development/scripts/disassemble_tombstone.py

#!/usr/bin/python

"""Disassemble the code stored in a tombstone.

The classes in this module use an interface, ProcessLine, so that they can be
chained together to do arbitrary procerssing. The current classes support
disassembling the bytes embedded in tombstones and printing output to stdout.
"""


import re
import subprocess
import sys
import tempfile
import architecture


STANDARD_PROLOGUE = """
       .type   _start, %function
       .globl  _start
_start:
"""


THUMB_PROLOGUE = STANDARD_PROLOGUE + """
       .code   16
       .thumb_func
       .type   thumb_start, %function
thumb_start:
"""


def Disassemble(line_generator):
  abi_line = re.compile("(ABI: \'(.*)\')")
  abi = None
  tools = None
  # Process global headers
  for line in line_generator:
    yield line
    abi_header = abi_line.search(line)
    if abi_header:
      abi = abi_header.group(2)
      # Look up the tools here so we don't do a lookup for each code block.
      tools = architecture.Architecture(abi)
      break
  # The rest of the file consists of:
  #   o Lines that should pass through unchanged
  #   o Blocks of register values, which follow a 'pid: ...' line and end with
  #     'backtrace:' line
  #   o Blocks of code represented as words, which start with 'code around ...'
  #     and end with a line that doesn't look like a list of words.
  #
  # The only constraint on the ordering of these blocks is that the register
  # values must come before the first code block.
  #
  # It's easiest to nest register processing in the codeblock search loop.
  register_list_re = re.compile('^pid: ')
  codeblock_re = re.compile('^code around ([a-z0-9]+)|memory near (pc)')
  register_text = {}
  for line in line_generator:
    yield line
    if register_list_re.search(line):
      register_text = {}
      for output in ProcessRegisterList(line_generator, register_text):
        yield output
    code_match = codeblock_re.search(line)
    if code_match:
      code_reg = ''.join(code_match.groups(''))
      for output in ProcessCodeBlock(
          abi, tools, code_reg, register_text, line_generator):
        yield output


def ProcessRegisterList(line_generator, rval):
  for line in line_generator:
    yield line
    if line.startswith('backtrace:'):
      return
    # The register list is indented and consists of alternating name, value
    # pairs.
    if line.startswith(' '):
      words = line.split()
      assert len(words) % 2 == 0
      for index in range(0, len(words), 2):
        rval[words[index]] = words[index + 1]


def ProcessCodeBlock(abi, tools, register_name, register_text, line_generator):
  program_counter = register_text[register_name]
  program_counter_val = int(program_counter, 16)
  scratch_file = tempfile.NamedTemporaryFile(suffix='.s')
  # ARM code comes in two flavors: arm and thumb. Figure out the one
  # to use by peeking in the cpsr.
  if abi == 'arm' and int(register_text['cpsr'], 16) & 0x20:
    scratch_file.write(THUMB_PROLOGUE)
  else:
    scratch_file.write(STANDARD_PROLOGUE)
  # Retains the hexadecimal text for the start of the block
  start_address = None
  # Maintains a numeric counter for the address of the current byte
  current_address = None
  # Handle the 3 differnt file formats that we've observerd.
  if len(program_counter) == 8:
    block_line_len = [67]
    block_num_words = 4
  else:
    assert len(program_counter) == 16
    block_line_len = [57, 73]
    block_num_words = 2
  # Now generate assembly from the bytes in the code block.
  for line in line_generator:
    words = line.split()
    # Be conservative and stop interpreting if the line length is wrong
    # We can't count words because spaces can appear in the text representation
    # of the memory.
    if len(line) not in block_line_len:
      break
    # Double check the address at the start of each line
    if current_address is None:
      start_address = words[0]
      current_address = int(start_address, 16)
    else:
      assert current_address == int(words[0], 16)
    for word in words[1:block_num_words+1]:
      # Handle byte swapping
      for byte in tools.WordToBytes(word):
        # Emit a label at the desired program counter.
        # This will cause the disassembler to resynchronize at this point,
        # allowing us to position the arrow and also ensuring that we decode
        # the instruction properly.
        if current_address == program_counter_val:
          scratch_file.write('program_counter_was_here:\n')
        scratch_file.write('  .byte 0x%s\n' % byte)
        current_address += 1
  scratch_file.flush()
  # Assemble the scratch file and relocate it to the block address with the
  # linker.
  object_file = tempfile.NamedTemporaryFile(suffix='.o')
  subprocess.check_call(tools.Assemble([
      '-o', object_file.name, scratch_file.name]))
  scratch_file.close()

  # Work around ARM data tagging: rename $d to $t.
  if abi.startswith('arm'):
    subprocess.check_call(
        ['sed', '-i', '-e', "s/\\x00\\x24\\x64\\x00/\\x00\\x24\\x71\\x00/", object_file.name])

  linked_file = tempfile.NamedTemporaryFile(suffix='.o')
  cmd = tools.Link([
      '-Ttext', '0x' + start_address, '-o', linked_file.name, object_file.name])
  subprocess.check_call(cmd)
  object_file.close()
  disassembler = subprocess.Popen(tools.Disassemble([
      '-S', linked_file.name]), stdout=subprocess.PIPE)
  # Skip some of the annoying assembler headers.
  emit = False
  start_pattern = start_address + ' '
  # objdump padding varies between 32 bit and 64 bit architectures
  arrow_pattern = re.compile('^[ 0]*%8x:\t' % program_counter_val)
  for line in disassembler.stdout:
    emit = emit or line.startswith(start_pattern)
    if emit and len(line) > 1 and line.find('program_counter_was_here') == -1:
      if arrow_pattern.search(line):
        yield '--->' + line
      else:
        yield '    ' + line
  linked_file.close()
  yield '\n'


def main(argv):
  for fn in argv[1:]:
    for line in Disassemble(open(fn, 'r')):
      print line,


if __name__ == '__main__':
  main(sys.argv)