#!/usr/bin/python # # Copyright 2013 Google Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import fileinput import operator import optparse import os import pprint import re import subprocess import sys import json def format_bytes(bytes): """Pretty-print a number of bytes.""" if bytes > 1e6: bytes = bytes / 1.0e6 return '%.1fm' % bytes if bytes > 1e3: bytes = bytes / 1.0e3 return '%.1fk' % bytes return str(bytes) def symbol_type_to_human(type): """Convert a symbol type as printed by nm into a human-readable name.""" return { 'b': 'bss', 'd': 'data', 'r': 'read-only data', 't': 'code', 'u': 'weak symbol', # Unique global. 'w': 'weak symbol', 'v': 'weak symbol' }[type] def parse_nm(input): """Parse nm output. Argument: an iterable over lines of nm output. Yields: (symbol name, symbol type, symbol size, source file path). Path may be None if nm couldn't figure out the source file. """ # Match lines with size + symbol + optional filename. sym_re = re.compile(r'^[0-9a-f]+ ([0-9a-f]+) (.) ([^\t]+)(?:\t(.*):\d+)?$') # Match lines with addr but no size. addr_re = re.compile(r'^[0-9a-f]+ (.) ([^\t]+)(?:\t.*)?$') # Match lines that don't have an address at all -- typically external symbols. noaddr_re = re.compile(r'^ + (.) (.*)$') for line in input: line = line.rstrip() match = sym_re.match(line) if match: size, type, sym = match.groups()[0:3] size = int(size, 16) type = type.lower() if type in ['u', 'v']: type = 'w' # just call them all weak if type == 'b': continue # skip all BSS for now path = match.group(4) yield sym, type, size, path continue match = addr_re.match(line) if match: type, sym = match.groups()[0:2] # No size == we don't care. continue match = noaddr_re.match(line) if match: type, sym = match.groups() if type in ('U', 'w'): # external or weak symbol continue print >>sys.stderr, 'unparsed:', repr(line) def demangle(ident, cppfilt): if cppfilt and ident.startswith('_Z'): # Demangle names when possible. Mangled names all start with _Z. ident = subprocess.check_output([cppfilt, ident]).strip() return ident class Suffix: def __init__(self, suffix, replacement): self.pattern = '^(.*)' + suffix + '(.*)$' self.re = re.compile(self.pattern) self.replacement = replacement class SuffixCleanup: """Pre-compile suffix regular expressions.""" def __init__(self): self.suffixes = [ Suffix('\.part\.([0-9]+)', 'part'), Suffix('\.constprop\.([0-9]+)', 'constprop'), Suffix('\.isra\.([0-9]+)', 'isra'), ] def cleanup(self, ident, cppfilt): """Cleanup identifiers that have suffixes preventing demangling, and demangle if possible.""" to_append = [] for s in self.suffixes: found = s.re.match(ident) if not found: continue to_append += [' [' + s.replacement + '.' + found.group(2) + ']'] ident = found.group(1) + found.group(3) if len(to_append) > 0: # Only try to demangle if there were suffixes. ident = demangle(ident, cppfilt) for s in to_append: ident += s return ident suffix_cleanup = SuffixCleanup() def parse_cpp_name(name, cppfilt): name = suffix_cleanup.cleanup(name, cppfilt) # Turn prefixes into suffixes so namespacing works. prefixes = [ ['bool ', ''], ['construction vtable for ', ' [construction vtable]'], ['global constructors keyed to ', ' [global constructors]'], ['guard variable for ', ' [guard variable]'], ['int ', ''], ['non-virtual thunk to ', ' [non-virtual thunk]'], ['typeinfo for ', ' [typeinfo]'], ['typeinfo name for ', ' [typeinfo name]'], ['virtual thunk to ', ' [virtual thunk]'], ['void ', ''], ['vtable for ', ' [vtable]'], ['VTT for ', ' [VTT]'], ] for prefix, replacement in prefixes: if name.startswith(prefix): name = name[len(prefix):] + replacement # Simplify parenthesis parsing. replacements = [ ['(anonymous namespace)', '[anonymous namespace]'], ] for value, replacement in replacements: name = name.replace(value, replacement) def parse_one(val): """Returns (leftmost-part, remaining).""" if (val.startswith('operator') and not (val[8].isalnum() or val[8] == '_')): # Operator overload function, terminate. return (val, '') co = val.find('::') lt = val.find('<') pa = val.find('(') co = len(val) if co == -1 else co lt = len(val) if lt == -1 else lt pa = len(val) if pa == -1 else pa if co < lt and co < pa: # Namespace or type name. return (val[:co], val[co+2:]) if lt < pa: # Template. Make sure we capture nested templates too. open_tmpl = 1 gt = lt while val[gt] != '>' or open_tmpl != 0: gt = gt + 1 if val[gt] == '<': open_tmpl = open_tmpl + 1 if val[gt] == '>': open_tmpl = open_tmpl - 1 ret = val[gt+1:] if ret.startswith('::'): ret = ret[2:] if ret.startswith('('): # Template function, terminate. return (val, '') return (val[:gt+1], ret) # Terminate with any function name, identifier, or unmangled name. return (val, '') parts = [] while len(name) > 0: (part, name) = parse_one(name) assert len(part) > 0 parts.append(part) return parts def treeify_syms(symbols, strip_prefix=None, cppfilt=None): dirs = {} for sym, type, size, path in symbols: if path: path = os.path.normpath(path) if strip_prefix and path.startswith(strip_prefix): path = path[len(strip_prefix):] elif path.startswith('/'): path = path[1:] path = ['[path]'] + path.split('/') parts = parse_cpp_name(sym, cppfilt) if len(parts) == 1: if path: # No namespaces, group with path. parts = path + parts else: new_prefix = ['[ungrouped]'] regroups = [ ['.L.str', '[str]'], ['.L__PRETTY_FUNCTION__.', '[__PRETTY_FUNCTION__]'], ['.L__func__.', '[__func__]'], ['.Lswitch.table', '[switch table]'], ] for prefix, group in regroups: if parts[0].startswith(prefix): parts[0] = parts[0][len(prefix):] parts[0] = demangle(parts[0], cppfilt) new_prefix += [group] break parts = new_prefix + parts key = parts.pop() tree = dirs try: depth = 0 for part in parts: depth = depth + 1 assert part != '', path if part not in tree: tree[part] = {'$bloat_symbols':{}} if type not in tree[part]['$bloat_symbols']: tree[part]['$bloat_symbols'][type] = 0 tree[part]['$bloat_symbols'][type] += 1 tree = tree[part] old_size, old_symbols = tree.get(key, (0, {})) if type not in old_symbols: old_symbols[type] = 0 old_symbols[type] += 1 tree[key] = (old_size + size, old_symbols) except: print >>sys.stderr, 'sym `%s`\tparts `%s`\tkey `%s`' % (sym, parts, key) raise return dirs def jsonify_tree(tree, name): children = [] total = 0 files = 0 for key, val in tree.iteritems(): if key == '$bloat_symbols': continue if isinstance(val, dict): subtree = jsonify_tree(val, key) total += subtree['data']['$area'] children.append(subtree) else: (size, symbols) = val total += size assert len(symbols) == 1, symbols.values()[0] == 1 symbol = symbol_type_to_human(symbols.keys()[0]) children.append({ 'name': key + ' ' + format_bytes(size), 'data': { '$area': size, '$symbol': symbol, } }) children.sort(key=lambda child: -child['data']['$area']) dominant_symbol = '' if '$bloat_symbols' in tree: dominant_symbol = symbol_type_to_human( max(tree['$bloat_symbols'].iteritems(), key=operator.itemgetter(1))[0]) return { 'name': name + ' ' + format_bytes(total), 'data': { '$area': total, '$dominant_symbol': dominant_symbol, }, 'children': children, } def dump_nm(nmfile, strip_prefix, cppfilt): dirs = treeify_syms(parse_nm(nmfile), strip_prefix, cppfilt) print ('var kTree = ' + json.dumps(jsonify_tree(dirs, '[everything]'), indent=2)) def parse_objdump(input): """Parse objdump -h output.""" sec_re = re.compile('^\d+ (\S+) +([0-9a-z]+)') sections = [] debug_sections = [] for line in input: line = line.strip() match = sec_re.match(line) if match: name, size = match.groups() if name.startswith('.'): name = name[1:] if name.startswith('debug_'): name = name[len('debug_'):] debug_sections.append((name, int(size, 16))) else: sections.append((name, int(size, 16))) continue return sections, debug_sections def jsonify_sections(name, sections): children = [] total = 0 for section, size in sections: children.append({ 'name': section + ' ' + format_bytes(size), 'data': { '$area': size } }) total += size children.sort(key=lambda child: -child['data']['$area']) return { 'name': name + ' ' + format_bytes(total), 'data': { '$area': total }, 'children': children } def dump_sections(objdump): sections, debug_sections = parse_objdump(objdump) sections = jsonify_sections('sections', sections) debug_sections = jsonify_sections('debug', debug_sections) size = sections['data']['$area'] + debug_sections['data']['$area'] print 'var kTree = ' + json.dumps({ 'name': 'top ' + format_bytes(size), 'data': { '$area': size }, 'children': [ debug_sections, sections ]}) usage="""%prog [options] MODE Modes are: syms: output symbols json suitable for a treemap dump: print symbols sorted by size (pipe to head for best output) sections: output binary sections json suitable for a treemap nm output passed to --nm-output should from running a command like the following (note, can take a long time -- 30 minutes): nm -C -S -l /path/to/binary > nm.out objdump output passed to --objdump-output should be from a command like: objdump -h /path/to/binary > objdump.out""" parser = optparse.OptionParser(usage=usage) parser.add_option('--nm-output', action='store', dest='nmpath', metavar='PATH', default='nm.out', help='path to nm output [default=nm.out]') parser.add_option('--objdump-output', action='store', dest='objdumppath', metavar='PATH', default='objdump.out', help='path to objdump output [default=objdump.out]') parser.add_option('--strip-prefix', metavar='PATH', action='store', help='strip PATH prefix from paths; e.g. /path/to/src/root') parser.add_option('--filter', action='store', help='include only symbols/files matching FILTER') parser.add_option('--c++filt', action='store', metavar='PATH', dest='cppfilt', default='c++filt', help="Path to c++filt, used to demangle " "symbols that weren't handled by nm. Set to an invalid path " "to disable.") opts, args = parser.parse_args() if len(args) != 1: parser.print_usage() sys.exit(1) mode = args[0] if mode == 'syms': nmfile = open(opts.nmpath, 'r') try: res = subprocess.check_output([opts.cppfilt, 'main']) if res.strip() != 'main': print >>sys.stderr, ("%s failed demangling, " "output won't be demangled." % opt.cppfilt) opts.cppfilt = None except: print >>sys.stderr, ("Could not find c++filt at %s, " "output won't be demangled." % opt.cppfilt) opts.cppfilt = None dump_nm(nmfile, strip_prefix=opts.strip_prefix, cppfilt=opts.cppfilt) elif mode == 'sections': objdumpfile = open(opts.objdumppath, 'r') dump_sections(objdumpfile) elif mode == 'dump': nmfile = open(opts.nmpath, 'r') syms = list(parse_nm(nmfile)) # a list of (sym, type, size, path); sort by size. syms.sort(key=lambda x: -x[2]) total = 0 for sym, type, size, path in syms: if type in ('b', 'w'): continue # skip bss and weak symbols if path is None: path = '' if opts.filter and not (opts.filter in sym or opts.filter in path): continue print '%6s %s (%s) %s' % (format_bytes(size), sym, symbol_type_to_human(type), path) total += size print '%6s %s' % (format_bytes(total), 'total'), else: print 'unknown mode' parser.print_usage()