• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python
2# Copyright 2016 the V8 project authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Script to transform and merge sancov files into human readable json-format.
7
8The script supports three actions:
9all: Writes a json file with all instrumented lines of all executables.
10merge: Merges sancov files with coverage output into an existing json file.
11split: Split json file into separate files per covered source file.
12
13The json data is structured as follows:
14{
15  "version": 1,
16  "tests": ["executable1", "executable2", ...],
17  "files": {
18    "file1": [[<instr line 1>, <bit_mask>], [<instr line 2>, <bit_mask>], ...],
19    "file2": [...],
20    ...
21  }
22}
23
24The executables are sorted and determine the test bit mask. Their index+1 is
25the bit, e.g. executable1 = 1, executable3 = 4, etc. Hence, a line covered by
26executable1 and executable3 will have bit_mask == 5 == 0b101. The number of
27tests is restricted to 52 in version 1, to allow javascript JSON parsing of
28the bitsets encoded as numbers. JS max safe int is (1 << 53) - 1.
29
30The line-number-bit_mask pairs are sorted by line number and don't contain
31duplicates.
32
33Split json data preserves the same format, but only contains one file per
34json file.
35
36The sancov tool is expected to be in the llvm compiler-rt third-party
37directory. It's not checked out by default and must be added as a custom deps:
38'v8/third_party/llvm/projects/compiler-rt':
39    'https://chromium.googlesource.com/external/llvm.org/compiler-rt.git'
40"""
41
42import argparse
43import json
44import logging
45import os
46import re
47import subprocess
48import sys
49
50from multiprocessing import Pool, cpu_count
51
52
53logging.basicConfig(level=logging.INFO)
54
55# Files to exclude from coverage. Dropping their data early adds more speed.
56# The contained cc files are already excluded from instrumentation, but inlined
57# data is referenced through v8's object files.
58EXCLUSIONS = [
59  'buildtools',
60  'src/third_party',
61  'third_party',
62  'test',
63  'testing',
64]
65
66# Executables found in the build output for which no coverage is generated.
67# Exclude them from the coverage data file.
68EXE_BLACKLIST = [
69  'generate-bytecode-expectations',
70  'hello-world',
71  'mksnapshot',
72  'parser-shell',
73  'process',
74  'shell',
75]
76
77# V8 checkout directory.
78BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(
79    os.path.abspath(__file__))))
80
81# The sancov tool location.
82SANCOV_TOOL = os.path.join(
83    BASE_DIR, 'third_party', 'llvm', 'projects', 'compiler-rt',
84    'lib', 'sanitizer_common', 'scripts', 'sancov.py')
85
86# Simple script to sanitize the PCs from objdump.
87SANITIZE_PCS = os.path.join(BASE_DIR, 'tools', 'sanitizers', 'sanitize_pcs.py')
88
89# The llvm symbolizer location.
90SYMBOLIZER = os.path.join(
91    BASE_DIR, 'third_party', 'llvm-build', 'Release+Asserts', 'bin',
92    'llvm-symbolizer')
93
94# Number of cpus.
95CPUS = cpu_count()
96
97# Regexp to find sancov files as output by sancov_merger.py. Also grabs the
98# executable name in group 1.
99SANCOV_FILE_RE = re.compile(r'^(.*)\.result.sancov$')
100
101
102def executables(build_dir):
103  """Iterates over executable files in the build directory."""
104  for f in os.listdir(build_dir):
105    file_path = os.path.join(build_dir, f)
106    if (os.path.isfile(file_path) and
107        os.access(file_path, os.X_OK) and
108        f not in EXE_BLACKLIST):
109      yield file_path
110
111
112def process_symbolizer_output(output, build_dir):
113  """Post-process llvm symbolizer output.
114
115  Excludes files outside the v8 checkout or given in exclusion list above
116  from further processing. Drops the character index in each line.
117
118  Returns: A mapping of file names to lists of line numbers. The file names
119           have relative paths to the v8 base directory. The lists of line
120           numbers don't contain duplicate lines and are sorted.
121  """
122  # Path prefix added by the llvm symbolizer including trailing slash.
123  output_path_prefix = os.path.join(build_dir, '..', '..', '')
124
125  # Drop path prefix when iterating lines. The path is redundant and takes
126  # too much space. Drop files outside that path, e.g. generated files in
127  # the build dir and absolute paths to c++ library headers.
128  def iter_lines():
129    for line in output.strip().splitlines():
130      if line.startswith(output_path_prefix):
131        yield line[len(output_path_prefix):]
132
133  # Map file names to sets of instrumented line numbers.
134  file_map = {}
135  for line in iter_lines():
136    # Drop character number, we only care for line numbers. Each line has the
137    # form: <file name>:<line number>:<character number>.
138    file_name, number, _ = line.split(':')
139    file_map.setdefault(file_name, set([])).add(int(number))
140
141  # Remove exclusion patterns from file map. It's cheaper to do it after the
142  # mapping, as there are few excluded files and we don't want to do this
143  # check for numerous lines in ordinary files.
144  def keep(file_name):
145    for e in EXCLUSIONS:
146      if file_name.startswith(e):
147        return False
148    return True
149
150  # Return in serializable form and filter.
151  return {k: sorted(file_map[k]) for k in file_map if keep(k)}
152
153
154def get_instrumented_lines(executable):
155  """Return the instrumented lines of an executable.
156
157  Called trough multiprocessing pool.
158
159  Returns: Post-processed llvm output as returned by process_symbolizer_output.
160  """
161  # The first two pipes are from llvm's tool sancov.py with 0x added to the hex
162  # numbers. The results are piped into the llvm symbolizer, which outputs for
163  # each PC: <file name with abs path>:<line number>:<character number>.
164  # We don't call the sancov tool to get more speed.
165  process = subprocess.Popen(
166      'objdump -d %s | '
167      'grep \'^\s\+[0-9a-f]\+:.*\scall\(q\|\)\s\+[0-9a-f]\+ '
168      '<__sanitizer_cov\(_with_check\|\|_trace_pc_guard\)\(@plt\|\)>\' | '
169      'grep \'^\s\+[0-9a-f]\+\' -o | '
170      '%s | '
171      '%s --obj %s -functions=none' %
172          (executable, SANITIZE_PCS, SYMBOLIZER, executable),
173      stdout=subprocess.PIPE,
174      stderr=subprocess.PIPE,
175      stdin=subprocess.PIPE,
176      cwd=BASE_DIR,
177      shell=True,
178  )
179  output, _ = process.communicate()
180  assert process.returncode == 0
181  return process_symbolizer_output(output, os.path.dirname(executable))
182
183
184def merge_instrumented_line_results(exe_list, results):
185  """Merge multiprocessing results for all instrumented lines.
186
187  Args:
188    exe_list: List of all executable names with absolute paths.
189    results: List of results as returned by get_instrumented_lines.
190
191  Returns: Dict to be used as json data as specified on the top of this page.
192           The dictionary contains all instrumented lines of all files
193           referenced by all executables.
194  """
195  def merge_files(x, y):
196    for file_name, lines in y.iteritems():
197      x.setdefault(file_name, set([])).update(lines)
198    return x
199  result = reduce(merge_files, results, {})
200
201  # Return data as file->lines mapping. The lines are saved as lists
202  # with (line number, test bits (as int)). The test bits are initialized with
203  # 0, meaning instrumented, but no coverage.
204  # The order of the test bits is given with key 'tests'. For now, these are
205  # the executable names. We use a _list_ with two items instead of a tuple to
206  # ease merging by allowing mutation of the second item.
207  return {
208    'version': 1,
209    'tests': sorted(map(os.path.basename, exe_list)),
210    'files': {f: map(lambda l: [l, 0], sorted(result[f])) for f in result},
211  }
212
213
214def write_instrumented(options):
215  """Implements the 'all' action of this tool."""
216  exe_list = list(executables(options.build_dir))
217  logging.info('Reading instrumented lines from %d executables.',
218               len(exe_list))
219  pool = Pool(CPUS)
220  try:
221    results = pool.imap_unordered(get_instrumented_lines, exe_list)
222  finally:
223    pool.close()
224
225  # Merge multiprocessing results and prepare output data.
226  data = merge_instrumented_line_results(exe_list, results)
227
228  logging.info('Read data from %d executables, which covers %d files.',
229               len(data['tests']), len(data['files']))
230  logging.info('Writing results to %s', options.json_output)
231
232  # Write json output.
233  with open(options.json_output, 'w') as f:
234    json.dump(data, f, sort_keys=True)
235
236
237def get_covered_lines(args):
238  """Return the covered lines of an executable.
239
240  Called trough multiprocessing pool. The args are expected to unpack to:
241    cov_dir: Folder with sancov files merged by sancov_merger.py.
242    executable: Absolute path to the executable that was called to produce the
243                given coverage data.
244    sancov_file: The merged sancov file with coverage data.
245
246  Returns: A tuple of post-processed llvm output as returned by
247           process_symbolizer_output and the executable name.
248  """
249  cov_dir, executable, sancov_file = args
250
251  # Let the sancov tool print the covered PCs and pipe them through the llvm
252  # symbolizer.
253  process = subprocess.Popen(
254      '%s print %s 2> /dev/null | '
255      '%s --obj %s -functions=none' %
256          (SANCOV_TOOL,
257           os.path.join(cov_dir, sancov_file),
258           SYMBOLIZER,
259           executable),
260      stdout=subprocess.PIPE,
261      stderr=subprocess.PIPE,
262      stdin=subprocess.PIPE,
263      cwd=BASE_DIR,
264      shell=True,
265  )
266  output, _ = process.communicate()
267  assert process.returncode == 0
268  return (
269      process_symbolizer_output(output, os.path.dirname(executable)),
270      os.path.basename(executable),
271  )
272
273
274def merge_covered_line_results(data, results):
275  """Merge multiprocessing results for covered lines.
276
277  The data is mutated, the results are merged into it in place.
278
279  Args:
280    data: Existing coverage data from json file containing all instrumented
281          lines.
282    results: List of results as returned by get_covered_lines.
283  """
284
285  # List of executables and mapping to the test bit mask. The number of
286  # tests is restricted to 52, to allow javascript JSON parsing of
287  # the bitsets encoded as numbers. JS max safe int is (1 << 53) - 1.
288  exe_list = data['tests']
289  assert len(exe_list) <= 52, 'Max 52 different tests are supported.'
290  test_bit_masks = {exe:1<<i for i, exe in enumerate(exe_list)}
291
292  def merge_lines(old_lines, new_lines, mask):
293    """Merge the coverage data of a list of lines.
294
295    Args:
296      old_lines: Lines as list of pairs with line number and test bit mask.
297                 The new lines will be merged into the list in place.
298      new_lines: List of new (covered) lines (sorted).
299      mask: The bit to be set for covered lines. The bit index is the test
300            index of the executable that covered the line.
301    """
302    i = 0
303    # Iterate over old and new lines, both are sorted.
304    for l in new_lines:
305      while old_lines[i][0] < l:
306        # Forward instrumented lines not present in this coverage data.
307        i += 1
308        # TODO: Add more context to the assert message.
309        assert i < len(old_lines), 'Covered line %d not in input file.' % l
310      assert old_lines[i][0] == l, 'Covered line %d not in input file.' % l
311
312      # Add coverage information to the line.
313      old_lines[i][1] |= mask
314
315  def merge_files(data, result):
316    """Merge result into data.
317
318    The data is mutated in place.
319
320    Args:
321      data: Merged coverage data from the previous reduce step.
322      result: New result to be merged in. The type is as returned by
323              get_covered_lines.
324    """
325    file_map, executable = result
326    files = data['files']
327    for file_name, lines in file_map.iteritems():
328      merge_lines(files[file_name], lines, test_bit_masks[executable])
329    return data
330
331  reduce(merge_files, results, data)
332
333
334def merge(options):
335  """Implements the 'merge' action of this tool."""
336
337  # Check if folder with coverage output exists.
338  assert (os.path.exists(options.coverage_dir) and
339          os.path.isdir(options.coverage_dir))
340
341  # Inputs for multiprocessing. List of tuples of:
342  # Coverage dir, absoluate path to executable, sancov file name.
343  inputs = []
344  for sancov_file in os.listdir(options.coverage_dir):
345    match = SANCOV_FILE_RE.match(sancov_file)
346    if match:
347      inputs.append((
348          options.coverage_dir,
349          os.path.join(options.build_dir, match.group(1)),
350          sancov_file,
351      ))
352
353  logging.info('Merging %d sancov files into %s',
354               len(inputs), options.json_input)
355
356  # Post-process covered lines in parallel.
357  pool = Pool(CPUS)
358  try:
359    results = pool.imap_unordered(get_covered_lines, inputs)
360  finally:
361    pool.close()
362
363  # Load existing json data file for merging the results.
364  with open(options.json_input, 'r') as f:
365    data = json.load(f)
366
367  # Merge muliprocessing results. Mutates data.
368  merge_covered_line_results(data, results)
369
370  logging.info('Merged data from %d executables, which covers %d files.',
371               len(data['tests']), len(data['files']))
372  logging.info('Writing results to %s', options.json_output)
373
374  # Write merged results to file.
375  with open(options.json_output, 'w') as f:
376    json.dump(data, f, sort_keys=True)
377
378
379def split(options):
380  """Implements the 'split' action of this tool."""
381  # Load existing json data file for splitting.
382  with open(options.json_input, 'r') as f:
383    data = json.load(f)
384
385  logging.info('Splitting off %d coverage files from %s',
386               len(data['files']), options.json_input)
387
388  for file_name, coverage in data['files'].iteritems():
389    # Preserve relative directories that are part of the file name.
390    file_path = os.path.join(options.output_dir, file_name + '.json')
391    try:
392      os.makedirs(os.path.dirname(file_path))
393    except OSError:
394      # Ignore existing directories.
395      pass
396
397    with open(file_path, 'w') as f:
398      # Flat-copy the old dict.
399      new_data = dict(data)
400
401      # Update current file.
402      new_data['files'] = {file_name: coverage}
403
404      # Write json data.
405      json.dump(new_data, f, sort_keys=True)
406
407
408def main(args=None):
409  parser = argparse.ArgumentParser()
410  # TODO(machenbach): Make this required and deprecate the default.
411  parser.add_argument('--build-dir',
412                      default=os.path.join(BASE_DIR, 'out', 'Release'),
413                      help='Path to the build output directory.')
414  parser.add_argument('--coverage-dir',
415                      help='Path to the sancov output files.')
416  parser.add_argument('--json-input',
417                      help='Path to an existing json file with coverage data.')
418  parser.add_argument('--json-output',
419                      help='Path to a file to write json output to.')
420  parser.add_argument('--output-dir',
421                      help='Directory where to put split output files to.')
422  parser.add_argument('action', choices=['all', 'merge', 'split'],
423                      help='Action to perform.')
424
425  options = parser.parse_args(args)
426  options.build_dir = os.path.abspath(options.build_dir)
427  if options.action.lower() == 'all':
428    if not options.json_output:
429      print '--json-output is required'
430      return 1
431    write_instrumented(options)
432  elif options.action.lower() == 'merge':
433    if not options.coverage_dir:
434      print '--coverage-dir is required'
435      return 1
436    if not options.json_input:
437      print '--json-input is required'
438      return 1
439    if not options.json_output:
440      print '--json-output is required'
441      return 1
442    merge(options)
443  elif options.action.lower() == 'split':
444    if not options.json_input:
445      print '--json-input is required'
446      return 1
447    if not options.output_dir:
448      print '--output-dir is required'
449      return 1
450    split(options)
451  return 0
452
453
454if __name__ == '__main__':
455  sys.exit(main())
456