1#!/usr/bin/env python 2# Copyright 2016 the V8 project authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6"""Script to transform and merge sancov files into human readable json-format. 7 8The script supports three actions: 9all: Writes a json file with all instrumented lines of all executables. 10merge: Merges sancov files with coverage output into an existing json file. 11split: Split json file into separate files per covered source file. 12 13The json data is structured as follows: 14{ 15 "version": 1, 16 "tests": ["executable1", "executable2", ...], 17 "files": { 18 "file1": [[<instr line 1>, <bit_mask>], [<instr line 2>, <bit_mask>], ...], 19 "file2": [...], 20 ... 21 } 22} 23 24The executables are sorted and determine the test bit mask. Their index+1 is 25the bit, e.g. executable1 = 1, executable3 = 4, etc. Hence, a line covered by 26executable1 and executable3 will have bit_mask == 5 == 0b101. The number of 27tests is restricted to 52 in version 1, to allow javascript JSON parsing of 28the bitsets encoded as numbers. JS max safe int is (1 << 53) - 1. 29 30The line-number-bit_mask pairs are sorted by line number and don't contain 31duplicates. 32 33Split json data preserves the same format, but only contains one file per 34json file. 35 36The sancov tool is expected to be in the llvm compiler-rt third-party 37directory. It's not checked out by default and must be added as a custom deps: 38'v8/third_party/llvm/projects/compiler-rt': 39 'https://chromium.googlesource.com/external/llvm.org/compiler-rt.git' 40""" 41 42# for py2/py3 compatibility 43from __future__ import print_function 44from functools import reduce 45 46import argparse 47import json 48import logging 49import os 50import re 51import subprocess 52import sys 53 54from multiprocessing import Pool, cpu_count 55 56 57logging.basicConfig(level=logging.INFO) 58 59# Files to exclude from coverage. Dropping their data early adds more speed. 60# The contained cc files are already excluded from instrumentation, but inlined 61# data is referenced through v8's object files. 62EXCLUSIONS = [ 63 'buildtools', 64 'src/third_party', 65 'third_party', 66 'test', 67 'testing', 68] 69 70# Executables found in the build output for which no coverage is generated. 71# Exclude them from the coverage data file. 72EXE_EXCLUSIONS = [ 73 'generate-bytecode-expectations', 74 'hello-world', 75 'mksnapshot', 76 'parser-shell', 77 'process', 78 'shell', 79] 80 81# V8 checkout directory. 82BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname( 83 os.path.abspath(__file__)))) 84 85# The sancov tool location. 86SANCOV_TOOL = os.path.join( 87 BASE_DIR, 'third_party', 'llvm', 'projects', 'compiler-rt', 88 'lib', 'sanitizer_common', 'scripts', 'sancov.py') 89 90# Simple script to sanitize the PCs from objdump. 91SANITIZE_PCS = os.path.join(BASE_DIR, 'tools', 'sanitizers', 'sanitize_pcs.py') 92 93# The llvm symbolizer location. 94SYMBOLIZER = os.path.join( 95 BASE_DIR, 'third_party', 'llvm-build', 'Release+Asserts', 'bin', 96 'llvm-symbolizer') 97 98# Number of cpus. 99CPUS = cpu_count() 100 101# Regexp to find sancov files as output by sancov_merger.py. Also grabs the 102# executable name in group 1. 103SANCOV_FILE_RE = re.compile(r'^(.*)\.result.sancov$') 104 105 106def executables(build_dir): 107 """Iterates over executable files in the build directory.""" 108 for f in os.listdir(build_dir): 109 file_path = os.path.join(build_dir, f) 110 if (os.path.isfile(file_path) and 111 os.access(file_path, os.X_OK) and 112 f not in EXE_EXCLUSIONS): 113 yield file_path 114 115 116def process_symbolizer_output(output, build_dir): 117 """Post-process llvm symbolizer output. 118 119 Excludes files outside the v8 checkout or given in exclusion list above 120 from further processing. Drops the character index in each line. 121 122 Returns: A mapping of file names to lists of line numbers. The file names 123 have relative paths to the v8 base directory. The lists of line 124 numbers don't contain duplicate lines and are sorted. 125 """ 126 # Path prefix added by the llvm symbolizer including trailing slash. 127 output_path_prefix = os.path.join(build_dir, '..', '..', '') 128 129 # Drop path prefix when iterating lines. The path is redundant and takes 130 # too much space. Drop files outside that path, e.g. generated files in 131 # the build dir and absolute paths to c++ library headers. 132 def iter_lines(): 133 for line in output.strip().splitlines(): 134 if line.startswith(output_path_prefix): 135 yield line[len(output_path_prefix):] 136 137 # Map file names to sets of instrumented line numbers. 138 file_map = {} 139 for line in iter_lines(): 140 # Drop character number, we only care for line numbers. Each line has the 141 # form: <file name>:<line number>:<character number>. 142 file_name, number, _ = line.split(':') 143 file_map.setdefault(file_name, set([])).add(int(number)) 144 145 # Remove exclusion patterns from file map. It's cheaper to do it after the 146 # mapping, as there are few excluded files and we don't want to do this 147 # check for numerous lines in ordinary files. 148 def keep(file_name): 149 for e in EXCLUSIONS: 150 if file_name.startswith(e): 151 return False 152 return True 153 154 # Return in serializable form and filter. 155 return {k: sorted(file_map[k]) for k in file_map if keep(k)} 156 157 158def get_instrumented_lines(executable): 159 """Return the instrumented lines of an executable. 160 161 Called trough multiprocessing pool. 162 163 Returns: Post-processed llvm output as returned by process_symbolizer_output. 164 """ 165 # The first two pipes are from llvm's tool sancov.py with 0x added to the hex 166 # numbers. The results are piped into the llvm symbolizer, which outputs for 167 # each PC: <file name with abs path>:<line number>:<character number>. 168 # We don't call the sancov tool to get more speed. 169 process = subprocess.Popen( 170 'objdump -d %s | ' 171 'grep \'^\s\+[0-9a-f]\+:.*\scall\(q\|\)\s\+[0-9a-f]\+ ' 172 '<__sanitizer_cov\(_with_check\|\|_trace_pc_guard\)\(@plt\|\)>\' | ' 173 'grep \'^\s\+[0-9a-f]\+\' -o | ' 174 '%s | ' 175 '%s --obj %s -functions=none' % 176 (executable, SANITIZE_PCS, SYMBOLIZER, executable), 177 stdout=subprocess.PIPE, 178 stderr=subprocess.PIPE, 179 stdin=subprocess.PIPE, 180 cwd=BASE_DIR, 181 shell=True, 182 ) 183 output, _ = process.communicate() 184 assert process.returncode == 0 185 return process_symbolizer_output(output, os.path.dirname(executable)) 186 187 188def merge_instrumented_line_results(exe_list, results): 189 """Merge multiprocessing results for all instrumented lines. 190 191 Args: 192 exe_list: List of all executable names with absolute paths. 193 results: List of results as returned by get_instrumented_lines. 194 195 Returns: Dict to be used as json data as specified on the top of this page. 196 The dictionary contains all instrumented lines of all files 197 referenced by all executables. 198 """ 199 def merge_files(x, y): 200 for file_name, lines in y.iteritems(): 201 x.setdefault(file_name, set([])).update(lines) 202 return x 203 result = reduce(merge_files, results, {}) 204 205 # Return data as file->lines mapping. The lines are saved as lists 206 # with (line number, test bits (as int)). The test bits are initialized with 207 # 0, meaning instrumented, but no coverage. 208 # The order of the test bits is given with key 'tests'. For now, these are 209 # the executable names. We use a _list_ with two items instead of a tuple to 210 # ease merging by allowing mutation of the second item. 211 return { 212 'version': 1, 213 'tests': sorted(map(os.path.basename, exe_list)), 214 'files': {f: map(lambda l: [l, 0], sorted(result[f])) for f in result}, 215 } 216 217 218def write_instrumented(options): 219 """Implements the 'all' action of this tool.""" 220 exe_list = list(executables(options.build_dir)) 221 logging.info('Reading instrumented lines from %d executables.', 222 len(exe_list)) 223 pool = Pool(CPUS) 224 try: 225 results = pool.imap_unordered(get_instrumented_lines, exe_list) 226 finally: 227 pool.close() 228 229 # Merge multiprocessing results and prepare output data. 230 data = merge_instrumented_line_results(exe_list, results) 231 232 logging.info('Read data from %d executables, which covers %d files.', 233 len(data['tests']), len(data['files'])) 234 logging.info('Writing results to %s', options.json_output) 235 236 # Write json output. 237 with open(options.json_output, 'w') as f: 238 json.dump(data, f, sort_keys=True) 239 240 241def get_covered_lines(args): 242 """Return the covered lines of an executable. 243 244 Called trough multiprocessing pool. The args are expected to unpack to: 245 cov_dir: Folder with sancov files merged by sancov_merger.py. 246 executable: Absolute path to the executable that was called to produce the 247 given coverage data. 248 sancov_file: The merged sancov file with coverage data. 249 250 Returns: A tuple of post-processed llvm output as returned by 251 process_symbolizer_output and the executable name. 252 """ 253 cov_dir, executable, sancov_file = args 254 255 # Let the sancov tool print the covered PCs and pipe them through the llvm 256 # symbolizer. 257 process = subprocess.Popen( 258 '%s print %s 2> /dev/null | ' 259 '%s --obj %s -functions=none' % 260 (SANCOV_TOOL, 261 os.path.join(cov_dir, sancov_file), 262 SYMBOLIZER, 263 executable), 264 stdout=subprocess.PIPE, 265 stderr=subprocess.PIPE, 266 stdin=subprocess.PIPE, 267 cwd=BASE_DIR, 268 shell=True, 269 ) 270 output, _ = process.communicate() 271 assert process.returncode == 0 272 return ( 273 process_symbolizer_output(output, os.path.dirname(executable)), 274 os.path.basename(executable), 275 ) 276 277 278def merge_covered_line_results(data, results): 279 """Merge multiprocessing results for covered lines. 280 281 The data is mutated, the results are merged into it in place. 282 283 Args: 284 data: Existing coverage data from json file containing all instrumented 285 lines. 286 results: List of results as returned by get_covered_lines. 287 """ 288 289 # List of executables and mapping to the test bit mask. The number of 290 # tests is restricted to 52, to allow javascript JSON parsing of 291 # the bitsets encoded as numbers. JS max safe int is (1 << 53) - 1. 292 exe_list = data['tests'] 293 assert len(exe_list) <= 52, 'Max 52 different tests are supported.' 294 test_bit_masks = {exe:1<<i for i, exe in enumerate(exe_list)} 295 296 def merge_lines(old_lines, new_lines, mask): 297 """Merge the coverage data of a list of lines. 298 299 Args: 300 old_lines: Lines as list of pairs with line number and test bit mask. 301 The new lines will be merged into the list in place. 302 new_lines: List of new (covered) lines (sorted). 303 mask: The bit to be set for covered lines. The bit index is the test 304 index of the executable that covered the line. 305 """ 306 i = 0 307 # Iterate over old and new lines, both are sorted. 308 for l in new_lines: 309 while old_lines[i][0] < l: 310 # Forward instrumented lines not present in this coverage data. 311 i += 1 312 # TODO: Add more context to the assert message. 313 assert i < len(old_lines), 'Covered line %d not in input file.' % l 314 assert old_lines[i][0] == l, 'Covered line %d not in input file.' % l 315 316 # Add coverage information to the line. 317 old_lines[i][1] |= mask 318 319 def merge_files(data, result): 320 """Merge result into data. 321 322 The data is mutated in place. 323 324 Args: 325 data: Merged coverage data from the previous reduce step. 326 result: New result to be merged in. The type is as returned by 327 get_covered_lines. 328 """ 329 file_map, executable = result 330 files = data['files'] 331 for file_name, lines in file_map.iteritems(): 332 merge_lines(files[file_name], lines, test_bit_masks[executable]) 333 return data 334 335 reduce(merge_files, results, data) 336 337 338def merge(options): 339 """Implements the 'merge' action of this tool.""" 340 341 # Check if folder with coverage output exists. 342 assert (os.path.exists(options.coverage_dir) and 343 os.path.isdir(options.coverage_dir)) 344 345 # Inputs for multiprocessing. List of tuples of: 346 # Coverage dir, absoluate path to executable, sancov file name. 347 inputs = [] 348 for sancov_file in os.listdir(options.coverage_dir): 349 match = SANCOV_FILE_RE.match(sancov_file) 350 if match: 351 inputs.append(( 352 options.coverage_dir, 353 os.path.join(options.build_dir, match.group(1)), 354 sancov_file, 355 )) 356 357 logging.info('Merging %d sancov files into %s', 358 len(inputs), options.json_input) 359 360 # Post-process covered lines in parallel. 361 pool = Pool(CPUS) 362 try: 363 results = pool.imap_unordered(get_covered_lines, inputs) 364 finally: 365 pool.close() 366 367 # Load existing json data file for merging the results. 368 with open(options.json_input, 'r') as f: 369 data = json.load(f) 370 371 # Merge muliprocessing results. Mutates data. 372 merge_covered_line_results(data, results) 373 374 logging.info('Merged data from %d executables, which covers %d files.', 375 len(data['tests']), len(data['files'])) 376 logging.info('Writing results to %s', options.json_output) 377 378 # Write merged results to file. 379 with open(options.json_output, 'w') as f: 380 json.dump(data, f, sort_keys=True) 381 382 383def split(options): 384 """Implements the 'split' action of this tool.""" 385 # Load existing json data file for splitting. 386 with open(options.json_input, 'r') as f: 387 data = json.load(f) 388 389 logging.info('Splitting off %d coverage files from %s', 390 len(data['files']), options.json_input) 391 392 for file_name, coverage in data['files'].iteritems(): 393 # Preserve relative directories that are part of the file name. 394 file_path = os.path.join(options.output_dir, file_name + '.json') 395 try: 396 os.makedirs(os.path.dirname(file_path)) 397 except OSError: 398 # Ignore existing directories. 399 pass 400 401 with open(file_path, 'w') as f: 402 # Flat-copy the old dict. 403 new_data = dict(data) 404 405 # Update current file. 406 new_data['files'] = {file_name: coverage} 407 408 # Write json data. 409 json.dump(new_data, f, sort_keys=True) 410 411 412def main(args=None): 413 parser = argparse.ArgumentParser() 414 # TODO(machenbach): Make this required and deprecate the default. 415 parser.add_argument('--build-dir', 416 default=os.path.join(BASE_DIR, 'out', 'Release'), 417 help='Path to the build output directory.') 418 parser.add_argument('--coverage-dir', 419 help='Path to the sancov output files.') 420 parser.add_argument('--json-input', 421 help='Path to an existing json file with coverage data.') 422 parser.add_argument('--json-output', 423 help='Path to a file to write json output to.') 424 parser.add_argument('--output-dir', 425 help='Directory where to put split output files to.') 426 parser.add_argument('action', choices=['all', 'merge', 'split'], 427 help='Action to perform.') 428 429 options = parser.parse_args(args) 430 options.build_dir = os.path.abspath(options.build_dir) 431 if options.action.lower() == 'all': 432 if not options.json_output: 433 print('--json-output is required') 434 return 1 435 write_instrumented(options) 436 elif options.action.lower() == 'merge': 437 if not options.coverage_dir: 438 print('--coverage-dir is required') 439 return 1 440 if not options.json_input: 441 print('--json-input is required') 442 return 1 443 if not options.json_output: 444 print('--json-output is required') 445 return 1 446 merge(options) 447 elif options.action.lower() == 'split': 448 if not options.json_input: 449 print('--json-input is required') 450 return 1 451 if not options.output_dir: 452 print('--output-dir is required') 453 return 1 454 split(options) 455 return 0 456 457 458if __name__ == '__main__': 459 sys.exit(main()) 460