1#!/usr/bin/env python 2# Copyright 2016 the V8 project authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6"""Script to transform and merge sancov files into human readable json-format. 7 8The script supports three actions: 9all: Writes a json file with all instrumented lines of all executables. 10merge: Merges sancov files with coverage output into an existing json file. 11split: Split json file into separate files per covered source file. 12 13The json data is structured as follows: 14{ 15 "version": 1, 16 "tests": ["executable1", "executable2", ...], 17 "files": { 18 "file1": [[<instr line 1>, <bit_mask>], [<instr line 2>, <bit_mask>], ...], 19 "file2": [...], 20 ... 21 } 22} 23 24The executables are sorted and determine the test bit mask. Their index+1 is 25the bit, e.g. executable1 = 1, executable3 = 4, etc. Hence, a line covered by 26executable1 and executable3 will have bit_mask == 5 == 0b101. The number of 27tests is restricted to 52 in version 1, to allow javascript JSON parsing of 28the bitsets encoded as numbers. JS max safe int is (1 << 53) - 1. 29 30The line-number-bit_mask pairs are sorted by line number and don't contain 31duplicates. 32 33Split json data preserves the same format, but only contains one file per 34json file. 35 36The sancov tool is expected to be in the llvm compiler-rt third-party 37directory. It's not checked out by default and must be added as a custom deps: 38'v8/third_party/llvm/projects/compiler-rt': 39 'https://chromium.googlesource.com/external/llvm.org/compiler-rt.git' 40""" 41 42import argparse 43import json 44import logging 45import os 46import re 47import subprocess 48import sys 49 50from multiprocessing import Pool, cpu_count 51 52 53logging.basicConfig(level=logging.INFO) 54 55# Files to exclude from coverage. Dropping their data early adds more speed. 56# The contained cc files are already excluded from instrumentation, but inlined 57# data is referenced through v8's object files. 58EXCLUSIONS = [ 59 'buildtools', 60 'src/third_party', 61 'third_party', 62 'test', 63 'testing', 64] 65 66# Executables found in the build output for which no coverage is generated. 67# Exclude them from the coverage data file. 68EXE_BLACKLIST = [ 69 'generate-bytecode-expectations', 70 'hello-world', 71 'mksnapshot', 72 'parser-shell', 73 'process', 74 'shell', 75] 76 77# V8 checkout directory. 78BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname( 79 os.path.abspath(__file__)))) 80 81# The sancov tool location. 82SANCOV_TOOL = os.path.join( 83 BASE_DIR, 'third_party', 'llvm', 'projects', 'compiler-rt', 84 'lib', 'sanitizer_common', 'scripts', 'sancov.py') 85 86# Simple script to sanitize the PCs from objdump. 87SANITIZE_PCS = os.path.join(BASE_DIR, 'tools', 'sanitizers', 'sanitize_pcs.py') 88 89# The llvm symbolizer location. 90SYMBOLIZER = os.path.join( 91 BASE_DIR, 'third_party', 'llvm-build', 'Release+Asserts', 'bin', 92 'llvm-symbolizer') 93 94# Number of cpus. 95CPUS = cpu_count() 96 97# Regexp to find sancov files as output by sancov_merger.py. Also grabs the 98# executable name in group 1. 99SANCOV_FILE_RE = re.compile(r'^(.*)\.result.sancov$') 100 101 102def executables(build_dir): 103 """Iterates over executable files in the build directory.""" 104 for f in os.listdir(build_dir): 105 file_path = os.path.join(build_dir, f) 106 if (os.path.isfile(file_path) and 107 os.access(file_path, os.X_OK) and 108 f not in EXE_BLACKLIST): 109 yield file_path 110 111 112def process_symbolizer_output(output, build_dir): 113 """Post-process llvm symbolizer output. 114 115 Excludes files outside the v8 checkout or given in exclusion list above 116 from further processing. Drops the character index in each line. 117 118 Returns: A mapping of file names to lists of line numbers. The file names 119 have relative paths to the v8 base directory. The lists of line 120 numbers don't contain duplicate lines and are sorted. 121 """ 122 # Path prefix added by the llvm symbolizer including trailing slash. 123 output_path_prefix = os.path.join(build_dir, '..', '..', '') 124 125 # Drop path prefix when iterating lines. The path is redundant and takes 126 # too much space. Drop files outside that path, e.g. generated files in 127 # the build dir and absolute paths to c++ library headers. 128 def iter_lines(): 129 for line in output.strip().splitlines(): 130 if line.startswith(output_path_prefix): 131 yield line[len(output_path_prefix):] 132 133 # Map file names to sets of instrumented line numbers. 134 file_map = {} 135 for line in iter_lines(): 136 # Drop character number, we only care for line numbers. Each line has the 137 # form: <file name>:<line number>:<character number>. 138 file_name, number, _ = line.split(':') 139 file_map.setdefault(file_name, set([])).add(int(number)) 140 141 # Remove exclusion patterns from file map. It's cheaper to do it after the 142 # mapping, as there are few excluded files and we don't want to do this 143 # check for numerous lines in ordinary files. 144 def keep(file_name): 145 for e in EXCLUSIONS: 146 if file_name.startswith(e): 147 return False 148 return True 149 150 # Return in serializable form and filter. 151 return {k: sorted(file_map[k]) for k in file_map if keep(k)} 152 153 154def get_instrumented_lines(executable): 155 """Return the instrumented lines of an executable. 156 157 Called trough multiprocessing pool. 158 159 Returns: Post-processed llvm output as returned by process_symbolizer_output. 160 """ 161 # The first two pipes are from llvm's tool sancov.py with 0x added to the hex 162 # numbers. The results are piped into the llvm symbolizer, which outputs for 163 # each PC: <file name with abs path>:<line number>:<character number>. 164 # We don't call the sancov tool to get more speed. 165 process = subprocess.Popen( 166 'objdump -d %s | ' 167 'grep \'^\s\+[0-9a-f]\+:.*\scall\(q\|\)\s\+[0-9a-f]\+ ' 168 '<__sanitizer_cov\(_with_check\|\|_trace_pc_guard\)\(@plt\|\)>\' | ' 169 'grep \'^\s\+[0-9a-f]\+\' -o | ' 170 '%s | ' 171 '%s --obj %s -functions=none' % 172 (executable, SANITIZE_PCS, SYMBOLIZER, executable), 173 stdout=subprocess.PIPE, 174 stderr=subprocess.PIPE, 175 stdin=subprocess.PIPE, 176 cwd=BASE_DIR, 177 shell=True, 178 ) 179 output, _ = process.communicate() 180 assert process.returncode == 0 181 return process_symbolizer_output(output, os.path.dirname(executable)) 182 183 184def merge_instrumented_line_results(exe_list, results): 185 """Merge multiprocessing results for all instrumented lines. 186 187 Args: 188 exe_list: List of all executable names with absolute paths. 189 results: List of results as returned by get_instrumented_lines. 190 191 Returns: Dict to be used as json data as specified on the top of this page. 192 The dictionary contains all instrumented lines of all files 193 referenced by all executables. 194 """ 195 def merge_files(x, y): 196 for file_name, lines in y.iteritems(): 197 x.setdefault(file_name, set([])).update(lines) 198 return x 199 result = reduce(merge_files, results, {}) 200 201 # Return data as file->lines mapping. The lines are saved as lists 202 # with (line number, test bits (as int)). The test bits are initialized with 203 # 0, meaning instrumented, but no coverage. 204 # The order of the test bits is given with key 'tests'. For now, these are 205 # the executable names. We use a _list_ with two items instead of a tuple to 206 # ease merging by allowing mutation of the second item. 207 return { 208 'version': 1, 209 'tests': sorted(map(os.path.basename, exe_list)), 210 'files': {f: map(lambda l: [l, 0], sorted(result[f])) for f in result}, 211 } 212 213 214def write_instrumented(options): 215 """Implements the 'all' action of this tool.""" 216 exe_list = list(executables(options.build_dir)) 217 logging.info('Reading instrumented lines from %d executables.', 218 len(exe_list)) 219 pool = Pool(CPUS) 220 try: 221 results = pool.imap_unordered(get_instrumented_lines, exe_list) 222 finally: 223 pool.close() 224 225 # Merge multiprocessing results and prepare output data. 226 data = merge_instrumented_line_results(exe_list, results) 227 228 logging.info('Read data from %d executables, which covers %d files.', 229 len(data['tests']), len(data['files'])) 230 logging.info('Writing results to %s', options.json_output) 231 232 # Write json output. 233 with open(options.json_output, 'w') as f: 234 json.dump(data, f, sort_keys=True) 235 236 237def get_covered_lines(args): 238 """Return the covered lines of an executable. 239 240 Called trough multiprocessing pool. The args are expected to unpack to: 241 cov_dir: Folder with sancov files merged by sancov_merger.py. 242 executable: Absolute path to the executable that was called to produce the 243 given coverage data. 244 sancov_file: The merged sancov file with coverage data. 245 246 Returns: A tuple of post-processed llvm output as returned by 247 process_symbolizer_output and the executable name. 248 """ 249 cov_dir, executable, sancov_file = args 250 251 # Let the sancov tool print the covered PCs and pipe them through the llvm 252 # symbolizer. 253 process = subprocess.Popen( 254 '%s print %s 2> /dev/null | ' 255 '%s --obj %s -functions=none' % 256 (SANCOV_TOOL, 257 os.path.join(cov_dir, sancov_file), 258 SYMBOLIZER, 259 executable), 260 stdout=subprocess.PIPE, 261 stderr=subprocess.PIPE, 262 stdin=subprocess.PIPE, 263 cwd=BASE_DIR, 264 shell=True, 265 ) 266 output, _ = process.communicate() 267 assert process.returncode == 0 268 return ( 269 process_symbolizer_output(output, os.path.dirname(executable)), 270 os.path.basename(executable), 271 ) 272 273 274def merge_covered_line_results(data, results): 275 """Merge multiprocessing results for covered lines. 276 277 The data is mutated, the results are merged into it in place. 278 279 Args: 280 data: Existing coverage data from json file containing all instrumented 281 lines. 282 results: List of results as returned by get_covered_lines. 283 """ 284 285 # List of executables and mapping to the test bit mask. The number of 286 # tests is restricted to 52, to allow javascript JSON parsing of 287 # the bitsets encoded as numbers. JS max safe int is (1 << 53) - 1. 288 exe_list = data['tests'] 289 assert len(exe_list) <= 52, 'Max 52 different tests are supported.' 290 test_bit_masks = {exe:1<<i for i, exe in enumerate(exe_list)} 291 292 def merge_lines(old_lines, new_lines, mask): 293 """Merge the coverage data of a list of lines. 294 295 Args: 296 old_lines: Lines as list of pairs with line number and test bit mask. 297 The new lines will be merged into the list in place. 298 new_lines: List of new (covered) lines (sorted). 299 mask: The bit to be set for covered lines. The bit index is the test 300 index of the executable that covered the line. 301 """ 302 i = 0 303 # Iterate over old and new lines, both are sorted. 304 for l in new_lines: 305 while old_lines[i][0] < l: 306 # Forward instrumented lines not present in this coverage data. 307 i += 1 308 # TODO: Add more context to the assert message. 309 assert i < len(old_lines), 'Covered line %d not in input file.' % l 310 assert old_lines[i][0] == l, 'Covered line %d not in input file.' % l 311 312 # Add coverage information to the line. 313 old_lines[i][1] |= mask 314 315 def merge_files(data, result): 316 """Merge result into data. 317 318 The data is mutated in place. 319 320 Args: 321 data: Merged coverage data from the previous reduce step. 322 result: New result to be merged in. The type is as returned by 323 get_covered_lines. 324 """ 325 file_map, executable = result 326 files = data['files'] 327 for file_name, lines in file_map.iteritems(): 328 merge_lines(files[file_name], lines, test_bit_masks[executable]) 329 return data 330 331 reduce(merge_files, results, data) 332 333 334def merge(options): 335 """Implements the 'merge' action of this tool.""" 336 337 # Check if folder with coverage output exists. 338 assert (os.path.exists(options.coverage_dir) and 339 os.path.isdir(options.coverage_dir)) 340 341 # Inputs for multiprocessing. List of tuples of: 342 # Coverage dir, absoluate path to executable, sancov file name. 343 inputs = [] 344 for sancov_file in os.listdir(options.coverage_dir): 345 match = SANCOV_FILE_RE.match(sancov_file) 346 if match: 347 inputs.append(( 348 options.coverage_dir, 349 os.path.join(options.build_dir, match.group(1)), 350 sancov_file, 351 )) 352 353 logging.info('Merging %d sancov files into %s', 354 len(inputs), options.json_input) 355 356 # Post-process covered lines in parallel. 357 pool = Pool(CPUS) 358 try: 359 results = pool.imap_unordered(get_covered_lines, inputs) 360 finally: 361 pool.close() 362 363 # Load existing json data file for merging the results. 364 with open(options.json_input, 'r') as f: 365 data = json.load(f) 366 367 # Merge muliprocessing results. Mutates data. 368 merge_covered_line_results(data, results) 369 370 logging.info('Merged data from %d executables, which covers %d files.', 371 len(data['tests']), len(data['files'])) 372 logging.info('Writing results to %s', options.json_output) 373 374 # Write merged results to file. 375 with open(options.json_output, 'w') as f: 376 json.dump(data, f, sort_keys=True) 377 378 379def split(options): 380 """Implements the 'split' action of this tool.""" 381 # Load existing json data file for splitting. 382 with open(options.json_input, 'r') as f: 383 data = json.load(f) 384 385 logging.info('Splitting off %d coverage files from %s', 386 len(data['files']), options.json_input) 387 388 for file_name, coverage in data['files'].iteritems(): 389 # Preserve relative directories that are part of the file name. 390 file_path = os.path.join(options.output_dir, file_name + '.json') 391 try: 392 os.makedirs(os.path.dirname(file_path)) 393 except OSError: 394 # Ignore existing directories. 395 pass 396 397 with open(file_path, 'w') as f: 398 # Flat-copy the old dict. 399 new_data = dict(data) 400 401 # Update current file. 402 new_data['files'] = {file_name: coverage} 403 404 # Write json data. 405 json.dump(new_data, f, sort_keys=True) 406 407 408def main(args=None): 409 parser = argparse.ArgumentParser() 410 # TODO(machenbach): Make this required and deprecate the default. 411 parser.add_argument('--build-dir', 412 default=os.path.join(BASE_DIR, 'out', 'Release'), 413 help='Path to the build output directory.') 414 parser.add_argument('--coverage-dir', 415 help='Path to the sancov output files.') 416 parser.add_argument('--json-input', 417 help='Path to an existing json file with coverage data.') 418 parser.add_argument('--json-output', 419 help='Path to a file to write json output to.') 420 parser.add_argument('--output-dir', 421 help='Directory where to put split output files to.') 422 parser.add_argument('action', choices=['all', 'merge', 'split'], 423 help='Action to perform.') 424 425 options = parser.parse_args(args) 426 options.build_dir = os.path.abspath(options.build_dir) 427 if options.action.lower() == 'all': 428 if not options.json_output: 429 print '--json-output is required' 430 return 1 431 write_instrumented(options) 432 elif options.action.lower() == 'merge': 433 if not options.coverage_dir: 434 print '--coverage-dir is required' 435 return 1 436 if not options.json_input: 437 print '--json-input is required' 438 return 1 439 if not options.json_output: 440 print '--json-output is required' 441 return 1 442 merge(options) 443 elif options.action.lower() == 'split': 444 if not options.json_input: 445 print '--json-input is required' 446 return 1 447 if not options.output_dir: 448 print '--output-dir is required' 449 return 1 450 split(options) 451 return 0 452 453 454if __name__ == '__main__': 455 sys.exit(main()) 456