1#!/usr/bin/env python3 2# Copyright 2020 Google Inc. 3# 4# Licensed under the Apache License, Version 2.0 (the "License"); 5# you may not use this file except in compliance with the License. 6# You may obtain a copy of the License at 7# 8# http://www.apache.org/licenses/LICENSE-2.0 9# 10# Unless required by applicable law or agreed to in writing, software 11# distributed under the License is distributed on an "AS IS" BASIS, 12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13# See the License for the specific language governing permissions and 14# limitations under the License. 15# 16################################################################################ 17"""Script for collecting dataflow traces using DFSan compiled binary. The script 18imitates `CollectDataFlow` function from libFuzzer but provides some flexibility 19for skipping long and/or slow corpus elements. 20 21Follow https://github.com/google/oss-fuzz/issues/1632 for more details.""" 22import hashlib 23import os 24import subprocess 25import sys 26 27# These can be controlled by the runner in order to change the values without 28# rebuilding OSS-Fuzz base images. 29FILE_SIZE_LIMIT = int(os.getenv('DFT_FILE_SIZE_LIMIT', 32 * 1024)) 30MIN_TIMEOUT = float(os.getenv('DFT_MIN_TIMEOUT', 1.0)) 31TIMEOUT_RANGE = float(os.getenv('DFT_TIMEOUT_RANGE', 3.0)) 32 33DFSAN_OPTIONS = 'fast16labels=1:warn_unimplemented=0' 34 35 36def _error(msg): 37 sys.stderr.write(msg + '\n') 38 39 40def _list_dir(dirpath): 41 for root, _, files in os.walk(dirpath): 42 for f in files: 43 yield os.path.join(root, f) 44 45 46def _sha1(filepath): 47 h = hashlib.sha1() 48 with open(filepath, 'rb') as f: 49 h.update(f.read()) 50 return h.hexdigest() 51 52 53def _run(cmd, timeout=None): 54 result = None 55 try: 56 result = subprocess.run(cmd, 57 timeout=timeout, 58 stdout=subprocess.PIPE, 59 stderr=subprocess.PIPE) 60 if result.returncode: 61 _error('{command} finished with non-zero code: {code}'.format( 62 command=str(cmd), code=result.returncode)) 63 64 except subprocess.TimeoutExpired: 65 raise 66 except Exception as e: 67 _error('Exception: ' + str(e)) 68 69 return result 70 71 72def _timeout(size): 73 # Dynamic timeout value (proportional to file size) to discard slow units. 74 timeout = MIN_TIMEOUT 75 timeout += size * TIMEOUT_RANGE / FILE_SIZE_LIMIT 76 return timeout 77 78 79def collect_traces(binary, corpus_dir, dft_dir): 80 stats = { 81 'total': 0, 82 'traced': 0, 83 'long': 0, 84 'slow': 0, 85 'failed': 0, 86 } 87 88 files_and_sizes = {} 89 for f in _list_dir(corpus_dir): 90 stats['total'] += 1 91 size = os.path.getsize(f) 92 if size > FILE_SIZE_LIMIT: 93 stats['long'] += 1 94 print('Skipping large file ({size}b): {path}'.format(size=size, path=f)) 95 continue 96 files_and_sizes[f] = size 97 98 for f in sorted(files_and_sizes, key=files_and_sizes.get): 99 output_path = os.path.join(dft_dir, _sha1(f)) 100 try: 101 result = _run([binary, f, output_path], timeout=_timeout(size)) 102 if result.returncode: 103 stats['failed'] += 1 104 else: 105 stats['traced'] += 1 106 107 except subprocess.TimeoutExpired as e: 108 _error('Slow input: ' + str(e)) 109 stats['slow'] += 1 110 111 return stats 112 113 114def dump_functions(binary, dft_dir): 115 result = _run([binary]) 116 if not result or result.returncode: 117 return False 118 119 with open(os.path.join(dft_dir, 'functions.txt'), 'wb') as f: 120 f.write(result.stdout) 121 122 return True 123 124 125def main(): 126 if len(sys.argv) < 4: 127 _error('Usage: {0} <binary> <corpus_dir> <dft_dir>'.format(sys.argv[0])) 128 sys.exit(1) 129 130 binary = sys.argv[1] 131 corpus_dir = sys.argv[2] 132 dft_dir = sys.argv[3] 133 134 os.environ['DFSAN_OPTIONS'] = DFSAN_OPTIONS 135 136 if not dump_functions(binary, dft_dir): 137 _error('Failed to dump functions. Something is wrong.') 138 sys.exit(1) 139 140 stats = collect_traces(binary, corpus_dir, dft_dir) 141 for k, v in stats.items(): 142 print('{0}: {1}'.format(k, v)) 143 144 # Checksum that we didn't lose track of any of the inputs. 145 assert stats['total'] * 2 == sum(v for v in stats.values()) 146 sys.exit(0) 147 148 149if __name__ == "__main__": 150 main() 151