1#!/usr/bin/env python3 2# Copyright 2020 Google Inc. 3# 4# Licensed under the Apache License, Version 2.0 (the "License"); 5# you may not use this file except in compliance with the License. 6# You may obtain a copy of the License at 7# 8# http://www.apache.org/licenses/LICENSE-2.0 9# 10# Unless required by applicable law or agreed to in writing, software 11# distributed under the License is distributed on an "AS IS" BASIS, 12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13# See the License for the specific language governing permissions and 14# limitations under the License. 15# 16################################################################################ 17"""Script for collecting dataflow traces using DFSan compiled binary. The script 18imitates `CollectDataFlow` function from libFuzzer but provides some flexibility 19for skipping long and/or slow corpus elements. 20 21Follow https://github.com/google/oss-fuzz/issues/1632 for more details.""" 22import hashlib 23import os 24import subprocess 25import sys 26 27# pylint: skip-file 28 29# See https://github.com/google/oss-fuzz/pull/5024#discussion_r561313003 for why 30# we are disabling pylint for this file (we can't do it in .pylintrc, probably 31# because of weirdness with this file's package, so we do it here). 32 33# These can be controlled by the runner in order to change the values without 34# rebuilding OSS-Fuzz base images. 35FILE_SIZE_LIMIT = int(os.getenv('DFT_FILE_SIZE_LIMIT', 32 * 1024)) 36MIN_TIMEOUT = float(os.getenv('DFT_MIN_TIMEOUT', 1.0)) 37TIMEOUT_RANGE = float(os.getenv('DFT_TIMEOUT_RANGE', 3.0)) 38 39DFSAN_OPTIONS = 'fast16labels=1:warn_unimplemented=0' 40 41 42def _error(msg): 43 sys.stderr.write(msg + '\n') 44 45 46def _list_dir(dirpath): 47 for root, _, files in os.walk(dirpath): 48 for f in files: 49 yield os.path.join(root, f) 50 51 52def _sha1(filepath): 53 h = hashlib.sha1() 54 with open(filepath, 'rb') as f: 55 h.update(f.read()) 56 return h.hexdigest() 57 58 59def _run(cmd, timeout=None): 60 result = None 61 try: 62 result = subprocess.run(cmd, 63 timeout=timeout, 64 stdout=subprocess.PIPE, 65 stderr=subprocess.PIPE) 66 if result.returncode: 67 _error('{command} finished with non-zero code: {code}'.format( 68 command=str(cmd), code=result.returncode)) 69 70 except subprocess.TimeoutExpired: 71 raise 72 except Exception as e: 73 _error('Exception: ' + str(e)) 74 75 return result 76 77 78def _timeout(size): 79 # Dynamic timeout value (proportional to file size) to discard slow units. 80 timeout = MIN_TIMEOUT 81 timeout += size * TIMEOUT_RANGE / FILE_SIZE_LIMIT 82 return timeout 83 84 85def collect_traces(binary, corpus_dir, dft_dir): 86 stats = { 87 'total': 0, 88 'traced': 0, 89 'long': 0, 90 'slow': 0, 91 'failed': 0, 92 } 93 94 files_and_sizes = {} 95 for f in _list_dir(corpus_dir): 96 stats['total'] += 1 97 size = os.path.getsize(f) 98 if size > FILE_SIZE_LIMIT: 99 stats['long'] += 1 100 print('Skipping large file ({size}b): {path}'.format(size=size, path=f)) 101 continue 102 files_and_sizes[f] = size 103 104 for f in sorted(files_and_sizes, key=files_and_sizes.get): 105 output_path = os.path.join(dft_dir, _sha1(f)) 106 try: 107 result = _run([binary, f, output_path], timeout=_timeout(size)) 108 if result.returncode: 109 stats['failed'] += 1 110 else: 111 stats['traced'] += 1 112 113 except subprocess.TimeoutExpired as e: 114 _error('Slow input: ' + str(e)) 115 stats['slow'] += 1 116 117 return stats 118 119 120def dump_functions(binary, dft_dir): 121 result = _run([binary]) 122 if not result or result.returncode: 123 return False 124 125 with open(os.path.join(dft_dir, 'functions.txt'), 'wb') as f: 126 f.write(result.stdout) 127 128 return True 129 130 131def main(): 132 if len(sys.argv) < 4: 133 _error('Usage: {0} <binary> <corpus_dir> <dft_dir>'.format(sys.argv[0])) 134 sys.exit(1) 135 136 binary = sys.argv[1] 137 corpus_dir = sys.argv[2] 138 dft_dir = sys.argv[3] 139 140 os.environ['DFSAN_OPTIONS'] = DFSAN_OPTIONS 141 142 if not dump_functions(binary, dft_dir): 143 _error('Failed to dump functions. Something is wrong.') 144 sys.exit(1) 145 146 stats = collect_traces(binary, corpus_dir, dft_dir) 147 for k, v in stats.items(): 148 print('{0}: {1}'.format(k, v)) 149 150 # Checksum that we didn't lose track of any of the inputs. 151 assert stats['total'] * 2 == sum(v for v in stats.values()) 152 sys.exit(0) 153 154 155if __name__ == "__main__": 156 main() 157