• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python3
2# Copyright 2020 Google Inc.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8#      http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15#
16################################################################################
17"""Script for collecting dataflow traces using DFSan compiled binary. The script
18imitates `CollectDataFlow` function from libFuzzer but provides some flexibility
19for skipping long and/or slow corpus elements.
20
21Follow https://github.com/google/oss-fuzz/issues/1632 for more details."""
22import hashlib
23import os
24import subprocess
25import sys
26
27# These can be controlled by the runner in order to change the values without
28# rebuilding OSS-Fuzz base images.
29FILE_SIZE_LIMIT = int(os.getenv('DFT_FILE_SIZE_LIMIT', 32 * 1024))
30MIN_TIMEOUT = float(os.getenv('DFT_MIN_TIMEOUT', 1.0))
31TIMEOUT_RANGE = float(os.getenv('DFT_TIMEOUT_RANGE', 3.0))
32
33DFSAN_OPTIONS = 'fast16labels=1:warn_unimplemented=0'
34
35
36def _error(msg):
37  sys.stderr.write(msg + '\n')
38
39
40def _list_dir(dirpath):
41  for root, _, files in os.walk(dirpath):
42    for f in files:
43      yield os.path.join(root, f)
44
45
46def _sha1(filepath):
47  h = hashlib.sha1()
48  with open(filepath, 'rb') as f:
49    h.update(f.read())
50  return h.hexdigest()
51
52
53def _run(cmd, timeout=None):
54  result = None
55  try:
56    result = subprocess.run(cmd,
57                            timeout=timeout,
58                            stdout=subprocess.PIPE,
59                            stderr=subprocess.PIPE)
60    if result.returncode:
61      _error('{command} finished with non-zero code: {code}'.format(
62          command=str(cmd), code=result.returncode))
63
64  except subprocess.TimeoutExpired:
65    raise
66  except Exception as e:
67    _error('Exception: ' + str(e))
68
69  return result
70
71
72def _timeout(size):
73  # Dynamic timeout value (proportional to file size) to discard slow units.
74  timeout = MIN_TIMEOUT
75  timeout += size * TIMEOUT_RANGE / FILE_SIZE_LIMIT
76  return timeout
77
78
79def collect_traces(binary, corpus_dir, dft_dir):
80  stats = {
81      'total': 0,
82      'traced': 0,
83      'long': 0,
84      'slow': 0,
85      'failed': 0,
86  }
87
88  files_and_sizes = {}
89  for f in _list_dir(corpus_dir):
90    stats['total'] += 1
91    size = os.path.getsize(f)
92    if size > FILE_SIZE_LIMIT:
93      stats['long'] += 1
94      print('Skipping large file ({size}b): {path}'.format(size=size, path=f))
95      continue
96    files_and_sizes[f] = size
97
98  for f in sorted(files_and_sizes, key=files_and_sizes.get):
99    output_path = os.path.join(dft_dir, _sha1(f))
100    try:
101      result = _run([binary, f, output_path], timeout=_timeout(size))
102      if result.returncode:
103        stats['failed'] += 1
104      else:
105        stats['traced'] += 1
106
107    except subprocess.TimeoutExpired as e:
108      _error('Slow input: ' + str(e))
109      stats['slow'] += 1
110
111  return stats
112
113
114def dump_functions(binary, dft_dir):
115  result = _run([binary])
116  if not result or result.returncode:
117    return False
118
119  with open(os.path.join(dft_dir, 'functions.txt'), 'wb') as f:
120    f.write(result.stdout)
121
122  return True
123
124
125def main():
126  if len(sys.argv) < 4:
127    _error('Usage: {0} <binary> <corpus_dir> <dft_dir>'.format(sys.argv[0]))
128    sys.exit(1)
129
130  binary = sys.argv[1]
131  corpus_dir = sys.argv[2]
132  dft_dir = sys.argv[3]
133
134  os.environ['DFSAN_OPTIONS'] = DFSAN_OPTIONS
135
136  if not dump_functions(binary, dft_dir):
137    _error('Failed to dump functions. Something is wrong.')
138    sys.exit(1)
139
140  stats = collect_traces(binary, corpus_dir, dft_dir)
141  for k, v in stats.items():
142    print('{0}: {1}'.format(k, v))
143
144  # Checksum that we didn't lose track of any of the inputs.
145  assert stats['total'] * 2 == sum(v for v in stats.values())
146  sys.exit(0)
147
148
149if __name__ == "__main__":
150  main()
151