• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python3
2# Copyright 2020 Google Inc.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8#      http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15#
16################################################################################
17"""Script for collecting dataflow traces using DFSan compiled binary. The script
18imitates `CollectDataFlow` function from libFuzzer but provides some flexibility
19for skipping long and/or slow corpus elements.
20
21Follow https://github.com/google/oss-fuzz/issues/1632 for more details."""
22import hashlib
23import os
24import subprocess
25import sys
26
27# pylint: skip-file
28
29# See https://github.com/google/oss-fuzz/pull/5024#discussion_r561313003 for why
30# we are disabling pylint for this file (we can't do it in .pylintrc, probably
31# because of weirdness with this file's package, so we do it here).
32
33# These can be controlled by the runner in order to change the values without
34# rebuilding OSS-Fuzz base images.
35FILE_SIZE_LIMIT = int(os.getenv('DFT_FILE_SIZE_LIMIT', 32 * 1024))
36MIN_TIMEOUT = float(os.getenv('DFT_MIN_TIMEOUT', 1.0))
37TIMEOUT_RANGE = float(os.getenv('DFT_TIMEOUT_RANGE', 3.0))
38
39DFSAN_OPTIONS = 'fast16labels=1:warn_unimplemented=0'
40
41
42def _error(msg):
43  sys.stderr.write(msg + '\n')
44
45
46def _list_dir(dirpath):
47  for root, _, files in os.walk(dirpath):
48    for f in files:
49      yield os.path.join(root, f)
50
51
52def _sha1(filepath):
53  h = hashlib.sha1()
54  with open(filepath, 'rb') as f:
55    h.update(f.read())
56  return h.hexdigest()
57
58
59def _run(cmd, timeout=None):
60  result = None
61  try:
62    result = subprocess.run(cmd,
63                            timeout=timeout,
64                            stdout=subprocess.PIPE,
65                            stderr=subprocess.PIPE)
66    if result.returncode:
67      _error('{command} finished with non-zero code: {code}'.format(
68          command=str(cmd), code=result.returncode))
69
70  except subprocess.TimeoutExpired:
71    raise
72  except Exception as e:
73    _error('Exception: ' + str(e))
74
75  return result
76
77
78def _timeout(size):
79  # Dynamic timeout value (proportional to file size) to discard slow units.
80  timeout = MIN_TIMEOUT
81  timeout += size * TIMEOUT_RANGE / FILE_SIZE_LIMIT
82  return timeout
83
84
85def collect_traces(binary, corpus_dir, dft_dir):
86  stats = {
87      'total': 0,
88      'traced': 0,
89      'long': 0,
90      'slow': 0,
91      'failed': 0,
92  }
93
94  files_and_sizes = {}
95  for f in _list_dir(corpus_dir):
96    stats['total'] += 1
97    size = os.path.getsize(f)
98    if size > FILE_SIZE_LIMIT:
99      stats['long'] += 1
100      print('Skipping large file ({size}b): {path}'.format(size=size, path=f))
101      continue
102    files_and_sizes[f] = size
103
104  for f in sorted(files_and_sizes, key=files_and_sizes.get):
105    output_path = os.path.join(dft_dir, _sha1(f))
106    try:
107      result = _run([binary, f, output_path], timeout=_timeout(size))
108      if result.returncode:
109        stats['failed'] += 1
110      else:
111        stats['traced'] += 1
112
113    except subprocess.TimeoutExpired as e:
114      _error('Slow input: ' + str(e))
115      stats['slow'] += 1
116
117  return stats
118
119
120def dump_functions(binary, dft_dir):
121  result = _run([binary])
122  if not result or result.returncode:
123    return False
124
125  with open(os.path.join(dft_dir, 'functions.txt'), 'wb') as f:
126    f.write(result.stdout)
127
128  return True
129
130
131def main():
132  if len(sys.argv) < 4:
133    _error('Usage: {0} <binary> <corpus_dir> <dft_dir>'.format(sys.argv[0]))
134    sys.exit(1)
135
136  binary = sys.argv[1]
137  corpus_dir = sys.argv[2]
138  dft_dir = sys.argv[3]
139
140  os.environ['DFSAN_OPTIONS'] = DFSAN_OPTIONS
141
142  if not dump_functions(binary, dft_dir):
143    _error('Failed to dump functions. Something is wrong.')
144    sys.exit(1)
145
146  stats = collect_traces(binary, corpus_dir, dft_dir)
147  for k, v in stats.items():
148    print('{0}: {1}'.format(k, v))
149
150  # Checksum that we didn't lose track of any of the inputs.
151  assert stats['total'] * 2 == sum(v for v in stats.values())
152  sys.exit(0)
153
154
155if __name__ == "__main__":
156  main()
157