• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python3
2#
3# Copyright 2016 The Chromium Authors
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6"""Generate a dictionary for libFuzzer or AFL-based fuzzer.
7
8Invoked manually using a fuzzer binary and target format/protocol specification.
9Works better for text formats or protocols. For binary ones may be useless.
10"""
11
12import argparse
13import HTMLParser
14import logging
15import os
16import re
17import shutil
18import subprocess
19import sys
20import tempfile
21
22ENCODING_TYPES = ['ascii', 'utf_16_be', 'utf_16_le', 'utf_32_be', 'utf_32_le']
23MIN_STRING_LENGTH = 4
24
25
26def DecodeHTML(html_data):
27  """HTML-decoding of the data."""
28  html_parser = HTMLParser.HTMLParser()
29  data = html_parser.unescape(html_data.decode('ascii', 'ignore'))
30  return data.encode('ascii', 'ignore')
31
32
33def EscapeDictionaryElement(element):
34  """Escape all unprintable and control characters in an element."""
35  element_escaped = element.encode('string_escape')
36  # Remove escaping for single quote because it breaks libFuzzer.
37  element_escaped = element_escaped.replace("\\'", "'")
38  # Add escaping for double quote.
39  element_escaped = element_escaped.replace('"', '\\"')
40  return element_escaped
41
42
43def ExtractWordsFromBinary(filepath, min_length=MIN_STRING_LENGTH):
44  """Extract words (splitted strings) from a binary executable file."""
45  rodata = PreprocessAndReadRodata(filepath)
46  words = []
47
48  strings_re = re.compile(r'[^\x00-\x1F\x7F-\xFF]{%d,}' % min_length)
49  # Use different encodings for strings extraction.
50  for encoding in ENCODING_TYPES:
51    data = rodata.decode(encoding, 'ignore').encode('ascii', 'ignore')
52    raw_strings = strings_re.findall(data)
53    for splitted_line in map(lambda line: line.split(), raw_strings):
54      words += splitted_line
55
56  return set(words)
57
58
59def ExtractWordsFromLines(lines):
60  """Extract all words from a list of strings."""
61  words = set()
62  for line in lines:
63    for word in line.split():
64      words.add(word)
65
66  return words
67
68
69def ExtractWordsFromSpec(filepath, is_html):
70  """Extract words from a specification."""
71  data = ReadSpecification(filepath, is_html)
72  words = data.split()
73  return set(words)
74
75
76def FindIndentedText(text):
77  """Find space-indented text blocks, e.g. code or data samples in RFCs."""
78  lines = text.split('\n')
79  indented_blocks = []
80  current_block = ''
81  previous_number_of_spaces = 0
82
83  # Go through every line and concatenate space-indented blocks into lines.
84  for i in range(0, len(lines), 1):
85    if not lines[i]:
86      # Ignore empty lines.
87      continue
88
89    # Space-indented text blocks have more leading spaces than regular text.
90    n = FindNumberOfLeadingSpaces(lines[i])
91
92    if n > previous_number_of_spaces:
93      # Beginning of a space-indented text block, start concatenation.
94      current_block = lines[i][n:]
95    elif n == previous_number_of_spaces and current_block:
96      # Or continuation of a space-indented text block, concatenate lines.
97      current_block += '\n' + lines[i][n:]
98
99    if n < previous_number_of_spaces and current_block:
100      # Current line is not indented, save previously concatenated lines.
101      indented_blocks.append(current_block)
102      current_block = ''
103
104    previous_number_of_spaces = n
105
106  return indented_blocks
107
108
109def FindNumberOfLeadingSpaces(line):
110  """Calculate number of leading whitespace characters in the string."""
111  n = 0
112  while n < len(line) and line[n].isspace():
113    n += 1
114
115  return n
116
117
118def GenerateDictionary(path_to_binary, path_to_spec, strategy, is_html=False):
119  """Generate a dictionary for given pair of fuzzer binary and specification."""
120  for filepath in [path_to_binary, path_to_spec]:
121    if not os.path.exists(filepath):
122      logging.error("%s doesn't exist. Exit.", filepath)
123      sys.exit(1)
124
125  words_from_binary = ExtractWordsFromBinary(path_to_binary)
126  words_from_spec = ExtractWordsFromSpec(path_to_spec, is_html)
127
128  dictionary_words = set()
129
130  if 'i' in strategy:
131    # Strategy i: only words which are common for binary and for specification.
132    dictionary_words = words_from_binary.intersection(words_from_spec)
133
134  if 'q' in strategy:
135    # Strategy q: add words from all quoted strings from specification.
136    # TODO(mmoroz): experimental and very noisy. Not recommended to use.
137    spec_data = ReadSpecification(path_to_spec, is_html)
138    quoted_strings = FindIndentedText(spec_data)
139    quoted_words = ExtractWordsFromLines(quoted_strings)
140    dictionary_words = dictionary_words.union(quoted_words)
141
142  if 'u' in strategy:
143    # Strategy u: add all uppercase words from specification.
144    uppercase_words = set(w for w in words_from_spec if w.isupper())
145    dictionary_words = dictionary_words.union(uppercase_words)
146
147  return dictionary_words
148
149
150def PreprocessAndReadRodata(filepath):
151  """Create a stripped copy of the binary and extract .rodata section."""
152  stripped_file = tempfile.NamedTemporaryFile(prefix='.stripped_')
153  stripped_filepath = stripped_file.name
154  shutil.copyfile(filepath, stripped_filepath)
155
156  # Strip all symbols to reduce amount of redundant strings.
157  strip_cmd = ['strip', '--strip-all', stripped_filepath]
158  result = subprocess.call(strip_cmd)
159  if result:
160    logging.warning('Failed to strip the binary. Using the original version.')
161    stripped_filepath = filepath
162
163  # Extract .rodata section to reduce amount of redundant strings.
164  rodata_file = tempfile.NamedTemporaryFile(prefix='.rodata_')
165  rodata_filepath = rodata_file.name
166  objcopy_cmd = ['objcopy', '-j', '.rodata', stripped_filepath, rodata_filepath]
167
168  # Hide output from stderr since objcopy prints a warning.
169  with open(os.devnull, 'w') as devnull:
170    result = subprocess.call(objcopy_cmd, stderr=devnull)
171
172  if result:
173    logging.warning('Failed to extract .rodata section. Using the whole file.')
174    rodata_filepath = stripped_filepath
175
176  with open(rodata_filepath) as file_handle:
177    data = file_handle.read()
178
179  stripped_file.close()
180  rodata_file.close()
181
182  return data
183
184
185def ReadSpecification(filepath, is_html):
186  """Read a specification file and return its contents."""
187  with open(filepath, 'r') as file_handle:
188    data = file_handle.read()
189
190  if is_html:
191    data = DecodeHTML(data)
192
193  return data
194
195
196def WriteDictionary(dictionary_path, dictionary):
197  """Write given dictionary to a file."""
198  with open(dictionary_path, 'wb') as file_handle:
199    file_handle.write('# This is an automatically generated dictionary.\n')
200    for word in dictionary:
201      if not word:
202        continue
203      line = '"%s"\n' % EscapeDictionaryElement(word)
204      file_handle.write(line)
205
206
207def main():
208  parser = argparse.ArgumentParser(description='Generate fuzzer dictionary.')
209  parser.add_argument('--fuzzer',
210                      required=True,
211                      help='Path to a fuzzer binary executable. It is '
212                      'recommended to use a binary built with '
213                      '"use_libfuzzer=false is_asan=false" to get a better '
214                      'dictionary with fewer number of redundant elements.')
215  parser.add_argument('--spec',
216                      required=True,
217                      help='Path to a target specification (in textual form).')
218  parser.add_argument('--html',
219                      default=0,
220                      help='Decode HTML [01] (0 is default value): '
221                      '1 - if specification has HTML entities to be decoded.')
222  parser.add_argument('--out',
223                      required=True,
224                      help='Path to a file to write a dictionary into.')
225  parser.add_argument('--strategy',
226                      default='iu',
227                      help='Generation strategy [iqu] ("iu" is default value): '
228                      'i - intersection, q - quoted, u - uppercase.')
229  args = parser.parse_args()
230
231  dictionary = GenerateDictionary(args.fuzzer,
232                                  args.spec,
233                                  args.strategy,
234                                  is_html=bool(args.html))
235  WriteDictionary(args.out, dictionary)
236
237
238if __name__ == '__main__':
239  main()
240