1#!/usr/bin/env python3 2# 3# Copyright 2016 The Chromium Authors 4# Use of this source code is governed by a BSD-style license that can be 5# found in the LICENSE file. 6"""Generate a dictionary for libFuzzer or AFL-based fuzzer. 7 8Invoked manually using a fuzzer binary and target format/protocol specification. 9Works better for text formats or protocols. For binary ones may be useless. 10""" 11 12import argparse 13import HTMLParser 14import logging 15import os 16import re 17import shutil 18import subprocess 19import sys 20import tempfile 21 22ENCODING_TYPES = ['ascii', 'utf_16_be', 'utf_16_le', 'utf_32_be', 'utf_32_le'] 23MIN_STRING_LENGTH = 4 24 25 26def DecodeHTML(html_data): 27 """HTML-decoding of the data.""" 28 html_parser = HTMLParser.HTMLParser() 29 data = html_parser.unescape(html_data.decode('ascii', 'ignore')) 30 return data.encode('ascii', 'ignore') 31 32 33def EscapeDictionaryElement(element): 34 """Escape all unprintable and control characters in an element.""" 35 element_escaped = element.encode('string_escape') 36 # Remove escaping for single quote because it breaks libFuzzer. 37 element_escaped = element_escaped.replace("\\'", "'") 38 # Add escaping for double quote. 39 element_escaped = element_escaped.replace('"', '\\"') 40 return element_escaped 41 42 43def ExtractWordsFromBinary(filepath, min_length=MIN_STRING_LENGTH): 44 """Extract words (splitted strings) from a binary executable file.""" 45 rodata = PreprocessAndReadRodata(filepath) 46 words = [] 47 48 strings_re = re.compile(r'[^\x00-\x1F\x7F-\xFF]{%d,}' % min_length) 49 # Use different encodings for strings extraction. 50 for encoding in ENCODING_TYPES: 51 data = rodata.decode(encoding, 'ignore').encode('ascii', 'ignore') 52 raw_strings = strings_re.findall(data) 53 for splitted_line in map(lambda line: line.split(), raw_strings): 54 words += splitted_line 55 56 return set(words) 57 58 59def ExtractWordsFromLines(lines): 60 """Extract all words from a list of strings.""" 61 words = set() 62 for line in lines: 63 for word in line.split(): 64 words.add(word) 65 66 return words 67 68 69def ExtractWordsFromSpec(filepath, is_html): 70 """Extract words from a specification.""" 71 data = ReadSpecification(filepath, is_html) 72 words = data.split() 73 return set(words) 74 75 76def FindIndentedText(text): 77 """Find space-indented text blocks, e.g. code or data samples in RFCs.""" 78 lines = text.split('\n') 79 indented_blocks = [] 80 current_block = '' 81 previous_number_of_spaces = 0 82 83 # Go through every line and concatenate space-indented blocks into lines. 84 for i in range(0, len(lines), 1): 85 if not lines[i]: 86 # Ignore empty lines. 87 continue 88 89 # Space-indented text blocks have more leading spaces than regular text. 90 n = FindNumberOfLeadingSpaces(lines[i]) 91 92 if n > previous_number_of_spaces: 93 # Beginning of a space-indented text block, start concatenation. 94 current_block = lines[i][n:] 95 elif n == previous_number_of_spaces and current_block: 96 # Or continuation of a space-indented text block, concatenate lines. 97 current_block += '\n' + lines[i][n:] 98 99 if n < previous_number_of_spaces and current_block: 100 # Current line is not indented, save previously concatenated lines. 101 indented_blocks.append(current_block) 102 current_block = '' 103 104 previous_number_of_spaces = n 105 106 return indented_blocks 107 108 109def FindNumberOfLeadingSpaces(line): 110 """Calculate number of leading whitespace characters in the string.""" 111 n = 0 112 while n < len(line) and line[n].isspace(): 113 n += 1 114 115 return n 116 117 118def GenerateDictionary(path_to_binary, path_to_spec, strategy, is_html=False): 119 """Generate a dictionary for given pair of fuzzer binary and specification.""" 120 for filepath in [path_to_binary, path_to_spec]: 121 if not os.path.exists(filepath): 122 logging.error("%s doesn't exist. Exit.", filepath) 123 sys.exit(1) 124 125 words_from_binary = ExtractWordsFromBinary(path_to_binary) 126 words_from_spec = ExtractWordsFromSpec(path_to_spec, is_html) 127 128 dictionary_words = set() 129 130 if 'i' in strategy: 131 # Strategy i: only words which are common for binary and for specification. 132 dictionary_words = words_from_binary.intersection(words_from_spec) 133 134 if 'q' in strategy: 135 # Strategy q: add words from all quoted strings from specification. 136 # TODO(mmoroz): experimental and very noisy. Not recommended to use. 137 spec_data = ReadSpecification(path_to_spec, is_html) 138 quoted_strings = FindIndentedText(spec_data) 139 quoted_words = ExtractWordsFromLines(quoted_strings) 140 dictionary_words = dictionary_words.union(quoted_words) 141 142 if 'u' in strategy: 143 # Strategy u: add all uppercase words from specification. 144 uppercase_words = set(w for w in words_from_spec if w.isupper()) 145 dictionary_words = dictionary_words.union(uppercase_words) 146 147 return dictionary_words 148 149 150def PreprocessAndReadRodata(filepath): 151 """Create a stripped copy of the binary and extract .rodata section.""" 152 stripped_file = tempfile.NamedTemporaryFile(prefix='.stripped_') 153 stripped_filepath = stripped_file.name 154 shutil.copyfile(filepath, stripped_filepath) 155 156 # Strip all symbols to reduce amount of redundant strings. 157 strip_cmd = ['strip', '--strip-all', stripped_filepath] 158 result = subprocess.call(strip_cmd) 159 if result: 160 logging.warning('Failed to strip the binary. Using the original version.') 161 stripped_filepath = filepath 162 163 # Extract .rodata section to reduce amount of redundant strings. 164 rodata_file = tempfile.NamedTemporaryFile(prefix='.rodata_') 165 rodata_filepath = rodata_file.name 166 objcopy_cmd = ['objcopy', '-j', '.rodata', stripped_filepath, rodata_filepath] 167 168 # Hide output from stderr since objcopy prints a warning. 169 with open(os.devnull, 'w') as devnull: 170 result = subprocess.call(objcopy_cmd, stderr=devnull) 171 172 if result: 173 logging.warning('Failed to extract .rodata section. Using the whole file.') 174 rodata_filepath = stripped_filepath 175 176 with open(rodata_filepath) as file_handle: 177 data = file_handle.read() 178 179 stripped_file.close() 180 rodata_file.close() 181 182 return data 183 184 185def ReadSpecification(filepath, is_html): 186 """Read a specification file and return its contents.""" 187 with open(filepath, 'r') as file_handle: 188 data = file_handle.read() 189 190 if is_html: 191 data = DecodeHTML(data) 192 193 return data 194 195 196def WriteDictionary(dictionary_path, dictionary): 197 """Write given dictionary to a file.""" 198 with open(dictionary_path, 'wb') as file_handle: 199 file_handle.write('# This is an automatically generated dictionary.\n') 200 for word in dictionary: 201 if not word: 202 continue 203 line = '"%s"\n' % EscapeDictionaryElement(word) 204 file_handle.write(line) 205 206 207def main(): 208 parser = argparse.ArgumentParser(description='Generate fuzzer dictionary.') 209 parser.add_argument('--fuzzer', 210 required=True, 211 help='Path to a fuzzer binary executable. It is ' 212 'recommended to use a binary built with ' 213 '"use_libfuzzer=false is_asan=false" to get a better ' 214 'dictionary with fewer number of redundant elements.') 215 parser.add_argument('--spec', 216 required=True, 217 help='Path to a target specification (in textual form).') 218 parser.add_argument('--html', 219 default=0, 220 help='Decode HTML [01] (0 is default value): ' 221 '1 - if specification has HTML entities to be decoded.') 222 parser.add_argument('--out', 223 required=True, 224 help='Path to a file to write a dictionary into.') 225 parser.add_argument('--strategy', 226 default='iu', 227 help='Generation strategy [iqu] ("iu" is default value): ' 228 'i - intersection, q - quoted, u - uppercase.') 229 args = parser.parse_args() 230 231 dictionary = GenerateDictionary(args.fuzzer, 232 args.spec, 233 args.strategy, 234 is_html=bool(args.html)) 235 WriteDictionary(args.out, dictionary) 236 237 238if __name__ == '__main__': 239 main() 240