1#!/usr/bin/env python 2# Copyright 2016 Google Inc. 3# 4# Licensed under the Apache License, Version 2.0 (the "License"); 5# you may not use this file except in compliance with the License. 6# You may obtain a copy of the License at 7# 8# http://www.apache.org/licenses/LICENSE-2.0 9# 10# Unless required by applicable law or agreed to in writing, software 11# distributed under the License is distributed on an "AS IS" BASIS, 12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13# See the License for the specific language governing permissions and 14# limitations under the License. 15# 16################################################################################ 17 18from __future__ import print_function 19import logging 20import os 21import re 22import sys 23import zipfile 24 25 26logging.basicConfig(level=logging.INFO, format='INFO: %(message)s') 27CODEC_NAME_REGEXP = re.compile(r'codec_id_(.+?)_fuzzer') 28 29 30def get_fuzzer_tags(fuzzer_name): 31 """Extract tags (are used to filter samples) from the given fuzzer name.""" 32 tags = [] 33 fuzzer_name = fuzzer_name.lower() 34 # All subtitle samples are in 'sub' directory, need to add 'sub' tag manually. 35 if 'subtitle' in fuzzer_name: 36 tags.append('sub') 37 m = CODEC_NAME_REGEXP.search(fuzzer_name) 38 if m: 39 codec_name = m.group(1) 40 # Some names are complex, need to split them and filter common strings. 41 codec_name_parts = codec_name.split('_') 42 for codec in codec_name_parts: 43 # Remove common strings from codec names like 'mpeg1video' or 'msvideo1'. 44 codec = codec.split('video')[0] 45 codec = codec.split('audio')[0] 46 codec = codec.split('subtitle')[0] 47 codec = codec.split('text')[0] 48 if codec: 49 # Some codec names have trailing characters: 'VP6F','FLV1', 'JPEGLS'. 50 # Use only first 3 characters for long enough codec names. 51 if len(codec) > 3: 52 tags.append(codec[:3]) 53 else: 54 tags.append(codec) 55 56 return tags 57 58 59def parse_corpus(corpus_directory): 60 """Recursively list all files in the given directory and ignore checksums.""" 61 all_corpus_files = [] 62 for root, dirs, files in os.walk(corpus_directory): 63 for filename in files: 64 # Skip checksum files, they are useless in corpus. 65 if 'md5sum' in filename: 66 continue 67 path = os.path.join(root, filename) 68 all_corpus_files.append(path) 69 70 logging.info('Parsed %d corpus files from %s' % (len(all_corpus_files), 71 corpus_directory)) 72 return all_corpus_files 73 74 75def parse_fuzzers(fuzzers_directory): 76 """Recursively list all fuzzers in the given directory.""" 77 all_fuzzers = [] 78 for filename in os.listdir(fuzzers_directory): 79 # Skip non-ffmpeg and non-fuzzer files in the given directory, 80 if not filename.startswith('ffmpeg_') or not filename.endswith('_fuzzer'): 81 continue 82 fuzzer_path = os.path.join(fuzzers_directory, filename) 83 all_fuzzers.append(fuzzer_path) 84 85 logging.info('Parsed %d fuzzers from %s' % (len(all_fuzzers), 86 fuzzers_directory)) 87 return all_fuzzers 88 89 90def zip_relevant_corpus(corpus_files, fuzzers): 91 """Find relevant corpus files and archive them for every fuzzer given.""" 92 for fuzzer in fuzzers: 93 fuzzer_name = os.path.basename(fuzzer) 94 fuzzer_directory = os.path.dirname(fuzzer) 95 fuzzer_tags = get_fuzzer_tags(fuzzer_name) 96 relevant_corpus_files = set() 97 for filename in corpus_files: 98 # Remove 'ffmpeg' substring to do not use everything for 'MPEG' codec. 99 sanitized_filename = filename.replace('ffmpeg', '').lower() 100 for tag in fuzzer_tags: 101 if tag in sanitized_filename: 102 relevant_corpus_files.add(filename) 103 104 if not relevant_corpus_files: 105 # Strip last symbol from tags if we haven't found relevant corpus. 106 # It helps for such codecs as 'RV40' ('RV4' -> 'RV') or 'PCX' (-> 'PC'). 107 for tag in fuzzer_tags: 108 if tag[:-1] in sanitized_filename: 109 relevant_corpus_files.add(filename) 110 111 logging.info( 112 'Found %d relevant samples for %s' % (len(relevant_corpus_files), 113 fuzzer_name)) 114 115 if not relevant_corpus_files: 116 continue 117 118 zip_archive_name = fuzzer + "_seed_corpus.zip" 119 with zipfile.ZipFile(zip_archive_name, 'w') as archive: 120 for filename in relevant_corpus_files: 121 archive.write(filename) 122 123 124def main(): 125 if len(sys.argv) < 3: 126 print('Usage: %s <seed_corpus_directory> <fuzzers_directory>' % __file__) 127 sys.exit(1) 128 129 seed_corpus_directory = sys.argv[1] 130 fuzzers_directory = sys.argv[2] 131 132 corpus_files = parse_corpus(seed_corpus_directory) 133 fuzzers = parse_fuzzers(fuzzers_directory) 134 zip_relevant_corpus(corpus_files, fuzzers) 135 136 137if __name__ == '__main__': 138 sys.exit(main()) 139