1#!/usr/bin/env bash 2# 3# american fuzzy lop++ - corpus minimization tool 4# --------------------------------------------- 5# 6# Originally written by Michal Zalewski 7# 8# Copyright 2014, 2015 Google Inc. All rights reserved. 9# 10# Licensed under the Apache License, Version 2.0 (the "License"); 11# you may not use this file except in compliance with the License. 12# You may obtain a copy of the License at: 13# 14# https://www.apache.org/licenses/LICENSE-2.0 15# 16# This tool tries to find the smallest subset of files in the input directory 17# that still trigger the full range of instrumentation data points seen in 18# the starting corpus. This has two uses: 19# 20# - Screening large corpora of input files before using them as a seed for 21# afl-fuzz. The tool will remove functionally redundant files and likely 22# leave you with a much smaller set. 23# 24# (In this case, you probably also want to consider running afl-tmin on 25# the individual files later on to reduce their size.) 26# 27# - Minimizing the corpus generated organically by afl-fuzz, perhaps when 28# planning to feed it to more resource-intensive tools. The tool achieves 29# this by removing all entries that used to trigger unique behaviors in the 30# past, but have been made obsolete by later finds. 31# 32# Note that the tool doesn't modify the files themselves. For that, you want 33# afl-tmin. 34# 35# This script must use bash because other shells may have hardcoded limits on 36# array sizes. 37# 38 39echo "corpus minimization tool for afl-fuzz by Michal Zalewski" 40echo 41 42######### 43# SETUP # 44######### 45 46# Process command-line options... 47 48MEM_LIMIT=none 49TIMEOUT=none 50 51unset IN_DIR OUT_DIR STDIN_FILE EXTRA_PAR MEM_LIMIT_GIVEN \ 52 AFL_CMIN_CRASHES_ONLY AFL_CMIN_ALLOW_ANY QEMU_MODE UNICORN_MODE 53 54export AFL_QUIET=1 55 56while getopts "+i:o:f:m:t:eOQUCh" opt; do 57 58 case "$opt" in 59 60 "h") 61 ;; 62 63 "i") 64 IN_DIR="$OPTARG" 65 ;; 66 67 "o") 68 OUT_DIR="$OPTARG" 69 ;; 70 "f") 71 STDIN_FILE="$OPTARG" 72 ;; 73 "m") 74 MEM_LIMIT="$OPTARG" 75 MEM_LIMIT_GIVEN=1 76 ;; 77 "t") 78 TIMEOUT="$OPTARG" 79 ;; 80 "e") 81 EXTRA_PAR="$EXTRA_PAR -e" 82 ;; 83 "C") 84 export AFL_CMIN_CRASHES_ONLY=1 85 ;; 86 "O") 87 EXTRA_PAR="$EXTRA_PAR -O" 88 FRIDA_MODE=1 89 ;; 90 "Q") 91 EXTRA_PAR="$EXTRA_PAR -Q" 92 QEMU_MODE=1 93 ;; 94 "U") 95 EXTRA_PAR="$EXTRA_PAR -U" 96 UNICORN_MODE=1 97 ;; 98 "?") 99 exit 1 100 ;; 101 102 esac 103 104done 105 106shift $((OPTIND-1)) 107 108TARGET_BIN="$1" 109 110if [ "$TARGET_BIN" = "" -o "$IN_DIR" = "" -o "$OUT_DIR" = "" ]; then 111 112 cat 1>&2 <<_EOF_ 113Usage: $0 [ options ] -- /path/to/target_app [ ... ] 114 115Required parameters: 116 117 -i dir - input directory with the starting corpus 118 -o dir - output directory for minimized files 119 120Execution control settings: 121 122 -f file - location read by the fuzzed program (stdin) 123 -m megs - memory limit for child process ($MEM_LIMIT MB) 124 -t msec - run time limit for child process (none) 125 -O - use binary-only instrumentation (FRIDA mode) 126 -Q - use binary-only instrumentation (QEMU mode) 127 -U - use unicorn-based instrumentation (Unicorn mode) 128 129Minimization settings: 130 131 -C - keep crashing inputs, reject everything else 132 -e - solve for edge coverage only, ignore hit counts 133 134For additional tips, please consult README.md. 135 136Environment variables used: 137AFL_KEEP_TRACES: leave the temporary <out_dir>\.traces directory 138AFL_NO_FORKSRV: run target via execve instead of using the forkserver 139AFL_PATH: last resort location to find the afl-showmap binary 140AFL_SKIP_BIN_CHECK: skip check for target binary 141_EOF_ 142 exit 1 143fi 144 145# Do a sanity check to discourage the use of /tmp, since we can't really 146# handle this safely from a shell script. 147 148#if [ "$AFL_ALLOW_TMP" = "" ]; then 149# 150# echo "$IN_DIR" | grep -qE '^(/var)?/tmp/' 151# T1="$?" 152# 153# echo "$TARGET_BIN" | grep -qE '^(/var)?/tmp/' 154# T2="$?" 155# 156# echo "$OUT_DIR" | grep -qE '^(/var)?/tmp/' 157# T3="$?" 158# 159# echo "$STDIN_FILE" | grep -qE '^(/var)?/tmp/' 160# T4="$?" 161# 162# echo "$PWD" | grep -qE '^(/var)?/tmp/' 163# T5="$?" 164# 165# if [ "$T1" = "0" -o "$T2" = "0" -o "$T3" = "0" -o "$T4" = "0" -o "$T5" = "0" ]; then 166# echo "[-] Error: do not use this script in /tmp or /var/tmp." 1>&2 167# exit 1 168# fi 169# 170#fi 171 172# If @@ is specified, but there's no -f, let's come up with a temporary input 173# file name. 174 175TRACE_DIR="$OUT_DIR/.traces" 176 177if [ "$STDIN_FILE" = "" ]; then 178 179 if echo "$*" | grep -qF '@@'; then 180 STDIN_FILE="$TRACE_DIR/.cur_input" 181 fi 182 183fi 184 185# Check for obvious errors. 186 187if [ ! "$MEM_LIMIT" = "none" ]; then 188 189 if [ "$MEM_LIMIT" -lt "5" ]; then 190 echo "[-] Error: dangerously low memory limit." 1>&2 191 exit 1 192 fi 193 194fi 195 196if [ ! "$TIMEOUT" = "none" ]; then 197 198 if [ "$TIMEOUT" -lt "10" ]; then 199 echo "[-] Error: dangerously low timeout." 1>&2 200 exit 1 201 fi 202 203fi 204 205if [ ! -f "$TARGET_BIN" -o ! -x "$TARGET_BIN" ]; then 206 207 TNEW="`which "$TARGET_BIN" 2>/dev/null`" 208 209 if [ ! -f "$TNEW" -o ! -x "$TNEW" ]; then 210 echo "[-] Error: binary '$TARGET_BIN' not found or not executable." 1>&2 211 exit 1 212 fi 213 214 TARGET_BIN="$TNEW" 215 216fi 217 218if [ "$AFL_SKIP_BIN_CHECK" = "" -a "$QEMU_MODE" = "" -a "$FRIDA_MODE" = "" -a "$UNICORN_MODE" = "" ]; then 219 220 if ! grep -qF "__AFL_SHM_ID" "$TARGET_BIN"; then 221 echo "[-] Error: binary '$TARGET_BIN' doesn't appear to be instrumented." 1>&2 222 exit 1 223 fi 224 225fi 226 227if [ ! -d "$IN_DIR" ]; then 228 echo "[-] Error: directory '$IN_DIR' not found." 1>&2 229 exit 1 230fi 231 232test -d "$IN_DIR/default" && IN_DIR="$IN_DIR/default" 233test -d "$IN_DIR/queue" && IN_DIR="$IN_DIR/queue" 234 235find "$OUT_DIR" -name 'id[:_]*' -maxdepth 1 -exec rm -- {} \; 2>/dev/null 236rm -rf "$TRACE_DIR" 2>/dev/null 237 238rmdir "$OUT_DIR" 2>/dev/null 239 240if [ -d "$OUT_DIR" ]; then 241 echo "[-] Error: directory '$OUT_DIR' exists and is not empty - delete it first." 1>&2 242 exit 1 243fi 244 245mkdir -m 700 -p "$TRACE_DIR" || exit 1 246 247if [ ! "$STDIN_FILE" = "" ]; then 248 rm -f "$STDIN_FILE" || exit 1 249 touch "$STDIN_FILE" || exit 1 250fi 251 252SHOWMAP=`command -v afl-showmap 2>/dev/null` 253 254if [ -z "$SHOWMAP" ]; then 255 TMP="${0%/afl-cmin.bash}/afl-showmap" 256 if [ -x "$TMP" ]; then 257 SHOWMAP=$TMP 258 fi 259fi 260 261if [ -z "$SHOWMAP" -a -x "./afl-showmap" ]; then 262 SHOWMAP="./afl-showmap" 263else 264 if [ -n "$AFL_PATH" ]; then 265 SHOWMAP="$AFL_PATH/afl-showmap" 266 fi 267fi 268 269if [ ! -x "$SHOWMAP" ]; then 270 echo "[-] Error: can't find 'afl-showmap' - please set AFL_PATH." 1>&2 271 rm -rf "$TRACE_DIR" 272 exit 1 273fi 274 275IN_COUNT=$((`ls -- "$IN_DIR" 2>/dev/null | wc -l`)) 276 277if [ "$IN_COUNT" = "0" ]; then 278 echo "[+] Hmm, no inputs in the target directory. Nothing to be done." 279 rm -rf "$TRACE_DIR" 280 exit 1 281fi 282 283FIRST_FILE=`ls "$IN_DIR" | head -1` 284 285# Make sure that we're not dealing with a directory. 286 287if [ -d "$IN_DIR/$FIRST_FILE" ]; then 288 echo "[-] Error: The target directory contains subdirectories - please fix." 1>&2 289 rm -rf "$TRACE_DIR" 290 exit 1 291fi 292 293# Check for the more efficient way to copy files... 294 295if ln "$IN_DIR/$FIRST_FILE" "$TRACE_DIR/.link_test" 2>/dev/null; then 296 CP_TOOL=ln 297else 298 CP_TOOL=cp 299fi 300 301# Make sure that we can actually get anything out of afl-showmap before we 302# waste too much time. 303 304echo "[*] Testing the target binary..." 305 306if [ "$STDIN_FILE" = "" ]; then 307 308 AFL_CMIN_ALLOW_ANY=1 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/.run_test" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$FIRST_FILE" 309 310else 311 312 cp "$IN_DIR/$FIRST_FILE" "$STDIN_FILE" 313 AFL_CMIN_ALLOW_ANY=1 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/.run_test" -Z $EXTRA_PAR -H "$STDIN_FILE" -- "$@" </dev/null 314 315fi 316 317FIRST_COUNT=$((`grep -c . "$TRACE_DIR/.run_test"`)) 318 319if [ "$FIRST_COUNT" -gt "0" ]; then 320 321 echo "[+] OK, $FIRST_COUNT tuples recorded." 322 323else 324 325 echo "[-] Error: no instrumentation output detected (perhaps crash or timeout)." 1>&2 326 test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR" 327 exit 1 328 329fi 330 331# Let's roll! 332 333############################# 334# STEP 1: COLLECTING TRACES # 335############################# 336 337echo "[*] Obtaining traces for input files in '$IN_DIR'..." 338 339( 340 341 CUR=0 342 343 if [ "$STDIN_FILE" = "" ]; then 344 345 ls "$IN_DIR" | while read -r fn; do 346 347 CUR=$((CUR+1)) 348 printf "\\r Processing file $CUR/$IN_COUNT... " 349 350 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$fn" 351 352 done 353 354 else 355 356 ls "$IN_DIR" | while read -r fn; do 357 358 CUR=$((CUR+1)) 359 printf "\\r Processing file $CUR/$IN_COUNT... " 360 361 cp "$IN_DIR/$fn" "$STDIN_FILE" 362 363 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -H "$STDIN_FILE" -- "$@" </dev/null 364 365 done 366 367 368 fi 369 370) 371 372echo 373 374########################## 375# STEP 2: SORTING TUPLES # 376########################## 377 378# With this out of the way, we sort all tuples by popularity across all 379# datasets. The reasoning here is that we won't be able to avoid the files 380# that trigger unique tuples anyway, so we will want to start with them and 381# see what's left. 382 383echo "[*] Sorting trace sets (this may take a while)..." 384 385ls "$IN_DIR" | sed "s#^#$TRACE_DIR/#" | tr '\n' '\0' | xargs -0 -n 1 cat | \ 386 sort | uniq -c | sort -k 1,1 -n >"$TRACE_DIR/.all_uniq" 387 388TUPLE_COUNT=$((`grep -c . "$TRACE_DIR/.all_uniq"`)) 389 390echo "[+] Found $TUPLE_COUNT unique tuples across $IN_COUNT files." 391 392##################################### 393# STEP 3: SELECTING CANDIDATE FILES # 394##################################### 395 396# The next step is to find the best candidate for each tuple. The "best" 397# part is understood simply as the smallest input that includes a particular 398# tuple in its trace. Empirical evidence suggests that this produces smaller 399# datasets than more involved algorithms that could be still pulled off in 400# a shell script. 401 402echo "[*] Finding best candidates for each tuple..." 403 404CUR=0 405 406ls -rS "$IN_DIR" | while read -r fn; do 407 408 CUR=$((CUR+1)) 409 printf "\\r Processing file $CUR/$IN_COUNT... " 410 411 sed "s#\$# $fn#" "$TRACE_DIR/$fn" >>"$TRACE_DIR/.candidate_list" 412 413done 414 415echo 416 417############################## 418# STEP 4: LOADING CANDIDATES # 419############################## 420 421# At this point, we have a file of tuple-file pairs, sorted by file size 422# in ascending order (as a consequence of ls -rS). By doing sort keyed 423# only by tuple (-k 1,1) and configured to output only the first line for 424# every key (-s -u), we end up with the smallest file for each tuple. 425 426echo "[*] Sorting candidate list (be patient)..." 427 428sort -k1,1 -s -u "$TRACE_DIR/.candidate_list" | \ 429 sed 's/^/BEST_FILE[/;s/ /]="/;s/$/"/' >"$TRACE_DIR/.candidate_script" 430 431if [ ! -s "$TRACE_DIR/.candidate_script" ]; then 432 echo "[-] Error: no traces obtained from test cases, check syntax!" 1>&2 433 test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR" 434 exit 1 435fi 436 437# The sed command converted the sorted list to a shell script that populates 438# BEST_FILE[tuple]="fname". Let's load that! 439 440. "$TRACE_DIR/.candidate_script" 441 442########################## 443# STEP 5: WRITING OUTPUT # 444########################## 445 446# The final trick is to grab the top pick for each tuple, unless said tuple is 447# already set due to the inclusion of an earlier candidate; and then put all 448# tuples associated with the newly-added file to the "already have" list. The 449# loop works from least popular tuples and toward the most common ones. 450 451echo "[*] Processing candidates and writing output files..." 452 453CUR=0 454 455touch "$TRACE_DIR/.already_have" 456 457while read -r cnt tuple; do 458 459 CUR=$((CUR+1)) 460 printf "\\r Processing tuple $CUR/$TUPLE_COUNT with count $cnt... " 461 462 # If we already have this tuple, skip it. 463 464 grep -q "^$tuple\$" "$TRACE_DIR/.already_have" && continue 465 466 FN=${BEST_FILE[tuple]} 467 468# echo "tuple nr $CUR ($tuple cnt=$cnt) -> $FN" >> "$TRACE_DIR/.log" 469 $CP_TOOL "$IN_DIR/$FN" "$OUT_DIR/$FN" 470 471 if [ "$((CUR % 5))" = "0" ]; then 472 sort -u "$TRACE_DIR/$FN" "$TRACE_DIR/.already_have" >"$TRACE_DIR/.tmp" 473 mv -f "$TRACE_DIR/.tmp" "$TRACE_DIR/.already_have" 474 else 475 cat "$TRACE_DIR/$FN" >>"$TRACE_DIR/.already_have" 476 fi 477 478done <"$TRACE_DIR/.all_uniq" 479 480echo 481 482OUT_COUNT=`ls -- "$OUT_DIR" | wc -l` 483 484if [ "$OUT_COUNT" = "1" ]; then 485 echo "[!] WARNING: All test cases had the same traces, check syntax!" 486fi 487 488echo "[+] Narrowed down to $OUT_COUNT files, saved in '$OUT_DIR'." 489echo 490 491test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR" 492 493exit 0 494