• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env bash
2#
3# american fuzzy lop++ - corpus minimization tool
4# ---------------------------------------------
5#
6# Originally written by Michal Zalewski
7#
8# Copyright 2014, 2015 Google Inc. All rights reserved.
9#
10# Licensed under the Apache License, Version 2.0 (the "License");
11# you may not use this file except in compliance with the License.
12# You may obtain a copy of the License at:
13#
14#   https://www.apache.org/licenses/LICENSE-2.0
15#
16# This tool tries to find the smallest subset of files in the input directory
17# that still trigger the full range of instrumentation data points seen in
18# the starting corpus. This has two uses:
19#
20#   - Screening large corpora of input files before using them as a seed for
21#     afl-fuzz. The tool will remove functionally redundant files and likely
22#     leave you with a much smaller set.
23#
24#     (In this case, you probably also want to consider running afl-tmin on
25#     the individual files later on to reduce their size.)
26#
27#   - Minimizing the corpus generated organically by afl-fuzz, perhaps when
28#     planning to feed it to more resource-intensive tools. The tool achieves
29#     this by removing all entries that used to trigger unique behaviors in the
30#     past, but have been made obsolete by later finds.
31#
32# Note that the tool doesn't modify the files themselves. For that, you want
33# afl-tmin.
34#
35# This script must use bash because other shells may have hardcoded limits on
36# array sizes.
37#
38
39echo "corpus minimization tool for afl-fuzz by Michal Zalewski"
40echo
41
42#########
43# SETUP #
44#########
45
46# Process command-line options...
47
48MEM_LIMIT=none
49TIMEOUT=none
50
51unset IN_DIR OUT_DIR STDIN_FILE EXTRA_PAR MEM_LIMIT_GIVEN \
52  AFL_CMIN_CRASHES_ONLY AFL_CMIN_ALLOW_ANY QEMU_MODE UNICORN_MODE
53
54export AFL_QUIET=1
55
56while getopts "+i:o:f:m:t:eOQUCh" opt; do
57
58  case "$opt" in
59
60    "h")
61	;;
62
63    "i")
64         IN_DIR="$OPTARG"
65         ;;
66
67    "o")
68         OUT_DIR="$OPTARG"
69         ;;
70    "f")
71         STDIN_FILE="$OPTARG"
72         ;;
73    "m")
74         MEM_LIMIT="$OPTARG"
75         MEM_LIMIT_GIVEN=1
76         ;;
77    "t")
78         TIMEOUT="$OPTARG"
79         ;;
80    "e")
81         EXTRA_PAR="$EXTRA_PAR -e"
82         ;;
83    "C")
84         export AFL_CMIN_CRASHES_ONLY=1
85         ;;
86    "O")
87         EXTRA_PAR="$EXTRA_PAR -O"
88         FRIDA_MODE=1
89         ;;
90    "Q")
91         EXTRA_PAR="$EXTRA_PAR -Q"
92         QEMU_MODE=1
93         ;;
94    "U")
95         EXTRA_PAR="$EXTRA_PAR -U"
96         UNICORN_MODE=1
97         ;;
98    "?")
99         exit 1
100         ;;
101
102   esac
103
104done
105
106shift $((OPTIND-1))
107
108TARGET_BIN="$1"
109
110if [ "$TARGET_BIN" = "" -o "$IN_DIR" = "" -o "$OUT_DIR" = "" ]; then
111
112  cat 1>&2 <<_EOF_
113Usage: $0 [ options ] -- /path/to/target_app [ ... ]
114
115Required parameters:
116
117  -i dir        - input directory with the starting corpus
118  -o dir        - output directory for minimized files
119
120Execution control settings:
121
122  -f file       - location read by the fuzzed program (stdin)
123  -m megs       - memory limit for child process ($MEM_LIMIT MB)
124  -t msec       - run time limit for child process (none)
125  -O            - use binary-only instrumentation (FRIDA mode)
126  -Q            - use binary-only instrumentation (QEMU mode)
127  -U            - use unicorn-based instrumentation (Unicorn mode)
128  
129Minimization settings:
130
131  -C            - keep crashing inputs, reject everything else
132  -e            - solve for edge coverage only, ignore hit counts
133
134For additional tips, please consult README.md.
135
136Environment variables used:
137AFL_KEEP_TRACES: leave the temporary <out_dir>\.traces directory
138AFL_NO_FORKSRV: run target via execve instead of using the forkserver
139AFL_PATH: last resort location to find the afl-showmap binary
140AFL_SKIP_BIN_CHECK: skip check for target binary
141_EOF_
142  exit 1
143fi
144
145# Do a sanity check to discourage the use of /tmp, since we can't really
146# handle this safely from a shell script.
147
148#if [ "$AFL_ALLOW_TMP" = "" ]; then
149#
150#  echo "$IN_DIR" | grep -qE '^(/var)?/tmp/'
151#  T1="$?"
152#
153#  echo "$TARGET_BIN" | grep -qE '^(/var)?/tmp/'
154#  T2="$?"
155#
156#  echo "$OUT_DIR" | grep -qE '^(/var)?/tmp/'
157#  T3="$?"
158#
159#  echo "$STDIN_FILE" | grep -qE '^(/var)?/tmp/'
160#  T4="$?"
161#
162#  echo "$PWD" | grep -qE '^(/var)?/tmp/'
163#  T5="$?"
164#
165#  if [ "$T1" = "0" -o "$T2" = "0" -o "$T3" = "0" -o "$T4" = "0" -o "$T5" = "0" ]; then
166#    echo "[-] Error: do not use this script in /tmp or /var/tmp." 1>&2
167#    exit 1
168#  fi
169#
170#fi
171
172# If @@ is specified, but there's no -f, let's come up with a temporary input
173# file name.
174
175TRACE_DIR="$OUT_DIR/.traces"
176
177if [ "$STDIN_FILE" = "" ]; then
178
179  if echo "$*" | grep -qF '@@'; then
180    STDIN_FILE="$TRACE_DIR/.cur_input"
181  fi
182
183fi
184
185# Check for obvious errors.
186
187if [ ! "$MEM_LIMIT" = "none" ]; then
188
189  if [ "$MEM_LIMIT" -lt "5" ]; then
190    echo "[-] Error: dangerously low memory limit." 1>&2
191    exit 1
192  fi
193
194fi
195
196if [ ! "$TIMEOUT" = "none" ]; then
197
198  if [ "$TIMEOUT" -lt "10" ]; then
199    echo "[-] Error: dangerously low timeout." 1>&2
200    exit 1
201  fi
202
203fi
204
205if [ ! -f "$TARGET_BIN" -o ! -x "$TARGET_BIN" ]; then
206
207  TNEW="`which "$TARGET_BIN" 2>/dev/null`"
208
209  if [ ! -f "$TNEW" -o ! -x "$TNEW" ]; then
210    echo "[-] Error: binary '$TARGET_BIN' not found or not executable." 1>&2
211    exit 1
212  fi
213
214  TARGET_BIN="$TNEW"
215
216fi
217
218if [ "$AFL_SKIP_BIN_CHECK" = "" -a "$QEMU_MODE" = "" -a "$FRIDA_MODE" = "" -a "$UNICORN_MODE" = "" ]; then
219
220  if ! grep -qF "__AFL_SHM_ID" "$TARGET_BIN"; then
221    echo "[-] Error: binary '$TARGET_BIN' doesn't appear to be instrumented." 1>&2
222    exit 1
223  fi
224
225fi
226
227if [ ! -d "$IN_DIR" ]; then
228  echo "[-] Error: directory '$IN_DIR' not found." 1>&2
229  exit 1
230fi
231
232test -d "$IN_DIR/default" && IN_DIR="$IN_DIR/default"
233test -d "$IN_DIR/queue" && IN_DIR="$IN_DIR/queue"
234
235find "$OUT_DIR" -name 'id[:_]*' -maxdepth 1 -exec rm -- {} \; 2>/dev/null
236rm -rf "$TRACE_DIR" 2>/dev/null
237
238rmdir "$OUT_DIR" 2>/dev/null
239
240if [ -d "$OUT_DIR" ]; then
241  echo "[-] Error: directory '$OUT_DIR' exists and is not empty - delete it first." 1>&2
242  exit 1
243fi
244
245mkdir -m 700 -p "$TRACE_DIR" || exit 1
246
247if [ ! "$STDIN_FILE" = "" ]; then
248  rm -f "$STDIN_FILE" || exit 1
249  touch "$STDIN_FILE" || exit 1
250fi
251
252SHOWMAP=`command -v afl-showmap 2>/dev/null`
253
254if [ -z "$SHOWMAP" ]; then
255  TMP="${0%/afl-cmin.bash}/afl-showmap"
256  if [ -x "$TMP" ]; then
257    SHOWMAP=$TMP
258  fi
259fi
260
261if [ -z "$SHOWMAP" -a -x "./afl-showmap" ]; then
262  SHOWMAP="./afl-showmap"
263else
264  if [ -n "$AFL_PATH" ]; then
265    SHOWMAP="$AFL_PATH/afl-showmap"
266  fi
267fi
268
269if [ ! -x "$SHOWMAP" ]; then
270  echo "[-] Error: can't find 'afl-showmap' - please set AFL_PATH." 1>&2
271  rm -rf "$TRACE_DIR"
272  exit 1
273fi
274
275IN_COUNT=$((`ls -- "$IN_DIR" 2>/dev/null | wc -l`))
276
277if [ "$IN_COUNT" = "0" ]; then
278  echo "[+] Hmm, no inputs in the target directory. Nothing to be done."
279  rm -rf "$TRACE_DIR"
280  exit 1
281fi
282
283FIRST_FILE=`ls "$IN_DIR" | head -1`
284
285# Make sure that we're not dealing with a directory.
286
287if [ -d "$IN_DIR/$FIRST_FILE" ]; then
288  echo "[-] Error: The target directory contains subdirectories - please fix." 1>&2
289  rm -rf "$TRACE_DIR"
290  exit 1
291fi
292
293# Check for the more efficient way to copy files...
294
295if ln "$IN_DIR/$FIRST_FILE" "$TRACE_DIR/.link_test" 2>/dev/null; then
296  CP_TOOL=ln
297else
298  CP_TOOL=cp
299fi
300
301# Make sure that we can actually get anything out of afl-showmap before we
302# waste too much time.
303
304echo "[*] Testing the target binary..."
305
306if [ "$STDIN_FILE" = "" ]; then
307
308  AFL_CMIN_ALLOW_ANY=1 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/.run_test" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$FIRST_FILE"
309
310else
311
312  cp "$IN_DIR/$FIRST_FILE" "$STDIN_FILE"
313  AFL_CMIN_ALLOW_ANY=1 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/.run_test" -Z $EXTRA_PAR -H "$STDIN_FILE" -- "$@" </dev/null
314
315fi
316
317FIRST_COUNT=$((`grep -c . "$TRACE_DIR/.run_test"`))
318
319if [ "$FIRST_COUNT" -gt "0" ]; then
320
321  echo "[+] OK, $FIRST_COUNT tuples recorded."
322
323else
324
325  echo "[-] Error: no instrumentation output detected (perhaps crash or timeout)." 1>&2
326  test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR"
327  exit 1
328
329fi
330
331# Let's roll!
332
333#############################
334# STEP 1: COLLECTING TRACES #
335#############################
336
337echo "[*] Obtaining traces for input files in '$IN_DIR'..."
338
339(
340
341  CUR=0
342
343  if [ "$STDIN_FILE" = "" ]; then
344
345    ls "$IN_DIR" | while read -r fn; do
346
347      CUR=$((CUR+1))
348      printf "\\r    Processing file $CUR/$IN_COUNT... "
349
350      "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$fn"
351
352    done
353
354  else
355
356    ls "$IN_DIR" | while read -r fn; do
357
358      CUR=$((CUR+1))
359      printf "\\r    Processing file $CUR/$IN_COUNT... "
360
361      cp "$IN_DIR/$fn" "$STDIN_FILE"
362
363      "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -H "$STDIN_FILE" -- "$@" </dev/null
364
365    done
366
367
368  fi
369
370)
371
372echo
373
374##########################
375# STEP 2: SORTING TUPLES #
376##########################
377
378# With this out of the way, we sort all tuples by popularity across all
379# datasets. The reasoning here is that we won't be able to avoid the files
380# that trigger unique tuples anyway, so we will want to start with them and
381# see what's left.
382
383echo "[*] Sorting trace sets (this may take a while)..."
384
385ls "$IN_DIR" | sed "s#^#$TRACE_DIR/#" | tr '\n' '\0' | xargs -0 -n 1 cat | \
386  sort | uniq -c | sort -k 1,1 -n >"$TRACE_DIR/.all_uniq"
387
388TUPLE_COUNT=$((`grep -c . "$TRACE_DIR/.all_uniq"`))
389
390echo "[+] Found $TUPLE_COUNT unique tuples across $IN_COUNT files."
391
392#####################################
393# STEP 3: SELECTING CANDIDATE FILES #
394#####################################
395
396# The next step is to find the best candidate for each tuple. The "best"
397# part is understood simply as the smallest input that includes a particular
398# tuple in its trace. Empirical evidence suggests that this produces smaller
399# datasets than more involved algorithms that could be still pulled off in
400# a shell script.
401
402echo "[*] Finding best candidates for each tuple..."
403
404CUR=0
405
406ls -rS "$IN_DIR" | while read -r fn; do
407
408  CUR=$((CUR+1))
409  printf "\\r    Processing file $CUR/$IN_COUNT... "
410
411  sed "s#\$# $fn#" "$TRACE_DIR/$fn" >>"$TRACE_DIR/.candidate_list"
412
413done
414
415echo
416
417##############################
418# STEP 4: LOADING CANDIDATES #
419##############################
420
421# At this point, we have a file of tuple-file pairs, sorted by file size
422# in ascending order (as a consequence of ls -rS). By doing sort keyed
423# only by tuple (-k 1,1) and configured to output only the first line for
424# every key (-s -u), we end up with the smallest file for each tuple.
425
426echo "[*] Sorting candidate list (be patient)..."
427
428sort -k1,1 -s -u "$TRACE_DIR/.candidate_list" | \
429  sed 's/^/BEST_FILE[/;s/ /]="/;s/$/"/' >"$TRACE_DIR/.candidate_script"
430
431if [ ! -s "$TRACE_DIR/.candidate_script" ]; then
432  echo "[-] Error: no traces obtained from test cases, check syntax!" 1>&2
433  test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR"
434  exit 1
435fi
436
437# The sed command converted the sorted list to a shell script that populates
438# BEST_FILE[tuple]="fname". Let's load that!
439
440. "$TRACE_DIR/.candidate_script"
441
442##########################
443# STEP 5: WRITING OUTPUT #
444##########################
445
446# The final trick is to grab the top pick for each tuple, unless said tuple is
447# already set due to the inclusion of an earlier candidate; and then put all
448# tuples associated with the newly-added file to the "already have" list. The
449# loop works from least popular tuples and toward the most common ones.
450
451echo "[*] Processing candidates and writing output files..."
452
453CUR=0
454
455touch "$TRACE_DIR/.already_have"
456
457while read -r cnt tuple; do
458
459  CUR=$((CUR+1))
460  printf "\\r    Processing tuple $CUR/$TUPLE_COUNT with count $cnt... "
461
462  # If we already have this tuple, skip it.
463
464  grep -q "^$tuple\$" "$TRACE_DIR/.already_have" && continue
465
466  FN=${BEST_FILE[tuple]}
467
468#  echo "tuple nr $CUR ($tuple cnt=$cnt) -> $FN" >> "$TRACE_DIR/.log"
469  $CP_TOOL "$IN_DIR/$FN" "$OUT_DIR/$FN"
470
471  if [ "$((CUR % 5))" = "0" ]; then
472    sort -u "$TRACE_DIR/$FN" "$TRACE_DIR/.already_have" >"$TRACE_DIR/.tmp"
473    mv -f "$TRACE_DIR/.tmp" "$TRACE_DIR/.already_have"
474  else
475    cat "$TRACE_DIR/$FN" >>"$TRACE_DIR/.already_have"
476  fi
477
478done <"$TRACE_DIR/.all_uniq"
479
480echo
481
482OUT_COUNT=`ls -- "$OUT_DIR" | wc -l`
483
484if [ "$OUT_COUNT" = "1" ]; then
485  echo "[!] WARNING: All test cases had the same traces, check syntax!"
486fi
487
488echo "[+] Narrowed down to $OUT_COUNT files, saved in '$OUT_DIR'."
489echo
490
491test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR"
492
493exit 0
494