• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* A fuzz test for CPython.
2 
3   The only exposed function is LLVMFuzzerTestOneInput, which is called by
4   fuzzers and by the _fuzz module for smoke tests.
5 
6   To build exactly one fuzz test, as when running in oss-fuzz etc.,
7   build with -D _Py_FUZZ_ONE and -D _Py_FUZZ_<test_name>. e.g. to build
8   LLVMFuzzerTestOneInput to only run "fuzz_builtin_float", build this file with
9       -D _Py_FUZZ_ONE -D _Py_FUZZ_fuzz_builtin_float.
10 
11   See the source code for LLVMFuzzerTestOneInput for details. */
12 
13 #include <Python.h>
14 #include <stdlib.h>
15 #include <inttypes.h>
16 
17 /*  Fuzz PyFloat_FromString as a proxy for float(str). */
fuzz_builtin_float(const char * data,size_t size)18 static int fuzz_builtin_float(const char* data, size_t size) {
19     PyObject* s = PyBytes_FromStringAndSize(data, size);
20     if (s == NULL) return 0;
21     PyObject* f = PyFloat_FromString(s);
22     if (PyErr_Occurred() && PyErr_ExceptionMatches(PyExc_ValueError)) {
23         PyErr_Clear();
24     }
25 
26     Py_XDECREF(f);
27     Py_DECREF(s);
28     return 0;
29 }
30 
31 #define MAX_INT_TEST_SIZE 0x10000
32 
33 /* Fuzz PyLong_FromUnicodeObject as a proxy for int(str). */
fuzz_builtin_int(const char * data,size_t size)34 static int fuzz_builtin_int(const char* data, size_t size) {
35     /* Ignore test cases with very long ints to avoid timeouts
36        int("9" * 1000000) is not a very interesting test caase */
37     if (size > MAX_INT_TEST_SIZE) {
38         return 0;
39     }
40     /* Pick a random valid base. (When the fuzzed function takes extra
41        parameters, it's somewhat normal to hash the input to generate those
42        parameters. We want to exercise all code paths, so we do so here.) */
43     int base = _Py_HashBytes(data, size) % 37;
44     if (base == 1) {
45         // 1 is the only number between 0 and 36 that is not a valid base.
46         base = 0;
47     }
48     if (base == -1) {
49         return 0;  // An error occurred, bail early.
50     }
51     if (base < 0) {
52         base = -base;
53     }
54 
55     PyObject* s = PyUnicode_FromStringAndSize(data, size);
56     if (s == NULL) {
57         if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
58             PyErr_Clear();
59         }
60         return 0;
61     }
62     PyObject* l = PyLong_FromUnicodeObject(s, base);
63     if (l == NULL && PyErr_ExceptionMatches(PyExc_ValueError)) {
64         PyErr_Clear();
65     }
66     PyErr_Clear();
67     Py_XDECREF(l);
68     Py_DECREF(s);
69     return 0;
70 }
71 
72 /* Fuzz PyUnicode_FromStringAndSize as a proxy for unicode(str). */
fuzz_builtin_unicode(const char * data,size_t size)73 static int fuzz_builtin_unicode(const char* data, size_t size) {
74     PyObject* s = PyUnicode_FromStringAndSize(data, size);
75     if (s == NULL && PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
76         PyErr_Clear();
77     }
78     Py_XDECREF(s);
79     return 0;
80 }
81 
82 
83 PyObject* struct_unpack_method = NULL;
84 PyObject* struct_error = NULL;
85 /* Called by LLVMFuzzerTestOneInput for initialization */
init_struct_unpack()86 static int init_struct_unpack() {
87     /* Import struct.unpack */
88     PyObject* struct_module = PyImport_ImportModule("struct");
89     if (struct_module == NULL) {
90         return 0;
91     }
92     struct_error = PyObject_GetAttrString(struct_module, "error");
93     if (struct_error == NULL) {
94         return 0;
95     }
96     struct_unpack_method = PyObject_GetAttrString(struct_module, "unpack");
97     return struct_unpack_method != NULL;
98 }
99 /* Fuzz struct.unpack(x, y) */
fuzz_struct_unpack(const char * data,size_t size)100 static int fuzz_struct_unpack(const char* data, size_t size) {
101     /* Everything up to the first null byte is considered the
102        format. Everything after is the buffer */
103     const char* first_null = memchr(data, '\0', size);
104     if (first_null == NULL) {
105         return 0;
106     }
107 
108     size_t format_length = first_null - data;
109     size_t buffer_length = size - format_length - 1;
110 
111     PyObject* pattern = PyBytes_FromStringAndSize(data, format_length);
112     if (pattern == NULL) {
113         return 0;
114     }
115     PyObject* buffer = PyBytes_FromStringAndSize(first_null + 1, buffer_length);
116     if (buffer == NULL) {
117         Py_DECREF(pattern);
118         return 0;
119     }
120 
121     PyObject* unpacked = PyObject_CallFunctionObjArgs(
122         struct_unpack_method, pattern, buffer, NULL);
123     /* Ignore any overflow errors, these are easily triggered accidentally */
124     if (unpacked == NULL && PyErr_ExceptionMatches(PyExc_OverflowError)) {
125         PyErr_Clear();
126     }
127     /* The pascal format string will throw a negative size when passing 0
128        like: struct.unpack('0p', b'') */
129     if (unpacked == NULL && PyErr_ExceptionMatches(PyExc_SystemError)) {
130         PyErr_Clear();
131     }
132     /* Ignore any struct.error exceptions, these can be caused by invalid
133        formats or incomplete buffers both of which are common. */
134     if (unpacked == NULL && PyErr_ExceptionMatches(struct_error)) {
135         PyErr_Clear();
136     }
137 
138     Py_XDECREF(unpacked);
139     Py_DECREF(pattern);
140     Py_DECREF(buffer);
141     return 0;
142 }
143 
144 
145 #define MAX_JSON_TEST_SIZE 0x10000
146 
147 PyObject* json_loads_method = NULL;
148 /* Called by LLVMFuzzerTestOneInput for initialization */
init_json_loads()149 static int init_json_loads() {
150     /* Import json.loads */
151     PyObject* json_module = PyImport_ImportModule("json");
152     if (json_module == NULL) {
153         return 0;
154     }
155     json_loads_method = PyObject_GetAttrString(json_module, "loads");
156     return json_loads_method != NULL;
157 }
158 /* Fuzz json.loads(x) */
fuzz_json_loads(const char * data,size_t size)159 static int fuzz_json_loads(const char* data, size_t size) {
160     /* Since python supports arbitrarily large ints in JSON,
161        long inputs can lead to timeouts on boring inputs like
162        `json.loads("9" * 100000)` */
163     if (size > MAX_JSON_TEST_SIZE) {
164         return 0;
165     }
166     PyObject* input_bytes = PyBytes_FromStringAndSize(data, size);
167     if (input_bytes == NULL) {
168         return 0;
169     }
170     PyObject* parsed = PyObject_CallOneArg(json_loads_method, input_bytes);
171     if (parsed == NULL) {
172         /* Ignore ValueError as the fuzzer will more than likely
173            generate some invalid json and values */
174         if (PyErr_ExceptionMatches(PyExc_ValueError) ||
175         /* Ignore RecursionError as the fuzzer generates long sequences of
176            arrays such as `[[[...` */
177             PyErr_ExceptionMatches(PyExc_RecursionError) ||
178         /* Ignore unicode errors, invalid byte sequences are common */
179             PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)
180         ) {
181             PyErr_Clear();
182         }
183     }
184     Py_DECREF(input_bytes);
185     Py_XDECREF(parsed);
186     return 0;
187 }
188 
189 #define MAX_RE_TEST_SIZE 0x10000
190 
191 PyObject* sre_compile_method = NULL;
192 PyObject* sre_error_exception = NULL;
193 int SRE_FLAG_DEBUG = 0;
194 /* Called by LLVMFuzzerTestOneInput for initialization */
init_sre_compile()195 static int init_sre_compile() {
196     /* Import sre_compile.compile and sre.error */
197     PyObject* sre_compile_module = PyImport_ImportModule("sre_compile");
198     if (sre_compile_module == NULL) {
199         return 0;
200     }
201     sre_compile_method = PyObject_GetAttrString(sre_compile_module, "compile");
202     if (sre_compile_method == NULL) {
203         return 0;
204     }
205 
206     PyObject* sre_constants = PyImport_ImportModule("sre_constants");
207     if (sre_constants == NULL) {
208         return 0;
209     }
210     sre_error_exception = PyObject_GetAttrString(sre_constants, "error");
211     if (sre_error_exception == NULL) {
212         return 0;
213     }
214     PyObject* debug_flag = PyObject_GetAttrString(sre_constants, "SRE_FLAG_DEBUG");
215     if (debug_flag == NULL) {
216         return 0;
217     }
218     SRE_FLAG_DEBUG = PyLong_AsLong(debug_flag);
219     return 1;
220 }
221 /* Fuzz _sre.compile(x) */
fuzz_sre_compile(const char * data,size_t size)222 static int fuzz_sre_compile(const char* data, size_t size) {
223     /* Ignore really long regex patterns that will timeout the fuzzer */
224     if (size > MAX_RE_TEST_SIZE) {
225         return 0;
226     }
227     /* We treat the first 2 bytes of the input as a number for the flags */
228     if (size < 2) {
229         return 0;
230     }
231     uint16_t flags = ((uint16_t*) data)[0];
232     /* We remove the SRE_FLAG_DEBUG if present. This is because it
233        prints to stdout which greatly decreases fuzzing speed */
234     flags &= ~SRE_FLAG_DEBUG;
235 
236     /* Pull the pattern from the remaining bytes */
237     PyObject* pattern_bytes = PyBytes_FromStringAndSize(data + 2, size - 2);
238     if (pattern_bytes == NULL) {
239         return 0;
240     }
241     PyObject* flags_obj = PyLong_FromUnsignedLong(flags);
242     if (flags_obj == NULL) {
243         Py_DECREF(pattern_bytes);
244         return 0;
245     }
246 
247     /* compiled = _sre.compile(data[2:], data[0:2] */
248     PyObject* compiled = PyObject_CallFunctionObjArgs(
249         sre_compile_method, pattern_bytes, flags_obj, NULL);
250     /* Ignore ValueError as the fuzzer will more than likely
251        generate some invalid combination of flags */
252     if (compiled == NULL && PyErr_ExceptionMatches(PyExc_ValueError)) {
253         PyErr_Clear();
254     }
255     /* Ignore some common errors thrown by sre_parse:
256        Overflow, Assertion, Recursion and Index */
257     if (compiled == NULL && (PyErr_ExceptionMatches(PyExc_OverflowError) ||
258                              PyErr_ExceptionMatches(PyExc_AssertionError) ||
259                              PyErr_ExceptionMatches(PyExc_RecursionError) ||
260                              PyErr_ExceptionMatches(PyExc_IndexError))
261     ) {
262         PyErr_Clear();
263     }
264     /* Ignore re.error */
265     if (compiled == NULL && PyErr_ExceptionMatches(sre_error_exception)) {
266         PyErr_Clear();
267     }
268 
269     Py_DECREF(pattern_bytes);
270     Py_DECREF(flags_obj);
271     Py_XDECREF(compiled);
272     return 0;
273 }
274 
275 /* Some random patterns used to test re.match.
276    Be careful not to add catostraphically slow regexes here, we want to
277    exercise the matching code without causing timeouts.*/
278 static const char* regex_patterns[] = {
279     ".", "^", "abc", "abc|def", "^xxx$", "\\b", "()", "[a-zA-Z0-9]",
280     "abc+", "[^A-Z]", "[x]", "(?=)", "a{z}", "a+b", "a*?", "a??", "a+?",
281     "{}", "a{,}", "{", "}", "^\\(*\\d{3}\\)*( |-)*\\d{3}( |-)*\\d{4}$",
282     "(?:a*)*", "a{1,2}?"
283 };
284 const size_t NUM_PATTERNS = sizeof(regex_patterns) / sizeof(regex_patterns[0]);
285 PyObject** compiled_patterns = NULL;
286 /* Called by LLVMFuzzerTestOneInput for initialization */
init_sre_match()287 static int init_sre_match() {
288     PyObject* re_module = PyImport_ImportModule("re");
289     if (re_module == NULL) {
290         return 0;
291     }
292     compiled_patterns = (PyObject**) PyMem_RawMalloc(
293         sizeof(PyObject*) * NUM_PATTERNS);
294     if (compiled_patterns == NULL) {
295         PyErr_NoMemory();
296         return 0;
297     }
298 
299     /* Precompile all the regex patterns on the first run for faster fuzzing */
300     for (size_t i = 0; i < NUM_PATTERNS; i++) {
301         PyObject* compiled = PyObject_CallMethod(
302             re_module, "compile", "y", regex_patterns[i]);
303         /* Bail if any of the patterns fail to compile */
304         if (compiled == NULL) {
305             return 0;
306         }
307         compiled_patterns[i] = compiled;
308     }
309     return 1;
310 }
311 /* Fuzz re.match(x) */
fuzz_sre_match(const char * data,size_t size)312 static int fuzz_sre_match(const char* data, size_t size) {
313     if (size < 1 || size > MAX_RE_TEST_SIZE) {
314         return 0;
315     }
316     /* Use the first byte as a uint8_t specifying the index of the
317        regex to use */
318     unsigned char idx = (unsigned char) data[0];
319     idx = idx % NUM_PATTERNS;
320 
321     /* Pull the string to match from the remaining bytes */
322     PyObject* to_match = PyBytes_FromStringAndSize(data + 1, size - 1);
323     if (to_match == NULL) {
324         return 0;
325     }
326 
327     PyObject* pattern = compiled_patterns[idx];
328     PyObject* match_callable = PyObject_GetAttrString(pattern, "match");
329 
330     PyObject* matches = PyObject_CallOneArg(match_callable, to_match);
331 
332     Py_XDECREF(matches);
333     Py_DECREF(match_callable);
334     Py_DECREF(to_match);
335     return 0;
336 }
337 
338 #define MAX_CSV_TEST_SIZE 0x10000
339 PyObject* csv_module = NULL;
340 PyObject* csv_error = NULL;
341 /* Called by LLVMFuzzerTestOneInput for initialization */
init_csv_reader()342 static int init_csv_reader() {
343     /* Import csv and csv.Error */
344     csv_module = PyImport_ImportModule("csv");
345     if (csv_module == NULL) {
346         return 0;
347     }
348     csv_error = PyObject_GetAttrString(csv_module, "Error");
349     return csv_error != NULL;
350 }
351 /* Fuzz csv.reader([x]) */
fuzz_csv_reader(const char * data,size_t size)352 static int fuzz_csv_reader(const char* data, size_t size) {
353     if (size < 1 || size > MAX_CSV_TEST_SIZE) {
354         return 0;
355     }
356     /* Ignore non null-terminated strings since _csv can't handle
357        embeded nulls */
358     if (memchr(data, '\0', size) == NULL) {
359         return 0;
360     }
361 
362     PyObject* s = PyUnicode_FromString(data);
363     /* Ignore exceptions until we have a valid string */
364     if (s == NULL) {
365         PyErr_Clear();
366         return 0;
367     }
368 
369     /* Split on \n so we can test multiple lines */
370     PyObject* lines = PyObject_CallMethod(s, "split", "s", "\n");
371     if (lines == NULL) {
372         Py_DECREF(s);
373         return 0;
374     }
375 
376     PyObject* reader = PyObject_CallMethod(csv_module, "reader", "N", lines);
377     if (reader) {
378         /* Consume all of the reader as an iterator */
379         PyObject* parsed_line;
380         while ((parsed_line = PyIter_Next(reader))) {
381             Py_DECREF(parsed_line);
382         }
383     }
384 
385     /* Ignore csv.Error because we're probably going to generate
386        some bad files (embeded new-lines, unterminated quotes etc) */
387     if (PyErr_ExceptionMatches(csv_error)) {
388         PyErr_Clear();
389     }
390 
391     Py_XDECREF(reader);
392     Py_DECREF(s);
393     return 0;
394 }
395 
396 /* Run fuzzer and abort on failure. */
_run_fuzz(const uint8_t * data,size_t size,int (* fuzzer)(const char *,size_t))397 static int _run_fuzz(const uint8_t *data, size_t size, int(*fuzzer)(const char* , size_t)) {
398     int rv = fuzzer((const char*) data, size);
399     if (PyErr_Occurred()) {
400         /* Fuzz tests should handle expected errors for themselves.
401            This is last-ditch check in case they didn't. */
402         PyErr_Print();
403         abort();
404     }
405     /* Someday the return value might mean something, propagate it. */
406     return rv;
407 }
408 
409 /* CPython generates a lot of leak warnings for whatever reason. */
__lsan_is_turned_off(void)410 int __lsan_is_turned_off(void) { return 1; }
411 
412 
LLVMFuzzerInitialize(int * argc,char *** argv)413 int LLVMFuzzerInitialize(int *argc, char ***argv) {
414     wchar_t* wide_program_name = Py_DecodeLocale(*argv[0], NULL);
415     Py_SetProgramName(wide_program_name);
416     return 0;
417 }
418 
419 /* Fuzz test interface.
420    This returns the bitwise or of all fuzz test's return values.
421 
422    All fuzz tests must return 0, as all nonzero return codes are reserved for
423    future use -- we propagate the return values for that future case.
424    (And we bitwise or when running multiple tests to verify that normally we
425    only return 0.) */
LLVMFuzzerTestOneInput(const uint8_t * data,size_t size)426 int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
427     if (!Py_IsInitialized()) {
428         /* LLVMFuzzerTestOneInput is called repeatedly from the same process,
429            with no separate initialization phase, sadly, so we need to
430            initialize CPython ourselves on the first run. */
431         Py_InitializeEx(0);
432     }
433 
434     int rv = 0;
435 
436 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_builtin_float)
437     rv |= _run_fuzz(data, size, fuzz_builtin_float);
438 #endif
439 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_builtin_int)
440     rv |= _run_fuzz(data, size, fuzz_builtin_int);
441 #endif
442 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_builtin_unicode)
443     rv |= _run_fuzz(data, size, fuzz_builtin_unicode);
444 #endif
445 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_struct_unpack)
446     static int STRUCT_UNPACK_INITIALIZED = 0;
447     if (!STRUCT_UNPACK_INITIALIZED && !init_struct_unpack()) {
448         PyErr_Print();
449         abort();
450     } else {
451         STRUCT_UNPACK_INITIALIZED = 1;
452     }
453     rv |= _run_fuzz(data, size, fuzz_struct_unpack);
454 #endif
455 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_json_loads)
456     static int JSON_LOADS_INITIALIZED = 0;
457     if (!JSON_LOADS_INITIALIZED && !init_json_loads()) {
458         PyErr_Print();
459         abort();
460     } else {
461         JSON_LOADS_INITIALIZED = 1;
462     }
463 
464     rv |= _run_fuzz(data, size, fuzz_json_loads);
465 #endif
466 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_sre_compile)
467     static int SRE_COMPILE_INITIALIZED = 0;
468     if (!SRE_COMPILE_INITIALIZED && !init_sre_compile()) {
469         PyErr_Print();
470         abort();
471     } else {
472         SRE_COMPILE_INITIALIZED = 1;
473     }
474 
475     rv |= _run_fuzz(data, size, fuzz_sre_compile);
476 #endif
477 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_sre_match)
478     static int SRE_MATCH_INITIALIZED = 0;
479     if (!SRE_MATCH_INITIALIZED && !init_sre_match()) {
480         PyErr_Print();
481         abort();
482     } else {
483         SRE_MATCH_INITIALIZED = 1;
484     }
485 
486     rv |= _run_fuzz(data, size, fuzz_sre_match);
487 #endif
488 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_csv_reader)
489     static int CSV_READER_INITIALIZED = 0;
490     if (!CSV_READER_INITIALIZED && !init_csv_reader()) {
491         PyErr_Print();
492         abort();
493     } else {
494         CSV_READER_INITIALIZED = 1;
495     }
496 
497     rv |= _run_fuzz(data, size, fuzz_csv_reader);
498 #endif
499   return rv;
500 }
501