• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #include "Python.h"
2 #include "structmember.h"
3 #if PY_VERSION_HEX < 0x02060000 && !defined(Py_TYPE)
4 #define Py_TYPE(ob)     (((PyObject*)(ob))->ob_type)
5 #endif
6 #if PY_VERSION_HEX < 0x02050000 && !defined(PY_SSIZE_T_MIN)
7 typedef int Py_ssize_t;
8 #define PY_SSIZE_T_MAX INT_MAX
9 #define PY_SSIZE_T_MIN INT_MIN
10 #define PyInt_FromSsize_t PyInt_FromLong
11 #define PyInt_AsSsize_t PyInt_AsLong
12 #endif
13 #ifndef Py_IS_FINITE
14 #define Py_IS_FINITE(X) (!Py_IS_INFINITY(X) && !Py_IS_NAN(X))
15 #endif
16 
17 #ifdef __GNUC__
18 #define UNUSED __attribute__((__unused__))
19 #else
20 #define UNUSED
21 #endif
22 
23 #define DEFAULT_ENCODING "utf-8"
24 
25 #define PyScanner_Check(op) PyObject_TypeCheck(op, &PyScannerType)
26 #define PyScanner_CheckExact(op) (Py_TYPE(op) == &PyScannerType)
27 #define PyEncoder_Check(op) PyObject_TypeCheck(op, &PyEncoderType)
28 #define PyEncoder_CheckExact(op) (Py_TYPE(op) == &PyEncoderType)
29 
30 static PyTypeObject PyScannerType;
31 static PyTypeObject PyEncoderType;
32 
33 typedef struct _PyScannerObject {
34     PyObject_HEAD
35     PyObject *encoding;
36     PyObject *strict;
37     PyObject *object_hook;
38     PyObject *pairs_hook;
39     PyObject *parse_float;
40     PyObject *parse_int;
41     PyObject *parse_constant;
42 } PyScannerObject;
43 
44 static PyMemberDef scanner_members[] = {
45     {"encoding", T_OBJECT, offsetof(PyScannerObject, encoding), READONLY, "encoding"},
46     {"strict", T_OBJECT, offsetof(PyScannerObject, strict), READONLY, "strict"},
47     {"object_hook", T_OBJECT, offsetof(PyScannerObject, object_hook), READONLY, "object_hook"},
48     {"object_pairs_hook", T_OBJECT, offsetof(PyScannerObject, pairs_hook), READONLY, "object_pairs_hook"},
49     {"parse_float", T_OBJECT, offsetof(PyScannerObject, parse_float), READONLY, "parse_float"},
50     {"parse_int", T_OBJECT, offsetof(PyScannerObject, parse_int), READONLY, "parse_int"},
51     {"parse_constant", T_OBJECT, offsetof(PyScannerObject, parse_constant), READONLY, "parse_constant"},
52     {NULL}
53 };
54 
55 typedef struct _PyEncoderObject {
56     PyObject_HEAD
57     PyObject *markers;
58     PyObject *defaultfn;
59     PyObject *encoder;
60     PyObject *indent;
61     PyObject *key_separator;
62     PyObject *item_separator;
63     PyObject *sort_keys;
64     PyObject *skipkeys;
65     int fast_encode;
66     int allow_nan;
67 } PyEncoderObject;
68 
69 static PyMemberDef encoder_members[] = {
70     {"markers", T_OBJECT, offsetof(PyEncoderObject, markers), READONLY, "markers"},
71     {"default", T_OBJECT, offsetof(PyEncoderObject, defaultfn), READONLY, "default"},
72     {"encoder", T_OBJECT, offsetof(PyEncoderObject, encoder), READONLY, "encoder"},
73     {"indent", T_OBJECT, offsetof(PyEncoderObject, indent), READONLY, "indent"},
74     {"key_separator", T_OBJECT, offsetof(PyEncoderObject, key_separator), READONLY, "key_separator"},
75     {"item_separator", T_OBJECT, offsetof(PyEncoderObject, item_separator), READONLY, "item_separator"},
76     {"sort_keys", T_OBJECT, offsetof(PyEncoderObject, sort_keys), READONLY, "sort_keys"},
77     {"skipkeys", T_OBJECT, offsetof(PyEncoderObject, skipkeys), READONLY, "skipkeys"},
78     {NULL}
79 };
80 
81 static Py_ssize_t
82 ascii_escape_char(Py_UNICODE c, char *output, Py_ssize_t chars);
83 static PyObject *
84 ascii_escape_unicode(PyObject *pystr);
85 static PyObject *
86 ascii_escape_str(PyObject *pystr);
87 static PyObject *
88 py_encode_basestring_ascii(PyObject* self UNUSED, PyObject *pystr);
89 void init_json(void);
90 static PyObject *
91 scan_once_str(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_t *next_idx_ptr);
92 static PyObject *
93 scan_once_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_t *next_idx_ptr);
94 static PyObject *
95 _build_rval_index_tuple(PyObject *rval, Py_ssize_t idx);
96 static PyObject *
97 scanner_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
98 static int
99 scanner_init(PyObject *self, PyObject *args, PyObject *kwds);
100 static void
101 scanner_dealloc(PyObject *self);
102 static int
103 scanner_clear(PyObject *self);
104 static PyObject *
105 encoder_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
106 static int
107 encoder_init(PyObject *self, PyObject *args, PyObject *kwds);
108 static void
109 encoder_dealloc(PyObject *self);
110 static int
111 encoder_clear(PyObject *self);
112 static int
113 encoder_listencode_list(PyEncoderObject *s, PyObject *rval, PyObject *seq, Py_ssize_t indent_level);
114 static int
115 encoder_listencode_obj(PyEncoderObject *s, PyObject *rval, PyObject *obj, Py_ssize_t indent_level);
116 static int
117 encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ssize_t indent_level);
118 static PyObject *
119 _encoded_const(PyObject *obj);
120 static void
121 raise_errmsg(char *msg, PyObject *s, Py_ssize_t end);
122 static PyObject *
123 encoder_encode_string(PyEncoderObject *s, PyObject *obj);
124 static int
125 _convertPyInt_AsSsize_t(PyObject *o, Py_ssize_t *size_ptr);
126 static PyObject *
127 _convertPyInt_FromSsize_t(Py_ssize_t *size_ptr);
128 static PyObject *
129 encoder_encode_float(PyEncoderObject *s, PyObject *obj);
130 
131 #define S_CHAR(c) (c >= ' ' && c <= '~' && c != '\\' && c != '"')
132 #define IS_WHITESPACE(c) (((c) == ' ') || ((c) == '\t') || ((c) == '\n') || ((c) == '\r'))
133 
134 #define MIN_EXPANSION 6
135 #ifdef Py_UNICODE_WIDE
136 #define MAX_EXPANSION (2 * MIN_EXPANSION)
137 #else
138 #define MAX_EXPANSION MIN_EXPANSION
139 #endif
140 
141 static int
_convertPyInt_AsSsize_t(PyObject * o,Py_ssize_t * size_ptr)142 _convertPyInt_AsSsize_t(PyObject *o, Py_ssize_t *size_ptr)
143 {
144     /* PyObject to Py_ssize_t converter */
145     *size_ptr = PyInt_AsSsize_t(o);
146     if (*size_ptr == -1 && PyErr_Occurred())
147         return 0;
148     return 1;
149 }
150 
151 static PyObject *
_convertPyInt_FromSsize_t(Py_ssize_t * size_ptr)152 _convertPyInt_FromSsize_t(Py_ssize_t *size_ptr)
153 {
154     /* Py_ssize_t to PyObject converter */
155     return PyInt_FromSsize_t(*size_ptr);
156 }
157 
158 static Py_ssize_t
ascii_escape_char(Py_UNICODE c,char * output,Py_ssize_t chars)159 ascii_escape_char(Py_UNICODE c, char *output, Py_ssize_t chars)
160 {
161     /* Escape unicode code point c to ASCII escape sequences
162     in char *output. output must have at least 12 bytes unused to
163     accommodate an escaped surrogate pair "\uXXXX\uXXXX" */
164     output[chars++] = '\\';
165     switch (c) {
166         case '\\': output[chars++] = (char)c; break;
167         case '"': output[chars++] = (char)c; break;
168         case '\b': output[chars++] = 'b'; break;
169         case '\f': output[chars++] = 'f'; break;
170         case '\n': output[chars++] = 'n'; break;
171         case '\r': output[chars++] = 'r'; break;
172         case '\t': output[chars++] = 't'; break;
173         default:
174 #ifdef Py_UNICODE_WIDE
175             if (c >= 0x10000) {
176                 /* UTF-16 surrogate pair */
177                 Py_UNICODE v = c - 0x10000;
178                 c = 0xd800 | ((v >> 10) & 0x3ff);
179                 output[chars++] = 'u';
180                 output[chars++] = "0123456789abcdef"[(c >> 12) & 0xf];
181                 output[chars++] = "0123456789abcdef"[(c >>  8) & 0xf];
182                 output[chars++] = "0123456789abcdef"[(c >>  4) & 0xf];
183                 output[chars++] = "0123456789abcdef"[(c      ) & 0xf];
184                 c = 0xdc00 | (v & 0x3ff);
185                 output[chars++] = '\\';
186             }
187 #endif
188             output[chars++] = 'u';
189             output[chars++] = "0123456789abcdef"[(c >> 12) & 0xf];
190             output[chars++] = "0123456789abcdef"[(c >>  8) & 0xf];
191             output[chars++] = "0123456789abcdef"[(c >>  4) & 0xf];
192             output[chars++] = "0123456789abcdef"[(c      ) & 0xf];
193     }
194     return chars;
195 }
196 
197 static PyObject *
ascii_escape_unicode(PyObject * pystr)198 ascii_escape_unicode(PyObject *pystr)
199 {
200     /* Take a PyUnicode pystr and return a new ASCII-only escaped PyString */
201     Py_ssize_t i;
202     Py_ssize_t input_chars;
203     Py_ssize_t output_size;
204     Py_ssize_t max_output_size;
205     Py_ssize_t chars;
206     Py_ssize_t incr;
207     PyObject *rval;
208     char *output;
209     Py_UNICODE *input_unicode;
210 
211     input_chars = PyUnicode_GET_SIZE(pystr);
212     input_unicode = PyUnicode_AS_UNICODE(pystr);
213 
214     output_size = input_chars;
215     incr = 2; /* for quotes */
216     /* One char input can be up to 6 chars output, estimate 4 of these */
217     incr += MIN_EXPANSION * 4;
218     if (PY_SSIZE_T_MAX - incr < output_size) {
219         PyErr_NoMemory();
220         return NULL;
221     }
222     output_size += incr;
223     if (PY_SSIZE_T_MAX / MAX_EXPANSION < input_chars ||
224         PY_SSIZE_T_MAX - 2 < input_chars * MAX_EXPANSION)
225         max_output_size = PY_SSIZE_T_MAX;
226     else
227         max_output_size = 2 + (input_chars * MAX_EXPANSION);
228     rval = PyString_FromStringAndSize(NULL, output_size);
229     if (rval == NULL) {
230         return NULL;
231     }
232     output = PyString_AS_STRING(rval);
233     chars = 0;
234     output[chars++] = '"';
235     for (i = 0; i < input_chars; i++) {
236         Py_UNICODE c = input_unicode[i];
237         if (S_CHAR(c)) {
238             output[chars++] = (char)c;
239         }
240         else {
241             chars = ascii_escape_char(c, output, chars);
242         }
243         if (output_size - chars < (1 + MAX_EXPANSION)) {
244             if (output_size == PY_SSIZE_T_MAX) {
245                 Py_DECREF(rval);
246                 PyErr_NoMemory();
247                 return NULL;
248             }
249             /* There's more than four, so let's resize by a lot */
250             if (PY_SSIZE_T_MAX / 2 >= output_size && output_size * 2 < max_output_size)
251                 output_size *= 2;
252             else
253                 output_size = max_output_size;
254             if (_PyString_Resize(&rval, output_size) == -1) {
255                 return NULL;
256             }
257             output = PyString_AS_STRING(rval);
258         }
259     }
260     output[chars++] = '"';
261     if (_PyString_Resize(&rval, chars) == -1) {
262         return NULL;
263     }
264     return rval;
265 }
266 
267 static PyObject *
ascii_escape_str(PyObject * pystr)268 ascii_escape_str(PyObject *pystr)
269 {
270     /* Take a PyString pystr and return a new ASCII-only escaped PyString */
271     Py_ssize_t i;
272     Py_ssize_t input_chars;
273     Py_ssize_t output_size;
274     Py_ssize_t max_output_size;
275     Py_ssize_t chars;
276     Py_ssize_t incr;
277     PyObject *rval;
278     char *output;
279     char *input_str;
280 
281     input_chars = PyString_GET_SIZE(pystr);
282     input_str = PyString_AS_STRING(pystr);
283 
284     /* Fast path for a string that's already ASCII */
285     for (i = 0; i < input_chars; i++) {
286         Py_UNICODE c = (Py_UNICODE)(unsigned char)input_str[i];
287         if (!S_CHAR(c)) {
288             /* If we have to escape something, scan the string for unicode */
289             Py_ssize_t j;
290             for (j = i; j < input_chars; j++) {
291                 c = (Py_UNICODE)(unsigned char)input_str[j];
292                 if (c > 0x7f) {
293                     /* We hit a non-ASCII character, bail to unicode mode */
294                     PyObject *uni;
295                     uni = PyUnicode_DecodeUTF8(input_str, input_chars, "strict");
296                     if (uni == NULL) {
297                         return NULL;
298                     }
299                     rval = ascii_escape_unicode(uni);
300                     Py_DECREF(uni);
301                     return rval;
302                 }
303             }
304             break;
305         }
306     }
307 
308     output_size = input_chars;
309     incr = 2; /* for quotes */
310     if (i != input_chars) {
311         /* One char input can be up to 6 chars output, estimate 4 of these */
312         incr += MIN_EXPANSION * 4;
313     }
314     if (PY_SSIZE_T_MAX - incr < output_size) {
315         PyErr_NoMemory();
316         return NULL;
317     }
318     output_size += incr;
319     if (PY_SSIZE_T_MAX / MIN_EXPANSION < input_chars ||
320         PY_SSIZE_T_MAX - 2 < input_chars * MIN_EXPANSION)
321         max_output_size = PY_SSIZE_T_MAX;
322     else
323         max_output_size = 2 + (input_chars * MIN_EXPANSION);
324     rval = PyString_FromStringAndSize(NULL, output_size);
325     if (rval == NULL) {
326         return NULL;
327     }
328     output = PyString_AS_STRING(rval);
329     output[0] = '"';
330 
331     /* We know that everything up to i is ASCII already */
332     chars = i + 1;
333     memcpy(&output[1], input_str, i);
334 
335     for (; i < input_chars; i++) {
336         Py_UNICODE c = (Py_UNICODE)(unsigned char)input_str[i];
337         if (S_CHAR(c)) {
338             output[chars++] = (char)c;
339         }
340         else {
341             chars = ascii_escape_char(c, output, chars);
342         }
343         /* An ASCII char can't possibly expand to a surrogate! */
344         if (output_size - chars < (1 + MIN_EXPANSION)) {
345             if (output_size == PY_SSIZE_T_MAX) {
346                 Py_DECREF(rval);
347                 PyErr_NoMemory();
348                 return NULL;
349             }
350             /* There's more than four, so let's resize by a lot */
351             if (PY_SSIZE_T_MAX / 2 >= output_size && output_size * 2 < max_output_size)
352                 output_size *= 2;
353             else
354                 output_size = max_output_size;
355             if (_PyString_Resize(&rval, output_size) == -1) {
356                 return NULL;
357             }
358             output = PyString_AS_STRING(rval);
359         }
360     }
361     output[chars++] = '"';
362     if (_PyString_Resize(&rval, chars) == -1) {
363         return NULL;
364     }
365     return rval;
366 }
367 
368 static void
raise_errmsg(char * msg,PyObject * s,Py_ssize_t end)369 raise_errmsg(char *msg, PyObject *s, Py_ssize_t end)
370 {
371     /* Use the Python function json.decoder.errmsg to raise a nice
372     looking ValueError exception */
373     static PyObject *errmsg_fn = NULL;
374     PyObject *pymsg;
375     if (errmsg_fn == NULL) {
376         PyObject *decoder = PyImport_ImportModule("json.decoder");
377         if (decoder == NULL)
378             return;
379         errmsg_fn = PyObject_GetAttrString(decoder, "errmsg");
380         Py_DECREF(decoder);
381         if (errmsg_fn == NULL)
382             return;
383     }
384     pymsg = PyObject_CallFunction(errmsg_fn, "(zOO&)", msg, s, _convertPyInt_FromSsize_t, &end);
385     if (pymsg) {
386         PyErr_SetObject(PyExc_ValueError, pymsg);
387         Py_DECREF(pymsg);
388     }
389 }
390 
391 static PyObject *
join_list_unicode(PyObject * lst)392 join_list_unicode(PyObject *lst)
393 {
394     /* return u''.join(lst) */
395     static PyObject *joinfn = NULL;
396     if (joinfn == NULL) {
397         PyObject *ustr = PyUnicode_FromUnicode(NULL, 0);
398         if (ustr == NULL)
399             return NULL;
400 
401         joinfn = PyObject_GetAttrString(ustr, "join");
402         Py_DECREF(ustr);
403         if (joinfn == NULL)
404             return NULL;
405     }
406     return PyObject_CallFunctionObjArgs(joinfn, lst, NULL);
407 }
408 
409 static PyObject *
_build_rval_index_tuple(PyObject * rval,Py_ssize_t idx)410 _build_rval_index_tuple(PyObject *rval, Py_ssize_t idx) {
411     /* return (rval, idx) tuple, stealing reference to rval */
412     PyObject *tpl;
413     PyObject *pyidx;
414     /*
415     steal a reference to rval, returns (rval, idx)
416     */
417     if (rval == NULL) {
418         return NULL;
419     }
420     pyidx = PyInt_FromSsize_t(idx);
421     if (pyidx == NULL) {
422         Py_DECREF(rval);
423         return NULL;
424     }
425     tpl = PyTuple_New(2);
426     if (tpl == NULL) {
427         Py_DECREF(pyidx);
428         Py_DECREF(rval);
429         return NULL;
430     }
431     PyTuple_SET_ITEM(tpl, 0, rval);
432     PyTuple_SET_ITEM(tpl, 1, pyidx);
433     return tpl;
434 }
435 
436 static PyObject *
scanstring_str(PyObject * pystr,Py_ssize_t end,char * encoding,int strict,Py_ssize_t * next_end_ptr)437 scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict, Py_ssize_t *next_end_ptr)
438 {
439     /* Read the JSON string from PyString pystr.
440     end is the index of the first character after the quote.
441     encoding is the encoding of pystr (must be an ASCII superset)
442     if strict is zero then literal control characters are allowed
443     *next_end_ptr is a return-by-reference index of the character
444         after the end quote
445 
446     Return value is a new PyString (if ASCII-only) or PyUnicode
447     */
448     PyObject *rval;
449     Py_ssize_t len = PyString_GET_SIZE(pystr);
450     Py_ssize_t begin = end - 1;
451     Py_ssize_t next;
452     char *buf = PyString_AS_STRING(pystr);
453     PyObject *chunks = PyList_New(0);
454     if (chunks == NULL) {
455         goto bail;
456     }
457     if (end < 0 || len <= end) {
458         PyErr_SetString(PyExc_ValueError, "end is out of bounds");
459         goto bail;
460     }
461     while (1) {
462         /* Find the end of the string or the next escape */
463         Py_UNICODE c = 0;
464         PyObject *chunk = NULL;
465         for (next = end; next < len; next++) {
466             c = (unsigned char)buf[next];
467             if (c == '"' || c == '\\') {
468                 break;
469             }
470             else if (strict && c <= 0x1f) {
471                 raise_errmsg("Invalid control character at", pystr, next);
472                 goto bail;
473             }
474         }
475         if (!(c == '"' || c == '\\')) {
476             raise_errmsg("Unterminated string starting at", pystr, begin);
477             goto bail;
478         }
479         /* Pick up this chunk if it's not zero length */
480         if (next != end) {
481             PyObject *strchunk = PyString_FromStringAndSize(&buf[end], next - end);
482             if (strchunk == NULL) {
483                 goto bail;
484             }
485             chunk = PyUnicode_FromEncodedObject(strchunk, encoding, NULL);
486             Py_DECREF(strchunk);
487             if (chunk == NULL) {
488                 goto bail;
489             }
490             if (PyList_Append(chunks, chunk)) {
491                 Py_DECREF(chunk);
492                 goto bail;
493             }
494             Py_DECREF(chunk);
495         }
496         next++;
497         if (c == '"') {
498             end = next;
499             break;
500         }
501         if (next == len) {
502             raise_errmsg("Unterminated string starting at", pystr, begin);
503             goto bail;
504         }
505         c = buf[next];
506         if (c != 'u') {
507             /* Non-unicode backslash escapes */
508             end = next + 1;
509             switch (c) {
510                 case '"': break;
511                 case '\\': break;
512                 case '/': break;
513                 case 'b': c = '\b'; break;
514                 case 'f': c = '\f'; break;
515                 case 'n': c = '\n'; break;
516                 case 'r': c = '\r'; break;
517                 case 't': c = '\t'; break;
518                 default: c = 0;
519             }
520             if (c == 0) {
521                 raise_errmsg("Invalid \\escape", pystr, end - 2);
522                 goto bail;
523             }
524         }
525         else {
526             c = 0;
527             next++;
528             end = next + 4;
529             if (end >= len) {
530                 raise_errmsg("Invalid \\uXXXX escape", pystr, next - 1);
531                 goto bail;
532             }
533             /* Decode 4 hex digits */
534             for (; next < end; next++) {
535                 Py_UNICODE digit = buf[next];
536                 c <<= 4;
537                 switch (digit) {
538                     case '0': case '1': case '2': case '3': case '4':
539                     case '5': case '6': case '7': case '8': case '9':
540                         c |= (digit - '0'); break;
541                     case 'a': case 'b': case 'c': case 'd': case 'e':
542                     case 'f':
543                         c |= (digit - 'a' + 10); break;
544                     case 'A': case 'B': case 'C': case 'D': case 'E':
545                     case 'F':
546                         c |= (digit - 'A' + 10); break;
547                     default:
548                         raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
549                         goto bail;
550                 }
551             }
552 #ifdef Py_UNICODE_WIDE
553             /* Surrogate pair */
554             if ((c & 0xfc00) == 0xd800 && end + 6 < len &&
555                 buf[next++] == '\\' &&
556                 buf[next++] == 'u') {
557                 Py_UNICODE c2 = 0;
558                 end += 6;
559                 /* Decode 4 hex digits */
560                 for (; next < end; next++) {
561                     Py_UNICODE digit = buf[next];
562                     c2 <<= 4;
563                     switch (digit) {
564                         case '0': case '1': case '2': case '3': case '4':
565                         case '5': case '6': case '7': case '8': case '9':
566                             c2 |= (digit - '0'); break;
567                         case 'a': case 'b': case 'c': case 'd': case 'e':
568                         case 'f':
569                             c2 |= (digit - 'a' + 10); break;
570                         case 'A': case 'B': case 'C': case 'D': case 'E':
571                         case 'F':
572                             c2 |= (digit - 'A' + 10); break;
573                         default:
574                             raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
575                             goto bail;
576                     }
577                 }
578                 if ((c2 & 0xfc00) == 0xdc00)
579                     c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
580                 else
581                     end -= 6;
582             }
583 #endif
584         }
585         chunk = PyUnicode_FromUnicode(&c, 1);
586         if (chunk == NULL) {
587             goto bail;
588         }
589         if (PyList_Append(chunks, chunk)) {
590             Py_DECREF(chunk);
591             goto bail;
592         }
593         Py_DECREF(chunk);
594     }
595 
596     rval = join_list_unicode(chunks);
597     if (rval == NULL) {
598         goto bail;
599     }
600     Py_CLEAR(chunks);
601     *next_end_ptr = end;
602     return rval;
603 bail:
604     *next_end_ptr = -1;
605     Py_XDECREF(chunks);
606     return NULL;
607 }
608 
609 
610 static PyObject *
scanstring_unicode(PyObject * pystr,Py_ssize_t end,int strict,Py_ssize_t * next_end_ptr)611 scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next_end_ptr)
612 {
613     /* Read the JSON string from PyUnicode pystr.
614     end is the index of the first character after the quote.
615     if strict is zero then literal control characters are allowed
616     *next_end_ptr is a return-by-reference index of the character
617         after the end quote
618 
619     Return value is a new PyUnicode
620     */
621     PyObject *rval;
622     Py_ssize_t len = PyUnicode_GET_SIZE(pystr);
623     Py_ssize_t begin = end - 1;
624     Py_ssize_t next;
625     const Py_UNICODE *buf = PyUnicode_AS_UNICODE(pystr);
626     PyObject *chunks = PyList_New(0);
627     if (chunks == NULL) {
628         goto bail;
629     }
630     if (end < 0 || len <= end) {
631         PyErr_SetString(PyExc_ValueError, "end is out of bounds");
632         goto bail;
633     }
634     while (1) {
635         /* Find the end of the string or the next escape */
636         Py_UNICODE c = 0;
637         PyObject *chunk = NULL;
638         for (next = end; next < len; next++) {
639             c = buf[next];
640             if (c == '"' || c == '\\') {
641                 break;
642             }
643             else if (strict && c <= 0x1f) {
644                 raise_errmsg("Invalid control character at", pystr, next);
645                 goto bail;
646             }
647         }
648         if (!(c == '"' || c == '\\')) {
649             raise_errmsg("Unterminated string starting at", pystr, begin);
650             goto bail;
651         }
652         /* Pick up this chunk if it's not zero length */
653         if (next != end) {
654             chunk = PyUnicode_FromUnicode(&buf[end], next - end);
655             if (chunk == NULL) {
656                 goto bail;
657             }
658             if (PyList_Append(chunks, chunk)) {
659                 Py_DECREF(chunk);
660                 goto bail;
661             }
662             Py_DECREF(chunk);
663         }
664         next++;
665         if (c == '"') {
666             end = next;
667             break;
668         }
669         if (next == len) {
670             raise_errmsg("Unterminated string starting at", pystr, begin);
671             goto bail;
672         }
673         c = buf[next];
674         if (c != 'u') {
675             /* Non-unicode backslash escapes */
676             end = next + 1;
677             switch (c) {
678                 case '"': break;
679                 case '\\': break;
680                 case '/': break;
681                 case 'b': c = '\b'; break;
682                 case 'f': c = '\f'; break;
683                 case 'n': c = '\n'; break;
684                 case 'r': c = '\r'; break;
685                 case 't': c = '\t'; break;
686                 default: c = 0;
687             }
688             if (c == 0) {
689                 raise_errmsg("Invalid \\escape", pystr, end - 2);
690                 goto bail;
691             }
692         }
693         else {
694             c = 0;
695             next++;
696             end = next + 4;
697             if (end >= len) {
698                 raise_errmsg("Invalid \\uXXXX escape", pystr, next - 1);
699                 goto bail;
700             }
701             /* Decode 4 hex digits */
702             for (; next < end; next++) {
703                 Py_UNICODE digit = buf[next];
704                 c <<= 4;
705                 switch (digit) {
706                     case '0': case '1': case '2': case '3': case '4':
707                     case '5': case '6': case '7': case '8': case '9':
708                         c |= (digit - '0'); break;
709                     case 'a': case 'b': case 'c': case 'd': case 'e':
710                     case 'f':
711                         c |= (digit - 'a' + 10); break;
712                     case 'A': case 'B': case 'C': case 'D': case 'E':
713                     case 'F':
714                         c |= (digit - 'A' + 10); break;
715                     default:
716                         raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
717                         goto bail;
718                 }
719             }
720 #ifdef Py_UNICODE_WIDE
721             /* Surrogate pair */
722             if ((c & 0xfc00) == 0xd800 && end + 6 < len &&
723                 buf[next++] == '\\' && buf[next++] == 'u') {
724                 Py_UNICODE c2 = 0;
725                 end += 6;
726                 /* Decode 4 hex digits */
727                 for (; next < end; next++) {
728                     Py_UNICODE digit = buf[next];
729                     c2 <<= 4;
730                     switch (digit) {
731                         case '0': case '1': case '2': case '3': case '4':
732                         case '5': case '6': case '7': case '8': case '9':
733                             c2 |= (digit - '0'); break;
734                         case 'a': case 'b': case 'c': case 'd': case 'e':
735                         case 'f':
736                             c2 |= (digit - 'a' + 10); break;
737                         case 'A': case 'B': case 'C': case 'D': case 'E':
738                         case 'F':
739                             c2 |= (digit - 'A' + 10); break;
740                         default:
741                             raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
742                             goto bail;
743                     }
744                 }
745                 if ((c2 & 0xfc00) == 0xdc00)
746                     c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
747                 else
748                     end -= 6;
749             }
750 #endif
751         }
752         chunk = PyUnicode_FromUnicode(&c, 1);
753         if (chunk == NULL) {
754             goto bail;
755         }
756         if (PyList_Append(chunks, chunk)) {
757             Py_DECREF(chunk);
758             goto bail;
759         }
760         Py_DECREF(chunk);
761     }
762 
763     rval = join_list_unicode(chunks);
764     if (rval == NULL) {
765         goto bail;
766     }
767     Py_DECREF(chunks);
768     *next_end_ptr = end;
769     return rval;
770 bail:
771     *next_end_ptr = -1;
772     Py_XDECREF(chunks);
773     return NULL;
774 }
775 
776 PyDoc_STRVAR(pydoc_scanstring,
777     "scanstring(basestring, end, encoding, strict=True) -> (str, end)\n"
778     "\n"
779     "Scan the string s for a JSON string. End is the index of the\n"
780     "character in s after the quote that started the JSON string.\n"
781     "Unescapes all valid JSON string escape sequences and raises ValueError\n"
782     "on attempt to decode an invalid string. If strict is False then literal\n"
783     "control characters are allowed in the string.\n"
784     "\n"
785     "Returns a tuple of the decoded string and the index of the character in s\n"
786     "after the end quote."
787 );
788 
789 static PyObject *
py_scanstring(PyObject * self UNUSED,PyObject * args)790 py_scanstring(PyObject* self UNUSED, PyObject *args)
791 {
792     PyObject *pystr;
793     PyObject *rval;
794     Py_ssize_t end;
795     Py_ssize_t next_end = -1;
796     char *encoding = NULL;
797     int strict = 1;
798     if (!PyArg_ParseTuple(args, "OO&|zi:scanstring", &pystr, _convertPyInt_AsSsize_t, &end, &encoding, &strict)) {
799         return NULL;
800     }
801     if (encoding == NULL) {
802         encoding = DEFAULT_ENCODING;
803     }
804     if (PyString_Check(pystr)) {
805         rval = scanstring_str(pystr, end, encoding, strict, &next_end);
806     }
807     else if (PyUnicode_Check(pystr)) {
808         rval = scanstring_unicode(pystr, end, strict, &next_end);
809     }
810     else {
811         PyErr_Format(PyExc_TypeError,
812                      "first argument must be a string, not %.80s",
813                      Py_TYPE(pystr)->tp_name);
814         return NULL;
815     }
816     return _build_rval_index_tuple(rval, next_end);
817 }
818 
819 PyDoc_STRVAR(pydoc_encode_basestring_ascii,
820     "encode_basestring_ascii(basestring) -> str\n"
821     "\n"
822     "Return an ASCII-only JSON representation of a Python string"
823 );
824 
825 static PyObject *
py_encode_basestring_ascii(PyObject * self UNUSED,PyObject * pystr)826 py_encode_basestring_ascii(PyObject* self UNUSED, PyObject *pystr)
827 {
828     /* Return an ASCII-only JSON representation of a Python string */
829     /* METH_O */
830     if (PyString_Check(pystr)) {
831         return ascii_escape_str(pystr);
832     }
833     else if (PyUnicode_Check(pystr)) {
834         return ascii_escape_unicode(pystr);
835     }
836     else {
837         PyErr_Format(PyExc_TypeError,
838                      "first argument must be a string, not %.80s",
839                      Py_TYPE(pystr)->tp_name);
840         return NULL;
841     }
842 }
843 
844 static void
scanner_dealloc(PyObject * self)845 scanner_dealloc(PyObject *self)
846 {
847     /* Deallocate scanner object */
848     scanner_clear(self);
849     Py_TYPE(self)->tp_free(self);
850 }
851 
852 static int
scanner_traverse(PyObject * self,visitproc visit,void * arg)853 scanner_traverse(PyObject *self, visitproc visit, void *arg)
854 {
855     PyScannerObject *s;
856     assert(PyScanner_Check(self));
857     s = (PyScannerObject *)self;
858     Py_VISIT(s->encoding);
859     Py_VISIT(s->strict);
860     Py_VISIT(s->object_hook);
861     Py_VISIT(s->pairs_hook);
862     Py_VISIT(s->parse_float);
863     Py_VISIT(s->parse_int);
864     Py_VISIT(s->parse_constant);
865     return 0;
866 }
867 
868 static int
scanner_clear(PyObject * self)869 scanner_clear(PyObject *self)
870 {
871     PyScannerObject *s;
872     assert(PyScanner_Check(self));
873     s = (PyScannerObject *)self;
874     Py_CLEAR(s->encoding);
875     Py_CLEAR(s->strict);
876     Py_CLEAR(s->object_hook);
877     Py_CLEAR(s->pairs_hook);
878     Py_CLEAR(s->parse_float);
879     Py_CLEAR(s->parse_int);
880     Py_CLEAR(s->parse_constant);
881     return 0;
882 }
883 
884 static PyObject *
_parse_object_str(PyScannerObject * s,PyObject * pystr,Py_ssize_t idx,Py_ssize_t * next_idx_ptr)885 _parse_object_str(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_t *next_idx_ptr) {
886     /* Read a JSON object from PyString pystr.
887     idx is the index of the first character after the opening curly brace.
888     *next_idx_ptr is a return-by-reference index to the first character after
889         the closing curly brace.
890 
891     Returns a new PyObject (usually a dict, but object_hook can change that)
892     */
893     char *str = PyString_AS_STRING(pystr);
894     Py_ssize_t end_idx = PyString_GET_SIZE(pystr) - 1;
895     PyObject *rval;
896     PyObject *pairs;
897     PyObject *item;
898     PyObject *key = NULL;
899     PyObject *val = NULL;
900     char *encoding = PyString_AS_STRING(s->encoding);
901     int strict = PyObject_IsTrue(s->strict);
902     Py_ssize_t next_idx;
903 
904     if (strict < 0)
905         return NULL;
906 
907     pairs = PyList_New(0);
908     if (pairs == NULL)
909         return NULL;
910 
911     /* skip whitespace after { */
912     while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++;
913 
914     /* only loop if the object is non-empty */
915     if (idx <= end_idx && str[idx] != '}') {
916         while (idx <= end_idx) {
917             /* read key */
918             if (str[idx] != '"') {
919                 raise_errmsg("Expecting property name", pystr, idx);
920                 goto bail;
921             }
922             key = scanstring_str(pystr, idx + 1, encoding, strict, &next_idx);
923             if (key == NULL)
924                 goto bail;
925             idx = next_idx;
926 
927             /* skip whitespace between key and : delimiter, read :, skip whitespace */
928             while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++;
929             if (idx > end_idx || str[idx] != ':') {
930                 raise_errmsg("Expecting : delimiter", pystr, idx);
931                 goto bail;
932             }
933             idx++;
934             while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++;
935 
936             /* read any JSON data type */
937             val = scan_once_str(s, pystr, idx, &next_idx);
938             if (val == NULL)
939                 goto bail;
940 
941             item = PyTuple_Pack(2, key, val);
942             if (item == NULL)
943                 goto bail;
944             Py_CLEAR(key);
945             Py_CLEAR(val);
946             if (PyList_Append(pairs, item) == -1) {
947                 Py_DECREF(item);
948                 goto bail;
949             }
950             Py_DECREF(item);
951             idx = next_idx;
952 
953             /* skip whitespace before } or , */
954             while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++;
955 
956             /* bail if the object is closed or we didn't get the , delimiter */
957             if (idx > end_idx) break;
958             if (str[idx] == '}') {
959                 break;
960             }
961             else if (str[idx] != ',') {
962                 raise_errmsg("Expecting , delimiter", pystr, idx);
963                 goto bail;
964             }
965             idx++;
966 
967             /* skip whitespace after , delimiter */
968             while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++;
969         }
970     }
971     /* verify that idx < end_idx, str[idx] should be '}' */
972     if (idx > end_idx || str[idx] != '}') {
973         raise_errmsg("Expecting object", pystr, end_idx);
974         goto bail;
975     }
976 
977     /* if pairs_hook is not None: rval = object_pairs_hook(pairs) */
978     if (s->pairs_hook != Py_None) {
979         val = PyObject_CallFunctionObjArgs(s->pairs_hook, pairs, NULL);
980         if (val == NULL)
981             goto bail;
982         Py_DECREF(pairs);
983         *next_idx_ptr = idx + 1;
984         return val;
985     }
986 
987     rval = PyObject_CallFunctionObjArgs((PyObject *)(&PyDict_Type),
988                                          pairs, NULL);
989     if (rval == NULL)
990         goto bail;
991     Py_CLEAR(pairs);
992 
993     /* if object_hook is not None: rval = object_hook(rval) */
994     if (s->object_hook != Py_None) {
995         val = PyObject_CallFunctionObjArgs(s->object_hook, rval, NULL);
996         if (val == NULL)
997             goto bail;
998         Py_DECREF(rval);
999         rval = val;
1000         val = NULL;
1001     }
1002     *next_idx_ptr = idx + 1;
1003     return rval;
1004 bail:
1005     Py_XDECREF(key);
1006     Py_XDECREF(val);
1007     Py_XDECREF(pairs);
1008     return NULL;
1009 }
1010 
1011 static PyObject *
_parse_object_unicode(PyScannerObject * s,PyObject * pystr,Py_ssize_t idx,Py_ssize_t * next_idx_ptr)1012 _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_t *next_idx_ptr) {
1013     /* Read a JSON object from PyUnicode pystr.
1014     idx is the index of the first character after the opening curly brace.
1015     *next_idx_ptr is a return-by-reference index to the first character after
1016         the closing curly brace.
1017 
1018     Returns a new PyObject (usually a dict, but object_hook can change that)
1019     */
1020     Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr);
1021     Py_ssize_t end_idx = PyUnicode_GET_SIZE(pystr) - 1;
1022     PyObject *rval;
1023     PyObject *pairs;
1024     PyObject *item;
1025     PyObject *key = NULL;
1026     PyObject *val = NULL;
1027     int strict = PyObject_IsTrue(s->strict);
1028     Py_ssize_t next_idx;
1029 
1030     if (strict < 0)
1031         return NULL;
1032 
1033     pairs = PyList_New(0);
1034     if (pairs == NULL)
1035         return NULL;
1036 
1037     /* skip whitespace after { */
1038     while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++;
1039 
1040     /* only loop if the object is non-empty */
1041     if (idx <= end_idx && str[idx] != '}') {
1042         while (idx <= end_idx) {
1043             /* read key */
1044             if (str[idx] != '"') {
1045                 raise_errmsg("Expecting property name enclosed in double quotes", pystr, idx);
1046                 goto bail;
1047             }
1048             key = scanstring_unicode(pystr, idx + 1, strict, &next_idx);
1049             if (key == NULL)
1050                 goto bail;
1051             idx = next_idx;
1052 
1053             /* skip whitespace between key and : delimiter, read :, skip whitespace */
1054             while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++;
1055             if (idx > end_idx || str[idx] != ':') {
1056                 raise_errmsg("Expecting ':' delimiter", pystr, idx);
1057                 goto bail;
1058             }
1059             idx++;
1060             while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++;
1061 
1062             /* read any JSON term */
1063             val = scan_once_unicode(s, pystr, idx, &next_idx);
1064             if (val == NULL)
1065                 goto bail;
1066 
1067             item = PyTuple_Pack(2, key, val);
1068             if (item == NULL)
1069                 goto bail;
1070             Py_CLEAR(key);
1071             Py_CLEAR(val);
1072             if (PyList_Append(pairs, item) == -1) {
1073                 Py_DECREF(item);
1074                 goto bail;
1075             }
1076             Py_DECREF(item);
1077             idx = next_idx;
1078 
1079             /* skip whitespace before } or , */
1080             while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++;
1081 
1082             /* bail if the object is closed or we didn't get the , delimiter */
1083             if (idx > end_idx) break;
1084             if (str[idx] == '}') {
1085                 break;
1086             }
1087             else if (str[idx] != ',') {
1088                 raise_errmsg("Expecting ',' delimiter", pystr, idx);
1089                 goto bail;
1090             }
1091             idx++;
1092 
1093             /* skip whitespace after , delimiter */
1094             while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++;
1095         }
1096     }
1097 
1098     /* verify that idx < end_idx, str[idx] should be '}' */
1099     if (idx > end_idx || str[idx] != '}') {
1100         raise_errmsg("Expecting object", pystr, end_idx);
1101         goto bail;
1102     }
1103 
1104     /* if pairs_hook is not None: rval = object_pairs_hook(pairs) */
1105     if (s->pairs_hook != Py_None) {
1106         val = PyObject_CallFunctionObjArgs(s->pairs_hook, pairs, NULL);
1107         if (val == NULL)
1108             goto bail;
1109         Py_DECREF(pairs);
1110         *next_idx_ptr = idx + 1;
1111         return val;
1112     }
1113 
1114     rval = PyObject_CallFunctionObjArgs((PyObject *)(&PyDict_Type),
1115                                          pairs, NULL);
1116     if (rval == NULL)
1117         goto bail;
1118     Py_CLEAR(pairs);
1119 
1120     /* if object_hook is not None: rval = object_hook(rval) */
1121     if (s->object_hook != Py_None) {
1122         val = PyObject_CallFunctionObjArgs(s->object_hook, rval, NULL);
1123         if (val == NULL)
1124             goto bail;
1125         Py_DECREF(rval);
1126         rval = val;
1127         val = NULL;
1128     }
1129     *next_idx_ptr = idx + 1;
1130     return rval;
1131 bail:
1132     Py_XDECREF(key);
1133     Py_XDECREF(val);
1134     Py_XDECREF(pairs);
1135     return NULL;
1136 }
1137 
1138 static PyObject *
_parse_array_str(PyScannerObject * s,PyObject * pystr,Py_ssize_t idx,Py_ssize_t * next_idx_ptr)1139 _parse_array_str(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_t *next_idx_ptr) {
1140     /* Read a JSON array from PyString pystr.
1141     idx is the index of the first character after the opening brace.
1142     *next_idx_ptr is a return-by-reference index to the first character after
1143         the closing brace.
1144 
1145     Returns a new PyList
1146     */
1147     char *str = PyString_AS_STRING(pystr);
1148     Py_ssize_t end_idx = PyString_GET_SIZE(pystr) - 1;
1149     PyObject *val = NULL;
1150     PyObject *rval = PyList_New(0);
1151     Py_ssize_t next_idx;
1152     if (rval == NULL)
1153         return NULL;
1154 
1155     /* skip whitespace after [ */
1156     while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++;
1157 
1158     /* only loop if the array is non-empty */
1159     if (idx <= end_idx && str[idx] != ']') {
1160         while (idx <= end_idx) {
1161 
1162             /* read any JSON term and de-tuplefy the (rval, idx) */
1163             val = scan_once_str(s, pystr, idx, &next_idx);
1164             if (val == NULL)
1165                 goto bail;
1166 
1167             if (PyList_Append(rval, val) == -1)
1168                 goto bail;
1169 
1170             Py_CLEAR(val);
1171             idx = next_idx;
1172 
1173             /* skip whitespace between term and , */
1174             while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++;
1175 
1176             /* bail if the array is closed or we didn't get the , delimiter */
1177             if (idx > end_idx) break;
1178             if (str[idx] == ']') {
1179                 break;
1180             }
1181             else if (str[idx] != ',') {
1182                 raise_errmsg("Expecting , delimiter", pystr, idx);
1183                 goto bail;
1184             }
1185             idx++;
1186 
1187             /* skip whitespace after , */
1188             while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++;
1189         }
1190     }
1191 
1192     /* verify that idx < end_idx, str[idx] should be ']' */
1193     if (idx > end_idx || str[idx] != ']') {
1194         raise_errmsg("Expecting object", pystr, end_idx);
1195         goto bail;
1196     }
1197     *next_idx_ptr = idx + 1;
1198     return rval;
1199 bail:
1200     Py_XDECREF(val);
1201     Py_DECREF(rval);
1202     return NULL;
1203 }
1204 
1205 static PyObject *
_parse_array_unicode(PyScannerObject * s,PyObject * pystr,Py_ssize_t idx,Py_ssize_t * next_idx_ptr)1206 _parse_array_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_t *next_idx_ptr) {
1207     /* Read a JSON array from PyString pystr.
1208     idx is the index of the first character after the opening brace.
1209     *next_idx_ptr is a return-by-reference index to the first character after
1210         the closing brace.
1211 
1212     Returns a new PyList
1213     */
1214     Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr);
1215     Py_ssize_t end_idx = PyUnicode_GET_SIZE(pystr) - 1;
1216     PyObject *val = NULL;
1217     PyObject *rval = PyList_New(0);
1218     Py_ssize_t next_idx;
1219     if (rval == NULL)
1220         return NULL;
1221 
1222     /* skip whitespace after [ */
1223     while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++;
1224 
1225     /* only loop if the array is non-empty */
1226     if (idx <= end_idx && str[idx] != ']') {
1227         while (idx <= end_idx) {
1228 
1229             /* read any JSON term  */
1230             val = scan_once_unicode(s, pystr, idx, &next_idx);
1231             if (val == NULL)
1232                 goto bail;
1233 
1234             if (PyList_Append(rval, val) == -1)
1235                 goto bail;
1236 
1237             Py_CLEAR(val);
1238             idx = next_idx;
1239 
1240             /* skip whitespace between term and , */
1241             while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++;
1242 
1243             /* bail if the array is closed or we didn't get the , delimiter */
1244             if (idx > end_idx) break;
1245             if (str[idx] == ']') {
1246                 break;
1247             }
1248             else if (str[idx] != ',') {
1249                 raise_errmsg("Expecting ',' delimiter", pystr, idx);
1250                 goto bail;
1251             }
1252             idx++;
1253 
1254             /* skip whitespace after , */
1255             while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++;
1256         }
1257     }
1258 
1259     /* verify that idx < end_idx, str[idx] should be ']' */
1260     if (idx > end_idx || str[idx] != ']') {
1261         raise_errmsg("Expecting object", pystr, end_idx);
1262         goto bail;
1263     }
1264     *next_idx_ptr = idx + 1;
1265     return rval;
1266 bail:
1267     Py_XDECREF(val);
1268     Py_DECREF(rval);
1269     return NULL;
1270 }
1271 
1272 static PyObject *
_parse_constant(PyScannerObject * s,char * constant,Py_ssize_t idx,Py_ssize_t * next_idx_ptr)1273 _parse_constant(PyScannerObject *s, char *constant, Py_ssize_t idx, Py_ssize_t *next_idx_ptr) {
1274     /* Read a JSON constant from PyString pystr.
1275     constant is the constant string that was found
1276         ("NaN", "Infinity", "-Infinity").
1277     idx is the index of the first character of the constant
1278     *next_idx_ptr is a return-by-reference index to the first character after
1279         the constant.
1280 
1281     Returns the result of parse_constant
1282     */
1283     PyObject *cstr;
1284     PyObject *rval;
1285     /* constant is "NaN", "Infinity", or "-Infinity" */
1286     cstr = PyString_InternFromString(constant);
1287     if (cstr == NULL)
1288         return NULL;
1289 
1290     /* rval = parse_constant(constant) */
1291     rval = PyObject_CallFunctionObjArgs(s->parse_constant, cstr, NULL);
1292     idx += PyString_GET_SIZE(cstr);
1293     Py_DECREF(cstr);
1294     *next_idx_ptr = idx;
1295     return rval;
1296 }
1297 
1298 static PyObject *
_match_number_str(PyScannerObject * s,PyObject * pystr,Py_ssize_t start,Py_ssize_t * next_idx_ptr)1299 _match_number_str(PyScannerObject *s, PyObject *pystr, Py_ssize_t start, Py_ssize_t *next_idx_ptr) {
1300     /* Read a JSON number from PyString pystr.
1301     idx is the index of the first character of the number
1302     *next_idx_ptr is a return-by-reference index to the first character after
1303         the number.
1304 
1305     Returns a new PyObject representation of that number:
1306         PyInt, PyLong, or PyFloat.
1307         May return other types if parse_int or parse_float are set
1308     */
1309     char *str = PyString_AS_STRING(pystr);
1310     Py_ssize_t end_idx = PyString_GET_SIZE(pystr) - 1;
1311     Py_ssize_t idx = start;
1312     int is_float = 0;
1313     PyObject *rval;
1314     PyObject *numstr;
1315 
1316     /* read a sign if it's there, make sure it's not the end of the string */
1317     if (str[idx] == '-') {
1318         idx++;
1319         if (idx > end_idx) {
1320             PyErr_SetNone(PyExc_StopIteration);
1321             return NULL;
1322         }
1323     }
1324 
1325     /* read as many integer digits as we find as long as it doesn't start with 0 */
1326     if (str[idx] >= '1' && str[idx] <= '9') {
1327         idx++;
1328         while (idx <= end_idx && str[idx] >= '0' && str[idx] <= '9') idx++;
1329     }
1330     /* if it starts with 0 we only expect one integer digit */
1331     else if (str[idx] == '0') {
1332         idx++;
1333     }
1334     /* no integer digits, error */
1335     else {
1336         PyErr_SetNone(PyExc_StopIteration);
1337         return NULL;
1338     }
1339 
1340     /* if the next char is '.' followed by a digit then read all float digits */
1341     if (idx < end_idx && str[idx] == '.' && str[idx + 1] >= '0' && str[idx + 1] <= '9') {
1342         is_float = 1;
1343         idx += 2;
1344         while (idx <= end_idx && str[idx] >= '0' && str[idx] <= '9') idx++;
1345     }
1346 
1347     /* if the next char is 'e' or 'E' then maybe read the exponent (or backtrack) */
1348     if (idx < end_idx && (str[idx] == 'e' || str[idx] == 'E')) {
1349 
1350         /* save the index of the 'e' or 'E' just in case we need to backtrack */
1351         Py_ssize_t e_start = idx;
1352         idx++;
1353 
1354         /* read an exponent sign if present */
1355         if (idx < end_idx && (str[idx] == '-' || str[idx] == '+')) idx++;
1356 
1357         /* read all digits */
1358         while (idx <= end_idx && str[idx] >= '0' && str[idx] <= '9') idx++;
1359 
1360         /* if we got a digit, then parse as float. if not, backtrack */
1361         if (str[idx - 1] >= '0' && str[idx - 1] <= '9') {
1362             is_float = 1;
1363         }
1364         else {
1365             idx = e_start;
1366         }
1367     }
1368 
1369     /* copy the section we determined to be a number */
1370     numstr = PyString_FromStringAndSize(&str[start], idx - start);
1371     if (numstr == NULL)
1372         return NULL;
1373     if (is_float) {
1374         /* parse as a float using a fast path if available, otherwise call user defined method */
1375         if (s->parse_float != (PyObject *)&PyFloat_Type) {
1376             rval = PyObject_CallFunctionObjArgs(s->parse_float, numstr, NULL);
1377         }
1378         else {
1379             double d = PyOS_string_to_double(PyString_AS_STRING(numstr),
1380                                              NULL, NULL);
1381             if (d == -1.0 && PyErr_Occurred())
1382                 return NULL;
1383             rval = PyFloat_FromDouble(d);
1384         }
1385     }
1386     else {
1387         /* parse as an int using a fast path if available, otherwise call user defined method */
1388         if (s->parse_int != (PyObject *)&PyInt_Type) {
1389             rval = PyObject_CallFunctionObjArgs(s->parse_int, numstr, NULL);
1390         }
1391         else {
1392             rval = PyInt_FromString(PyString_AS_STRING(numstr), NULL, 10);
1393         }
1394     }
1395     Py_DECREF(numstr);
1396     *next_idx_ptr = idx;
1397     return rval;
1398 }
1399 
1400 static PyObject *
_match_number_unicode(PyScannerObject * s,PyObject * pystr,Py_ssize_t start,Py_ssize_t * next_idx_ptr)1401 _match_number_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t start, Py_ssize_t *next_idx_ptr) {
1402     /* Read a JSON number from PyUnicode pystr.
1403     idx is the index of the first character of the number
1404     *next_idx_ptr is a return-by-reference index to the first character after
1405         the number.
1406 
1407     Returns a new PyObject representation of that number:
1408         PyInt, PyLong, or PyFloat.
1409         May return other types if parse_int or parse_float are set
1410     */
1411     Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr);
1412     Py_ssize_t end_idx = PyUnicode_GET_SIZE(pystr) - 1;
1413     Py_ssize_t idx = start;
1414     int is_float = 0;
1415     PyObject *rval;
1416     PyObject *numstr;
1417 
1418     /* read a sign if it's there, make sure it's not the end of the string */
1419     if (str[idx] == '-') {
1420         idx++;
1421         if (idx > end_idx) {
1422             PyErr_SetNone(PyExc_StopIteration);
1423             return NULL;
1424         }
1425     }
1426 
1427     /* read as many integer digits as we find as long as it doesn't start with 0 */
1428     if (str[idx] >= '1' && str[idx] <= '9') {
1429         idx++;
1430         while (idx <= end_idx && str[idx] >= '0' && str[idx] <= '9') idx++;
1431     }
1432     /* if it starts with 0 we only expect one integer digit */
1433     else if (str[idx] == '0') {
1434         idx++;
1435     }
1436     /* no integer digits, error */
1437     else {
1438         PyErr_SetNone(PyExc_StopIteration);
1439         return NULL;
1440     }
1441 
1442     /* if the next char is '.' followed by a digit then read all float digits */
1443     if (idx < end_idx && str[idx] == '.' && str[idx + 1] >= '0' && str[idx + 1] <= '9') {
1444         is_float = 1;
1445         idx += 2;
1446         while (idx <= end_idx && str[idx] >= '0' && str[idx] <= '9') idx++;
1447     }
1448 
1449     /* if the next char is 'e' or 'E' then maybe read the exponent (or backtrack) */
1450     if (idx < end_idx && (str[idx] == 'e' || str[idx] == 'E')) {
1451         Py_ssize_t e_start = idx;
1452         idx++;
1453 
1454         /* read an exponent sign if present */
1455         if (idx < end_idx && (str[idx] == '-' || str[idx] == '+')) idx++;
1456 
1457         /* read all digits */
1458         while (idx <= end_idx && str[idx] >= '0' && str[idx] <= '9') idx++;
1459 
1460         /* if we got a digit, then parse as float. if not, backtrack */
1461         if (str[idx - 1] >= '0' && str[idx - 1] <= '9') {
1462             is_float = 1;
1463         }
1464         else {
1465             idx = e_start;
1466         }
1467     }
1468 
1469     /* copy the section we determined to be a number */
1470     numstr = PyUnicode_FromUnicode(&str[start], idx - start);
1471     if (numstr == NULL)
1472         return NULL;
1473     if (is_float) {
1474         /* parse as a float using a fast path if available, otherwise call user defined method */
1475         if (s->parse_float != (PyObject *)&PyFloat_Type) {
1476             rval = PyObject_CallFunctionObjArgs(s->parse_float, numstr, NULL);
1477         }
1478         else {
1479             rval = PyFloat_FromString(numstr, NULL);
1480         }
1481     }
1482     else {
1483         /* no fast path for unicode -> int, just call */
1484         rval = PyObject_CallFunctionObjArgs(s->parse_int, numstr, NULL);
1485     }
1486     Py_DECREF(numstr);
1487     *next_idx_ptr = idx;
1488     return rval;
1489 }
1490 
1491 static PyObject *
scan_once_str(PyScannerObject * s,PyObject * pystr,Py_ssize_t idx,Py_ssize_t * next_idx_ptr)1492 scan_once_str(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_t *next_idx_ptr)
1493 {
1494     /* Read one JSON term (of any kind) from PyString pystr.
1495     idx is the index of the first character of the term
1496     *next_idx_ptr is a return-by-reference index to the first character after
1497         the number.
1498 
1499     Returns a new PyObject representation of the term.
1500     */
1501     PyObject *res;
1502     int strict;
1503     char *str = PyString_AS_STRING(pystr);
1504     Py_ssize_t length = PyString_GET_SIZE(pystr);
1505     if (idx < 0) {
1506         PyErr_SetString(PyExc_ValueError, "idx cannot be negative");
1507         return NULL;
1508     }
1509     if (idx >= length) {
1510         PyErr_SetNone(PyExc_StopIteration);
1511         return NULL;
1512     }
1513     switch (str[idx]) {
1514         case '"':
1515             /* string */
1516             strict = PyObject_IsTrue(s->strict);
1517             if (strict < 0)
1518                 return NULL;
1519             return scanstring_str(pystr, idx + 1,
1520                 PyString_AS_STRING(s->encoding), strict, next_idx_ptr);
1521         case '{':
1522             /* object */
1523             if (Py_EnterRecursiveCall(" while decoding a JSON object "
1524                                       "from a byte string"))
1525                 return NULL;
1526             res = _parse_object_str(s, pystr, idx + 1, next_idx_ptr);
1527             Py_LeaveRecursiveCall();
1528             return res;
1529         case '[':
1530             /* array */
1531             if (Py_EnterRecursiveCall(" while decoding a JSON array "
1532                                       "from a byte string"))
1533                 return NULL;
1534             res = _parse_array_str(s, pystr, idx + 1, next_idx_ptr);
1535             Py_LeaveRecursiveCall();
1536             return res;
1537         case 'n':
1538             /* null */
1539             if ((idx + 3 < length) && str[idx + 1] == 'u' && str[idx + 2] == 'l' && str[idx + 3] == 'l') {
1540                 Py_INCREF(Py_None);
1541                 *next_idx_ptr = idx + 4;
1542                 return Py_None;
1543             }
1544             break;
1545         case 't':
1546             /* true */
1547             if ((idx + 3 < length) && str[idx + 1] == 'r' && str[idx + 2] == 'u' && str[idx + 3] == 'e') {
1548                 Py_INCREF(Py_True);
1549                 *next_idx_ptr = idx + 4;
1550                 return Py_True;
1551             }
1552             break;
1553         case 'f':
1554             /* false */
1555             if ((idx + 4 < length) && str[idx + 1] == 'a' && str[idx + 2] == 'l' && str[idx + 3] == 's' && str[idx + 4] == 'e') {
1556                 Py_INCREF(Py_False);
1557                 *next_idx_ptr = idx + 5;
1558                 return Py_False;
1559             }
1560             break;
1561         case 'N':
1562             /* NaN */
1563             if ((idx + 2 < length) && str[idx + 1] == 'a' && str[idx + 2] == 'N') {
1564                 return _parse_constant(s, "NaN", idx, next_idx_ptr);
1565             }
1566             break;
1567         case 'I':
1568             /* Infinity */
1569             if ((idx + 7 < length) && str[idx + 1] == 'n' && str[idx + 2] == 'f' && str[idx + 3] == 'i' && str[idx + 4] == 'n' && str[idx + 5] == 'i' && str[idx + 6] == 't' && str[idx + 7] == 'y') {
1570                 return _parse_constant(s, "Infinity", idx, next_idx_ptr);
1571             }
1572             break;
1573         case '-':
1574             /* -Infinity */
1575             if ((idx + 8 < length) && str[idx + 1] == 'I' && str[idx + 2] == 'n' && str[idx + 3] == 'f' && str[idx + 4] == 'i' && str[idx + 5] == 'n' && str[idx + 6] == 'i' && str[idx + 7] == 't' && str[idx + 8] == 'y') {
1576                 return _parse_constant(s, "-Infinity", idx, next_idx_ptr);
1577             }
1578             break;
1579     }
1580     /* Didn't find a string, object, array, or named constant. Look for a number. */
1581     return _match_number_str(s, pystr, idx, next_idx_ptr);
1582 }
1583 
1584 static PyObject *
scan_once_unicode(PyScannerObject * s,PyObject * pystr,Py_ssize_t idx,Py_ssize_t * next_idx_ptr)1585 scan_once_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_t *next_idx_ptr)
1586 {
1587     /* Read one JSON term (of any kind) from PyUnicode pystr.
1588     idx is the index of the first character of the term
1589     *next_idx_ptr is a return-by-reference index to the first character after
1590         the number.
1591 
1592     Returns a new PyObject representation of the term.
1593     */
1594     PyObject *res;
1595     int strict;
1596     Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr);
1597     Py_ssize_t length = PyUnicode_GET_SIZE(pystr);
1598     if (idx < 0) {
1599         PyErr_SetString(PyExc_ValueError, "idx cannot be negative");
1600         return NULL;
1601     }
1602     if (idx >= length) {
1603         PyErr_SetNone(PyExc_StopIteration);
1604         return NULL;
1605     }
1606     switch (str[idx]) {
1607         case '"':
1608             /* string */
1609             strict = PyObject_IsTrue(s->strict);
1610             if (strict < 0)
1611                 return NULL;
1612             return scanstring_unicode(pystr, idx + 1, strict, next_idx_ptr);
1613         case '{':
1614             /* object */
1615             if (Py_EnterRecursiveCall(" while decoding a JSON object "
1616                                       "from a unicode string"))
1617                 return NULL;
1618             res = _parse_object_unicode(s, pystr, idx + 1, next_idx_ptr);
1619             Py_LeaveRecursiveCall();
1620             return res;
1621         case '[':
1622             /* array */
1623             if (Py_EnterRecursiveCall(" while decoding a JSON array "
1624                                       "from a unicode string"))
1625                 return NULL;
1626             res = _parse_array_unicode(s, pystr, idx + 1, next_idx_ptr);
1627             Py_LeaveRecursiveCall();
1628             return res;
1629         case 'n':
1630             /* null */
1631             if ((idx + 3 < length) && str[idx + 1] == 'u' && str[idx + 2] == 'l' && str[idx + 3] == 'l') {
1632                 Py_INCREF(Py_None);
1633                 *next_idx_ptr = idx + 4;
1634                 return Py_None;
1635             }
1636             break;
1637         case 't':
1638             /* true */
1639             if ((idx + 3 < length) && str[idx + 1] == 'r' && str[idx + 2] == 'u' && str[idx + 3] == 'e') {
1640                 Py_INCREF(Py_True);
1641                 *next_idx_ptr = idx + 4;
1642                 return Py_True;
1643             }
1644             break;
1645         case 'f':
1646             /* false */
1647             if ((idx + 4 < length) && str[idx + 1] == 'a' && str[idx + 2] == 'l' && str[idx + 3] == 's' && str[idx + 4] == 'e') {
1648                 Py_INCREF(Py_False);
1649                 *next_idx_ptr = idx + 5;
1650                 return Py_False;
1651             }
1652             break;
1653         case 'N':
1654             /* NaN */
1655             if ((idx + 2 < length) && str[idx + 1] == 'a' && str[idx + 2] == 'N') {
1656                 return _parse_constant(s, "NaN", idx, next_idx_ptr);
1657             }
1658             break;
1659         case 'I':
1660             /* Infinity */
1661             if ((idx + 7 < length) && str[idx + 1] == 'n' && str[idx + 2] == 'f' && str[idx + 3] == 'i' && str[idx + 4] == 'n' && str[idx + 5] == 'i' && str[idx + 6] == 't' && str[idx + 7] == 'y') {
1662                 return _parse_constant(s, "Infinity", idx, next_idx_ptr);
1663             }
1664             break;
1665         case '-':
1666             /* -Infinity */
1667             if ((idx + 8 < length) && str[idx + 1] == 'I' && str[idx + 2] == 'n' && str[idx + 3] == 'f' && str[idx + 4] == 'i' && str[idx + 5] == 'n' && str[idx + 6] == 'i' && str[idx + 7] == 't' && str[idx + 8] == 'y') {
1668                 return _parse_constant(s, "-Infinity", idx, next_idx_ptr);
1669             }
1670             break;
1671     }
1672     /* Didn't find a string, object, array, or named constant. Look for a number. */
1673     return _match_number_unicode(s, pystr, idx, next_idx_ptr);
1674 }
1675 
1676 static PyObject *
scanner_call(PyObject * self,PyObject * args,PyObject * kwds)1677 scanner_call(PyObject *self, PyObject *args, PyObject *kwds)
1678 {
1679     /* Python callable interface to scan_once_{str,unicode} */
1680     PyObject *pystr;
1681     PyObject *rval;
1682     Py_ssize_t idx;
1683     Py_ssize_t next_idx = -1;
1684     static char *kwlist[] = {"string", "idx", NULL};
1685     PyScannerObject *s;
1686     assert(PyScanner_Check(self));
1687     s = (PyScannerObject *)self;
1688     if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO&:scan_once", kwlist, &pystr, _convertPyInt_AsSsize_t, &idx))
1689         return NULL;
1690 
1691     if (PyString_Check(pystr)) {
1692         rval = scan_once_str(s, pystr, idx, &next_idx);
1693     }
1694     else if (PyUnicode_Check(pystr)) {
1695         rval = scan_once_unicode(s, pystr, idx, &next_idx);
1696     }
1697     else {
1698         PyErr_Format(PyExc_TypeError,
1699                  "first argument must be a string, not %.80s",
1700                  Py_TYPE(pystr)->tp_name);
1701         return NULL;
1702     }
1703     return _build_rval_index_tuple(rval, next_idx);
1704 }
1705 
1706 static PyObject *
scanner_new(PyTypeObject * type,PyObject * args,PyObject * kwds)1707 scanner_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
1708 {
1709     PyScannerObject *s;
1710     s = (PyScannerObject *)type->tp_alloc(type, 0);
1711     if (s != NULL) {
1712         s->encoding = NULL;
1713         s->strict = NULL;
1714         s->object_hook = NULL;
1715         s->pairs_hook = NULL;
1716         s->parse_float = NULL;
1717         s->parse_int = NULL;
1718         s->parse_constant = NULL;
1719     }
1720     return (PyObject *)s;
1721 }
1722 
1723 static int
scanner_init(PyObject * self,PyObject * args,PyObject * kwds)1724 scanner_init(PyObject *self, PyObject *args, PyObject *kwds)
1725 {
1726     /* Initialize Scanner object */
1727     PyObject *ctx;
1728     static char *kwlist[] = {"context", NULL};
1729     PyScannerObject *s;
1730 
1731     assert(PyScanner_Check(self));
1732     s = (PyScannerObject *)self;
1733 
1734     if (!PyArg_ParseTupleAndKeywords(args, kwds, "O:make_scanner", kwlist, &ctx))
1735         return -1;
1736 
1737     /* PyString_AS_STRING is used on encoding */
1738     s->encoding = PyObject_GetAttrString(ctx, "encoding");
1739     if (s->encoding == NULL)
1740         goto bail;
1741     if (s->encoding == Py_None) {
1742         Py_DECREF(Py_None);
1743         s->encoding = PyString_InternFromString(DEFAULT_ENCODING);
1744     }
1745     else if (PyUnicode_Check(s->encoding)) {
1746         PyObject *tmp = PyUnicode_AsEncodedString(s->encoding, NULL, NULL);
1747         Py_SETREF(s->encoding, tmp);
1748     }
1749     if (s->encoding == NULL)
1750         goto bail;
1751     if (!PyString_Check(s->encoding)) {
1752 	PyErr_Format(PyExc_TypeError,
1753 		     "encoding must be a string, not %.80s",
1754 		     Py_TYPE(s->encoding)->tp_name);
1755 	goto bail;
1756     }
1757 
1758 
1759     /* All of these will fail "gracefully" so we don't need to verify them */
1760     s->strict = PyObject_GetAttrString(ctx, "strict");
1761     if (s->strict == NULL)
1762         goto bail;
1763     s->object_hook = PyObject_GetAttrString(ctx, "object_hook");
1764     if (s->object_hook == NULL)
1765         goto bail;
1766     s->pairs_hook = PyObject_GetAttrString(ctx, "object_pairs_hook");
1767     if (s->pairs_hook == NULL)
1768         goto bail;
1769     s->parse_float = PyObject_GetAttrString(ctx, "parse_float");
1770     if (s->parse_float == NULL)
1771         goto bail;
1772     s->parse_int = PyObject_GetAttrString(ctx, "parse_int");
1773     if (s->parse_int == NULL)
1774         goto bail;
1775     s->parse_constant = PyObject_GetAttrString(ctx, "parse_constant");
1776     if (s->parse_constant == NULL)
1777         goto bail;
1778 
1779     return 0;
1780 
1781 bail:
1782     Py_CLEAR(s->encoding);
1783     Py_CLEAR(s->strict);
1784     Py_CLEAR(s->object_hook);
1785     Py_CLEAR(s->pairs_hook);
1786     Py_CLEAR(s->parse_float);
1787     Py_CLEAR(s->parse_int);
1788     Py_CLEAR(s->parse_constant);
1789     return -1;
1790 }
1791 
1792 PyDoc_STRVAR(scanner_doc, "JSON scanner object");
1793 
1794 static
1795 PyTypeObject PyScannerType = {
1796     PyObject_HEAD_INIT(NULL)
1797     0,                    /* tp_internal */
1798     "_json.Scanner",       /* tp_name */
1799     sizeof(PyScannerObject), /* tp_basicsize */
1800     0,                    /* tp_itemsize */
1801     scanner_dealloc, /* tp_dealloc */
1802     0,                    /* tp_print */
1803     0,                    /* tp_getattr */
1804     0,                    /* tp_setattr */
1805     0,                    /* tp_compare */
1806     0,                    /* tp_repr */
1807     0,                    /* tp_as_number */
1808     0,                    /* tp_as_sequence */
1809     0,                    /* tp_as_mapping */
1810     0,                    /* tp_hash */
1811     scanner_call,         /* tp_call */
1812     0,                    /* tp_str */
1813     0,/* PyObject_GenericGetAttr, */                    /* tp_getattro */
1814     0,/* PyObject_GenericSetAttr, */                    /* tp_setattro */
1815     0,                    /* tp_as_buffer */
1816     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,   /* tp_flags */
1817     scanner_doc,          /* tp_doc */
1818     scanner_traverse,                    /* tp_traverse */
1819     scanner_clear,                    /* tp_clear */
1820     0,                    /* tp_richcompare */
1821     0,                    /* tp_weaklistoffset */
1822     0,                    /* tp_iter */
1823     0,                    /* tp_iternext */
1824     0,                    /* tp_methods */
1825     scanner_members,                    /* tp_members */
1826     0,                    /* tp_getset */
1827     0,                    /* tp_base */
1828     0,                    /* tp_dict */
1829     0,                    /* tp_descr_get */
1830     0,                    /* tp_descr_set */
1831     0,                    /* tp_dictoffset */
1832     scanner_init,                    /* tp_init */
1833     0,/* PyType_GenericAlloc, */        /* tp_alloc */
1834     scanner_new,          /* tp_new */
1835     0,/* PyObject_GC_Del, */              /* tp_free */
1836 };
1837 
1838 static PyObject *
encoder_new(PyTypeObject * type,PyObject * args,PyObject * kwds)1839 encoder_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
1840 {
1841     PyEncoderObject *s;
1842     s = (PyEncoderObject *)type->tp_alloc(type, 0);
1843     if (s != NULL) {
1844         s->markers = NULL;
1845         s->defaultfn = NULL;
1846         s->encoder = NULL;
1847         s->indent = NULL;
1848         s->key_separator = NULL;
1849         s->item_separator = NULL;
1850         s->sort_keys = NULL;
1851         s->skipkeys = NULL;
1852     }
1853     return (PyObject *)s;
1854 }
1855 
1856 static int
encoder_init(PyObject * self,PyObject * args,PyObject * kwds)1857 encoder_init(PyObject *self, PyObject *args, PyObject *kwds)
1858 {
1859     /* initialize Encoder object */
1860     static char *kwlist[] = {"markers", "default", "encoder", "indent", "key_separator", "item_separator", "sort_keys", "skipkeys", "allow_nan", NULL};
1861 
1862     PyEncoderObject *s;
1863     PyObject *markers, *defaultfn, *encoder, *indent, *key_separator;
1864     PyObject *item_separator, *sort_keys, *skipkeys, *allow_nan_obj;
1865     int allow_nan;
1866 
1867     assert(PyEncoder_Check(self));
1868     s = (PyEncoderObject *)self;
1869 
1870     if (!PyArg_ParseTupleAndKeywords(args, kwds, "OOOOOOOOO:make_encoder", kwlist,
1871         &markers, &defaultfn, &encoder, &indent, &key_separator, &item_separator,
1872         &sort_keys, &skipkeys, &allow_nan_obj))
1873         return -1;
1874 
1875     allow_nan = PyObject_IsTrue(allow_nan_obj);
1876     if (allow_nan < 0)
1877         return -1;
1878 
1879     if (markers != Py_None && !PyDict_Check(markers)) {
1880         PyErr_Format(PyExc_TypeError,
1881                      "make_encoder() argument 1 must be dict or None, "
1882                      "not %.200s", Py_TYPE(markers)->tp_name);
1883         return -1;
1884     }
1885 
1886     s->markers = markers;
1887     s->defaultfn = defaultfn;
1888     s->encoder = encoder;
1889     s->indent = indent;
1890     s->key_separator = key_separator;
1891     s->item_separator = item_separator;
1892     s->sort_keys = sort_keys;
1893     s->skipkeys = skipkeys;
1894     s->fast_encode = (PyCFunction_Check(s->encoder) && PyCFunction_GetFunction(s->encoder) == (PyCFunction)py_encode_basestring_ascii);
1895     s->allow_nan = allow_nan;
1896 
1897     Py_INCREF(s->markers);
1898     Py_INCREF(s->defaultfn);
1899     Py_INCREF(s->encoder);
1900     Py_INCREF(s->indent);
1901     Py_INCREF(s->key_separator);
1902     Py_INCREF(s->item_separator);
1903     Py_INCREF(s->sort_keys);
1904     Py_INCREF(s->skipkeys);
1905     return 0;
1906 }
1907 
1908 static PyObject *
encoder_call(PyObject * self,PyObject * args,PyObject * kwds)1909 encoder_call(PyObject *self, PyObject *args, PyObject *kwds)
1910 {
1911     /* Python callable interface to encode_listencode_obj */
1912     static char *kwlist[] = {"obj", "_current_indent_level", NULL};
1913     PyObject *obj;
1914     PyObject *rval;
1915     Py_ssize_t indent_level;
1916     PyEncoderObject *s;
1917     assert(PyEncoder_Check(self));
1918     s = (PyEncoderObject *)self;
1919     if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO&:_iterencode", kwlist,
1920         &obj, _convertPyInt_AsSsize_t, &indent_level))
1921         return NULL;
1922     rval = PyList_New(0);
1923     if (rval == NULL)
1924         return NULL;
1925     if (encoder_listencode_obj(s, rval, obj, indent_level)) {
1926         Py_DECREF(rval);
1927         return NULL;
1928     }
1929     return rval;
1930 }
1931 
1932 static PyObject *
_encoded_const(PyObject * obj)1933 _encoded_const(PyObject *obj)
1934 {
1935     /* Return the JSON string representation of None, True, False */
1936     if (obj == Py_None) {
1937         static PyObject *s_null = NULL;
1938         if (s_null == NULL) {
1939             s_null = PyString_InternFromString("null");
1940         }
1941         Py_INCREF(s_null);
1942         return s_null;
1943     }
1944     else if (obj == Py_True) {
1945         static PyObject *s_true = NULL;
1946         if (s_true == NULL) {
1947             s_true = PyString_InternFromString("true");
1948         }
1949         Py_INCREF(s_true);
1950         return s_true;
1951     }
1952     else if (obj == Py_False) {
1953         static PyObject *s_false = NULL;
1954         if (s_false == NULL) {
1955             s_false = PyString_InternFromString("false");
1956         }
1957         Py_INCREF(s_false);
1958         return s_false;
1959     }
1960     else {
1961         PyErr_SetString(PyExc_ValueError, "not a const");
1962         return NULL;
1963     }
1964 }
1965 
1966 static PyObject *
encoder_encode_float(PyEncoderObject * s,PyObject * obj)1967 encoder_encode_float(PyEncoderObject *s, PyObject *obj)
1968 {
1969     /* Return the JSON representation of a PyFloat */
1970     double i = PyFloat_AS_DOUBLE(obj);
1971     if (!Py_IS_FINITE(i)) {
1972         if (!s->allow_nan) {
1973             PyErr_SetString(PyExc_ValueError, "Out of range float values are not JSON compliant");
1974             return NULL;
1975         }
1976         if (i > 0) {
1977             return PyString_FromString("Infinity");
1978         }
1979         else if (i < 0) {
1980             return PyString_FromString("-Infinity");
1981         }
1982         else {
1983             return PyString_FromString("NaN");
1984         }
1985     }
1986     /* Make sure to use the base float class repr method */
1987     return PyFloat_Type.tp_repr(obj);
1988 }
1989 
1990 static PyObject *
encoder_encode_string(PyEncoderObject * s,PyObject * obj)1991 encoder_encode_string(PyEncoderObject *s, PyObject *obj)
1992 {
1993     /* Return the JSON representation of a string */
1994     if (s->fast_encode)
1995         return py_encode_basestring_ascii(NULL, obj);
1996     else
1997         return PyObject_CallFunctionObjArgs(s->encoder, obj, NULL);
1998 }
1999 
2000 static int
_steal_list_append(PyObject * lst,PyObject * stolen)2001 _steal_list_append(PyObject *lst, PyObject *stolen)
2002 {
2003     /* Append stolen and then decrement its reference count */
2004     int rval = PyList_Append(lst, stolen);
2005     Py_DECREF(stolen);
2006     return rval;
2007 }
2008 
2009 static int
encoder_listencode_obj(PyEncoderObject * s,PyObject * rval,PyObject * obj,Py_ssize_t indent_level)2010 encoder_listencode_obj(PyEncoderObject *s, PyObject *rval, PyObject *obj, Py_ssize_t indent_level)
2011 {
2012     /* Encode Python object obj to a JSON term, rval is a PyList */
2013     PyObject *newobj;
2014     int rv;
2015 
2016     if (obj == Py_None || obj == Py_True || obj == Py_False) {
2017         PyObject *cstr = _encoded_const(obj);
2018         if (cstr == NULL)
2019             return -1;
2020         return _steal_list_append(rval, cstr);
2021     }
2022     else if (PyString_Check(obj) || PyUnicode_Check(obj))
2023     {
2024         PyObject *encoded = encoder_encode_string(s, obj);
2025         if (encoded == NULL)
2026             return -1;
2027         return _steal_list_append(rval, encoded);
2028     }
2029     else if (PyInt_Check(obj) || PyLong_Check(obj)) {
2030         PyObject *encoded = PyObject_Str(obj);
2031         if (encoded == NULL)
2032             return -1;
2033         return _steal_list_append(rval, encoded);
2034     }
2035     else if (PyFloat_Check(obj)) {
2036         PyObject *encoded = encoder_encode_float(s, obj);
2037         if (encoded == NULL)
2038             return -1;
2039         return _steal_list_append(rval, encoded);
2040     }
2041     else if (PyList_Check(obj) || PyTuple_Check(obj)) {
2042         if (Py_EnterRecursiveCall(" while encoding a JSON object"))
2043             return -1;
2044         rv = encoder_listencode_list(s, rval, obj, indent_level);
2045         Py_LeaveRecursiveCall();
2046         return rv;
2047     }
2048     else if (PyDict_Check(obj)) {
2049         if (Py_EnterRecursiveCall(" while encoding a JSON object"))
2050             return -1;
2051         rv = encoder_listencode_dict(s, rval, obj, indent_level);
2052         Py_LeaveRecursiveCall();
2053         return rv;
2054     }
2055     else {
2056         PyObject *ident = NULL;
2057         if (s->markers != Py_None) {
2058             int has_key;
2059             ident = PyLong_FromVoidPtr(obj);
2060             if (ident == NULL)
2061                 return -1;
2062             has_key = PyDict_Contains(s->markers, ident);
2063             if (has_key) {
2064                 if (has_key != -1)
2065                     PyErr_SetString(PyExc_ValueError, "Circular reference detected");
2066                 Py_DECREF(ident);
2067                 return -1;
2068             }
2069             if (PyDict_SetItem(s->markers, ident, obj)) {
2070                 Py_DECREF(ident);
2071                 return -1;
2072             }
2073         }
2074         newobj = PyObject_CallFunctionObjArgs(s->defaultfn, obj, NULL);
2075         if (newobj == NULL) {
2076             Py_XDECREF(ident);
2077             return -1;
2078         }
2079 
2080         if (Py_EnterRecursiveCall(" while encoding a JSON object"))
2081             return -1;
2082         rv = encoder_listencode_obj(s, rval, newobj, indent_level);
2083         Py_LeaveRecursiveCall();
2084 
2085         Py_DECREF(newobj);
2086         if (rv) {
2087             Py_XDECREF(ident);
2088             return -1;
2089         }
2090         if (ident != NULL) {
2091             if (PyDict_DelItem(s->markers, ident)) {
2092                 Py_XDECREF(ident);
2093                 return -1;
2094             }
2095             Py_XDECREF(ident);
2096         }
2097         return rv;
2098     }
2099 }
2100 
2101 static int
encoder_listencode_dict(PyEncoderObject * s,PyObject * rval,PyObject * dct,Py_ssize_t indent_level)2102 encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ssize_t indent_level)
2103 {
2104     /* Encode Python dict dct a JSON term, rval is a PyList */
2105     static PyObject *open_dict = NULL;
2106     static PyObject *close_dict = NULL;
2107     static PyObject *empty_dict = NULL;
2108     PyObject *kstr = NULL;
2109     PyObject *ident = NULL;
2110     PyObject *key = NULL;
2111     PyObject *value = NULL;
2112     PyObject *it = NULL;
2113     int skipkeys;
2114     Py_ssize_t idx;
2115 
2116     if (open_dict == NULL || close_dict == NULL || empty_dict == NULL) {
2117         open_dict = PyString_InternFromString("{");
2118         close_dict = PyString_InternFromString("}");
2119         empty_dict = PyString_InternFromString("{}");
2120         if (open_dict == NULL || close_dict == NULL || empty_dict == NULL)
2121             return -1;
2122     }
2123     if (Py_SIZE(dct) == 0)
2124         return PyList_Append(rval, empty_dict);
2125 
2126     if (s->markers != Py_None) {
2127         int has_key;
2128         ident = PyLong_FromVoidPtr(dct);
2129         if (ident == NULL)
2130             goto bail;
2131         has_key = PyDict_Contains(s->markers, ident);
2132         if (has_key) {
2133             if (has_key != -1)
2134                 PyErr_SetString(PyExc_ValueError, "Circular reference detected");
2135             goto bail;
2136         }
2137         if (PyDict_SetItem(s->markers, ident, dct)) {
2138             goto bail;
2139         }
2140     }
2141 
2142     if (PyList_Append(rval, open_dict))
2143         goto bail;
2144 
2145     if (s->indent != Py_None) {
2146         /* TODO: DOES NOT RUN */
2147         indent_level += 1;
2148         /*
2149             newline_indent = '\n' + (' ' * (_indent * _current_indent_level))
2150             separator = _item_separator + newline_indent
2151             buf += newline_indent
2152         */
2153     }
2154 
2155     /* TODO: C speedup not implemented for sort_keys */
2156 
2157     it = PyObject_GetIter(dct);
2158     if (it == NULL)
2159         goto bail;
2160     skipkeys = PyObject_IsTrue(s->skipkeys);
2161     if (skipkeys < 0)
2162         goto bail;
2163     idx = 0;
2164     while ((key = PyIter_Next(it)) != NULL) {
2165         PyObject *encoded;
2166 
2167         if (PyString_Check(key) || PyUnicode_Check(key)) {
2168             Py_INCREF(key);
2169             kstr = key;
2170         }
2171         else if (PyFloat_Check(key)) {
2172             kstr = encoder_encode_float(s, key);
2173             if (kstr == NULL)
2174                 goto bail;
2175         }
2176         else if (PyInt_Check(key) || PyLong_Check(key)) {
2177             kstr = PyObject_Str(key);
2178             if (kstr == NULL)
2179                 goto bail;
2180         }
2181         else if (key == Py_True || key == Py_False || key == Py_None) {
2182             kstr = _encoded_const(key);
2183             if (kstr == NULL)
2184                 goto bail;
2185         }
2186         else if (skipkeys) {
2187             Py_DECREF(key);
2188             continue;
2189         }
2190         else {
2191             /* TODO: include repr of key */
2192             PyErr_SetString(PyExc_TypeError, "keys must be a string");
2193             goto bail;
2194         }
2195 
2196         if (idx) {
2197             if (PyList_Append(rval, s->item_separator))
2198                 goto bail;
2199         }
2200 
2201         value = PyObject_GetItem(dct, key);
2202         if (value == NULL)
2203             goto bail;
2204 
2205         encoded = encoder_encode_string(s, kstr);
2206         Py_CLEAR(kstr);
2207         if (encoded == NULL)
2208             goto bail;
2209         if (PyList_Append(rval, encoded)) {
2210             Py_DECREF(encoded);
2211             goto bail;
2212         }
2213         Py_DECREF(encoded);
2214         if (PyList_Append(rval, s->key_separator))
2215             goto bail;
2216         if (encoder_listencode_obj(s, rval, value, indent_level))
2217             goto bail;
2218         idx += 1;
2219         Py_CLEAR(value);
2220         Py_DECREF(key);
2221     }
2222     if (PyErr_Occurred())
2223         goto bail;
2224     Py_CLEAR(it);
2225 
2226     if (ident != NULL) {
2227         if (PyDict_DelItem(s->markers, ident))
2228             goto bail;
2229         Py_CLEAR(ident);
2230     }
2231     if (s->indent != Py_None) {
2232         /* TODO: DOES NOT RUN */
2233         /*
2234             indent_level -= 1;
2235 
2236             yield '\n' + (' ' * (_indent * _current_indent_level))
2237         */
2238     }
2239     if (PyList_Append(rval, close_dict))
2240         goto bail;
2241     return 0;
2242 
2243 bail:
2244     Py_XDECREF(it);
2245     Py_XDECREF(key);
2246     Py_XDECREF(value);
2247     Py_XDECREF(kstr);
2248     Py_XDECREF(ident);
2249     return -1;
2250 }
2251 
2252 
2253 static int
encoder_listencode_list(PyEncoderObject * s,PyObject * rval,PyObject * seq,Py_ssize_t indent_level)2254 encoder_listencode_list(PyEncoderObject *s, PyObject *rval, PyObject *seq, Py_ssize_t indent_level)
2255 {
2256     /* Encode Python list seq to a JSON term, rval is a PyList */
2257     static PyObject *open_array = NULL;
2258     static PyObject *close_array = NULL;
2259     static PyObject *empty_array = NULL;
2260     PyObject *ident = NULL;
2261     PyObject *s_fast = NULL;
2262     Py_ssize_t i;
2263 
2264     if (open_array == NULL || close_array == NULL || empty_array == NULL) {
2265         open_array = PyString_InternFromString("[");
2266         close_array = PyString_InternFromString("]");
2267         empty_array = PyString_InternFromString("[]");
2268         if (open_array == NULL || close_array == NULL || empty_array == NULL)
2269             return -1;
2270     }
2271     ident = NULL;
2272     s_fast = PySequence_Fast(seq, "_iterencode_list needs a sequence");
2273     if (s_fast == NULL)
2274         return -1;
2275     if (PySequence_Fast_GET_SIZE(s_fast) == 0) {
2276         Py_DECREF(s_fast);
2277         return PyList_Append(rval, empty_array);
2278     }
2279 
2280     if (s->markers != Py_None) {
2281         int has_key;
2282         ident = PyLong_FromVoidPtr(seq);
2283         if (ident == NULL)
2284             goto bail;
2285         has_key = PyDict_Contains(s->markers, ident);
2286         if (has_key) {
2287             if (has_key != -1)
2288                 PyErr_SetString(PyExc_ValueError, "Circular reference detected");
2289             goto bail;
2290         }
2291         if (PyDict_SetItem(s->markers, ident, seq)) {
2292             goto bail;
2293         }
2294     }
2295 
2296     if (PyList_Append(rval, open_array))
2297         goto bail;
2298     if (s->indent != Py_None) {
2299         /* TODO: DOES NOT RUN */
2300         indent_level += 1;
2301         /*
2302             newline_indent = '\n' + (' ' * (_indent * _current_indent_level))
2303             separator = _item_separator + newline_indent
2304             buf += newline_indent
2305         */
2306     }
2307     for (i = 0; i < PySequence_Fast_GET_SIZE(s_fast); i++) {
2308         PyObject *obj = PySequence_Fast_GET_ITEM(s_fast, i);
2309         if (i) {
2310             if (PyList_Append(rval, s->item_separator))
2311                 goto bail;
2312         }
2313         if (encoder_listencode_obj(s, rval, obj, indent_level))
2314             goto bail;
2315     }
2316     if (ident != NULL) {
2317         if (PyDict_DelItem(s->markers, ident))
2318             goto bail;
2319         Py_CLEAR(ident);
2320     }
2321     if (s->indent != Py_None) {
2322         /* TODO: DOES NOT RUN */
2323         /*
2324             indent_level -= 1;
2325 
2326             yield '\n' + (' ' * (_indent * _current_indent_level))
2327         */
2328     }
2329     if (PyList_Append(rval, close_array))
2330         goto bail;
2331     Py_DECREF(s_fast);
2332     return 0;
2333 
2334 bail:
2335     Py_XDECREF(ident);
2336     Py_DECREF(s_fast);
2337     return -1;
2338 }
2339 
2340 static void
encoder_dealloc(PyObject * self)2341 encoder_dealloc(PyObject *self)
2342 {
2343     /* Deallocate Encoder */
2344     encoder_clear(self);
2345     Py_TYPE(self)->tp_free(self);
2346 }
2347 
2348 static int
encoder_traverse(PyObject * self,visitproc visit,void * arg)2349 encoder_traverse(PyObject *self, visitproc visit, void *arg)
2350 {
2351     PyEncoderObject *s;
2352     assert(PyEncoder_Check(self));
2353     s = (PyEncoderObject *)self;
2354     Py_VISIT(s->markers);
2355     Py_VISIT(s->defaultfn);
2356     Py_VISIT(s->encoder);
2357     Py_VISIT(s->indent);
2358     Py_VISIT(s->key_separator);
2359     Py_VISIT(s->item_separator);
2360     Py_VISIT(s->sort_keys);
2361     Py_VISIT(s->skipkeys);
2362     return 0;
2363 }
2364 
2365 static int
encoder_clear(PyObject * self)2366 encoder_clear(PyObject *self)
2367 {
2368     /* Deallocate Encoder */
2369     PyEncoderObject *s;
2370     assert(PyEncoder_Check(self));
2371     s = (PyEncoderObject *)self;
2372     Py_CLEAR(s->markers);
2373     Py_CLEAR(s->defaultfn);
2374     Py_CLEAR(s->encoder);
2375     Py_CLEAR(s->indent);
2376     Py_CLEAR(s->key_separator);
2377     Py_CLEAR(s->item_separator);
2378     Py_CLEAR(s->sort_keys);
2379     Py_CLEAR(s->skipkeys);
2380     return 0;
2381 }
2382 
2383 PyDoc_STRVAR(encoder_doc, "_iterencode(obj, _current_indent_level) -> iterable");
2384 
2385 static
2386 PyTypeObject PyEncoderType = {
2387     PyObject_HEAD_INIT(NULL)
2388     0,                    /* tp_internal */
2389     "_json.Encoder",       /* tp_name */
2390     sizeof(PyEncoderObject), /* tp_basicsize */
2391     0,                    /* tp_itemsize */
2392     encoder_dealloc, /* tp_dealloc */
2393     0,                    /* tp_print */
2394     0,                    /* tp_getattr */
2395     0,                    /* tp_setattr */
2396     0,                    /* tp_compare */
2397     0,                    /* tp_repr */
2398     0,                    /* tp_as_number */
2399     0,                    /* tp_as_sequence */
2400     0,                    /* tp_as_mapping */
2401     0,                    /* tp_hash */
2402     encoder_call,         /* tp_call */
2403     0,                    /* tp_str */
2404     0,                    /* tp_getattro */
2405     0,                    /* tp_setattro */
2406     0,                    /* tp_as_buffer */
2407     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,   /* tp_flags */
2408     encoder_doc,          /* tp_doc */
2409     encoder_traverse,     /* tp_traverse */
2410     encoder_clear,        /* tp_clear */
2411     0,                    /* tp_richcompare */
2412     0,                    /* tp_weaklistoffset */
2413     0,                    /* tp_iter */
2414     0,                    /* tp_iternext */
2415     0,                    /* tp_methods */
2416     encoder_members,      /* tp_members */
2417     0,                    /* tp_getset */
2418     0,                    /* tp_base */
2419     0,                    /* tp_dict */
2420     0,                    /* tp_descr_get */
2421     0,                    /* tp_descr_set */
2422     0,                    /* tp_dictoffset */
2423     encoder_init,         /* tp_init */
2424     0,                    /* tp_alloc */
2425     encoder_new,          /* tp_new */
2426     0,                    /* tp_free */
2427 };
2428 
2429 static PyMethodDef speedups_methods[] = {
2430     {"encode_basestring_ascii",
2431         (PyCFunction)py_encode_basestring_ascii,
2432         METH_O,
2433         pydoc_encode_basestring_ascii},
2434     {"scanstring",
2435         (PyCFunction)py_scanstring,
2436         METH_VARARGS,
2437         pydoc_scanstring},
2438     {NULL, NULL, 0, NULL}
2439 };
2440 
2441 PyDoc_STRVAR(module_doc,
2442 "json speedups\n");
2443 
2444 void
init_json(void)2445 init_json(void)
2446 {
2447     PyObject *m;
2448     PyScannerType.tp_new = PyType_GenericNew;
2449     if (PyType_Ready(&PyScannerType) < 0)
2450         return;
2451     PyEncoderType.tp_new = PyType_GenericNew;
2452     if (PyType_Ready(&PyEncoderType) < 0)
2453         return;
2454     m = Py_InitModule3("_json", speedups_methods, module_doc);
2455     if (m == NULL)
2456         return;
2457     Py_INCREF((PyObject*)&PyScannerType);
2458     PyModule_AddObject(m, "make_scanner", (PyObject*)&PyScannerType);
2459     Py_INCREF((PyObject*)&PyEncoderType);
2460     PyModule_AddObject(m, "make_encoder", (PyObject*)&PyEncoderType);
2461 }
2462