1"""Implementation of JSONDecoder 2""" 3import re 4import sys 5import struct 6 7from json import scanner 8try: 9 from _json import scanstring as c_scanstring 10except ImportError: 11 c_scanstring = None 12 13__all__ = ['JSONDecoder'] 14 15FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL 16 17def _floatconstants(): 18 _BYTES = '7FF80000000000007FF0000000000000'.decode('hex') 19 if sys.byteorder != 'big': 20 _BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1] 21 nan, inf = struct.unpack('dd', _BYTES) 22 return nan, inf, -inf 23 24NaN, PosInf, NegInf = _floatconstants() 25 26 27def linecol(doc, pos): 28 lineno = doc.count('\n', 0, pos) + 1 29 if lineno == 1: 30 colno = pos 31 else: 32 colno = pos - doc.rindex('\n', 0, pos) 33 return lineno, colno 34 35 36def errmsg(msg, doc, pos, end=None): 37 # Note that this function is called from _json 38 lineno, colno = linecol(doc, pos) 39 if end is None: 40 fmt = '{0}: line {1} column {2} (char {3})' 41 return fmt.format(msg, lineno, colno, pos) 42 #fmt = '%s: line %d column %d (char %d)' 43 #return fmt % (msg, lineno, colno, pos) 44 endlineno, endcolno = linecol(doc, end) 45 fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})' 46 return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end) 47 #fmt = '%s: line %d column %d - line %d column %d (char %d - %d)' 48 #return fmt % (msg, lineno, colno, endlineno, endcolno, pos, end) 49 50 51_CONSTANTS = { 52 '-Infinity': NegInf, 53 'Infinity': PosInf, 54 'NaN': NaN, 55} 56 57STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS) 58BACKSLASH = { 59 '"': u'"', '\\': u'\\', '/': u'/', 60 'b': u'\b', 'f': u'\f', 'n': u'\n', 'r': u'\r', 't': u'\t', 61} 62 63DEFAULT_ENCODING = "utf-8" 64 65def py_scanstring(s, end, encoding=None, strict=True, 66 _b=BACKSLASH, _m=STRINGCHUNK.match): 67 """Scan the string s for a JSON string. End is the index of the 68 character in s after the quote that started the JSON string. 69 Unescapes all valid JSON string escape sequences and raises ValueError 70 on attempt to decode an invalid string. If strict is False then literal 71 control characters are allowed in the string. 72 73 Returns a tuple of the decoded string and the index of the character in s 74 after the end quote.""" 75 if encoding is None: 76 encoding = DEFAULT_ENCODING 77 chunks = [] 78 _append = chunks.append 79 begin = end - 1 80 while 1: 81 chunk = _m(s, end) 82 if chunk is None: 83 raise ValueError( 84 errmsg("Unterminated string starting at", s, begin)) 85 end = chunk.end() 86 content, terminator = chunk.groups() 87 # Content is contains zero or more unescaped string characters 88 if content: 89 if not isinstance(content, unicode): 90 content = unicode(content, encoding) 91 _append(content) 92 # Terminator is the end of string, a literal control character, 93 # or a backslash denoting that an escape sequence follows 94 if terminator == '"': 95 break 96 elif terminator != '\\': 97 if strict: 98 #msg = "Invalid control character %r at" % (terminator,) 99 msg = "Invalid control character {0!r} at".format(terminator) 100 raise ValueError(errmsg(msg, s, end)) 101 else: 102 _append(terminator) 103 continue 104 try: 105 esc = s[end] 106 except IndexError: 107 raise ValueError( 108 errmsg("Unterminated string starting at", s, begin)) 109 # If not a unicode escape sequence, must be in the lookup table 110 if esc != 'u': 111 try: 112 char = _b[esc] 113 except KeyError: 114 msg = "Invalid \\escape: " + repr(esc) 115 raise ValueError(errmsg(msg, s, end)) 116 end += 1 117 else: 118 # Unicode escape sequence 119 esc = s[end + 1:end + 5] 120 next_end = end + 5 121 if len(esc) != 4: 122 msg = "Invalid \\uXXXX escape" 123 raise ValueError(errmsg(msg, s, end)) 124 uni = int(esc, 16) 125 # Check for surrogate pair on UCS-4 systems 126 if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535: 127 msg = "Invalid \\uXXXX\\uXXXX surrogate pair" 128 if not s[end + 5:end + 7] == '\\u': 129 raise ValueError(errmsg(msg, s, end)) 130 esc2 = s[end + 7:end + 11] 131 if len(esc2) != 4: 132 raise ValueError(errmsg(msg, s, end)) 133 uni2 = int(esc2, 16) 134 uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00)) 135 next_end += 6 136 char = unichr(uni) 137 end = next_end 138 # Append the unescaped character 139 _append(char) 140 return u''.join(chunks), end 141 142 143# Use speedup if available 144scanstring = c_scanstring or py_scanstring 145 146WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS) 147WHITESPACE_STR = ' \t\n\r' 148 149def JSONObject(s_and_end, encoding, strict, scan_once, object_hook, 150 object_pairs_hook, _w=WHITESPACE.match, _ws=WHITESPACE_STR): 151 s, end = s_and_end 152 pairs = [] 153 pairs_append = pairs.append 154 # Use a slice to prevent IndexError from being raised, the following 155 # check will raise a more specific ValueError if the string is empty 156 nextchar = s[end:end + 1] 157 # Normally we expect nextchar == '"' 158 if nextchar != '"': 159 if nextchar in _ws: 160 end = _w(s, end).end() 161 nextchar = s[end:end + 1] 162 # Trivial empty object 163 if nextchar == '}': 164 if object_pairs_hook is not None: 165 result = object_pairs_hook(pairs) 166 return result, end 167 pairs = {} 168 if object_hook is not None: 169 pairs = object_hook(pairs) 170 return pairs, end + 1 171 elif nextchar != '"': 172 raise ValueError(errmsg("Expecting property name", s, end)) 173 end += 1 174 while True: 175 key, end = scanstring(s, end, encoding, strict) 176 177 # To skip some function call overhead we optimize the fast paths where 178 # the JSON key separator is ": " or just ":". 179 if s[end:end + 1] != ':': 180 end = _w(s, end).end() 181 if s[end:end + 1] != ':': 182 raise ValueError(errmsg("Expecting : delimiter", s, end)) 183 184 end += 1 185 186 try: 187 if s[end] in _ws: 188 end += 1 189 if s[end] in _ws: 190 end = _w(s, end + 1).end() 191 except IndexError: 192 pass 193 194 try: 195 value, end = scan_once(s, end) 196 except StopIteration: 197 raise ValueError(errmsg("Expecting object", s, end)) 198 pairs_append((key, value)) 199 200 try: 201 nextchar = s[end] 202 if nextchar in _ws: 203 end = _w(s, end + 1).end() 204 nextchar = s[end] 205 except IndexError: 206 nextchar = '' 207 end += 1 208 209 if nextchar == '}': 210 break 211 elif nextchar != ',': 212 raise ValueError(errmsg("Expecting , delimiter", s, end - 1)) 213 214 try: 215 nextchar = s[end] 216 if nextchar in _ws: 217 end += 1 218 nextchar = s[end] 219 if nextchar in _ws: 220 end = _w(s, end + 1).end() 221 nextchar = s[end] 222 except IndexError: 223 nextchar = '' 224 225 end += 1 226 if nextchar != '"': 227 raise ValueError(errmsg("Expecting property name", s, end - 1)) 228 229 if object_pairs_hook is not None: 230 result = object_pairs_hook(pairs) 231 return result, end 232 pairs = dict(pairs) 233 if object_hook is not None: 234 pairs = object_hook(pairs) 235 return pairs, end 236 237def JSONArray(s_and_end, scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR): 238 s, end = s_and_end 239 values = [] 240 nextchar = s[end:end + 1] 241 if nextchar in _ws: 242 end = _w(s, end + 1).end() 243 nextchar = s[end:end + 1] 244 # Look-ahead for trivial empty array 245 if nextchar == ']': 246 return values, end + 1 247 _append = values.append 248 while True: 249 try: 250 value, end = scan_once(s, end) 251 except StopIteration: 252 raise ValueError(errmsg("Expecting object", s, end)) 253 _append(value) 254 nextchar = s[end:end + 1] 255 if nextchar in _ws: 256 end = _w(s, end + 1).end() 257 nextchar = s[end:end + 1] 258 end += 1 259 if nextchar == ']': 260 break 261 elif nextchar != ',': 262 raise ValueError(errmsg("Expecting , delimiter", s, end)) 263 264 try: 265 if s[end] in _ws: 266 end += 1 267 if s[end] in _ws: 268 end = _w(s, end + 1).end() 269 except IndexError: 270 pass 271 272 return values, end 273 274class JSONDecoder(object): 275 """Simple JSON <http://json.org> decoder 276 277 Performs the following translations in decoding by default: 278 279 +---------------+-------------------+ 280 | JSON | Python | 281 +===============+===================+ 282 | object | dict | 283 +---------------+-------------------+ 284 | array | list | 285 +---------------+-------------------+ 286 | string | unicode | 287 +---------------+-------------------+ 288 | number (int) | int, long | 289 +---------------+-------------------+ 290 | number (real) | float | 291 +---------------+-------------------+ 292 | true | True | 293 +---------------+-------------------+ 294 | false | False | 295 +---------------+-------------------+ 296 | null | None | 297 +---------------+-------------------+ 298 299 It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as 300 their corresponding ``float`` values, which is outside the JSON spec. 301 302 """ 303 304 def __init__(self, encoding=None, object_hook=None, parse_float=None, 305 parse_int=None, parse_constant=None, strict=True, 306 object_pairs_hook=None): 307 """``encoding`` determines the encoding used to interpret any ``str`` 308 objects decoded by this instance (utf-8 by default). It has no 309 effect when decoding ``unicode`` objects. 310 311 Note that currently only encodings that are a superset of ASCII work, 312 strings of other encodings should be passed in as ``unicode``. 313 314 ``object_hook``, if specified, will be called with the result 315 of every JSON object decoded and its return value will be used in 316 place of the given ``dict``. This can be used to provide custom 317 deserializations (e.g. to support JSON-RPC class hinting). 318 319 ``object_pairs_hook``, if specified will be called with the result of 320 every JSON object decoded with an ordered list of pairs. The return 321 value of ``object_pairs_hook`` will be used instead of the ``dict``. 322 This feature can be used to implement custom decoders that rely on the 323 order that the key and value pairs are decoded (for example, 324 collections.OrderedDict will remember the order of insertion). If 325 ``object_hook`` is also defined, the ``object_pairs_hook`` takes 326 priority. 327 328 ``parse_float``, if specified, will be called with the string 329 of every JSON float to be decoded. By default this is equivalent to 330 float(num_str). This can be used to use another datatype or parser 331 for JSON floats (e.g. decimal.Decimal). 332 333 ``parse_int``, if specified, will be called with the string 334 of every JSON int to be decoded. By default this is equivalent to 335 int(num_str). This can be used to use another datatype or parser 336 for JSON integers (e.g. float). 337 338 ``parse_constant``, if specified, will be called with one of the 339 following strings: -Infinity, Infinity, NaN. 340 This can be used to raise an exception if invalid JSON numbers 341 are encountered. 342 343 If ``strict`` is false (true is the default), then control 344 characters will be allowed inside strings. Control characters in 345 this context are those with character codes in the 0-31 range, 346 including ``'\\t'`` (tab), ``'\\n'``, ``'\\r'`` and ``'\\0'``. 347 348 """ 349 self.encoding = encoding 350 self.object_hook = object_hook 351 self.object_pairs_hook = object_pairs_hook 352 self.parse_float = parse_float or float 353 self.parse_int = parse_int or int 354 self.parse_constant = parse_constant or _CONSTANTS.__getitem__ 355 self.strict = strict 356 self.parse_object = JSONObject 357 self.parse_array = JSONArray 358 self.parse_string = scanstring 359 self.scan_once = scanner.make_scanner(self) 360 361 def decode(self, s, _w=WHITESPACE.match): 362 """Return the Python representation of ``s`` (a ``str`` or ``unicode`` 363 instance containing a JSON document) 364 365 """ 366 obj, end = self.raw_decode(s, idx=_w(s, 0).end()) 367 end = _w(s, end).end() 368 if end != len(s): 369 raise ValueError(errmsg("Extra data", s, end, len(s))) 370 return obj 371 372 def raw_decode(self, s, idx=0): 373 """Decode a JSON document from ``s`` (a ``str`` or ``unicode`` 374 beginning with a JSON document) and return a 2-tuple of the Python 375 representation and the index in ``s`` where the document ended. 376 377 This can be used to decode a JSON document from a string that may 378 have extraneous data at the end. 379 380 """ 381 try: 382 obj, end = self.scan_once(s, idx) 383 except StopIteration: 384 raise ValueError("No JSON object could be decoded") 385 return obj, end 386