1"""Implementation of JSONDecoder
2"""
3from __future__ import absolute_import
4import re
5import sys
6from .compat import PY3, unichr
7from .scanner import make_scanner, JSONDecodeError
8
9
10def _import_c_scanstring():
11 try:
12 from ._speedups import scanstring
13 return scanstring
14 except ImportError:
15 return None
16c_scanstring = _import_c_scanstring()
17
18# NOTE (3.1.0): JSONDecodeError may still be imported from this module for
19# compatibility, but it was never in the __all__
20__all__ = ['JSONDecoder']
21
22FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
23
24def _floatconstants():
25 return float('nan'), float('inf'), float('-inf')
26
27NaN, PosInf, NegInf = _floatconstants()
28
29_CONSTANTS = {
30 '-Infinity': NegInf,
31 'Infinity': PosInf,
32 'NaN': NaN,
33}
34
35STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
36BACKSLASH = {
37 '"': u'"', '\\': u'\\', '/': u'/',
38 'b': u'\b', 'f': u'\f', 'n': u'\n', 'r': u'\r', 't': u'\t',
39}
40
41DEFAULT_ENCODING = "utf-8"
42
43if hasattr(sys, 'get_int_max_str_digits'):
44 bounded_int = int
45else:
46 def bounded_int(s, INT_MAX_STR_DIGITS=4300):
47 """Backport of the integer string length conversion limitation
48
49 https://docs.python.org/3/library/stdtypes.html#int-max-str-digits
50 """
51 if len(s) > INT_MAX_STR_DIGITS:
52 raise ValueError("Exceeds the limit (%s) for integer string conversion: value has %s digits" % (INT_MAX_STR_DIGITS, len(s)))
53 return int(s)
54
55
56def scan_four_digit_hex(s, end, _m=re.compile(r'^[0-9a-fA-F]{4}$').match):
57 """Scan a four digit hex number from s[end:end + 4]
58 """
59 msg = "Invalid \\uXXXX escape sequence"
60 esc = s[end:end + 4]
61 if not _m(esc):
62 raise JSONDecodeError(msg, s, end - 2)
63 try:
64 return int(esc, 16), end + 4
65 except ValueError:
66 raise JSONDecodeError(msg, s, end - 2)
67
68def py_scanstring(s, end, encoding=None, strict=True,
69 _b=BACKSLASH, _m=STRINGCHUNK.match, _join=u''.join,
70 _PY3=PY3, _maxunicode=sys.maxunicode,
71 _scan_four_digit_hex=scan_four_digit_hex):
72 """Scan the string s for a JSON string. End is the index of the
73 character in s after the quote that started the JSON string.
74 Unescapes all valid JSON string escape sequences and raises ValueError
75 on attempt to decode an invalid string. If strict is False then literal
76 control characters are allowed in the string.
77
78 Returns a tuple of the decoded string and the index of the character in s
79 after the end quote."""
80 if encoding is None:
81 encoding = DEFAULT_ENCODING
82 chunks = []
83 _append = chunks.append
84 begin = end - 1
85 while 1:
86 chunk = _m(s, end)
87 if chunk is None:
88 raise JSONDecodeError(
89 "Unterminated string starting at", s, begin)
90 prev_end = end
91 end = chunk.end()
92 content, terminator = chunk.groups()
93 # Content is contains zero or more unescaped string characters
94 if content:
95 if not _PY3 and not isinstance(content, unicode):
96 content = unicode(content, encoding)
97 _append(content)
98 # Terminator is the end of string, a literal control character,
99 # or a backslash denoting that an escape sequence follows
100 if terminator == '"':
101 break
102 elif terminator != '\\':
103 if strict:
104 msg = "Invalid control character %r at"
105 raise JSONDecodeError(msg, s, prev_end)
106 else:
107 _append(terminator)
108 continue
109 try:
110 esc = s[end]
111 except IndexError:
112 raise JSONDecodeError(
113 "Unterminated string starting at", s, begin)
114 # If not a unicode escape sequence, must be in the lookup table
115 if esc != 'u':
116 try:
117 char = _b[esc]
118 except KeyError:
119 msg = "Invalid \\X escape sequence %r"
120 raise JSONDecodeError(msg, s, end)
121 end += 1
122 else:
123 # Unicode escape sequence
124 uni, end = _scan_four_digit_hex(s, end + 1)
125 # Check for surrogate pair on UCS-4 systems
126 # Note that this will join high/low surrogate pairs
127 # but will also pass unpaired surrogates through
128 if (_maxunicode > 65535 and
129 uni & 0xfc00 == 0xd800 and
130 s[end:end + 2] == '\\u'):
131 uni2, end2 = _scan_four_digit_hex(s, end + 2)
132 if uni2 & 0xfc00 == 0xdc00:
133 uni = 0x10000 + (((uni - 0xd800) << 10) |
134 (uni2 - 0xdc00))
135 end = end2
136 char = unichr(uni)
137 # Append the unescaped character
138 _append(char)
139 return _join(chunks), end
140
141
142# Use speedup if available
143scanstring = c_scanstring or py_scanstring
144
145WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS)
146WHITESPACE_STR = ' \t\n\r'
147
148def JSONObject(state, encoding, strict, scan_once, object_hook,
149 object_pairs_hook, memo=None,
150 _w=WHITESPACE.match, _ws=WHITESPACE_STR):
151 (s, end) = state
152 # Backwards compatibility
153 if memo is None:
154 memo = {}
155 memo_get = memo.setdefault
156 pairs = []
157 # Use a slice to prevent IndexError from being raised, the following
158 # check will raise a more specific ValueError if the string is empty
159 nextchar = s[end:end + 1]
160 # Normally we expect nextchar == '"'
161 if nextchar != '"':
162 if nextchar in _ws:
163 end = _w(s, end).end()
164 nextchar = s[end:end + 1]
165 # Trivial empty object
166 if nextchar == '}':
167 if object_pairs_hook is not None:
168 result = object_pairs_hook(pairs)
169 return result, end + 1
170 pairs = {}
171 if object_hook is not None:
172 pairs = object_hook(pairs)
173 return pairs, end + 1
174 elif nextchar != '"':
175 raise JSONDecodeError(
176 "Expecting property name enclosed in double quotes or '}'",
177 s, end)
178 end += 1
179 while True:
180 key, end = scanstring(s, end, encoding, strict)
181 key = memo_get(key, key)
182
183 # To skip some function call overhead we optimize the fast paths where
184 # the JSON key separator is ": " or just ":".
185 if s[end:end + 1] != ':':
186 end = _w(s, end).end()
187 if s[end:end + 1] != ':':
188 raise JSONDecodeError("Expecting ':' delimiter", s, end)
189
190 end += 1
191
192 try:
193 if s[end] in _ws:
194 end += 1
195 if s[end] in _ws:
196 end = _w(s, end + 1).end()
197 except IndexError:
198 pass
199
200 value, end = scan_once(s, end)
201 pairs.append((key, value))
202
203 try:
204 nextchar = s[end]
205 if nextchar in _ws:
206 end = _w(s, end + 1).end()
207 nextchar = s[end]
208 except IndexError:
209 nextchar = ''
210 end += 1
211
212 if nextchar == '}':
213 break
214 elif nextchar != ',':
215 raise JSONDecodeError("Expecting ',' delimiter or '}'", s, end - 1)
216
217 try:
218 nextchar = s[end]
219 if nextchar in _ws:
220 end += 1
221 nextchar = s[end]
222 if nextchar in _ws:
223 end = _w(s, end + 1).end()
224 nextchar = s[end]
225 except IndexError:
226 nextchar = ''
227
228 end += 1
229 if nextchar != '"':
230 if nextchar == '}':
231 raise JSONDecodeError(
232 "Illegal trailing comma before end of object",
233 s, end - 1)
234 raise JSONDecodeError(
235 "Expecting property name enclosed in double quotes",
236 s, end - 1)
237
238 if object_pairs_hook is not None:
239 result = object_pairs_hook(pairs)
240 return result, end
241 pairs = dict(pairs)
242 if object_hook is not None:
243 pairs = object_hook(pairs)
244 return pairs, end
245
246def JSONArray(state, scan_once, array_hook=None,
247 _w=WHITESPACE.match, _ws=WHITESPACE_STR):
248 (s, end) = state
249 values = []
250 nextchar = s[end:end + 1]
251 if nextchar in _ws:
252 end = _w(s, end + 1).end()
253 nextchar = s[end:end + 1]
254 # Look-ahead for trivial empty array
255 if nextchar == ']':
256 if array_hook is not None:
257 values = array_hook(values)
258 return values, end + 1
259 elif nextchar == '':
260 raise JSONDecodeError("Expecting value or ']'", s, end)
261 _append = values.append
262 while True:
263 value, end = scan_once(s, end)
264 _append(value)
265 nextchar = s[end:end + 1]
266 if nextchar in _ws:
267 end = _w(s, end + 1).end()
268 nextchar = s[end:end + 1]
269 end += 1
270 if nextchar == ']':
271 break
272 elif nextchar != ',':
273 raise JSONDecodeError("Expecting ',' delimiter or ']'", s, end - 1)
274
275 try:
276 if s[end] in _ws:
277 end += 1
278 if s[end] in _ws:
279 end = _w(s, end + 1).end()
280 except IndexError:
281 pass
282
283 if s[end:end + 1] == ']':
284 raise JSONDecodeError(
285 "Illegal trailing comma before end of array",
286 s, end - 1)
287
288 if array_hook is not None:
289 values = array_hook(values)
290 return values, end
291
292class JSONDecoder(object):
293 """Simple JSON <http://json.org> decoder
294
295 Performs the following translations in decoding by default:
296
297 +---------------+-------------------+
298 | JSON | Python |
299 +===============+===================+
300 | object | dict |
301 +---------------+-------------------+
302 | array | list |
303 +---------------+-------------------+
304 | string | str, unicode |
305 +---------------+-------------------+
306 | number (int) | int, long |
307 +---------------+-------------------+
308 | number (real) | float |
309 +---------------+-------------------+
310 | true | True |
311 +---------------+-------------------+
312 | false | False |
313 +---------------+-------------------+
314 | null | None |
315 +---------------+-------------------+
316
317 When allow_nan=True, it also understands
318 ``NaN``, ``Infinity``, and ``-Infinity`` as
319 their corresponding ``float`` values, which is outside the JSON spec.
320
321 """
322
323 def __init__(self, encoding=None, object_hook=None, parse_float=None,
324 parse_int=None, parse_constant=None, strict=True,
325 object_pairs_hook=None, allow_nan=False,
326 array_hook=None):
327 """
328 *encoding* determines the encoding used to interpret any
329 :class:`str` objects decoded by this instance (``'utf-8'`` by
330 default). It has no effect when decoding :class:`unicode` objects.
331
332 Note that currently only encodings that are a superset of ASCII work,
333 strings of other encodings should be passed in as :class:`unicode`.
334
335 *object_hook*, if specified, will be called with the result of every
336 JSON object decoded and its return value will be used in place of the
337 given :class:`dict`. This can be used to provide custom
338 deserializations (e.g. to support JSON-RPC class hinting).
339
340 *object_pairs_hook* is an optional function that will be called with
341 the result of any object literal decode with an ordered list of pairs.
342 The return value of *object_pairs_hook* will be used instead of the
343 :class:`dict`. This feature can be used to implement custom decoders
344 that rely on the order that the key and value pairs are decoded (for
345 example, :func:`collections.OrderedDict` will remember the order of
346 insertion). If *object_hook* is also defined, the *object_pairs_hook*
347 takes priority.
348
349 *parse_float*, if specified, will be called with the string of every
350 JSON float to be decoded. By default, this is equivalent to
351 ``float(num_str)``. This can be used to use another datatype or parser
352 for JSON floats (e.g. :class:`decimal.Decimal`).
353
354 *parse_int*, if specified, will be called with the string of every
355 JSON int to be decoded. By default, this is equivalent to
356 ``int(num_str)``. This can be used to use another datatype or parser
357 for JSON integers (e.g. :class:`float`).
358
359 *allow_nan*, if True (default false), will allow the parser to
360 accept the non-standard floats ``NaN``, ``Infinity``, and ``-Infinity``.
361
362 *parse_constant*, if specified, will be
363 called with one of the following strings: ``'-Infinity'``,
364 ``'Infinity'``, ``'NaN'``. It is not recommended to use this feature,
365 as it is rare to parse non-compliant JSON containing these values.
366
367 *strict* controls the parser's behavior when it encounters an
368 invalid control character in a string. The default setting of
369 ``True`` means that unescaped control characters are parse errors, if
370 ``False`` then control characters will be allowed in strings.
371
372 """
373 if encoding is None:
374 encoding = DEFAULT_ENCODING
375 self.encoding = encoding
376 self.object_hook = object_hook
377 self.object_pairs_hook = object_pairs_hook
378 self.parse_float = parse_float or float
379 self.parse_int = parse_int or bounded_int
380 self.parse_constant = parse_constant or (allow_nan and _CONSTANTS.__getitem__ or None)
381 self.strict = strict
382 self.array_hook = array_hook
383 self.parse_object = JSONObject
384 self.parse_array = JSONArray
385 self.parse_string = scanstring
386 self.memo = {}
387 self.scan_once = make_scanner(self)
388
389 def decode(self, s, _w=WHITESPACE.match, _PY3=PY3):
390 """Return the Python representation of ``s`` (a ``str`` or ``unicode``
391 instance containing a JSON document)
392
393 """
394 if _PY3 and isinstance(s, bytes):
395 s = str(s, self.encoding)
396 obj, end = self.raw_decode(s)
397 end = _w(s, end).end()
398 if end != len(s):
399 raise JSONDecodeError("Extra data", s, end, len(s))
400 return obj
401
402 def raw_decode(self, s, idx=0, _w=WHITESPACE.match, _PY3=PY3):
403 """Decode a JSON document from ``s`` (a ``str`` or ``unicode``
404 beginning with a JSON document) and return a 2-tuple of the Python
405 representation and the index in ``s`` where the document ended.
406 Optionally, ``idx`` can be used to specify an offset in ``s`` where
407 the JSON document begins.
408
409 This can be used to decode a JSON document from a string that may
410 have extraneous data at the end.
411
412 """
413 if idx < 0:
414 # Ensure that raw_decode bails on negative indexes, the regex
415 # would otherwise mask this behavior. #98
416 raise JSONDecodeError('Expecting value', s, idx)
417 if _PY3 and not isinstance(s, str):
418 raise TypeError("Input string must be text, not bytes")
419 # strip UTF-8 bom
420 if len(s) > idx:
421 ord0 = ord(s[idx])
422 if ord0 == 0xfeff:
423 idx += 1
424 elif ord0 == 0xef and s[idx:idx + 3] == '\xef\xbb\xbf':
425 idx += 3
426 return self.scan_once(s, idx=_w(s, idx).end())