1'''
2Pure-python parsing backend.
3'''
4from json.decoder import scanstring
5import re
6
7from ijson import common, utils
8import codecs
9
10
11LEXEME_RE = re.compile(r'[a-z0-9eE\.\+-]+|\S')
12UNARY_LEXEMES = set('[]{},')
13EOF = -1, None
14
15
16class UnexpectedSymbol(common.JSONError):
17 def __init__(self, symbol, pos):
18 super(UnexpectedSymbol, self).__init__(
19 'Unexpected symbol %r at %d' % (symbol, pos)
20 )
21
22
23@utils.coroutine
24def utf8_encoder(target):
25 decoder = codecs.getincrementaldecoder('utf-8')()
26 decode = decoder.decode
27 send = target.send
28 while True:
29 try:
30 final = False
31 bdata = (yield)
32 except GeneratorExit:
33 final = True
34 bdata = b''
35 try:
36 sdata = decode(bdata, final)
37 except UnicodeDecodeError as e:
38 try:
39 target.close()
40 except:
41 pass
42 raise common.IncompleteJSONError(e)
43 if sdata:
44 send(sdata)
45 elif not bdata:
46 target.close()
47 break
48
49@utils.coroutine
50def Lexer(target):
51 """
52 Parses lexemes out of the incoming content, and sends them to parse_value.
53 A special EOF result is sent when the data source has been exhausted to
54 give parse_value the possibility of raising custom exceptions due to missing
55 content.
56 """
57 try:
58 data = (yield)
59 except GeneratorExit:
60 data = ''
61 buf = data
62 pos = 0
63 discarded = 0
64 send = target.send
65 while True:
66 match = LEXEME_RE.search(buf, pos)
67 if match:
68 lexeme = match.group()
69 if lexeme == '"':
70 pos = match.start()
71 start = pos + 1
72 while True:
73 try:
74 end = buf.index('"', start)
75 escpos = end - 1
76 while buf[escpos] == '\\':
77 escpos -= 1
78 if (end - escpos) % 2 == 0:
79 start = end + 1
80 else:
81 break
82 except ValueError:
83 try:
84 data = (yield)
85 except GeneratorExit:
86 data = ''
87 if not data:
88 raise common.IncompleteJSONError('Incomplete string lexeme')
89 buf += data
90 send((discarded + pos, buf[pos:end + 1]))
91 pos = end + 1
92 else:
93 while lexeme not in UNARY_LEXEMES and match.end() == len(buf):
94 try:
95 data = (yield)
96 except GeneratorExit:
97 data = ''
98 if not data:
99 break
100 buf += data
101 match = LEXEME_RE.search(buf, pos)
102 lexeme = match.group()
103 send((discarded + match.start(), lexeme))
104 pos = match.end()
105 else:
106 # Don't ask data from an already exhausted source
107 if data:
108 try:
109 data = (yield)
110 except GeneratorExit:
111 data = ''
112 if not data:
113 # Normally should raise StopIteration, but can raise
114 # IncompleteJSONError too, which is the point of sending EOF
115 try:
116 target.send(EOF)
117 except StopIteration:
118 pass
119 break
120 discarded += len(buf)
121 buf = data
122 pos = 0
123
124
125# Parsing states
126_PARSE_VALUE = 0
127_PARSE_ARRAY_ELEMENT_END = 1
128_PARSE_OBJECT_KEY = 2
129_PARSE_OBJECT_END = 3
130
131# infinity singleton for overflow checks
132inf = float("inf")
133
134@utils.coroutine
135def parse_value(target, multivalue, use_float):
136 """
137 Parses results coming out of the Lexer into ijson events, which are sent to
138 `target`. A stack keeps track of the type of object being parsed at the time
139 (a value, and object or array -- the last two being values themselves).
140
141 A special EOF result coming from the Lexer indicates that no more content is
142 expected. This is used to check for incomplete content and raise the
143 appropriate exception, which wouldn't be possible if the Lexer simply closed
144 this co-routine (either explicitly via .close(), or implicitly by itself
145 finishing and decreasing the only reference to the co-routine) since that
146 causes a GeneratorExit exception that cannot be replaced with a custom one.
147 """
148
149 state_stack = [_PARSE_VALUE]
150 pop = state_stack.pop
151 push = state_stack.append
152 send = target.send
153 prev_pos, prev_symbol = None, None
154 to_number = common.integer_or_float if use_float else common.integer_or_decimal
155 while True:
156
157 if prev_pos is None:
158 pos, symbol = (yield)
159 if (pos, symbol) == EOF:
160 if state_stack:
161 raise common.IncompleteJSONError('Incomplete JSON content')
162 break
163 else:
164 pos, symbol = prev_pos, prev_symbol
165 prev_pos, prev_symbol = None, None
166 try:
167 state = state_stack[-1]
168 except IndexError:
169 if multivalue:
170 state = _PARSE_VALUE
171 push(state)
172 else:
173 raise common.JSONError('Additional data found')
174 assert state_stack
175
176 if state == _PARSE_VALUE:
177 # Simple, common cases
178 if symbol == 'null':
179 send(('null', None))
180 pop()
181 elif symbol == 'true':
182 send(('boolean', True))
183 pop()
184 elif symbol == 'false':
185 send(('boolean', False))
186 pop()
187 elif symbol[0] == '"':
188 send(('string', parse_string(symbol)))
189 pop()
190 # Array start
191 elif symbol == '[':
192 send(('start_array', None))
193 pos, symbol = (yield)
194 if (pos, symbol) == EOF:
195 raise common.IncompleteJSONError('Incomplete JSON content')
196 if symbol == ']':
197 send(('end_array', None))
198 pop()
199 else:
200 prev_pos, prev_symbol = pos, symbol
201 push(_PARSE_ARRAY_ELEMENT_END)
202 push(_PARSE_VALUE)
203 # Object start
204 elif symbol == '{':
205 send(('start_map', None))
206 pos, symbol = (yield)
207 if (pos, symbol) == EOF:
208 raise common.IncompleteJSONError('Incomplete JSON content')
209 if symbol == '}':
210 send(('end_map', None))
211 pop()
212 else:
213 prev_pos, prev_symbol = pos, symbol
214 push(_PARSE_OBJECT_KEY)
215 # A number
216 else:
217 # JSON numbers can't contain leading zeros
218 if ((len(symbol) > 1 and symbol[0] == '0' and symbol[1] not in ('e', 'E', '.')) or
219 (len(symbol) > 2 and symbol[0:2] == '-0' and symbol[2] not in ('e', 'E', '.'))):
220 raise common.JSONError('Invalid JSON number: %s' % (symbol,))
221 # Fractions need a leading digit and must be followed by a digit
222 if symbol[0] == '.' or symbol[-1] == '.':
223 raise common.JSONError('Invalid JSON number: %s' % (symbol,))
224 try:
225 number = to_number(symbol)
226 if number == inf:
227 raise common.JSONError("float overflow: %s" % (symbol,))
228 except:
229 if 'true'.startswith(symbol) or 'false'.startswith(symbol) or 'null'.startswith(symbol):
230 raise common.IncompleteJSONError('Incomplete JSON content')
231 raise UnexpectedSymbol(symbol, pos)
232 else:
233 send(('number', number))
234 pop()
235
236 elif state == _PARSE_OBJECT_KEY:
237 if symbol[0] != '"':
238 raise UnexpectedSymbol(symbol, pos)
239 send(('map_key', parse_string(symbol)))
240 pos, symbol = (yield)
241 if (pos, symbol) == EOF:
242 raise common.IncompleteJSONError('Incomplete JSON content')
243 if symbol != ':':
244 raise UnexpectedSymbol(symbol, pos)
245 state_stack[-1] = _PARSE_OBJECT_END
246 push(_PARSE_VALUE)
247
248 elif state == _PARSE_OBJECT_END:
249 if symbol == ',':
250 state_stack[-1] = _PARSE_OBJECT_KEY
251 elif symbol != '}':
252 raise UnexpectedSymbol(symbol, pos)
253 else:
254 send(('end_map', None))
255 pop()
256 pop()
257
258 elif state == _PARSE_ARRAY_ELEMENT_END:
259 if symbol == ',':
260 state_stack[-1] = _PARSE_ARRAY_ELEMENT_END
261 push(_PARSE_VALUE)
262 elif symbol != ']':
263 raise UnexpectedSymbol(symbol, pos)
264 else:
265 send(('end_array', None))
266 pop()
267 pop()
268
269
270def parse_string(symbol):
271 return scanstring(symbol, 1)[0]
272
273
274def basic_parse_basecoro(target, multiple_values=False, allow_comments=False,
275 use_float=False):
276 '''
277 Iterator yielding unprefixed events.
278
279 Parameters:
280
281 - file: a readable file-like object with JSON input
282 '''
283 if allow_comments:
284 raise ValueError("Comments are not supported by the python backend")
285 return utf8_encoder(Lexer(parse_value(target, multiple_values, use_float)))
286
287
288common.enrich_backend(globals())