Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/jmespath/parser.py: 97%
313 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:03 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:03 +0000
1"""Top down operator precedence parser.
3This is an implementation of Vaughan R. Pratt's
4"Top Down Operator Precedence" parser.
5(http://dl.acm.org/citation.cfm?doid=512927.512931).
7These are some additional resources that help explain the
8general idea behind a Pratt parser:
10* http://effbot.org/zone/simple-top-down-parsing.htm
11* http://javascript.crockford.com/tdop/tdop.html
13A few notes on the implementation.
15* All the nud/led tokens are on the Parser class itself, and are dispatched
16 using getattr(). This keeps all the parsing logic contained to a single
17 class.
18* We use two passes through the data. One to create a list of token,
19 then one pass through the tokens to create the AST. While the lexer actually
20 yields tokens, we convert it to a list so we can easily implement two tokens
21 of lookahead. A previous implementation used a fixed circular buffer, but it
22 was significantly slower. Also, the average jmespath expression typically
23 does not have a large amount of token so this is not an issue. And
24 interestingly enough, creating a token list first is actually faster than
25 consuming from the token iterator one token at a time.
27"""
28import random
30from jmespath import lexer
31from jmespath.compat import with_repr_method
32from jmespath import ast
33from jmespath import exceptions
34from jmespath import visitor
37class Parser(object):
38 BINDING_POWER = {
39 'eof': 0,
40 'unquoted_identifier': 0,
41 'quoted_identifier': 0,
42 'literal': 0,
43 'rbracket': 0,
44 'rparen': 0,
45 'comma': 0,
46 'rbrace': 0,
47 'number': 0,
48 'current': 0,
49 'expref': 0,
50 'colon': 0,
51 'pipe': 1,
52 'or': 2,
53 'and': 3,
54 'eq': 5,
55 'gt': 5,
56 'lt': 5,
57 'gte': 5,
58 'lte': 5,
59 'ne': 5,
60 'flatten': 9,
61 # Everything above stops a projection.
62 'star': 20,
63 'filter': 21,
64 'dot': 40,
65 'not': 45,
66 'lbrace': 50,
67 'lbracket': 55,
68 'lparen': 60,
69 }
70 # The maximum binding power for a token that can stop
71 # a projection.
72 _PROJECTION_STOP = 10
73 # The _MAX_SIZE most recent expressions are cached in
74 # _CACHE dict.
75 _CACHE = {}
76 _MAX_SIZE = 128
78 def __init__(self, lookahead=2):
79 self.tokenizer = None
80 self._tokens = [None] * lookahead
81 self._buffer_size = lookahead
82 self._index = 0
84 def parse(self, expression):
85 cached = self._CACHE.get(expression)
86 if cached is not None:
87 return cached
88 parsed_result = self._do_parse(expression)
89 self._CACHE[expression] = parsed_result
90 if len(self._CACHE) > self._MAX_SIZE:
91 self._free_cache_entries()
92 return parsed_result
94 def _do_parse(self, expression):
95 try:
96 return self._parse(expression)
97 except exceptions.LexerError as e:
98 e.expression = expression
99 raise
100 except exceptions.IncompleteExpressionError as e:
101 e.set_expression(expression)
102 raise
103 except exceptions.ParseError as e:
104 e.expression = expression
105 raise
107 def _parse(self, expression):
108 self.tokenizer = lexer.Lexer().tokenize(expression)
109 self._tokens = list(self.tokenizer)
110 self._index = 0
111 parsed = self._expression(binding_power=0)
112 if not self._current_token() == 'eof':
113 t = self._lookahead_token(0)
114 raise exceptions.ParseError(t['start'], t['value'], t['type'],
115 "Unexpected token: %s" % t['value'])
116 return ParsedResult(expression, parsed)
118 def _expression(self, binding_power=0):
119 left_token = self._lookahead_token(0)
120 self._advance()
121 nud_function = getattr(
122 self, '_token_nud_%s' % left_token['type'],
123 self._error_nud_token)
124 left = nud_function(left_token)
125 current_token = self._current_token()
126 while binding_power < self.BINDING_POWER[current_token]:
127 led = getattr(self, '_token_led_%s' % current_token, None)
128 if led is None:
129 error_token = self._lookahead_token(0)
130 self._error_led_token(error_token)
131 else:
132 self._advance()
133 left = led(left)
134 current_token = self._current_token()
135 return left
137 def _token_nud_literal(self, token):
138 return ast.literal(token['value'])
140 def _token_nud_unquoted_identifier(self, token):
141 return ast.field(token['value'])
143 def _token_nud_quoted_identifier(self, token):
144 field = ast.field(token['value'])
145 # You can't have a quoted identifier as a function
146 # name.
147 if self._current_token() == 'lparen':
148 t = self._lookahead_token(0)
149 raise exceptions.ParseError(
150 0, t['value'], t['type'],
151 'Quoted identifier not allowed for function names.')
152 return field
154 def _token_nud_star(self, token):
155 left = ast.identity()
156 if self._current_token() == 'rbracket':
157 right = ast.identity()
158 else:
159 right = self._parse_projection_rhs(self.BINDING_POWER['star'])
160 return ast.value_projection(left, right)
162 def _token_nud_filter(self, token):
163 return self._token_led_filter(ast.identity())
165 def _token_nud_lbrace(self, token):
166 return self._parse_multi_select_hash()
168 def _token_nud_lparen(self, token):
169 expression = self._expression()
170 self._match('rparen')
171 return expression
173 def _token_nud_flatten(self, token):
174 left = ast.flatten(ast.identity())
175 right = self._parse_projection_rhs(
176 self.BINDING_POWER['flatten'])
177 return ast.projection(left, right)
179 def _token_nud_not(self, token):
180 expr = self._expression(self.BINDING_POWER['not'])
181 return ast.not_expression(expr)
183 def _token_nud_lbracket(self, token):
184 if self._current_token() in ['number', 'colon']:
185 right = self._parse_index_expression()
186 # We could optimize this and remove the identity() node.
187 # We don't really need an index_expression node, we can
188 # just use emit an index node here if we're not dealing
189 # with a slice.
190 return self._project_if_slice(ast.identity(), right)
191 elif self._current_token() == 'star' and \
192 self._lookahead(1) == 'rbracket':
193 self._advance()
194 self._advance()
195 right = self._parse_projection_rhs(self.BINDING_POWER['star'])
196 return ast.projection(ast.identity(), right)
197 else:
198 return self._parse_multi_select_list()
200 def _parse_index_expression(self):
201 # We're here:
202 # [<current>
203 # ^
204 # | current token
205 if (self._lookahead(0) == 'colon' or
206 self._lookahead(1) == 'colon'):
207 return self._parse_slice_expression()
208 else:
209 # Parse the syntax [number]
210 node = ast.index(self._lookahead_token(0)['value'])
211 self._advance()
212 self._match('rbracket')
213 return node
215 def _parse_slice_expression(self):
216 # [start:end:step]
217 # Where start, end, and step are optional.
218 # The last colon is optional as well.
219 parts = [None, None, None]
220 index = 0
221 current_token = self._current_token()
222 while not current_token == 'rbracket' and index < 3:
223 if current_token == 'colon':
224 index += 1
225 if index == 3:
226 self._raise_parse_error_for_token(
227 self._lookahead_token(0), 'syntax error')
228 self._advance()
229 elif current_token == 'number':
230 parts[index] = self._lookahead_token(0)['value']
231 self._advance()
232 else:
233 self._raise_parse_error_for_token(
234 self._lookahead_token(0), 'syntax error')
235 current_token = self._current_token()
236 self._match('rbracket')
237 return ast.slice(*parts)
239 def _token_nud_current(self, token):
240 return ast.current_node()
242 def _token_nud_expref(self, token):
243 expression = self._expression(self.BINDING_POWER['expref'])
244 return ast.expref(expression)
246 def _token_led_dot(self, left):
247 if not self._current_token() == 'star':
248 right = self._parse_dot_rhs(self.BINDING_POWER['dot'])
249 if left['type'] == 'subexpression':
250 left['children'].append(right)
251 return left
252 else:
253 return ast.subexpression([left, right])
254 else:
255 # We're creating a projection.
256 self._advance()
257 right = self._parse_projection_rhs(
258 self.BINDING_POWER['dot'])
259 return ast.value_projection(left, right)
261 def _token_led_pipe(self, left):
262 right = self._expression(self.BINDING_POWER['pipe'])
263 return ast.pipe(left, right)
265 def _token_led_or(self, left):
266 right = self._expression(self.BINDING_POWER['or'])
267 return ast.or_expression(left, right)
269 def _token_led_and(self, left):
270 right = self._expression(self.BINDING_POWER['and'])
271 return ast.and_expression(left, right)
273 def _token_led_lparen(self, left):
274 if left['type'] != 'field':
275 # 0 - first func arg or closing paren.
276 # -1 - '(' token
277 # -2 - invalid function "name".
278 prev_t = self._lookahead_token(-2)
279 raise exceptions.ParseError(
280 prev_t['start'], prev_t['value'], prev_t['type'],
281 "Invalid function name '%s'" % prev_t['value'])
282 name = left['value']
283 args = []
284 while not self._current_token() == 'rparen':
285 expression = self._expression()
286 if self._current_token() == 'comma':
287 self._match('comma')
288 args.append(expression)
289 self._match('rparen')
290 function_node = ast.function_expression(name, args)
291 return function_node
293 def _token_led_filter(self, left):
294 # Filters are projections.
295 condition = self._expression(0)
296 self._match('rbracket')
297 if self._current_token() == 'flatten':
298 right = ast.identity()
299 else:
300 right = self._parse_projection_rhs(self.BINDING_POWER['filter'])
301 return ast.filter_projection(left, right, condition)
303 def _token_led_eq(self, left):
304 return self._parse_comparator(left, 'eq')
306 def _token_led_ne(self, left):
307 return self._parse_comparator(left, 'ne')
309 def _token_led_gt(self, left):
310 return self._parse_comparator(left, 'gt')
312 def _token_led_gte(self, left):
313 return self._parse_comparator(left, 'gte')
315 def _token_led_lt(self, left):
316 return self._parse_comparator(left, 'lt')
318 def _token_led_lte(self, left):
319 return self._parse_comparator(left, 'lte')
321 def _token_led_flatten(self, left):
322 left = ast.flatten(left)
323 right = self._parse_projection_rhs(
324 self.BINDING_POWER['flatten'])
325 return ast.projection(left, right)
327 def _token_led_lbracket(self, left):
328 token = self._lookahead_token(0)
329 if token['type'] in ['number', 'colon']:
330 right = self._parse_index_expression()
331 if left['type'] == 'index_expression':
332 # Optimization: if the left node is an index expr,
333 # we can avoid creating another node and instead just add
334 # the right node as a child of the left.
335 left['children'].append(right)
336 return left
337 else:
338 return self._project_if_slice(left, right)
339 else:
340 # We have a projection
341 self._match('star')
342 self._match('rbracket')
343 right = self._parse_projection_rhs(self.BINDING_POWER['star'])
344 return ast.projection(left, right)
346 def _project_if_slice(self, left, right):
347 index_expr = ast.index_expression([left, right])
348 if right['type'] == 'slice':
349 return ast.projection(
350 index_expr,
351 self._parse_projection_rhs(self.BINDING_POWER['star']))
352 else:
353 return index_expr
355 def _parse_comparator(self, left, comparator):
356 right = self._expression(self.BINDING_POWER[comparator])
357 return ast.comparator(comparator, left, right)
359 def _parse_multi_select_list(self):
360 expressions = []
361 while True:
362 expression = self._expression()
363 expressions.append(expression)
364 if self._current_token() == 'rbracket':
365 break
366 else:
367 self._match('comma')
368 self._match('rbracket')
369 return ast.multi_select_list(expressions)
371 def _parse_multi_select_hash(self):
372 pairs = []
373 while True:
374 key_token = self._lookahead_token(0)
375 # Before getting the token value, verify it's
376 # an identifier.
377 self._match_multiple_tokens(
378 token_types=['quoted_identifier', 'unquoted_identifier'])
379 key_name = key_token['value']
380 self._match('colon')
381 value = self._expression(0)
382 node = ast.key_val_pair(key_name=key_name, node=value)
383 pairs.append(node)
384 if self._current_token() == 'comma':
385 self._match('comma')
386 elif self._current_token() == 'rbrace':
387 self._match('rbrace')
388 break
389 return ast.multi_select_dict(nodes=pairs)
391 def _parse_projection_rhs(self, binding_power):
392 # Parse the right hand side of the projection.
393 if self.BINDING_POWER[self._current_token()] < self._PROJECTION_STOP:
394 # BP of 10 are all the tokens that stop a projection.
395 right = ast.identity()
396 elif self._current_token() == 'lbracket':
397 right = self._expression(binding_power)
398 elif self._current_token() == 'filter':
399 right = self._expression(binding_power)
400 elif self._current_token() == 'dot':
401 self._match('dot')
402 right = self._parse_dot_rhs(binding_power)
403 else:
404 self._raise_parse_error_for_token(self._lookahead_token(0),
405 'syntax error')
406 return right
408 def _parse_dot_rhs(self, binding_power):
409 # From the grammar:
410 # expression '.' ( identifier /
411 # multi-select-list /
412 # multi-select-hash /
413 # function-expression /
414 # *
415 # In terms of tokens that means that after a '.',
416 # you can have:
417 lookahead = self._current_token()
418 # Common case "foo.bar", so first check for an identifier.
419 if lookahead in ['quoted_identifier', 'unquoted_identifier', 'star']:
420 return self._expression(binding_power)
421 elif lookahead == 'lbracket':
422 self._match('lbracket')
423 return self._parse_multi_select_list()
424 elif lookahead == 'lbrace':
425 self._match('lbrace')
426 return self._parse_multi_select_hash()
427 else:
428 t = self._lookahead_token(0)
429 allowed = ['quoted_identifier', 'unquoted_identifier',
430 'lbracket', 'lbrace']
431 msg = (
432 "Expecting: %s, got: %s" % (allowed, t['type'])
433 )
434 self._raise_parse_error_for_token(t, msg)
436 def _error_nud_token(self, token):
437 if token['type'] == 'eof':
438 raise exceptions.IncompleteExpressionError(
439 token['start'], token['value'], token['type'])
440 self._raise_parse_error_for_token(token, 'invalid token')
442 def _error_led_token(self, token):
443 self._raise_parse_error_for_token(token, 'invalid token')
445 def _match(self, token_type=None):
446 # inline'd self._current_token()
447 if self._current_token() == token_type:
448 # inline'd self._advance()
449 self._advance()
450 else:
451 self._raise_parse_error_maybe_eof(
452 token_type, self._lookahead_token(0))
454 def _match_multiple_tokens(self, token_types):
455 if self._current_token() not in token_types:
456 self._raise_parse_error_maybe_eof(
457 token_types, self._lookahead_token(0))
458 self._advance()
460 def _advance(self):
461 self._index += 1
463 def _current_token(self):
464 return self._tokens[self._index]['type']
466 def _lookahead(self, number):
467 return self._tokens[self._index + number]['type']
469 def _lookahead_token(self, number):
470 return self._tokens[self._index + number]
472 def _raise_parse_error_for_token(self, token, reason):
473 lex_position = token['start']
474 actual_value = token['value']
475 actual_type = token['type']
476 raise exceptions.ParseError(lex_position, actual_value,
477 actual_type, reason)
479 def _raise_parse_error_maybe_eof(self, expected_type, token):
480 lex_position = token['start']
481 actual_value = token['value']
482 actual_type = token['type']
483 if actual_type == 'eof':
484 raise exceptions.IncompleteExpressionError(
485 lex_position, actual_value, actual_type)
486 message = 'Expecting: %s, got: %s' % (expected_type,
487 actual_type)
488 raise exceptions.ParseError(
489 lex_position, actual_value, actual_type, message)
491 def _free_cache_entries(self):
492 for key in random.sample(list(self._CACHE.keys()), int(self._MAX_SIZE / 2)):
493 self._CACHE.pop(key, None)
495 @classmethod
496 def purge(cls):
497 """Clear the expression compilation cache."""
498 cls._CACHE.clear()
501@with_repr_method
502class ParsedResult(object):
503 def __init__(self, expression, parsed):
504 self.expression = expression
505 self.parsed = parsed
507 def search(self, value, options=None):
508 interpreter = visitor.TreeInterpreter(options)
509 result = interpreter.visit(self.parsed, value)
510 return result
512 def _render_dot_file(self):
513 """Render the parsed AST as a dot file.
515 Note that this is marked as an internal method because
516 the AST is an implementation detail and is subject
517 to change. This method can be used to help troubleshoot
518 or for development purposes, but is not considered part
519 of the public supported API. Use at your own risk.
521 """
522 renderer = visitor.GraphvizVisitor()
523 contents = renderer.visit(self.parsed)
524 return contents
526 def __repr__(self):
527 return repr(self.parsed)