Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/jmespath/parser.py: 96%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""Top down operator precedence parser.
3This is an implementation of Vaughan R. Pratt's
4"Top Down Operator Precedence" parser.
5(http://dl.acm.org/citation.cfm?doid=512927.512931).
7These are some additional resources that help explain the
8general idea behind a Pratt parser:
10* http://effbot.org/zone/simple-top-down-parsing.htm
11* http://javascript.crockford.com/tdop/tdop.html
13A few notes on the implementation.
15* All the nud/led tokens are on the Parser class itself, and are dispatched
16 using getattr(). This keeps all the parsing logic contained to a single
17 class.
18* We use two passes through the data. One to create a list of token,
19 then one pass through the tokens to create the AST. While the lexer actually
20 yields tokens, we convert it to a list so we can easily implement two tokens
21 of lookahead. A previous implementation used a fixed circular buffer, but it
22 was significantly slower. Also, the average jmespath expression typically
23 does not have a large amount of token so this is not an issue. And
24 interestingly enough, creating a token list first is actually faster than
25 consuming from the token iterator one token at a time.
27"""
28from jmespath import lexer
29from jmespath.compat import with_repr_method
30from jmespath import ast
31from jmespath import exceptions
32from jmespath import visitor
35class Parser(object):
36 BINDING_POWER = {
37 'eof': 0,
38 'unquoted_identifier': 0,
39 'quoted_identifier': 0,
40 'literal': 0,
41 'rbracket': 0,
42 'rparen': 0,
43 'comma': 0,
44 'rbrace': 0,
45 'number': 0,
46 'current': 0,
47 'expref': 0,
48 'colon': 0,
49 'pipe': 1,
50 'or': 2,
51 'and': 3,
52 'eq': 5,
53 'gt': 5,
54 'lt': 5,
55 'gte': 5,
56 'lte': 5,
57 'ne': 5,
58 'flatten': 9,
59 # Everything above stops a projection.
60 'star': 20,
61 'filter': 21,
62 'dot': 40,
63 'not': 45,
64 'lbrace': 50,
65 'lbracket': 55,
66 'lparen': 60,
67 }
68 # The maximum binding power for a token that can stop
69 # a projection.
70 _PROJECTION_STOP = 10
71 # The _MAX_SIZE most recent expressions are cached in
72 # _CACHE dict.
73 _CACHE = {}
74 _MAX_SIZE = 512
76 def __init__(self, lookahead=2):
77 self.tokenizer = None
78 self._tokens = [None] * lookahead
79 self._buffer_size = lookahead
80 self._index = 0
82 def parse(self, expression):
83 try:
84 return self._CACHE[expression]
85 except KeyError:
86 pass
87 parsed_result = self._do_parse(expression)
88 if len(self._CACHE) >= self._MAX_SIZE:
89 try:
90 del self._CACHE[next(iter(self._CACHE))]
91 except (KeyError, StopIteration, RuntimeError):
92 # KeyError - Another thread else already deleted the key.
93 # RuntimeError - Another modified the cache.
94 # StopIteration - (Unlikely) Cache is empty.
95 #
96 # If we encounter an error we should NOT be adding to the
97 # cache. To ensure we do not exceed self._MAX_SIZE, we
98 # can only add to the cache if we successfully removed
99 # an element from the cache, otherwise this can grow
100 # unbounded.
101 return parsed_result
102 self._CACHE[expression] = parsed_result
103 return parsed_result
105 def _do_parse(self, expression):
106 try:
107 return self._parse(expression)
108 except exceptions.LexerError as e:
109 e.expression = expression
110 raise
111 except exceptions.IncompleteExpressionError as e:
112 e.set_expression(expression)
113 raise
114 except exceptions.ParseError as e:
115 e.expression = expression
116 raise
118 def _parse(self, expression):
119 self.tokenizer = lexer.Lexer().tokenize(expression)
120 self._tokens = list(self.tokenizer)
121 self._index = 0
122 parsed = self._expression(binding_power=0)
123 if not self._current_token() == 'eof':
124 t = self._lookahead_token(0)
125 raise exceptions.ParseError(t['start'], t['value'], t['type'],
126 "Unexpected token: %s" % t['value'])
127 return ParsedResult(expression, parsed)
129 def _expression(self, binding_power=0):
130 left_token = self._lookahead_token(0)
131 self._advance()
132 nud_function = getattr(
133 self, '_token_nud_%s' % left_token['type'],
134 self._error_nud_token)
135 left = nud_function(left_token)
136 current_token = self._current_token()
137 while binding_power < self.BINDING_POWER[current_token]:
138 led = getattr(self, '_token_led_%s' % current_token, None)
139 if led is None:
140 error_token = self._lookahead_token(0)
141 self._error_led_token(error_token)
142 else:
143 self._advance()
144 left = led(left)
145 current_token = self._current_token()
146 return left
148 def _token_nud_literal(self, token):
149 return ast.literal(token['value'])
151 def _token_nud_unquoted_identifier(self, token):
152 return ast.field(token['value'])
154 def _token_nud_quoted_identifier(self, token):
155 field = ast.field(token['value'])
156 # You can't have a quoted identifier as a function
157 # name.
158 if self._current_token() == 'lparen':
159 t = self._lookahead_token(0)
160 raise exceptions.ParseError(
161 0, t['value'], t['type'],
162 'Quoted identifier not allowed for function names.')
163 return field
165 def _token_nud_star(self, token):
166 left = ast.identity()
167 if self._current_token() == 'rbracket':
168 right = ast.identity()
169 else:
170 right = self._parse_projection_rhs(self.BINDING_POWER['star'])
171 return ast.value_projection(left, right)
173 def _token_nud_filter(self, token):
174 return self._token_led_filter(ast.identity())
176 def _token_nud_lbrace(self, token):
177 return self._parse_multi_select_hash()
179 def _token_nud_lparen(self, token):
180 expression = self._expression()
181 self._match('rparen')
182 return expression
184 def _token_nud_flatten(self, token):
185 left = ast.flatten(ast.identity())
186 right = self._parse_projection_rhs(
187 self.BINDING_POWER['flatten'])
188 return ast.projection(left, right)
190 def _token_nud_not(self, token):
191 expr = self._expression(self.BINDING_POWER['not'])
192 return ast.not_expression(expr)
194 def _token_nud_lbracket(self, token):
195 if self._current_token() in ['number', 'colon']:
196 right = self._parse_index_expression()
197 # We could optimize this and remove the identity() node.
198 # We don't really need an index_expression node, we can
199 # just use emit an index node here if we're not dealing
200 # with a slice.
201 return self._project_if_slice(ast.identity(), right)
202 elif self._current_token() == 'star' and \
203 self._lookahead(1) == 'rbracket':
204 self._advance()
205 self._advance()
206 right = self._parse_projection_rhs(self.BINDING_POWER['star'])
207 return ast.projection(ast.identity(), right)
208 else:
209 return self._parse_multi_select_list()
211 def _parse_index_expression(self):
212 # We're here:
213 # [<current>
214 # ^
215 # | current token
216 if (self._lookahead(0) == 'colon' or
217 self._lookahead(1) == 'colon'):
218 return self._parse_slice_expression()
219 else:
220 # Parse the syntax [number]
221 node = ast.index(self._lookahead_token(0)['value'])
222 self._advance()
223 self._match('rbracket')
224 return node
226 def _parse_slice_expression(self):
227 # [start:end:step]
228 # Where start, end, and step are optional.
229 # The last colon is optional as well.
230 parts = [None, None, None]
231 index = 0
232 current_token = self._current_token()
233 while not current_token == 'rbracket' and index < 3:
234 if current_token == 'colon':
235 index += 1
236 if index == 3:
237 self._raise_parse_error_for_token(
238 self._lookahead_token(0), 'syntax error')
239 self._advance()
240 elif current_token == 'number':
241 parts[index] = self._lookahead_token(0)['value']
242 self._advance()
243 else:
244 self._raise_parse_error_for_token(
245 self._lookahead_token(0), 'syntax error')
246 current_token = self._current_token()
247 self._match('rbracket')
248 return ast.slice(*parts)
250 def _token_nud_current(self, token):
251 return ast.current_node()
253 def _token_nud_expref(self, token):
254 expression = self._expression(self.BINDING_POWER['expref'])
255 return ast.expref(expression)
257 def _token_led_dot(self, left):
258 if not self._current_token() == 'star':
259 right = self._parse_dot_rhs(self.BINDING_POWER['dot'])
260 if left['type'] == 'subexpression':
261 left['children'].append(right)
262 return left
263 else:
264 return ast.subexpression([left, right])
265 else:
266 # We're creating a projection.
267 self._advance()
268 right = self._parse_projection_rhs(
269 self.BINDING_POWER['dot'])
270 return ast.value_projection(left, right)
272 def _token_led_pipe(self, left):
273 right = self._expression(self.BINDING_POWER['pipe'])
274 return ast.pipe(left, right)
276 def _token_led_or(self, left):
277 right = self._expression(self.BINDING_POWER['or'])
278 return ast.or_expression(left, right)
280 def _token_led_and(self, left):
281 right = self._expression(self.BINDING_POWER['and'])
282 return ast.and_expression(left, right)
284 def _token_led_lparen(self, left):
285 if left['type'] != 'field':
286 # 0 - first func arg or closing paren.
287 # -1 - '(' token
288 # -2 - invalid function "name".
289 prev_t = self._lookahead_token(-2)
290 raise exceptions.ParseError(
291 prev_t['start'], prev_t['value'], prev_t['type'],
292 "Invalid function name '%s'" % prev_t['value'])
293 name = left['value']
294 args = []
295 while not self._current_token() == 'rparen':
296 expression = self._expression()
297 if self._current_token() == 'comma':
298 self._match('comma')
299 args.append(expression)
300 self._match('rparen')
301 function_node = ast.function_expression(name, args)
302 return function_node
304 def _token_led_filter(self, left):
305 # Filters are projections.
306 condition = self._expression(0)
307 self._match('rbracket')
308 if self._current_token() == 'flatten':
309 right = ast.identity()
310 else:
311 right = self._parse_projection_rhs(self.BINDING_POWER['filter'])
312 return ast.filter_projection(left, right, condition)
314 def _token_led_eq(self, left):
315 return self._parse_comparator(left, 'eq')
317 def _token_led_ne(self, left):
318 return self._parse_comparator(left, 'ne')
320 def _token_led_gt(self, left):
321 return self._parse_comparator(left, 'gt')
323 def _token_led_gte(self, left):
324 return self._parse_comparator(left, 'gte')
326 def _token_led_lt(self, left):
327 return self._parse_comparator(left, 'lt')
329 def _token_led_lte(self, left):
330 return self._parse_comparator(left, 'lte')
332 def _token_led_flatten(self, left):
333 left = ast.flatten(left)
334 right = self._parse_projection_rhs(
335 self.BINDING_POWER['flatten'])
336 return ast.projection(left, right)
338 def _token_led_lbracket(self, left):
339 token = self._lookahead_token(0)
340 if token['type'] in ['number', 'colon']:
341 right = self._parse_index_expression()
342 if left['type'] == 'index_expression':
343 # Optimization: if the left node is an index expr,
344 # we can avoid creating another node and instead just add
345 # the right node as a child of the left.
346 left['children'].append(right)
347 return left
348 else:
349 return self._project_if_slice(left, right)
350 else:
351 # We have a projection
352 self._match('star')
353 self._match('rbracket')
354 right = self._parse_projection_rhs(self.BINDING_POWER['star'])
355 return ast.projection(left, right)
357 def _project_if_slice(self, left, right):
358 index_expr = ast.index_expression([left, right])
359 if right['type'] == 'slice':
360 return ast.projection(
361 index_expr,
362 self._parse_projection_rhs(self.BINDING_POWER['star']))
363 else:
364 return index_expr
366 def _parse_comparator(self, left, comparator):
367 right = self._expression(self.BINDING_POWER[comparator])
368 return ast.comparator(comparator, left, right)
370 def _parse_multi_select_list(self):
371 expressions = []
372 while True:
373 expression = self._expression()
374 expressions.append(expression)
375 if self._current_token() == 'rbracket':
376 break
377 else:
378 self._match('comma')
379 self._match('rbracket')
380 return ast.multi_select_list(expressions)
382 def _parse_multi_select_hash(self):
383 pairs = []
384 while True:
385 key_token = self._lookahead_token(0)
386 # Before getting the token value, verify it's
387 # an identifier.
388 self._match_multiple_tokens(
389 token_types=['quoted_identifier', 'unquoted_identifier'])
390 key_name = key_token['value']
391 self._match('colon')
392 value = self._expression(0)
393 node = ast.key_val_pair(key_name=key_name, node=value)
394 pairs.append(node)
395 if self._current_token() == 'comma':
396 self._match('comma')
397 elif self._current_token() == 'rbrace':
398 self._match('rbrace')
399 break
400 return ast.multi_select_dict(nodes=pairs)
402 def _parse_projection_rhs(self, binding_power):
403 # Parse the right hand side of the projection.
404 if self.BINDING_POWER[self._current_token()] < self._PROJECTION_STOP:
405 # BP of 10 are all the tokens that stop a projection.
406 right = ast.identity()
407 elif self._current_token() == 'lbracket':
408 right = self._expression(binding_power)
409 elif self._current_token() == 'filter':
410 right = self._expression(binding_power)
411 elif self._current_token() == 'dot':
412 self._match('dot')
413 right = self._parse_dot_rhs(binding_power)
414 else:
415 self._raise_parse_error_for_token(self._lookahead_token(0),
416 'syntax error')
417 return right
419 def _parse_dot_rhs(self, binding_power):
420 # From the grammar:
421 # expression '.' ( identifier /
422 # multi-select-list /
423 # multi-select-hash /
424 # function-expression /
425 # *
426 # In terms of tokens that means that after a '.',
427 # you can have:
428 lookahead = self._current_token()
429 # Common case "foo.bar", so first check for an identifier.
430 if lookahead in ['quoted_identifier', 'unquoted_identifier', 'star']:
431 return self._expression(binding_power)
432 elif lookahead == 'lbracket':
433 self._match('lbracket')
434 return self._parse_multi_select_list()
435 elif lookahead == 'lbrace':
436 self._match('lbrace')
437 return self._parse_multi_select_hash()
438 else:
439 t = self._lookahead_token(0)
440 allowed = ['quoted_identifier', 'unquoted_identifier',
441 'lbracket', 'lbrace']
442 msg = (
443 "Expecting: %s, got: %s" % (allowed, t['type'])
444 )
445 self._raise_parse_error_for_token(t, msg)
447 def _error_nud_token(self, token):
448 if token['type'] == 'eof':
449 raise exceptions.IncompleteExpressionError(
450 token['start'], token['value'], token['type'])
451 self._raise_parse_error_for_token(token, 'invalid token')
453 def _error_led_token(self, token):
454 self._raise_parse_error_for_token(token, 'invalid token')
456 def _match(self, token_type=None):
457 # inline'd self._current_token()
458 if self._current_token() == token_type:
459 # inline'd self._advance()
460 self._advance()
461 else:
462 self._raise_parse_error_maybe_eof(
463 token_type, self._lookahead_token(0))
465 def _match_multiple_tokens(self, token_types):
466 if self._current_token() not in token_types:
467 self._raise_parse_error_maybe_eof(
468 token_types, self._lookahead_token(0))
469 self._advance()
471 def _advance(self):
472 self._index += 1
474 def _current_token(self):
475 return self._tokens[self._index]['type']
477 def _lookahead(self, number):
478 return self._tokens[self._index + number]['type']
480 def _lookahead_token(self, number):
481 return self._tokens[self._index + number]
483 def _raise_parse_error_for_token(self, token, reason):
484 lex_position = token['start']
485 actual_value = token['value']
486 actual_type = token['type']
487 raise exceptions.ParseError(lex_position, actual_value,
488 actual_type, reason)
490 def _raise_parse_error_maybe_eof(self, expected_type, token):
491 lex_position = token['start']
492 actual_value = token['value']
493 actual_type = token['type']
494 if actual_type == 'eof':
495 raise exceptions.IncompleteExpressionError(
496 lex_position, actual_value, actual_type)
497 message = 'Expecting: %s, got: %s' % (expected_type,
498 actual_type)
499 raise exceptions.ParseError(
500 lex_position, actual_value, actual_type, message)
502 @classmethod
503 def purge(cls):
504 """Clear the expression compilation cache."""
505 cls._CACHE.clear()
508@with_repr_method
509class ParsedResult(object):
510 def __init__(self, expression, parsed):
511 self.expression = expression
512 self.parsed = parsed
514 def search(self, value, options=None):
515 interpreter = visitor.TreeInterpreter(options)
516 result = interpreter.visit(self.parsed, value)
517 return result
519 def _render_dot_file(self):
520 """Render the parsed AST as a dot file.
522 Note that this is marked as an internal method because
523 the AST is an implementation detail and is subject
524 to change. This method can be used to help troubleshoot
525 or for development purposes, but is not considered part
526 of the public supported API. Use at your own risk.
528 """
529 renderer = visitor.GraphvizVisitor()
530 contents = renderer.visit(self.parsed)
531 return contents
533 def __repr__(self):
534 return repr(self.parsed)