Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/jmespath/parser.py: 96%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

316 statements  

1"""Top down operator precedence parser. 

2 

3This is an implementation of Vaughan R. Pratt's 

4"Top Down Operator Precedence" parser. 

5(http://dl.acm.org/citation.cfm?doid=512927.512931). 

6 

7These are some additional resources that help explain the 

8general idea behind a Pratt parser: 

9 

10* http://effbot.org/zone/simple-top-down-parsing.htm 

11* http://javascript.crockford.com/tdop/tdop.html 

12 

13A few notes on the implementation. 

14 

15* All the nud/led tokens are on the Parser class itself, and are dispatched 

16 using getattr(). This keeps all the parsing logic contained to a single 

17 class. 

18* We use two passes through the data. One to create a list of token, 

19 then one pass through the tokens to create the AST. While the lexer actually 

20 yields tokens, we convert it to a list so we can easily implement two tokens 

21 of lookahead. A previous implementation used a fixed circular buffer, but it 

22 was significantly slower. Also, the average jmespath expression typically 

23 does not have a large amount of token so this is not an issue. And 

24 interestingly enough, creating a token list first is actually faster than 

25 consuming from the token iterator one token at a time. 

26 

27""" 

28from jmespath import lexer 

29from jmespath.compat import with_repr_method 

30from jmespath import ast 

31from jmespath import exceptions 

32from jmespath import visitor 

33 

34 

35class Parser(object): 

36 BINDING_POWER = { 

37 'eof': 0, 

38 'unquoted_identifier': 0, 

39 'quoted_identifier': 0, 

40 'literal': 0, 

41 'rbracket': 0, 

42 'rparen': 0, 

43 'comma': 0, 

44 'rbrace': 0, 

45 'number': 0, 

46 'current': 0, 

47 'expref': 0, 

48 'colon': 0, 

49 'pipe': 1, 

50 'or': 2, 

51 'and': 3, 

52 'eq': 5, 

53 'gt': 5, 

54 'lt': 5, 

55 'gte': 5, 

56 'lte': 5, 

57 'ne': 5, 

58 'flatten': 9, 

59 # Everything above stops a projection. 

60 'star': 20, 

61 'filter': 21, 

62 'dot': 40, 

63 'not': 45, 

64 'lbrace': 50, 

65 'lbracket': 55, 

66 'lparen': 60, 

67 } 

68 # The maximum binding power for a token that can stop 

69 # a projection. 

70 _PROJECTION_STOP = 10 

71 # The _MAX_SIZE most recent expressions are cached in 

72 # _CACHE dict. 

73 _CACHE = {} 

74 _MAX_SIZE = 512 

75 

76 def __init__(self, lookahead=2): 

77 self.tokenizer = None 

78 self._tokens = [None] * lookahead 

79 self._buffer_size = lookahead 

80 self._index = 0 

81 

82 def parse(self, expression): 

83 try: 

84 return self._CACHE[expression] 

85 except KeyError: 

86 pass 

87 parsed_result = self._do_parse(expression) 

88 if len(self._CACHE) >= self._MAX_SIZE: 

89 try: 

90 del self._CACHE[next(iter(self._CACHE))] 

91 except (KeyError, StopIteration, RuntimeError): 

92 # KeyError - Another thread else already deleted the key. 

93 # RuntimeError - Another modified the cache. 

94 # StopIteration - (Unlikely) Cache is empty. 

95 # 

96 # If we encounter an error we should NOT be adding to the 

97 # cache. To ensure we do not exceed self._MAX_SIZE, we 

98 # can only add to the cache if we successfully removed 

99 # an element from the cache, otherwise this can grow 

100 # unbounded. 

101 return parsed_result 

102 self._CACHE[expression] = parsed_result 

103 return parsed_result 

104 

105 def _do_parse(self, expression): 

106 try: 

107 return self._parse(expression) 

108 except exceptions.LexerError as e: 

109 e.expression = expression 

110 raise 

111 except exceptions.IncompleteExpressionError as e: 

112 e.set_expression(expression) 

113 raise 

114 except exceptions.ParseError as e: 

115 e.expression = expression 

116 raise 

117 

118 def _parse(self, expression): 

119 self.tokenizer = lexer.Lexer().tokenize(expression) 

120 self._tokens = list(self.tokenizer) 

121 self._index = 0 

122 parsed = self._expression(binding_power=0) 

123 if not self._current_token() == 'eof': 

124 t = self._lookahead_token(0) 

125 raise exceptions.ParseError(t['start'], t['value'], t['type'], 

126 "Unexpected token: %s" % t['value']) 

127 return ParsedResult(expression, parsed) 

128 

129 def _expression(self, binding_power=0): 

130 left_token = self._lookahead_token(0) 

131 self._advance() 

132 nud_function = getattr( 

133 self, '_token_nud_%s' % left_token['type'], 

134 self._error_nud_token) 

135 left = nud_function(left_token) 

136 current_token = self._current_token() 

137 while binding_power < self.BINDING_POWER[current_token]: 

138 led = getattr(self, '_token_led_%s' % current_token, None) 

139 if led is None: 

140 error_token = self._lookahead_token(0) 

141 self._error_led_token(error_token) 

142 else: 

143 self._advance() 

144 left = led(left) 

145 current_token = self._current_token() 

146 return left 

147 

148 def _token_nud_literal(self, token): 

149 return ast.literal(token['value']) 

150 

151 def _token_nud_unquoted_identifier(self, token): 

152 return ast.field(token['value']) 

153 

154 def _token_nud_quoted_identifier(self, token): 

155 field = ast.field(token['value']) 

156 # You can't have a quoted identifier as a function 

157 # name. 

158 if self._current_token() == 'lparen': 

159 t = self._lookahead_token(0) 

160 raise exceptions.ParseError( 

161 0, t['value'], t['type'], 

162 'Quoted identifier not allowed for function names.') 

163 return field 

164 

165 def _token_nud_star(self, token): 

166 left = ast.identity() 

167 if self._current_token() == 'rbracket': 

168 right = ast.identity() 

169 else: 

170 right = self._parse_projection_rhs(self.BINDING_POWER['star']) 

171 return ast.value_projection(left, right) 

172 

173 def _token_nud_filter(self, token): 

174 return self._token_led_filter(ast.identity()) 

175 

176 def _token_nud_lbrace(self, token): 

177 return self._parse_multi_select_hash() 

178 

179 def _token_nud_lparen(self, token): 

180 expression = self._expression() 

181 self._match('rparen') 

182 return expression 

183 

184 def _token_nud_flatten(self, token): 

185 left = ast.flatten(ast.identity()) 

186 right = self._parse_projection_rhs( 

187 self.BINDING_POWER['flatten']) 

188 return ast.projection(left, right) 

189 

190 def _token_nud_not(self, token): 

191 expr = self._expression(self.BINDING_POWER['not']) 

192 return ast.not_expression(expr) 

193 

194 def _token_nud_lbracket(self, token): 

195 if self._current_token() in ['number', 'colon']: 

196 right = self._parse_index_expression() 

197 # We could optimize this and remove the identity() node. 

198 # We don't really need an index_expression node, we can 

199 # just use emit an index node here if we're not dealing 

200 # with a slice. 

201 return self._project_if_slice(ast.identity(), right) 

202 elif self._current_token() == 'star' and \ 

203 self._lookahead(1) == 'rbracket': 

204 self._advance() 

205 self._advance() 

206 right = self._parse_projection_rhs(self.BINDING_POWER['star']) 

207 return ast.projection(ast.identity(), right) 

208 else: 

209 return self._parse_multi_select_list() 

210 

211 def _parse_index_expression(self): 

212 # We're here: 

213 # [<current> 

214 # ^ 

215 # | current token 

216 if (self._lookahead(0) == 'colon' or 

217 self._lookahead(1) == 'colon'): 

218 return self._parse_slice_expression() 

219 else: 

220 # Parse the syntax [number] 

221 node = ast.index(self._lookahead_token(0)['value']) 

222 self._advance() 

223 self._match('rbracket') 

224 return node 

225 

226 def _parse_slice_expression(self): 

227 # [start:end:step] 

228 # Where start, end, and step are optional. 

229 # The last colon is optional as well. 

230 parts = [None, None, None] 

231 index = 0 

232 current_token = self._current_token() 

233 while not current_token == 'rbracket' and index < 3: 

234 if current_token == 'colon': 

235 index += 1 

236 if index == 3: 

237 self._raise_parse_error_for_token( 

238 self._lookahead_token(0), 'syntax error') 

239 self._advance() 

240 elif current_token == 'number': 

241 parts[index] = self._lookahead_token(0)['value'] 

242 self._advance() 

243 else: 

244 self._raise_parse_error_for_token( 

245 self._lookahead_token(0), 'syntax error') 

246 current_token = self._current_token() 

247 self._match('rbracket') 

248 return ast.slice(*parts) 

249 

250 def _token_nud_current(self, token): 

251 return ast.current_node() 

252 

253 def _token_nud_expref(self, token): 

254 expression = self._expression(self.BINDING_POWER['expref']) 

255 return ast.expref(expression) 

256 

257 def _token_led_dot(self, left): 

258 if not self._current_token() == 'star': 

259 right = self._parse_dot_rhs(self.BINDING_POWER['dot']) 

260 if left['type'] == 'subexpression': 

261 left['children'].append(right) 

262 return left 

263 else: 

264 return ast.subexpression([left, right]) 

265 else: 

266 # We're creating a projection. 

267 self._advance() 

268 right = self._parse_projection_rhs( 

269 self.BINDING_POWER['dot']) 

270 return ast.value_projection(left, right) 

271 

272 def _token_led_pipe(self, left): 

273 right = self._expression(self.BINDING_POWER['pipe']) 

274 return ast.pipe(left, right) 

275 

276 def _token_led_or(self, left): 

277 right = self._expression(self.BINDING_POWER['or']) 

278 return ast.or_expression(left, right) 

279 

280 def _token_led_and(self, left): 

281 right = self._expression(self.BINDING_POWER['and']) 

282 return ast.and_expression(left, right) 

283 

284 def _token_led_lparen(self, left): 

285 if left['type'] != 'field': 

286 # 0 - first func arg or closing paren. 

287 # -1 - '(' token 

288 # -2 - invalid function "name". 

289 prev_t = self._lookahead_token(-2) 

290 raise exceptions.ParseError( 

291 prev_t['start'], prev_t['value'], prev_t['type'], 

292 "Invalid function name '%s'" % prev_t['value']) 

293 name = left['value'] 

294 args = [] 

295 while not self._current_token() == 'rparen': 

296 expression = self._expression() 

297 if self._current_token() == 'comma': 

298 self._match('comma') 

299 args.append(expression) 

300 self._match('rparen') 

301 function_node = ast.function_expression(name, args) 

302 return function_node 

303 

304 def _token_led_filter(self, left): 

305 # Filters are projections. 

306 condition = self._expression(0) 

307 self._match('rbracket') 

308 if self._current_token() == 'flatten': 

309 right = ast.identity() 

310 else: 

311 right = self._parse_projection_rhs(self.BINDING_POWER['filter']) 

312 return ast.filter_projection(left, right, condition) 

313 

314 def _token_led_eq(self, left): 

315 return self._parse_comparator(left, 'eq') 

316 

317 def _token_led_ne(self, left): 

318 return self._parse_comparator(left, 'ne') 

319 

320 def _token_led_gt(self, left): 

321 return self._parse_comparator(left, 'gt') 

322 

323 def _token_led_gte(self, left): 

324 return self._parse_comparator(left, 'gte') 

325 

326 def _token_led_lt(self, left): 

327 return self._parse_comparator(left, 'lt') 

328 

329 def _token_led_lte(self, left): 

330 return self._parse_comparator(left, 'lte') 

331 

332 def _token_led_flatten(self, left): 

333 left = ast.flatten(left) 

334 right = self._parse_projection_rhs( 

335 self.BINDING_POWER['flatten']) 

336 return ast.projection(left, right) 

337 

338 def _token_led_lbracket(self, left): 

339 token = self._lookahead_token(0) 

340 if token['type'] in ['number', 'colon']: 

341 right = self._parse_index_expression() 

342 if left['type'] == 'index_expression': 

343 # Optimization: if the left node is an index expr, 

344 # we can avoid creating another node and instead just add 

345 # the right node as a child of the left. 

346 left['children'].append(right) 

347 return left 

348 else: 

349 return self._project_if_slice(left, right) 

350 else: 

351 # We have a projection 

352 self._match('star') 

353 self._match('rbracket') 

354 right = self._parse_projection_rhs(self.BINDING_POWER['star']) 

355 return ast.projection(left, right) 

356 

357 def _project_if_slice(self, left, right): 

358 index_expr = ast.index_expression([left, right]) 

359 if right['type'] == 'slice': 

360 return ast.projection( 

361 index_expr, 

362 self._parse_projection_rhs(self.BINDING_POWER['star'])) 

363 else: 

364 return index_expr 

365 

366 def _parse_comparator(self, left, comparator): 

367 right = self._expression(self.BINDING_POWER[comparator]) 

368 return ast.comparator(comparator, left, right) 

369 

370 def _parse_multi_select_list(self): 

371 expressions = [] 

372 while True: 

373 expression = self._expression() 

374 expressions.append(expression) 

375 if self._current_token() == 'rbracket': 

376 break 

377 else: 

378 self._match('comma') 

379 self._match('rbracket') 

380 return ast.multi_select_list(expressions) 

381 

382 def _parse_multi_select_hash(self): 

383 pairs = [] 

384 while True: 

385 key_token = self._lookahead_token(0) 

386 # Before getting the token value, verify it's 

387 # an identifier. 

388 self._match_multiple_tokens( 

389 token_types=['quoted_identifier', 'unquoted_identifier']) 

390 key_name = key_token['value'] 

391 self._match('colon') 

392 value = self._expression(0) 

393 node = ast.key_val_pair(key_name=key_name, node=value) 

394 pairs.append(node) 

395 if self._current_token() == 'comma': 

396 self._match('comma') 

397 elif self._current_token() == 'rbrace': 

398 self._match('rbrace') 

399 break 

400 return ast.multi_select_dict(nodes=pairs) 

401 

402 def _parse_projection_rhs(self, binding_power): 

403 # Parse the right hand side of the projection. 

404 if self.BINDING_POWER[self._current_token()] < self._PROJECTION_STOP: 

405 # BP of 10 are all the tokens that stop a projection. 

406 right = ast.identity() 

407 elif self._current_token() == 'lbracket': 

408 right = self._expression(binding_power) 

409 elif self._current_token() == 'filter': 

410 right = self._expression(binding_power) 

411 elif self._current_token() == 'dot': 

412 self._match('dot') 

413 right = self._parse_dot_rhs(binding_power) 

414 else: 

415 self._raise_parse_error_for_token(self._lookahead_token(0), 

416 'syntax error') 

417 return right 

418 

419 def _parse_dot_rhs(self, binding_power): 

420 # From the grammar: 

421 # expression '.' ( identifier / 

422 # multi-select-list / 

423 # multi-select-hash / 

424 # function-expression / 

425 # * 

426 # In terms of tokens that means that after a '.', 

427 # you can have: 

428 lookahead = self._current_token() 

429 # Common case "foo.bar", so first check for an identifier. 

430 if lookahead in ['quoted_identifier', 'unquoted_identifier', 'star']: 

431 return self._expression(binding_power) 

432 elif lookahead == 'lbracket': 

433 self._match('lbracket') 

434 return self._parse_multi_select_list() 

435 elif lookahead == 'lbrace': 

436 self._match('lbrace') 

437 return self._parse_multi_select_hash() 

438 else: 

439 t = self._lookahead_token(0) 

440 allowed = ['quoted_identifier', 'unquoted_identifier', 

441 'lbracket', 'lbrace'] 

442 msg = ( 

443 "Expecting: %s, got: %s" % (allowed, t['type']) 

444 ) 

445 self._raise_parse_error_for_token(t, msg) 

446 

447 def _error_nud_token(self, token): 

448 if token['type'] == 'eof': 

449 raise exceptions.IncompleteExpressionError( 

450 token['start'], token['value'], token['type']) 

451 self._raise_parse_error_for_token(token, 'invalid token') 

452 

453 def _error_led_token(self, token): 

454 self._raise_parse_error_for_token(token, 'invalid token') 

455 

456 def _match(self, token_type=None): 

457 # inline'd self._current_token() 

458 if self._current_token() == token_type: 

459 # inline'd self._advance() 

460 self._advance() 

461 else: 

462 self._raise_parse_error_maybe_eof( 

463 token_type, self._lookahead_token(0)) 

464 

465 def _match_multiple_tokens(self, token_types): 

466 if self._current_token() not in token_types: 

467 self._raise_parse_error_maybe_eof( 

468 token_types, self._lookahead_token(0)) 

469 self._advance() 

470 

471 def _advance(self): 

472 self._index += 1 

473 

474 def _current_token(self): 

475 return self._tokens[self._index]['type'] 

476 

477 def _lookahead(self, number): 

478 return self._tokens[self._index + number]['type'] 

479 

480 def _lookahead_token(self, number): 

481 return self._tokens[self._index + number] 

482 

483 def _raise_parse_error_for_token(self, token, reason): 

484 lex_position = token['start'] 

485 actual_value = token['value'] 

486 actual_type = token['type'] 

487 raise exceptions.ParseError(lex_position, actual_value, 

488 actual_type, reason) 

489 

490 def _raise_parse_error_maybe_eof(self, expected_type, token): 

491 lex_position = token['start'] 

492 actual_value = token['value'] 

493 actual_type = token['type'] 

494 if actual_type == 'eof': 

495 raise exceptions.IncompleteExpressionError( 

496 lex_position, actual_value, actual_type) 

497 message = 'Expecting: %s, got: %s' % (expected_type, 

498 actual_type) 

499 raise exceptions.ParseError( 

500 lex_position, actual_value, actual_type, message) 

501 

502 @classmethod 

503 def purge(cls): 

504 """Clear the expression compilation cache.""" 

505 cls._CACHE.clear() 

506 

507 

508@with_repr_method 

509class ParsedResult(object): 

510 def __init__(self, expression, parsed): 

511 self.expression = expression 

512 self.parsed = parsed 

513 

514 def search(self, value, options=None): 

515 interpreter = visitor.TreeInterpreter(options) 

516 result = interpreter.visit(self.parsed, value) 

517 return result 

518 

519 def _render_dot_file(self): 

520 """Render the parsed AST as a dot file. 

521 

522 Note that this is marked as an internal method because 

523 the AST is an implementation detail and is subject 

524 to change. This method can be used to help troubleshoot 

525 or for development purposes, but is not considered part 

526 of the public supported API. Use at your own risk. 

527 

528 """ 

529 renderer = visitor.GraphvizVisitor() 

530 contents = renderer.visit(self.parsed) 

531 return contents 

532 

533 def __repr__(self): 

534 return repr(self.parsed)