Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/jmespath/parser.py: 97%

313 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-07 06:03 +0000

1"""Top down operator precedence parser. 

2 

3This is an implementation of Vaughan R. Pratt's 

4"Top Down Operator Precedence" parser. 

5(http://dl.acm.org/citation.cfm?doid=512927.512931). 

6 

7These are some additional resources that help explain the 

8general idea behind a Pratt parser: 

9 

10* http://effbot.org/zone/simple-top-down-parsing.htm 

11* http://javascript.crockford.com/tdop/tdop.html 

12 

13A few notes on the implementation. 

14 

15* All the nud/led tokens are on the Parser class itself, and are dispatched 

16 using getattr(). This keeps all the parsing logic contained to a single 

17 class. 

18* We use two passes through the data. One to create a list of token, 

19 then one pass through the tokens to create the AST. While the lexer actually 

20 yields tokens, we convert it to a list so we can easily implement two tokens 

21 of lookahead. A previous implementation used a fixed circular buffer, but it 

22 was significantly slower. Also, the average jmespath expression typically 

23 does not have a large amount of token so this is not an issue. And 

24 interestingly enough, creating a token list first is actually faster than 

25 consuming from the token iterator one token at a time. 

26 

27""" 

28import random 

29 

30from jmespath import lexer 

31from jmespath.compat import with_repr_method 

32from jmespath import ast 

33from jmespath import exceptions 

34from jmespath import visitor 

35 

36 

37class Parser(object): 

38 BINDING_POWER = { 

39 'eof': 0, 

40 'unquoted_identifier': 0, 

41 'quoted_identifier': 0, 

42 'literal': 0, 

43 'rbracket': 0, 

44 'rparen': 0, 

45 'comma': 0, 

46 'rbrace': 0, 

47 'number': 0, 

48 'current': 0, 

49 'expref': 0, 

50 'colon': 0, 

51 'pipe': 1, 

52 'or': 2, 

53 'and': 3, 

54 'eq': 5, 

55 'gt': 5, 

56 'lt': 5, 

57 'gte': 5, 

58 'lte': 5, 

59 'ne': 5, 

60 'flatten': 9, 

61 # Everything above stops a projection. 

62 'star': 20, 

63 'filter': 21, 

64 'dot': 40, 

65 'not': 45, 

66 'lbrace': 50, 

67 'lbracket': 55, 

68 'lparen': 60, 

69 } 

70 # The maximum binding power for a token that can stop 

71 # a projection. 

72 _PROJECTION_STOP = 10 

73 # The _MAX_SIZE most recent expressions are cached in 

74 # _CACHE dict. 

75 _CACHE = {} 

76 _MAX_SIZE = 128 

77 

78 def __init__(self, lookahead=2): 

79 self.tokenizer = None 

80 self._tokens = [None] * lookahead 

81 self._buffer_size = lookahead 

82 self._index = 0 

83 

84 def parse(self, expression): 

85 cached = self._CACHE.get(expression) 

86 if cached is not None: 

87 return cached 

88 parsed_result = self._do_parse(expression) 

89 self._CACHE[expression] = parsed_result 

90 if len(self._CACHE) > self._MAX_SIZE: 

91 self._free_cache_entries() 

92 return parsed_result 

93 

94 def _do_parse(self, expression): 

95 try: 

96 return self._parse(expression) 

97 except exceptions.LexerError as e: 

98 e.expression = expression 

99 raise 

100 except exceptions.IncompleteExpressionError as e: 

101 e.set_expression(expression) 

102 raise 

103 except exceptions.ParseError as e: 

104 e.expression = expression 

105 raise 

106 

107 def _parse(self, expression): 

108 self.tokenizer = lexer.Lexer().tokenize(expression) 

109 self._tokens = list(self.tokenizer) 

110 self._index = 0 

111 parsed = self._expression(binding_power=0) 

112 if not self._current_token() == 'eof': 

113 t = self._lookahead_token(0) 

114 raise exceptions.ParseError(t['start'], t['value'], t['type'], 

115 "Unexpected token: %s" % t['value']) 

116 return ParsedResult(expression, parsed) 

117 

118 def _expression(self, binding_power=0): 

119 left_token = self._lookahead_token(0) 

120 self._advance() 

121 nud_function = getattr( 

122 self, '_token_nud_%s' % left_token['type'], 

123 self._error_nud_token) 

124 left = nud_function(left_token) 

125 current_token = self._current_token() 

126 while binding_power < self.BINDING_POWER[current_token]: 

127 led = getattr(self, '_token_led_%s' % current_token, None) 

128 if led is None: 

129 error_token = self._lookahead_token(0) 

130 self._error_led_token(error_token) 

131 else: 

132 self._advance() 

133 left = led(left) 

134 current_token = self._current_token() 

135 return left 

136 

137 def _token_nud_literal(self, token): 

138 return ast.literal(token['value']) 

139 

140 def _token_nud_unquoted_identifier(self, token): 

141 return ast.field(token['value']) 

142 

143 def _token_nud_quoted_identifier(self, token): 

144 field = ast.field(token['value']) 

145 # You can't have a quoted identifier as a function 

146 # name. 

147 if self._current_token() == 'lparen': 

148 t = self._lookahead_token(0) 

149 raise exceptions.ParseError( 

150 0, t['value'], t['type'], 

151 'Quoted identifier not allowed for function names.') 

152 return field 

153 

154 def _token_nud_star(self, token): 

155 left = ast.identity() 

156 if self._current_token() == 'rbracket': 

157 right = ast.identity() 

158 else: 

159 right = self._parse_projection_rhs(self.BINDING_POWER['star']) 

160 return ast.value_projection(left, right) 

161 

162 def _token_nud_filter(self, token): 

163 return self._token_led_filter(ast.identity()) 

164 

165 def _token_nud_lbrace(self, token): 

166 return self._parse_multi_select_hash() 

167 

168 def _token_nud_lparen(self, token): 

169 expression = self._expression() 

170 self._match('rparen') 

171 return expression 

172 

173 def _token_nud_flatten(self, token): 

174 left = ast.flatten(ast.identity()) 

175 right = self._parse_projection_rhs( 

176 self.BINDING_POWER['flatten']) 

177 return ast.projection(left, right) 

178 

179 def _token_nud_not(self, token): 

180 expr = self._expression(self.BINDING_POWER['not']) 

181 return ast.not_expression(expr) 

182 

183 def _token_nud_lbracket(self, token): 

184 if self._current_token() in ['number', 'colon']: 

185 right = self._parse_index_expression() 

186 # We could optimize this and remove the identity() node. 

187 # We don't really need an index_expression node, we can 

188 # just use emit an index node here if we're not dealing 

189 # with a slice. 

190 return self._project_if_slice(ast.identity(), right) 

191 elif self._current_token() == 'star' and \ 

192 self._lookahead(1) == 'rbracket': 

193 self._advance() 

194 self._advance() 

195 right = self._parse_projection_rhs(self.BINDING_POWER['star']) 

196 return ast.projection(ast.identity(), right) 

197 else: 

198 return self._parse_multi_select_list() 

199 

200 def _parse_index_expression(self): 

201 # We're here: 

202 # [<current> 

203 # ^ 

204 # | current token 

205 if (self._lookahead(0) == 'colon' or 

206 self._lookahead(1) == 'colon'): 

207 return self._parse_slice_expression() 

208 else: 

209 # Parse the syntax [number] 

210 node = ast.index(self._lookahead_token(0)['value']) 

211 self._advance() 

212 self._match('rbracket') 

213 return node 

214 

215 def _parse_slice_expression(self): 

216 # [start:end:step] 

217 # Where start, end, and step are optional. 

218 # The last colon is optional as well. 

219 parts = [None, None, None] 

220 index = 0 

221 current_token = self._current_token() 

222 while not current_token == 'rbracket' and index < 3: 

223 if current_token == 'colon': 

224 index += 1 

225 if index == 3: 

226 self._raise_parse_error_for_token( 

227 self._lookahead_token(0), 'syntax error') 

228 self._advance() 

229 elif current_token == 'number': 

230 parts[index] = self._lookahead_token(0)['value'] 

231 self._advance() 

232 else: 

233 self._raise_parse_error_for_token( 

234 self._lookahead_token(0), 'syntax error') 

235 current_token = self._current_token() 

236 self._match('rbracket') 

237 return ast.slice(*parts) 

238 

239 def _token_nud_current(self, token): 

240 return ast.current_node() 

241 

242 def _token_nud_expref(self, token): 

243 expression = self._expression(self.BINDING_POWER['expref']) 

244 return ast.expref(expression) 

245 

246 def _token_led_dot(self, left): 

247 if not self._current_token() == 'star': 

248 right = self._parse_dot_rhs(self.BINDING_POWER['dot']) 

249 if left['type'] == 'subexpression': 

250 left['children'].append(right) 

251 return left 

252 else: 

253 return ast.subexpression([left, right]) 

254 else: 

255 # We're creating a projection. 

256 self._advance() 

257 right = self._parse_projection_rhs( 

258 self.BINDING_POWER['dot']) 

259 return ast.value_projection(left, right) 

260 

261 def _token_led_pipe(self, left): 

262 right = self._expression(self.BINDING_POWER['pipe']) 

263 return ast.pipe(left, right) 

264 

265 def _token_led_or(self, left): 

266 right = self._expression(self.BINDING_POWER['or']) 

267 return ast.or_expression(left, right) 

268 

269 def _token_led_and(self, left): 

270 right = self._expression(self.BINDING_POWER['and']) 

271 return ast.and_expression(left, right) 

272 

273 def _token_led_lparen(self, left): 

274 if left['type'] != 'field': 

275 # 0 - first func arg or closing paren. 

276 # -1 - '(' token 

277 # -2 - invalid function "name". 

278 prev_t = self._lookahead_token(-2) 

279 raise exceptions.ParseError( 

280 prev_t['start'], prev_t['value'], prev_t['type'], 

281 "Invalid function name '%s'" % prev_t['value']) 

282 name = left['value'] 

283 args = [] 

284 while not self._current_token() == 'rparen': 

285 expression = self._expression() 

286 if self._current_token() == 'comma': 

287 self._match('comma') 

288 args.append(expression) 

289 self._match('rparen') 

290 function_node = ast.function_expression(name, args) 

291 return function_node 

292 

293 def _token_led_filter(self, left): 

294 # Filters are projections. 

295 condition = self._expression(0) 

296 self._match('rbracket') 

297 if self._current_token() == 'flatten': 

298 right = ast.identity() 

299 else: 

300 right = self._parse_projection_rhs(self.BINDING_POWER['filter']) 

301 return ast.filter_projection(left, right, condition) 

302 

303 def _token_led_eq(self, left): 

304 return self._parse_comparator(left, 'eq') 

305 

306 def _token_led_ne(self, left): 

307 return self._parse_comparator(left, 'ne') 

308 

309 def _token_led_gt(self, left): 

310 return self._parse_comparator(left, 'gt') 

311 

312 def _token_led_gte(self, left): 

313 return self._parse_comparator(left, 'gte') 

314 

315 def _token_led_lt(self, left): 

316 return self._parse_comparator(left, 'lt') 

317 

318 def _token_led_lte(self, left): 

319 return self._parse_comparator(left, 'lte') 

320 

321 def _token_led_flatten(self, left): 

322 left = ast.flatten(left) 

323 right = self._parse_projection_rhs( 

324 self.BINDING_POWER['flatten']) 

325 return ast.projection(left, right) 

326 

327 def _token_led_lbracket(self, left): 

328 token = self._lookahead_token(0) 

329 if token['type'] in ['number', 'colon']: 

330 right = self._parse_index_expression() 

331 if left['type'] == 'index_expression': 

332 # Optimization: if the left node is an index expr, 

333 # we can avoid creating another node and instead just add 

334 # the right node as a child of the left. 

335 left['children'].append(right) 

336 return left 

337 else: 

338 return self._project_if_slice(left, right) 

339 else: 

340 # We have a projection 

341 self._match('star') 

342 self._match('rbracket') 

343 right = self._parse_projection_rhs(self.BINDING_POWER['star']) 

344 return ast.projection(left, right) 

345 

346 def _project_if_slice(self, left, right): 

347 index_expr = ast.index_expression([left, right]) 

348 if right['type'] == 'slice': 

349 return ast.projection( 

350 index_expr, 

351 self._parse_projection_rhs(self.BINDING_POWER['star'])) 

352 else: 

353 return index_expr 

354 

355 def _parse_comparator(self, left, comparator): 

356 right = self._expression(self.BINDING_POWER[comparator]) 

357 return ast.comparator(comparator, left, right) 

358 

359 def _parse_multi_select_list(self): 

360 expressions = [] 

361 while True: 

362 expression = self._expression() 

363 expressions.append(expression) 

364 if self._current_token() == 'rbracket': 

365 break 

366 else: 

367 self._match('comma') 

368 self._match('rbracket') 

369 return ast.multi_select_list(expressions) 

370 

371 def _parse_multi_select_hash(self): 

372 pairs = [] 

373 while True: 

374 key_token = self._lookahead_token(0) 

375 # Before getting the token value, verify it's 

376 # an identifier. 

377 self._match_multiple_tokens( 

378 token_types=['quoted_identifier', 'unquoted_identifier']) 

379 key_name = key_token['value'] 

380 self._match('colon') 

381 value = self._expression(0) 

382 node = ast.key_val_pair(key_name=key_name, node=value) 

383 pairs.append(node) 

384 if self._current_token() == 'comma': 

385 self._match('comma') 

386 elif self._current_token() == 'rbrace': 

387 self._match('rbrace') 

388 break 

389 return ast.multi_select_dict(nodes=pairs) 

390 

391 def _parse_projection_rhs(self, binding_power): 

392 # Parse the right hand side of the projection. 

393 if self.BINDING_POWER[self._current_token()] < self._PROJECTION_STOP: 

394 # BP of 10 are all the tokens that stop a projection. 

395 right = ast.identity() 

396 elif self._current_token() == 'lbracket': 

397 right = self._expression(binding_power) 

398 elif self._current_token() == 'filter': 

399 right = self._expression(binding_power) 

400 elif self._current_token() == 'dot': 

401 self._match('dot') 

402 right = self._parse_dot_rhs(binding_power) 

403 else: 

404 self._raise_parse_error_for_token(self._lookahead_token(0), 

405 'syntax error') 

406 return right 

407 

408 def _parse_dot_rhs(self, binding_power): 

409 # From the grammar: 

410 # expression '.' ( identifier / 

411 # multi-select-list / 

412 # multi-select-hash / 

413 # function-expression / 

414 # * 

415 # In terms of tokens that means that after a '.', 

416 # you can have: 

417 lookahead = self._current_token() 

418 # Common case "foo.bar", so first check for an identifier. 

419 if lookahead in ['quoted_identifier', 'unquoted_identifier', 'star']: 

420 return self._expression(binding_power) 

421 elif lookahead == 'lbracket': 

422 self._match('lbracket') 

423 return self._parse_multi_select_list() 

424 elif lookahead == 'lbrace': 

425 self._match('lbrace') 

426 return self._parse_multi_select_hash() 

427 else: 

428 t = self._lookahead_token(0) 

429 allowed = ['quoted_identifier', 'unquoted_identifier', 

430 'lbracket', 'lbrace'] 

431 msg = ( 

432 "Expecting: %s, got: %s" % (allowed, t['type']) 

433 ) 

434 self._raise_parse_error_for_token(t, msg) 

435 

436 def _error_nud_token(self, token): 

437 if token['type'] == 'eof': 

438 raise exceptions.IncompleteExpressionError( 

439 token['start'], token['value'], token['type']) 

440 self._raise_parse_error_for_token(token, 'invalid token') 

441 

442 def _error_led_token(self, token): 

443 self._raise_parse_error_for_token(token, 'invalid token') 

444 

445 def _match(self, token_type=None): 

446 # inline'd self._current_token() 

447 if self._current_token() == token_type: 

448 # inline'd self._advance() 

449 self._advance() 

450 else: 

451 self._raise_parse_error_maybe_eof( 

452 token_type, self._lookahead_token(0)) 

453 

454 def _match_multiple_tokens(self, token_types): 

455 if self._current_token() not in token_types: 

456 self._raise_parse_error_maybe_eof( 

457 token_types, self._lookahead_token(0)) 

458 self._advance() 

459 

460 def _advance(self): 

461 self._index += 1 

462 

463 def _current_token(self): 

464 return self._tokens[self._index]['type'] 

465 

466 def _lookahead(self, number): 

467 return self._tokens[self._index + number]['type'] 

468 

469 def _lookahead_token(self, number): 

470 return self._tokens[self._index + number] 

471 

472 def _raise_parse_error_for_token(self, token, reason): 

473 lex_position = token['start'] 

474 actual_value = token['value'] 

475 actual_type = token['type'] 

476 raise exceptions.ParseError(lex_position, actual_value, 

477 actual_type, reason) 

478 

479 def _raise_parse_error_maybe_eof(self, expected_type, token): 

480 lex_position = token['start'] 

481 actual_value = token['value'] 

482 actual_type = token['type'] 

483 if actual_type == 'eof': 

484 raise exceptions.IncompleteExpressionError( 

485 lex_position, actual_value, actual_type) 

486 message = 'Expecting: %s, got: %s' % (expected_type, 

487 actual_type) 

488 raise exceptions.ParseError( 

489 lex_position, actual_value, actual_type, message) 

490 

491 def _free_cache_entries(self): 

492 for key in random.sample(list(self._CACHE.keys()), int(self._MAX_SIZE / 2)): 

493 self._CACHE.pop(key, None) 

494 

495 @classmethod 

496 def purge(cls): 

497 """Clear the expression compilation cache.""" 

498 cls._CACHE.clear() 

499 

500 

501@with_repr_method 

502class ParsedResult(object): 

503 def __init__(self, expression, parsed): 

504 self.expression = expression 

505 self.parsed = parsed 

506 

507 def search(self, value, options=None): 

508 interpreter = visitor.TreeInterpreter(options) 

509 result = interpreter.visit(self.parsed, value) 

510 return result 

511 

512 def _render_dot_file(self): 

513 """Render the parsed AST as a dot file. 

514 

515 Note that this is marked as an internal method because 

516 the AST is an implementation detail and is subject 

517 to change. This method can be used to help troubleshoot 

518 or for development purposes, but is not considered part 

519 of the public supported API. Use at your own risk. 

520 

521 """ 

522 renderer = visitor.GraphvizVisitor() 

523 contents = renderer.visit(self.parsed) 

524 return contents 

525 

526 def __repr__(self): 

527 return repr(self.parsed)