Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/parso/python/diff.py: 41%

502 statements  

« prev     ^ index     » next       coverage.py v7.2.2, created at 2023-03-26 07:36 +0000

1""" 

2The diff parser is trying to be a faster version of the normal parser by trying 

3to reuse the nodes of a previous pass over the same file. This is also called 

4incremental parsing in parser literature. The difference is mostly that with 

5incremental parsing you get a range that needs to be reparsed. Here we 

6calculate that range ourselves by using difflib. After that it's essentially 

7incremental parsing. 

8 

9The biggest issue of this approach is that we reuse nodes in a mutable way. The 

10intial design and idea is quite problematic for this parser, but it is also 

11pretty fast. Measurements showed that just copying nodes in Python is simply 

12quite a bit slower (especially for big files >3 kLOC). Therefore we did not 

13want to get rid of the mutable nodes, since this is usually not an issue. 

14 

15This is by far the hardest software I ever wrote, exactly because the initial 

16design is crappy. When you have to account for a lot of mutable state, it 

17creates a ton of issues that you would otherwise not have. This file took 

18probably 3-6 months to write, which is insane for a parser. 

19 

20There is a fuzzer in that helps test this whole thing. Please use it if you 

21make changes here. If you run the fuzzer like:: 

22 

23 test/fuzz_diff_parser.py random -n 100000 

24 

25you can be pretty sure that everything is still fine. I sometimes run the 

26fuzzer up to 24h to make sure everything is still ok. 

27""" 

28import re 

29import difflib 

30from collections import namedtuple 

31import logging 

32 

33from parso.utils import split_lines 

34from parso.python.parser import Parser 

35from parso.python.tree import EndMarker 

36from parso.python.tokenize import PythonToken, BOM_UTF8_STRING 

37from parso.python.token import PythonTokenTypes 

38 

39LOG = logging.getLogger(__name__) 

40DEBUG_DIFF_PARSER = False 

41 

42_INDENTATION_TOKENS = 'INDENT', 'ERROR_DEDENT', 'DEDENT' 

43 

44NEWLINE = PythonTokenTypes.NEWLINE 

45DEDENT = PythonTokenTypes.DEDENT 

46NAME = PythonTokenTypes.NAME 

47ERROR_DEDENT = PythonTokenTypes.ERROR_DEDENT 

48ENDMARKER = PythonTokenTypes.ENDMARKER 

49 

50 

51def _is_indentation_error_leaf(node): 

52 return node.type == 'error_leaf' and node.token_type in _INDENTATION_TOKENS 

53 

54 

55def _get_previous_leaf_if_indentation(leaf): 

56 while leaf and _is_indentation_error_leaf(leaf): 

57 leaf = leaf.get_previous_leaf() 

58 return leaf 

59 

60 

61def _get_next_leaf_if_indentation(leaf): 

62 while leaf and _is_indentation_error_leaf(leaf): 

63 leaf = leaf.get_next_leaf() 

64 return leaf 

65 

66 

67def _get_suite_indentation(tree_node): 

68 return _get_indentation(tree_node.children[1]) 

69 

70 

71def _get_indentation(tree_node): 

72 return tree_node.start_pos[1] 

73 

74 

75def _assert_valid_graph(node): 

76 """ 

77 Checks if the parent/children relationship is correct. 

78 

79 This is a check that only runs during debugging/testing. 

80 """ 

81 try: 

82 children = node.children 

83 except AttributeError: 

84 # Ignore INDENT is necessary, because indent/dedent tokens don't 

85 # contain value/prefix and are just around, because of the tokenizer. 

86 if node.type == 'error_leaf' and node.token_type in _INDENTATION_TOKENS: 

87 assert not node.value 

88 assert not node.prefix 

89 return 

90 

91 # Calculate the content between two start positions. 

92 previous_leaf = _get_previous_leaf_if_indentation(node.get_previous_leaf()) 

93 if previous_leaf is None: 

94 content = node.prefix 

95 previous_start_pos = 1, 0 

96 else: 

97 assert previous_leaf.end_pos <= node.start_pos, \ 

98 (previous_leaf, node) 

99 

100 content = previous_leaf.value + node.prefix 

101 previous_start_pos = previous_leaf.start_pos 

102 

103 if '\n' in content or '\r' in content: 

104 splitted = split_lines(content) 

105 line = previous_start_pos[0] + len(splitted) - 1 

106 actual = line, len(splitted[-1]) 

107 else: 

108 actual = previous_start_pos[0], previous_start_pos[1] + len(content) 

109 if content.startswith(BOM_UTF8_STRING) \ 

110 and node.get_start_pos_of_prefix() == (1, 0): 

111 # Remove the byte order mark 

112 actual = actual[0], actual[1] - 1 

113 

114 assert node.start_pos == actual, (node.start_pos, actual) 

115 else: 

116 for child in children: 

117 assert child.parent == node, (node, child) 

118 _assert_valid_graph(child) 

119 

120 

121def _assert_nodes_are_equal(node1, node2): 

122 try: 

123 children1 = node1.children 

124 except AttributeError: 

125 assert not hasattr(node2, 'children'), (node1, node2) 

126 assert node1.value == node2.value, (node1, node2) 

127 assert node1.type == node2.type, (node1, node2) 

128 assert node1.prefix == node2.prefix, (node1, node2) 

129 assert node1.start_pos == node2.start_pos, (node1, node2) 

130 return 

131 else: 

132 try: 

133 children2 = node2.children 

134 except AttributeError: 

135 assert False, (node1, node2) 

136 for n1, n2 in zip(children1, children2): 

137 _assert_nodes_are_equal(n1, n2) 

138 assert len(children1) == len(children2), '\n' + repr(children1) + '\n' + repr(children2) 

139 

140 

141def _get_debug_error_message(module, old_lines, new_lines): 

142 current_lines = split_lines(module.get_code(), keepends=True) 

143 current_diff = difflib.unified_diff(new_lines, current_lines) 

144 old_new_diff = difflib.unified_diff(old_lines, new_lines) 

145 import parso 

146 return ( 

147 "There's an issue with the diff parser. Please " 

148 "report (parso v%s) - Old/New:\n%s\nActual Diff (May be empty):\n%s" 

149 % (parso.__version__, ''.join(old_new_diff), ''.join(current_diff)) 

150 ) 

151 

152 

153def _get_last_line(node_or_leaf): 

154 last_leaf = node_or_leaf.get_last_leaf() 

155 if _ends_with_newline(last_leaf): 

156 return last_leaf.start_pos[0] 

157 else: 

158 n = last_leaf.get_next_leaf() 

159 if n.type == 'endmarker' and '\n' in n.prefix: 

160 # This is a very special case and has to do with error recovery in 

161 # Parso. The problem is basically that there's no newline leaf at 

162 # the end sometimes (it's required in the grammar, but not needed 

163 # actually before endmarker, CPython just adds a newline to make 

164 # source code pass the parser, to account for that Parso error 

165 # recovery allows small_stmt instead of simple_stmt). 

166 return last_leaf.end_pos[0] + 1 

167 return last_leaf.end_pos[0] 

168 

169 

170def _skip_dedent_error_leaves(leaf): 

171 while leaf is not None and leaf.type == 'error_leaf' and leaf.token_type == 'DEDENT': 

172 leaf = leaf.get_previous_leaf() 

173 return leaf 

174 

175 

176def _ends_with_newline(leaf, suffix=''): 

177 leaf = _skip_dedent_error_leaves(leaf) 

178 

179 if leaf.type == 'error_leaf': 

180 typ = leaf.token_type.lower() 

181 else: 

182 typ = leaf.type 

183 

184 return typ == 'newline' or suffix.endswith('\n') or suffix.endswith('\r') 

185 

186 

187def _flows_finished(pgen_grammar, stack): 

188 """ 

189 if, while, for and try might not be finished, because another part might 

190 still be parsed. 

191 """ 

192 for stack_node in stack: 

193 if stack_node.nonterminal in ('if_stmt', 'while_stmt', 'for_stmt', 'try_stmt'): 

194 return False 

195 return True 

196 

197 

198def _func_or_class_has_suite(node): 

199 if node.type == 'decorated': 

200 node = node.children[-1] 

201 if node.type in ('async_funcdef', 'async_stmt'): 

202 node = node.children[-1] 

203 return node.type in ('classdef', 'funcdef') and node.children[-1].type == 'suite' 

204 

205 

206def _suite_or_file_input_is_valid(pgen_grammar, stack): 

207 if not _flows_finished(pgen_grammar, stack): 

208 return False 

209 

210 for stack_node in reversed(stack): 

211 if stack_node.nonterminal == 'decorator': 

212 # A decorator is only valid with the upcoming function. 

213 return False 

214 

215 if stack_node.nonterminal == 'suite': 

216 # If only newline is in the suite, the suite is not valid, yet. 

217 return len(stack_node.nodes) > 1 

218 # Not reaching a suite means that we're dealing with file_input levels 

219 # where there's no need for a valid statement in it. It can also be empty. 

220 return True 

221 

222 

223def _is_flow_node(node): 

224 if node.type == 'async_stmt': 

225 node = node.children[1] 

226 try: 

227 value = node.children[0].value 

228 except AttributeError: 

229 return False 

230 return value in ('if', 'for', 'while', 'try', 'with') 

231 

232 

233class _PositionUpdatingFinished(Exception): 

234 pass 

235 

236 

237def _update_positions(nodes, line_offset, last_leaf): 

238 for node in nodes: 

239 try: 

240 children = node.children 

241 except AttributeError: 

242 # Is a leaf 

243 node.line += line_offset 

244 if node is last_leaf: 

245 raise _PositionUpdatingFinished 

246 else: 

247 _update_positions(children, line_offset, last_leaf) 

248 

249 

250class DiffParser: 

251 """ 

252 An advanced form of parsing a file faster. Unfortunately comes with huge 

253 side effects. It changes the given module. 

254 """ 

255 def __init__(self, pgen_grammar, tokenizer, module): 

256 self._pgen_grammar = pgen_grammar 

257 self._tokenizer = tokenizer 

258 self._module = module 

259 

260 def _reset(self): 

261 self._copy_count = 0 

262 self._parser_count = 0 

263 

264 self._nodes_tree = _NodesTree(self._module) 

265 

266 def update(self, old_lines, new_lines): 

267 ''' 

268 The algorithm works as follows: 

269 

270 Equal: 

271 - Assure that the start is a newline, otherwise parse until we get 

272 one. 

273 - Copy from parsed_until_line + 1 to max(i2 + 1) 

274 - Make sure that the indentation is correct (e.g. add DEDENT) 

275 - Add old and change positions 

276 Insert: 

277 - Parse from parsed_until_line + 1 to min(j2 + 1), hopefully not 

278 much more. 

279 

280 Returns the new module node. 

281 ''' 

282 LOG.debug('diff parser start') 

283 # Reset the used names cache so they get regenerated. 

284 self._module._used_names = None 

285 

286 self._parser_lines_new = new_lines 

287 

288 self._reset() 

289 

290 line_length = len(new_lines) 

291 sm = difflib.SequenceMatcher(None, old_lines, self._parser_lines_new) 

292 opcodes = sm.get_opcodes() 

293 LOG.debug('line_lengths old: %s; new: %s' % (len(old_lines), line_length)) 

294 

295 for operation, i1, i2, j1, j2 in opcodes: 

296 LOG.debug('-> code[%s] old[%s:%s] new[%s:%s]', 

297 operation, i1 + 1, i2, j1 + 1, j2) 

298 

299 if j2 == line_length and new_lines[-1] == '': 

300 # The empty part after the last newline is not relevant. 

301 j2 -= 1 

302 

303 if operation == 'equal': 

304 line_offset = j1 - i1 

305 self._copy_from_old_parser(line_offset, i1 + 1, i2, j2) 

306 elif operation == 'replace': 

307 self._parse(until_line=j2) 

308 elif operation == 'insert': 

309 self._parse(until_line=j2) 

310 else: 

311 assert operation == 'delete' 

312 

313 # With this action all change will finally be applied and we have a 

314 # changed module. 

315 self._nodes_tree.close() 

316 

317 if DEBUG_DIFF_PARSER: 

318 # If there is reasonable suspicion that the diff parser is not 

319 # behaving well, this should be enabled. 

320 try: 

321 code = ''.join(new_lines) 

322 assert self._module.get_code() == code 

323 _assert_valid_graph(self._module) 

324 without_diff_parser_module = Parser( 

325 self._pgen_grammar, 

326 error_recovery=True 

327 ).parse(self._tokenizer(new_lines)) 

328 _assert_nodes_are_equal(self._module, without_diff_parser_module) 

329 except AssertionError: 

330 print(_get_debug_error_message(self._module, old_lines, new_lines)) 

331 raise 

332 

333 last_pos = self._module.end_pos[0] 

334 if last_pos != line_length: 

335 raise Exception( 

336 ('(%s != %s) ' % (last_pos, line_length)) 

337 + _get_debug_error_message(self._module, old_lines, new_lines) 

338 ) 

339 LOG.debug('diff parser end') 

340 return self._module 

341 

342 def _enabled_debugging(self, old_lines, lines_new): 

343 if self._module.get_code() != ''.join(lines_new): 

344 LOG.warning('parser issue:\n%s\n%s', ''.join(old_lines), ''.join(lines_new)) 

345 

346 def _copy_from_old_parser(self, line_offset, start_line_old, until_line_old, until_line_new): 

347 last_until_line = -1 

348 while until_line_new > self._nodes_tree.parsed_until_line: 

349 parsed_until_line_old = self._nodes_tree.parsed_until_line - line_offset 

350 line_stmt = self._get_old_line_stmt(parsed_until_line_old + 1) 

351 if line_stmt is None: 

352 # Parse 1 line at least. We don't need more, because we just 

353 # want to get into a state where the old parser has statements 

354 # again that can be copied (e.g. not lines within parentheses). 

355 self._parse(self._nodes_tree.parsed_until_line + 1) 

356 else: 

357 p_children = line_stmt.parent.children 

358 index = p_children.index(line_stmt) 

359 

360 if start_line_old == 1 \ 

361 and p_children[0].get_first_leaf().prefix.startswith(BOM_UTF8_STRING): 

362 # If there's a BOM in the beginning, just reparse. It's too 

363 # complicated to account for it otherwise. 

364 copied_nodes = [] 

365 else: 

366 from_ = self._nodes_tree.parsed_until_line + 1 

367 copied_nodes = self._nodes_tree.copy_nodes( 

368 p_children[index:], 

369 until_line_old, 

370 line_offset 

371 ) 

372 # Match all the nodes that are in the wanted range. 

373 if copied_nodes: 

374 self._copy_count += 1 

375 

376 to = self._nodes_tree.parsed_until_line 

377 

378 LOG.debug('copy old[%s:%s] new[%s:%s]', 

379 copied_nodes[0].start_pos[0], 

380 copied_nodes[-1].end_pos[0] - 1, from_, to) 

381 else: 

382 # We have copied as much as possible (but definitely not too 

383 # much). Therefore we just parse a bit more. 

384 self._parse(self._nodes_tree.parsed_until_line + 1) 

385 # Since there are potential bugs that might loop here endlessly, we 

386 # just stop here. 

387 assert last_until_line != self._nodes_tree.parsed_until_line, last_until_line 

388 last_until_line = self._nodes_tree.parsed_until_line 

389 

390 def _get_old_line_stmt(self, old_line): 

391 leaf = self._module.get_leaf_for_position((old_line, 0), include_prefixes=True) 

392 

393 if _ends_with_newline(leaf): 

394 leaf = leaf.get_next_leaf() 

395 if leaf.get_start_pos_of_prefix()[0] == old_line: 

396 node = leaf 

397 while node.parent.type not in ('file_input', 'suite'): 

398 node = node.parent 

399 

400 # Make sure that if only the `else:` line of an if statement is 

401 # copied that not the whole thing is going to be copied. 

402 if node.start_pos[0] >= old_line: 

403 return node 

404 # Must be on the same line. Otherwise we need to parse that bit. 

405 return None 

406 

407 def _parse(self, until_line): 

408 """ 

409 Parses at least until the given line, but might just parse more until a 

410 valid state is reached. 

411 """ 

412 last_until_line = 0 

413 while until_line > self._nodes_tree.parsed_until_line: 

414 node = self._try_parse_part(until_line) 

415 nodes = node.children 

416 

417 self._nodes_tree.add_parsed_nodes(nodes, self._keyword_token_indents) 

418 if self._replace_tos_indent is not None: 

419 self._nodes_tree.indents[-1] = self._replace_tos_indent 

420 

421 LOG.debug( 

422 'parse_part from %s to %s (to %s in part parser)', 

423 nodes[0].get_start_pos_of_prefix()[0], 

424 self._nodes_tree.parsed_until_line, 

425 node.end_pos[0] - 1 

426 ) 

427 # Since the tokenizer sometimes has bugs, we cannot be sure that 

428 # this loop terminates. Therefore assert that there's always a 

429 # change. 

430 assert last_until_line != self._nodes_tree.parsed_until_line, last_until_line 

431 last_until_line = self._nodes_tree.parsed_until_line 

432 

433 def _try_parse_part(self, until_line): 

434 """ 

435 Sets up a normal parser that uses a spezialized tokenizer to only parse 

436 until a certain position (or a bit longer if the statement hasn't 

437 ended. 

438 """ 

439 self._parser_count += 1 

440 # TODO speed up, shouldn't copy the whole list all the time. 

441 # memoryview? 

442 parsed_until_line = self._nodes_tree.parsed_until_line 

443 lines_after = self._parser_lines_new[parsed_until_line:] 

444 tokens = self._diff_tokenize( 

445 lines_after, 

446 until_line, 

447 line_offset=parsed_until_line 

448 ) 

449 self._active_parser = Parser( 

450 self._pgen_grammar, 

451 error_recovery=True 

452 ) 

453 return self._active_parser.parse(tokens=tokens) 

454 

455 def _diff_tokenize(self, lines, until_line, line_offset=0): 

456 was_newline = False 

457 indents = self._nodes_tree.indents 

458 initial_indentation_count = len(indents) 

459 

460 tokens = self._tokenizer( 

461 lines, 

462 start_pos=(line_offset + 1, 0), 

463 indents=indents, 

464 is_first_token=line_offset == 0, 

465 ) 

466 stack = self._active_parser.stack 

467 self._replace_tos_indent = None 

468 self._keyword_token_indents = {} 

469 # print('start', line_offset + 1, indents) 

470 for token in tokens: 

471 # print(token, indents) 

472 typ = token.type 

473 if typ == DEDENT: 

474 if len(indents) < initial_indentation_count: 

475 # We are done here, only thing that can come now is an 

476 # endmarker or another dedented code block. 

477 while True: 

478 typ, string, start_pos, prefix = token = next(tokens) 

479 if typ in (DEDENT, ERROR_DEDENT): 

480 if typ == ERROR_DEDENT: 

481 # We want to force an error dedent in the next 

482 # parser/pass. To make this possible we just 

483 # increase the location by one. 

484 self._replace_tos_indent = start_pos[1] + 1 

485 pass 

486 else: 

487 break 

488 

489 if '\n' in prefix or '\r' in prefix: 

490 prefix = re.sub(r'[^\n\r]+\Z', '', prefix) 

491 else: 

492 assert start_pos[1] >= len(prefix), repr(prefix) 

493 if start_pos[1] - len(prefix) == 0: 

494 prefix = '' 

495 yield PythonToken( 

496 ENDMARKER, '', 

497 start_pos, 

498 prefix 

499 ) 

500 break 

501 elif typ == NEWLINE and token.start_pos[0] >= until_line: 

502 was_newline = True 

503 elif was_newline: 

504 was_newline = False 

505 if len(indents) == initial_indentation_count: 

506 # Check if the parser is actually in a valid suite state. 

507 if _suite_or_file_input_is_valid(self._pgen_grammar, stack): 

508 yield PythonToken(ENDMARKER, '', token.start_pos, '') 

509 break 

510 

511 if typ == NAME and token.string in ('class', 'def'): 

512 self._keyword_token_indents[token.start_pos] = list(indents) 

513 

514 yield token 

515 

516 

517class _NodesTreeNode: 

518 _ChildrenGroup = namedtuple( 

519 '_ChildrenGroup', 

520 'prefix children line_offset last_line_offset_leaf') 

521 

522 def __init__(self, tree_node, parent=None, indentation=0): 

523 self.tree_node = tree_node 

524 self._children_groups = [] 

525 self.parent = parent 

526 self._node_children = [] 

527 self.indentation = indentation 

528 

529 def finish(self): 

530 children = [] 

531 for prefix, children_part, line_offset, last_line_offset_leaf in self._children_groups: 

532 first_leaf = _get_next_leaf_if_indentation( 

533 children_part[0].get_first_leaf() 

534 ) 

535 

536 first_leaf.prefix = prefix + first_leaf.prefix 

537 if line_offset != 0: 

538 try: 

539 _update_positions( 

540 children_part, line_offset, last_line_offset_leaf) 

541 except _PositionUpdatingFinished: 

542 pass 

543 children += children_part 

544 self.tree_node.children = children 

545 # Reset the parents 

546 for node in children: 

547 node.parent = self.tree_node 

548 

549 for node_child in self._node_children: 

550 node_child.finish() 

551 

552 def add_child_node(self, child_node): 

553 self._node_children.append(child_node) 

554 

555 def add_tree_nodes(self, prefix, children, line_offset=0, 

556 last_line_offset_leaf=None): 

557 if last_line_offset_leaf is None: 

558 last_line_offset_leaf = children[-1].get_last_leaf() 

559 group = self._ChildrenGroup( 

560 prefix, children, line_offset, last_line_offset_leaf 

561 ) 

562 self._children_groups.append(group) 

563 

564 def get_last_line(self, suffix): 

565 line = 0 

566 if self._children_groups: 

567 children_group = self._children_groups[-1] 

568 last_leaf = _get_previous_leaf_if_indentation( 

569 children_group.last_line_offset_leaf 

570 ) 

571 

572 line = last_leaf.end_pos[0] + children_group.line_offset 

573 

574 # Newlines end on the next line, which means that they would cover 

575 # the next line. That line is not fully parsed at this point. 

576 if _ends_with_newline(last_leaf, suffix): 

577 line -= 1 

578 line += len(split_lines(suffix)) - 1 

579 

580 if suffix and not suffix.endswith('\n') and not suffix.endswith('\r'): 

581 # This is the end of a file (that doesn't end with a newline). 

582 line += 1 

583 

584 if self._node_children: 

585 return max(line, self._node_children[-1].get_last_line(suffix)) 

586 return line 

587 

588 def __repr__(self): 

589 return '<%s: %s>' % (self.__class__.__name__, self.tree_node) 

590 

591 

592class _NodesTree: 

593 def __init__(self, module): 

594 self._base_node = _NodesTreeNode(module) 

595 self._working_stack = [self._base_node] 

596 self._module = module 

597 self._prefix_remainder = '' 

598 self.prefix = '' 

599 self.indents = [0] 

600 

601 @property 

602 def parsed_until_line(self): 

603 return self._working_stack[-1].get_last_line(self.prefix) 

604 

605 def _update_insertion_node(self, indentation): 

606 for node in reversed(list(self._working_stack)): 

607 if node.indentation < indentation or node is self._working_stack[0]: 

608 return node 

609 self._working_stack.pop() 

610 

611 def add_parsed_nodes(self, tree_nodes, keyword_token_indents): 

612 old_prefix = self.prefix 

613 tree_nodes = self._remove_endmarker(tree_nodes) 

614 if not tree_nodes: 

615 self.prefix = old_prefix + self.prefix 

616 return 

617 

618 assert tree_nodes[0].type != 'newline' 

619 

620 node = self._update_insertion_node(tree_nodes[0].start_pos[1]) 

621 assert node.tree_node.type in ('suite', 'file_input') 

622 node.add_tree_nodes(old_prefix, tree_nodes) 

623 # tos = Top of stack 

624 self._update_parsed_node_tos(tree_nodes[-1], keyword_token_indents) 

625 

626 def _update_parsed_node_tos(self, tree_node, keyword_token_indents): 

627 if tree_node.type == 'suite': 

628 def_leaf = tree_node.parent.children[0] 

629 new_tos = _NodesTreeNode( 

630 tree_node, 

631 indentation=keyword_token_indents[def_leaf.start_pos][-1], 

632 ) 

633 new_tos.add_tree_nodes('', list(tree_node.children)) 

634 

635 self._working_stack[-1].add_child_node(new_tos) 

636 self._working_stack.append(new_tos) 

637 

638 self._update_parsed_node_tos(tree_node.children[-1], keyword_token_indents) 

639 elif _func_or_class_has_suite(tree_node): 

640 self._update_parsed_node_tos(tree_node.children[-1], keyword_token_indents) 

641 

642 def _remove_endmarker(self, tree_nodes): 

643 """ 

644 Helps cleaning up the tree nodes that get inserted. 

645 """ 

646 last_leaf = tree_nodes[-1].get_last_leaf() 

647 is_endmarker = last_leaf.type == 'endmarker' 

648 self._prefix_remainder = '' 

649 if is_endmarker: 

650 prefix = last_leaf.prefix 

651 separation = max(prefix.rfind('\n'), prefix.rfind('\r')) 

652 if separation > -1: 

653 # Remove the whitespace part of the prefix after a newline. 

654 # That is not relevant if parentheses were opened. Always parse 

655 # until the end of a line. 

656 last_leaf.prefix, self._prefix_remainder = \ 

657 last_leaf.prefix[:separation + 1], last_leaf.prefix[separation + 1:] 

658 

659 self.prefix = '' 

660 

661 if is_endmarker: 

662 self.prefix = last_leaf.prefix 

663 

664 tree_nodes = tree_nodes[:-1] 

665 return tree_nodes 

666 

667 def _get_matching_indent_nodes(self, tree_nodes, is_new_suite): 

668 # There might be a random dedent where we have to stop copying. 

669 # Invalid indents are ok, because the parser handled that 

670 # properly before. An invalid dedent can happen, because a few 

671 # lines above there was an invalid indent. 

672 node_iterator = iter(tree_nodes) 

673 if is_new_suite: 

674 yield next(node_iterator) 

675 

676 first_node = next(node_iterator) 

677 indent = _get_indentation(first_node) 

678 if not is_new_suite and indent not in self.indents: 

679 return 

680 yield first_node 

681 

682 for n in node_iterator: 

683 if _get_indentation(n) != indent: 

684 return 

685 yield n 

686 

687 def copy_nodes(self, tree_nodes, until_line, line_offset): 

688 """ 

689 Copies tree nodes from the old parser tree. 

690 

691 Returns the number of tree nodes that were copied. 

692 """ 

693 if tree_nodes[0].type in ('error_leaf', 'error_node'): 

694 # Avoid copying errors in the beginning. Can lead to a lot of 

695 # issues. 

696 return [] 

697 

698 indentation = _get_indentation(tree_nodes[0]) 

699 old_working_stack = list(self._working_stack) 

700 old_prefix = self.prefix 

701 old_indents = self.indents 

702 self.indents = [i for i in self.indents if i <= indentation] 

703 

704 self._update_insertion_node(indentation) 

705 

706 new_nodes, self._working_stack, self.prefix, added_indents = self._copy_nodes( 

707 list(self._working_stack), 

708 tree_nodes, 

709 until_line, 

710 line_offset, 

711 self.prefix, 

712 ) 

713 if new_nodes: 

714 self.indents += added_indents 

715 else: 

716 self._working_stack = old_working_stack 

717 self.prefix = old_prefix 

718 self.indents = old_indents 

719 return new_nodes 

720 

721 def _copy_nodes(self, working_stack, nodes, until_line, line_offset, 

722 prefix='', is_nested=False): 

723 new_nodes = [] 

724 added_indents = [] 

725 

726 nodes = list(self._get_matching_indent_nodes( 

727 nodes, 

728 is_new_suite=is_nested, 

729 )) 

730 

731 new_prefix = '' 

732 for node in nodes: 

733 if node.start_pos[0] > until_line: 

734 break 

735 

736 if node.type == 'endmarker': 

737 break 

738 

739 if node.type == 'error_leaf' and node.token_type in ('DEDENT', 'ERROR_DEDENT'): 

740 break 

741 # TODO this check might take a bit of time for large files. We 

742 # might want to change this to do more intelligent guessing or 

743 # binary search. 

744 if _get_last_line(node) > until_line: 

745 # We can split up functions and classes later. 

746 if _func_or_class_has_suite(node): 

747 new_nodes.append(node) 

748 break 

749 try: 

750 c = node.children 

751 except AttributeError: 

752 pass 

753 else: 

754 # This case basically appears with error recovery of one line 

755 # suites like `def foo(): bar.-`. In this case we might not 

756 # include a newline in the statement and we need to take care 

757 # of that. 

758 n = node 

759 if n.type == 'decorated': 

760 n = n.children[-1] 

761 if n.type in ('async_funcdef', 'async_stmt'): 

762 n = n.children[-1] 

763 if n.type in ('classdef', 'funcdef'): 

764 suite_node = n.children[-1] 

765 else: 

766 suite_node = c[-1] 

767 

768 if suite_node.type in ('error_leaf', 'error_node'): 

769 break 

770 

771 new_nodes.append(node) 

772 

773 # Pop error nodes at the end from the list 

774 if new_nodes: 

775 while new_nodes: 

776 last_node = new_nodes[-1] 

777 if (last_node.type in ('error_leaf', 'error_node') 

778 or _is_flow_node(new_nodes[-1])): 

779 # Error leafs/nodes don't have a defined start/end. Error 

780 # nodes might not end with a newline (e.g. if there's an 

781 # open `(`). Therefore ignore all of them unless they are 

782 # succeeded with valid parser state. 

783 # If we copy flows at the end, they might be continued 

784 # after the copy limit (in the new parser). 

785 # In this while loop we try to remove until we find a newline. 

786 new_prefix = '' 

787 new_nodes.pop() 

788 while new_nodes: 

789 last_node = new_nodes[-1] 

790 if last_node.get_last_leaf().type == 'newline': 

791 break 

792 new_nodes.pop() 

793 continue 

794 if len(new_nodes) > 1 and new_nodes[-2].type == 'error_node': 

795 # The problem here is that Parso error recovery sometimes 

796 # influences nodes before this node. 

797 # Since the new last node is an error node this will get 

798 # cleaned up in the next while iteration. 

799 new_nodes.pop() 

800 continue 

801 break 

802 

803 if not new_nodes: 

804 return [], working_stack, prefix, added_indents 

805 

806 tos = working_stack[-1] 

807 last_node = new_nodes[-1] 

808 had_valid_suite_last = False 

809 # Pop incomplete suites from the list 

810 if _func_or_class_has_suite(last_node): 

811 suite = last_node 

812 while suite.type != 'suite': 

813 suite = suite.children[-1] 

814 

815 indent = _get_suite_indentation(suite) 

816 added_indents.append(indent) 

817 

818 suite_tos = _NodesTreeNode(suite, indentation=_get_indentation(last_node)) 

819 # Don't need to pass line_offset here, it's already done by the 

820 # parent. 

821 suite_nodes, new_working_stack, new_prefix, ai = self._copy_nodes( 

822 working_stack + [suite_tos], suite.children, until_line, line_offset, 

823 is_nested=True, 

824 ) 

825 added_indents += ai 

826 if len(suite_nodes) < 2: 

827 # A suite only with newline is not valid. 

828 new_nodes.pop() 

829 new_prefix = '' 

830 else: 

831 assert new_nodes 

832 tos.add_child_node(suite_tos) 

833 working_stack = new_working_stack 

834 had_valid_suite_last = True 

835 

836 if new_nodes: 

837 if not _ends_with_newline(new_nodes[-1].get_last_leaf()) and not had_valid_suite_last: 

838 p = new_nodes[-1].get_next_leaf().prefix 

839 # We are not allowed to remove the newline at the end of the 

840 # line, otherwise it's going to be missing. This happens e.g. 

841 # if a bracket is around before that moves newlines to 

842 # prefixes. 

843 new_prefix = split_lines(p, keepends=True)[0] 

844 

845 if had_valid_suite_last: 

846 last = new_nodes[-1] 

847 if last.type == 'decorated': 

848 last = last.children[-1] 

849 if last.type in ('async_funcdef', 'async_stmt'): 

850 last = last.children[-1] 

851 last_line_offset_leaf = last.children[-2].get_last_leaf() 

852 assert last_line_offset_leaf == ':' 

853 else: 

854 last_line_offset_leaf = new_nodes[-1].get_last_leaf() 

855 tos.add_tree_nodes( 

856 prefix, new_nodes, line_offset, last_line_offset_leaf, 

857 ) 

858 prefix = new_prefix 

859 self._prefix_remainder = '' 

860 

861 return new_nodes, working_stack, prefix, added_indents 

862 

863 def close(self): 

864 self._base_node.finish() 

865 

866 # Add an endmarker. 

867 try: 

868 last_leaf = self._module.get_last_leaf() 

869 except IndexError: 

870 end_pos = [1, 0] 

871 else: 

872 last_leaf = _skip_dedent_error_leaves(last_leaf) 

873 end_pos = list(last_leaf.end_pos) 

874 lines = split_lines(self.prefix) 

875 assert len(lines) > 0 

876 if len(lines) == 1: 

877 if lines[0].startswith(BOM_UTF8_STRING) and end_pos == [1, 0]: 

878 end_pos[1] -= 1 

879 end_pos[1] += len(lines[0]) 

880 else: 

881 end_pos[0] += len(lines) - 1 

882 end_pos[1] = len(lines[-1]) 

883 

884 endmarker = EndMarker('', tuple(end_pos), self.prefix + self._prefix_remainder) 

885 endmarker.parent = self._module 

886 self._module.children.append(endmarker)