Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pytokens/__init__.py: 78%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

646 statements  

1"""pytokens - A Fast, spec compliant Python 3.12+ tokenizer that runs on older Pythons.""" 

2 

3from __future__ import annotations 

4 

5from dataclasses import dataclass, field 

6import enum 

7import string 

8from typing import Iterator, NewType 

9 

10 

11class TokenizeError(Exception): ... 

12 

13 

14class IndentationError(TokenizeError): ... 

15 

16 

17class InconsistentUseOfTabsAndSpaces(IndentationError): ... 

18 

19 

20class DedentDoesNotMatchAnyOuterIndent(IndentationError): ... 

21 

22 

23class UnterminatedString(TokenizeError): ... 

24 

25 

26class UnexpectedEOF(TokenizeError): ... 

27 

28 

29class UnexpectedCharacterAfterBackslash(TokenizeError): ... 

30 

31 

32class NotAnIndent(AssertionError): ... 

33 

34 

35class Underflow(AssertionError): ... 

36 

37 

38class TokenType(enum.IntEnum): 

39 whitespace = 1 

40 indent = 2 

41 dedent = 3 

42 newline = 4 # semantically meaningful newline 

43 nl = 5 # non meaningful newline 

44 comment = 6 

45 

46 _op_start = 7 # marker used to check if a token is an operator 

47 semicolon = 8 

48 lparen = 9 

49 rparen = 10 

50 lbracket = 11 

51 rbracket = 12 

52 lbrace = 13 

53 rbrace = 14 

54 colon = 15 

55 op = 16 

56 _op_end = 17 # marker used to check if a token is an operator 

57 

58 identifier = 18 

59 number = 19 

60 string = 20 

61 fstring_start = 21 

62 fstring_middle = 22 

63 fstring_end = 23 

64 

65 endmarker = 24 

66 

67 errortoken = 25 

68 

69 def __repr__(self) -> str: 

70 return f"TokenType.{self.name}" 

71 

72 def to_python_token(self) -> str: 

73 if self.name == "identifier": 

74 return "NAME" 

75 

76 if self.is_operator(): 

77 return "OP" 

78 

79 return self.name.upper() 

80 

81 def is_operator(self) -> bool: 

82 return TokenType._op_start < self < TokenType._op_end 

83 

84 

85@dataclass 

86class Token: 

87 type: TokenType 

88 # Byte offsets in the file 

89 start_index: int 

90 end_index: int 

91 start_line: int 

92 # 0-indexed offset from start of line 

93 start_col: int 

94 end_line: int 

95 end_col: int 

96 

97 def to_byte_slice(self, source: str) -> str: 

98 # Newline at end of file may not exist in the file 

99 if ( 

100 (self.type == TokenType.newline or self.type == TokenType.nl) 

101 and self.start_index == len(source) 

102 and self.end_index == len(source) + 1 

103 ): 

104 return "" 

105 

106 # Dedents at end of file also may not exist in the file 

107 if ( 

108 self.type == TokenType.dedent 

109 and self.start_index == len(source) + 1 

110 and self.end_index == len(source) + 1 

111 ): 

112 return "" 

113 

114 # Endmarkers are out of bound too 

115 if self.type == TokenType.endmarker: 

116 return "" 

117 

118 return source[self.start_index : self.end_index] 

119 

120 

121class FStringState: 

122 State = NewType("State", int) 

123 

124 not_fstring = State(1) 

125 at_fstring_middle = State(2) 

126 at_fstring_lbrace = State(3) 

127 in_fstring_expr = State(4) 

128 in_fstring_expr_modifier = State(5) 

129 at_fstring_end = State(6) 

130 

131 def __init__(self) -> None: 

132 self.state = FStringState.not_fstring 

133 self.stack: list[FStringState.State] = [] 

134 

135 def enter_fstring(self) -> None: 

136 self.stack.append(self.state) 

137 self.state = FStringState.at_fstring_middle 

138 

139 def leave_fstring(self) -> None: 

140 assert self.state == FStringState.at_fstring_end 

141 self.state = self.stack.pop() 

142 

143 def consume_fstring_middle_for_lbrace(self) -> None: 

144 if self.state == FStringState.in_fstring_expr_modifier: 

145 self.stack.append(self.state) 

146 

147 self.state = FStringState.at_fstring_lbrace 

148 

149 def consume_fstring_middle_for_end(self) -> None: 

150 self.state = FStringState.at_fstring_end 

151 

152 def consume_lbrace(self) -> None: 

153 self.state = FStringState.in_fstring_expr 

154 

155 def consume_rbrace(self) -> None: 

156 assert ( 

157 self.state == FStringState.in_fstring_expr 

158 or self.state == FStringState.in_fstring_expr_modifier 

159 ) 

160 

161 if ( 

162 len(self.stack) > 0 

163 and self.stack[-1] == FStringState.in_fstring_expr_modifier 

164 ): 

165 self.state = self.stack.pop() 

166 else: 

167 self.state = FStringState.at_fstring_middle 

168 

169 def consume_colon(self) -> None: 

170 assert self.state == FStringState.in_fstring_expr 

171 self.state = FStringState.in_fstring_expr_modifier 

172 

173 

174@dataclass 

175class TokenIterator: 

176 source: str 

177 issue_128233_handling: bool 

178 

179 current_index: int = 0 

180 prev_index: int = 0 

181 line_number: int = 1 

182 prev_line_number: int = 1 

183 byte_offset: int = 0 

184 prev_byte_offset: int = 0 

185 all_whitespace_on_this_line: bool = True 

186 

187 bracket_level: int = 0 

188 bracket_level_stack: list[int] = field(default_factory=list) 

189 prev_token: Token | None = None 

190 

191 indent_stack: list[str] = field(default_factory=list) 

192 dedent_counter: int = 0 

193 

194 # f-string state 

195 fstring_state: FStringState = field(default_factory=FStringState) 

196 fstring_prefix_quote_stack: list[tuple[str, str]] = field(default_factory=list) 

197 fstring_prefix: str | None = None 

198 fstring_quote: str | None = None 

199 

200 # CPython has a weird bug where every time a bare \r is 

201 # present, the next token becomes an OP. regardless of what it is. 

202 weird_op_case: bool = False 

203 weird_op_case_nl: bool = False 

204 

205 weird_whitespace_case: bool = False 

206 

207 def is_in_bounds(self) -> bool: 

208 return self.current_index < len(self.source) 

209 

210 def peek(self) -> str: 

211 assert self.is_in_bounds() 

212 return self.source[self.current_index] 

213 

214 def peek_next(self) -> str: 

215 assert self.current_index + 1 < len(self.source) 

216 return self.source[self.current_index + 1] 

217 

218 def advance(self) -> None: 

219 self.current_index += 1 

220 self.byte_offset += 1 

221 

222 def advance_by(self, count: int) -> None: 

223 self.current_index += count 

224 self.byte_offset += count 

225 

226 def next_line(self) -> None: 

227 self.line_number += 1 

228 self.byte_offset = 0 

229 self.all_whitespace_on_this_line = True 

230 

231 def advance_check_newline(self) -> None: 

232 if self.source[self.current_index] == "\n": 

233 self.current_index += 1 

234 self.next_line() 

235 else: 

236 self.advance() 

237 

238 def match(self, *options: str, ignore_case: bool = False) -> bool: 

239 for option in options: 

240 if self.current_index + len(option) > len(self.source): 

241 continue 

242 snippet = self.source[self.current_index : self.current_index + len(option)] 

243 if ignore_case: 

244 option = option.lower() 

245 snippet = snippet.lower() 

246 

247 if option == snippet: 

248 return True 

249 

250 return False 

251 

252 def make_token(self, tok_type: TokenType) -> Token: 

253 token_type = ( 

254 TokenType.op 

255 if self.weird_op_case 

256 and not tok_type.is_operator() 

257 and tok_type not in (TokenType.number, TokenType.string) 

258 else tok_type 

259 ) 

260 if self.weird_op_case: 

261 # And we have another weird case INSIDE the weird case. 

262 # For some reason when CPython accidentally captures a space 

263 # as the next character, i.e. when the token is '\r ', 

264 # It DOESN't see it as whitespace, so in that specific case, 

265 # we shouldn't set all_whitespace_on_this_line. 

266 # I think this is because CPython never expecte to have a 

267 # ' ' token in it anyway so it doesn't classify it as 

268 # whitespace. So it becomes non-whitespace. 

269 # Removing this if stmt breaks test 1001 right now. 

270 token_str = self.source[self.prev_index : self.current_index] 

271 if token_str == "\r ": 

272 self.all_whitespace_on_this_line = False 

273 self.weird_op_case = False 

274 

275 token = Token( 

276 type=token_type, 

277 start_index=self.prev_index, 

278 end_index=self.current_index, 

279 start_line=self.prev_line_number, 

280 start_col=self.prev_byte_offset, 

281 end_line=self.line_number, 

282 end_col=self.byte_offset, 

283 ) 

284 if tok_type == TokenType.newline or tok_type == TokenType.nl: 

285 self.next_line() 

286 elif tok_type == TokenType.whitespace or tok_type == TokenType.comment: 

287 pass 

288 else: 

289 self.all_whitespace_on_this_line = False 

290 

291 self.prev_token = token 

292 self.prev_index = self.current_index 

293 self.prev_line_number = self.line_number 

294 self.prev_byte_offset = self.byte_offset 

295 self.weird_op_case = False 

296 

297 return token 

298 

299 def push_fstring_prefix_quote(self, prefix: str, quote: str) -> None: 

300 if self.fstring_prefix is not None: 

301 assert self.fstring_quote is not None 

302 self.fstring_prefix_quote_stack.append( 

303 (self.fstring_prefix, self.fstring_quote) 

304 ) 

305 

306 self.fstring_prefix = prefix 

307 self.fstring_quote = quote 

308 

309 def pop_fstring_quote(self) -> None: 

310 if self.fstring_prefix is None: 

311 assert self.fstring_quote is None 

312 raise Underflow 

313 

314 self.fstring_prefix, self.fstring_quote = ( 

315 (None, None) 

316 if len(self.fstring_prefix_quote_stack) == 0 

317 else self.fstring_prefix_quote_stack.pop() 

318 ) 

319 

320 def newline(self) -> Token: 

321 if self.is_in_bounds() and self.source[self.current_index] == "\r": 

322 self.advance() 

323 self.advance() 

324 token_type = ( 

325 TokenType.nl 

326 if ( 

327 self.weird_op_case_nl 

328 or self.bracket_level > 0 

329 or self.fstring_state.state == FStringState.in_fstring_expr 

330 or self.all_whitespace_on_this_line 

331 ) 

332 else TokenType.newline 

333 ) 

334 token = self.make_token(token_type) 

335 self.weird_op_case_nl = False 

336 return token 

337 

338 def endmarker(self) -> Token: 

339 if self.bracket_level != 0: 

340 raise UnexpectedEOF 

341 

342 if len(self.indent_stack) > 0: 

343 _ = self.indent_stack.pop() 

344 return self.make_token(TokenType.dedent) 

345 

346 return self.make_token(TokenType.endmarker) 

347 

348 def decimal(self) -> Token: 

349 digit_before_decimal = False 

350 if self.source[self.current_index].isdigit(): 

351 digit_before_decimal = True 

352 self.advance() 

353 

354 # TODO: this is too lax; 1__2 tokenizes successfully 

355 while self.is_in_bounds() and ( 

356 self.source[self.current_index].isdigit() 

357 or self.source[self.current_index] == "_" 

358 ): 

359 self.advance() 

360 

361 if self.is_in_bounds() and self.source[self.current_index] == ".": 

362 self.advance() 

363 

364 while self.is_in_bounds() and ( 

365 self.source[self.current_index].isdigit() 

366 or ( 

367 self.source[self.current_index] == "_" 

368 and self.source[self.current_index - 1].isdigit() 

369 ) 

370 ): 

371 self.advance() 

372 # Before advancing over the 'e', ensure that there has been at least 1 digit before the 'e' 

373 if self.current_index + 1 < len(self.source) and ( 

374 (digit_before_decimal or self.source[self.current_index - 1].isdigit()) 

375 and ( 

376 self.source[self.current_index] == "e" 

377 or self.source[self.current_index] == "E" 

378 ) 

379 and ( 

380 self.source[self.current_index + 1].isdigit() 

381 or ( 

382 self.current_index + 2 < len(self.source) 

383 and ( 

384 self.source[self.current_index + 1] == "+" 

385 or self.source[self.current_index + 1] == "-" 

386 ) 

387 and self.source[self.current_index + 2].isdigit() 

388 ) 

389 ) 

390 ): 

391 self.advance() 

392 self.advance() 

393 # optional third advance not necessary as itll get advanced just below 

394 

395 # TODO: this is too lax; 1__2 tokenizes successfully 

396 while self.is_in_bounds() and ( 

397 self.source[self.current_index].isdigit() 

398 or ( 

399 (digit_before_decimal or self.source[self.current_index - 1].isdigit()) 

400 and self.source[self.current_index] == "_" 

401 ) 

402 ): 

403 self.advance() 

404 

405 # Complex numbers end in a `j`. But ensure at least 1 digit before it 

406 if self.is_in_bounds() and ( 

407 (digit_before_decimal or self.source[self.current_index - 1].isdigit()) 

408 and ( 

409 self.source[self.current_index] == "j" 

410 or self.source[self.current_index] == "J" 

411 ) 

412 ): 

413 self.advance() 

414 # If all of this resulted in just a dot, return an operator 

415 if ( 

416 self.current_index - self.prev_index == 1 

417 and self.source[self.current_index - 1] == "." 

418 ): 

419 # Ellipsis check 

420 if ( 

421 self.current_index + 2 <= len(self.source) 

422 and self.source[self.current_index : self.current_index + 2] == ".." 

423 ): 

424 self.advance() 

425 self.advance() 

426 

427 return self.make_token(TokenType.op) 

428 

429 return self.make_token(TokenType.number) 

430 

431 def binary(self) -> Token: 

432 # jump over `0b` 

433 self.advance() 

434 self.advance() 

435 while self.is_in_bounds() and ( 

436 self.source[self.current_index] == "0" 

437 or self.source[self.current_index] == "1" 

438 or self.source[self.current_index] == "_" 

439 ): 

440 self.advance() 

441 if self.is_in_bounds() and ( 

442 self.source[self.current_index] == "e" 

443 or self.source[self.current_index] == "E" 

444 ): 

445 self.advance() 

446 if self.is_in_bounds() and self.source[self.current_index] == "-": 

447 self.advance() 

448 

449 while self.is_in_bounds() and ( 

450 self.source[self.current_index] == "0" 

451 or self.source[self.current_index] == "1" 

452 or self.source[self.current_index] == "_" 

453 ): 

454 self.advance() 

455 return self.make_token(TokenType.number) 

456 

457 def octal(self) -> Token: 

458 # jump over `0o` 

459 self.advance() 

460 self.advance() 

461 while self.is_in_bounds() and ( 

462 self.source[self.current_index] >= "0" 

463 and self.source[self.current_index] <= "7" 

464 or self.source[self.current_index] == "_" 

465 ): 

466 self.advance() 

467 if self.is_in_bounds() and ( 

468 self.source[self.current_index] == "e" 

469 or self.source[self.current_index] == "E" 

470 ): 

471 self.advance() 

472 if self.is_in_bounds() and self.source[self.current_index] == "-": 

473 self.advance() 

474 

475 while self.is_in_bounds() and ( 

476 self.source[self.current_index] >= "0" 

477 and self.source[self.current_index] <= "7" 

478 or self.source[self.current_index] == "_" 

479 ): 

480 self.advance() 

481 return self.make_token(TokenType.number) 

482 

483 def hexadecimal(self) -> Token: 

484 # jump over `0x` 

485 self.advance() 

486 self.advance() 

487 while self.is_in_bounds() and ( 

488 self.source[self.current_index] in string.hexdigits 

489 or self.source[self.current_index] == "_" 

490 ): 

491 self.advance() 

492 if self.is_in_bounds() and ( 

493 self.source[self.current_index] == "e" 

494 or self.source[self.current_index] == "E" 

495 ): 

496 self.advance() 

497 if self.is_in_bounds() and self.source[self.current_index] == "-": 

498 self.advance() 

499 

500 while self.is_in_bounds() and ( 

501 self.source[self.current_index] in string.hexdigits 

502 or self.source[self.current_index] == "_" 

503 ): 

504 self.advance() 

505 return self.make_token(TokenType.number) 

506 

507 def find_opening_quote(self) -> int: 

508 # Quotes should always be within 3 chars of the beginning of the string token 

509 for offset in range(3): 

510 char = self.source[self.current_index + offset] 

511 if char == '"' or char == "'": 

512 return self.current_index + offset 

513 

514 raise AssertionError("Quote not found somehow") 

515 

516 def string_prefix_and_quotes(self) -> tuple[str, str]: 

517 quote_index = self.find_opening_quote() 

518 prefix = self.source[self.current_index : quote_index] 

519 quote_char = self.source[quote_index] 

520 

521 # Check for triple quotes 

522 quote = ( 

523 self.source[quote_index : quote_index + 3] 

524 if ( 

525 quote_index + 3 <= len(self.source) 

526 and self.source[quote_index + 1] == quote_char 

527 and self.source[quote_index + 2] == quote_char 

528 ) 

529 else self.source[quote_index : quote_index + 1] 

530 ) 

531 return prefix, quote 

532 

533 def fstring(self) -> Token: 

534 if self.fstring_state.state in ( 

535 FStringState.not_fstring, 

536 FStringState.in_fstring_expr, 

537 ): 

538 prefix, quote = self.string_prefix_and_quotes() 

539 self.push_fstring_prefix_quote(prefix, quote) 

540 for _ in range(len(prefix)): 

541 self.advance() 

542 for _ in range(len(quote)): 

543 self.advance() 

544 self.fstring_state.enter_fstring() 

545 return self.make_token(TokenType.fstring_start) 

546 

547 if self.fstring_state.state == FStringState.at_fstring_middle: 

548 assert self.fstring_quote is not None 

549 is_single_quote = len(self.fstring_quote) == 1 

550 start_index = self.current_index 

551 while self.is_in_bounds(): 

552 char = self.source[self.current_index] 

553 # For single quotes, bail on newlines 

554 if char == "\n" and is_single_quote: 

555 raise UnterminatedString 

556 

557 # Handle escapes 

558 if char == "\\": 

559 self.advance() 

560 # But don't escape a `\{` or `\}` in f-strings 

561 # but DO escape `\N{` in f-strings, that's for unicode characters 

562 # but DON'T escape `\N{` in raw f-strings. 

563 assert self.fstring_prefix is not None 

564 if ( 

565 "r" not in self.fstring_prefix.lower() 

566 and self.current_index + 1 < len(self.source) 

567 and self.peek() == "N" 

568 and self.peek_next() == "{" 

569 ): 

570 self.advance() 

571 self.advance() 

572 

573 if self.is_in_bounds() and not ( 

574 self.peek() == "{" or self.peek() == "}" 

575 ): 

576 self.advance_check_newline() 

577 

578 continue 

579 

580 # Find opening / closing quote 

581 if char == "{": 

582 if self.peek_next() == "{": 

583 self.advance() 

584 self.advance() 

585 continue 

586 else: 

587 self.fstring_state.consume_fstring_middle_for_lbrace() 

588 # If fstring-middle is empty, skip it by returning the next step token 

589 if self.current_index == start_index: 

590 return self.fstring() 

591 

592 return self.make_token(TokenType.fstring_middle) 

593 

594 assert self.fstring_quote is not None 

595 if self.match(self.fstring_quote): 

596 self.fstring_state.consume_fstring_middle_for_end() 

597 # If fstring-middle is empty, skip it by returning the next step token 

598 if self.current_index == start_index: 

599 return self.fstring() 

600 

601 return self.make_token(TokenType.fstring_middle) 

602 

603 self.advance_check_newline() 

604 

605 raise UnexpectedEOF 

606 

607 if self.fstring_state.state == FStringState.at_fstring_lbrace: 

608 self.advance() 

609 self.bracket_level_stack.append(self.bracket_level) 

610 self.bracket_level = 0 

611 self.fstring_state.consume_lbrace() 

612 return self.make_token(TokenType.lbrace) 

613 

614 if self.fstring_state.state == FStringState.at_fstring_end: 

615 assert self.fstring_quote is not None 

616 for _ in range(len(self.fstring_quote)): 

617 self.advance() 

618 self.pop_fstring_quote() 

619 self.fstring_state.leave_fstring() 

620 return self.make_token(TokenType.fstring_end) 

621 

622 if self.fstring_state.state == FStringState.in_fstring_expr_modifier: 

623 start_index = self.current_index 

624 while self.is_in_bounds(): 

625 char = self.source[self.current_index] 

626 assert self.fstring_quote is not None 

627 if (char == "\n" or char == "{") and len(self.fstring_quote) == 1: 

628 if char == "{": 

629 self.fstring_state.consume_fstring_middle_for_lbrace() 

630 else: 

631 # TODO: why? 

632 self.fstring_state.state = FStringState.in_fstring_expr 

633 

634 # If fstring-middle is empty, skip it by returning the next step token 

635 if self.current_index == start_index: 

636 return self.fstring() 

637 

638 return self.make_token(TokenType.fstring_middle) 

639 elif char == "}": 

640 self.fstring_state.state = FStringState.in_fstring_expr 

641 return self.make_token(TokenType.fstring_middle) 

642 

643 self.advance_check_newline() 

644 

645 raise UnexpectedEOF 

646 

647 raise AssertionError("Unhandled f-string state") 

648 

649 def string(self) -> Token: 

650 prefix, quote = self.string_prefix_and_quotes() 

651 if prefix and self.weird_op_case: 

652 self.advance() 

653 return self.make_token(tok_type=TokenType.op) 

654 

655 for char in prefix: 

656 if char == "f" or char == "F": 

657 return self.fstring() 

658 

659 for _ in range(len(prefix)): 

660 self.advance() 

661 for _ in range(len(quote)): 

662 self.advance() 

663 

664 is_single_quote = len(quote) == 1 

665 

666 while self.is_in_bounds(): 

667 char = self.source[self.current_index] 

668 # For single quotes, bail on newlines 

669 if char == "\n" and is_single_quote: 

670 raise UnterminatedString 

671 

672 # Handle escapes 

673 if char == "\\": 

674 self.advance() 

675 self.advance_check_newline() 

676 continue 

677 

678 # Find closing quote 

679 if self.match(quote): 

680 for _ in range(len(quote)): 

681 self.advance() 

682 return self.make_token(TokenType.string) 

683 

684 self.advance_check_newline() 

685 

686 raise UnexpectedEOF 

687 

688 def indent(self) -> Token: 

689 start_index = self.current_index 

690 saw_whitespace = False 

691 saw_tab_or_space = False 

692 while self.is_in_bounds(): 

693 char = self.source[self.current_index] 

694 if self.is_whitespace(): 

695 self.advance() 

696 saw_whitespace = True 

697 if char == " " or char == "\t": 

698 saw_tab_or_space = True 

699 else: 

700 break 

701 

702 if not self.is_in_bounds(): 

703 # File ends with no whitespace after newline, don't return indent 

704 if self.current_index == start_index: 

705 raise NotAnIndent 

706 # If reached the end of the file, don't return an indent 

707 return self.make_token(TokenType.whitespace) 

708 

709 # If the line is preceded by just linefeeds/CR/etc., 

710 # treat it as whitespace. 

711 if saw_whitespace and not saw_tab_or_space: 

712 self.weird_whitespace_case = True 

713 return self.make_token(TokenType.whitespace) 

714 

715 # For lines that are just leading whitespace and a slash or a comment, 

716 # don't return indents 

717 next_char = self.peek() 

718 if next_char == "#" or next_char == "\\" or self.is_newline(): 

719 return self.make_token(TokenType.whitespace) 

720 

721 new_indent = self.source[start_index : self.current_index] 

722 current_indent = "" if len(self.indent_stack) == 0 else self.indent_stack[-1] 

723 

724 if len(new_indent) == len(current_indent): 

725 if len(new_indent) == 0: 

726 raise NotAnIndent 

727 

728 if new_indent != current_indent: 

729 raise InconsistentUseOfTabsAndSpaces 

730 return self.make_token(TokenType.whitespace) 

731 elif len(new_indent) > len(current_indent): 

732 if len(current_indent) > 0 and current_indent not in new_indent: 

733 raise InconsistentUseOfTabsAndSpaces 

734 self.indent_stack.append(new_indent) 

735 return self.make_token(TokenType.indent) 

736 else: 

737 while len(self.indent_stack) > 0: 

738 top_indent = self.indent_stack[-1] 

739 if len(top_indent) < len(new_indent): 

740 raise DedentDoesNotMatchAnyOuterIndent 

741 

742 if len(top_indent) == len(new_indent): 

743 break 

744 

745 _ = self.indent_stack.pop() 

746 self.dedent_counter += 1 

747 

748 # Let the dedent counter make the dedents. They must be length zero 

749 return self.make_token(TokenType.whitespace) 

750 

751 def is_whitespace(self) -> bool: 

752 if self.is_newline(): 

753 return False 

754 

755 char = self.source[self.current_index] 

756 return ( 

757 char == " " 

758 or char == "\r" 

759 or char == "\t" 

760 or char == "\x0b" 

761 or char == "\x0c" 

762 ) 

763 

764 def is_newline(self) -> bool: 

765 if self.source[self.current_index] == "\n": 

766 return True 

767 if ( 

768 self.source[self.current_index] == "\r" 

769 and self.current_index + 1 < len(self.source) 

770 and self.source[self.current_index + 1] == "\n" 

771 ): 

772 return True 

773 

774 return False 

775 

776 def name(self) -> Token: 

777 if self.weird_op_case: 

778 self.advance() 

779 return self.make_token(TokenType.identifier) 

780 

781 # According to PEP 3131, any non-ascii character is valid in a NAME token. 

782 # But if we see any non-identifier ASCII character we should stop. 

783 remaining = self.source[self.current_index :] 

784 for index, char in enumerate(remaining): 

785 if ord(char) < 128 and not str.isalnum(char) and char != "_": 

786 length = index 

787 break 

788 else: 

789 length = len(remaining) 

790 

791 self.advance_by(length) 

792 return self.make_token(TokenType.identifier) 

793 

794 def __iter__(self) -> TokenIterator: 

795 return self 

796 

797 def __next__(self) -> Token: 

798 if self.prev_token is not None and self.prev_token.type == TokenType.endmarker: 

799 raise StopIteration 

800 

801 # EOF checks 

802 if self.current_index == len(self.source): 

803 if self.prev_token is None: 

804 return self.endmarker() 

805 

806 if self.prev_token.type in { 

807 TokenType.newline, 

808 TokenType.nl, 

809 TokenType.dedent, 

810 }: 

811 return self.endmarker() 

812 else: 

813 return self.newline() 

814 

815 if self.current_index > len(self.source): 

816 return self.endmarker() 

817 

818 # f-string check 

819 if ( 

820 self.fstring_state.state != FStringState.not_fstring 

821 and self.fstring_state.state != FStringState.in_fstring_expr 

822 ): 

823 return self.fstring() 

824 

825 current_char = self.source[self.current_index] 

826 

827 # \r on its own, in certain cases it gets merged with the next char. 

828 # It's probably a bug: https://github.com/python/cpython/issues/128233 

829 # 'issue_128233_handling=True' works around this bug, but if it's False 

830 # then we produce identical tokens to CPython. 

831 if not self.issue_128233_handling and current_char == "\r": 

832 self.advance() 

833 if not self.is_in_bounds(): 

834 return self.newline() 

835 

836 current_char = self.source[self.current_index] 

837 if current_char != "\n": 

838 self.weird_op_case = True 

839 if ( 

840 self.prev_token is not None 

841 and self.prev_token.type == TokenType.comment 

842 ): 

843 self.weird_op_case_nl = True 

844 

845 # Comment check 

846 if current_char == "#": 

847 if self.weird_op_case: 

848 self.advance() 

849 return self.make_token(TokenType.comment) 

850 

851 while self.is_in_bounds() and not self.is_newline(): 

852 if ( 

853 not self.issue_128233_handling 

854 and self.source[self.current_index] == "\r" 

855 ): 

856 break 

857 self.advance() 

858 return self.make_token(TokenType.comment) 

859 

860 # Empty the dedent counter 

861 if self.dedent_counter > 0: 

862 self.dedent_counter -= 1 

863 return self.make_token(TokenType.dedent) 

864 

865 # Newline check 

866 if self.is_newline(): 

867 return self.newline() 

868 

869 # \<newline> check 

870 if current_char == "\\": 

871 self.advance() 

872 if not self.is_in_bounds(): 

873 raise UnexpectedEOF 

874 

875 # Consume all whitespace on this line and the next. 

876 found_whitespace = False 

877 seen_newline = False 

878 while self.is_in_bounds(): 

879 if self.is_whitespace(): 

880 self.advance() 

881 found_whitespace = True 

882 elif not seen_newline and (self.is_newline()): 

883 char = self.source[self.current_index] 

884 if char == "\r": 

885 self.advance() 

886 self.advance() 

887 found_whitespace = True 

888 seen_newline = True 

889 # Move to next line without creating a newline token. But, 

890 # if the previous line was all whitespace, whitespace on 

891 # the next line is still valid indentation. Avoid consuming 

892 if self.all_whitespace_on_this_line: 

893 self.next_line() 

894 break 

895 else: 

896 self.next_line() 

897 # Preserve this boolean, we're on the same line semantically 

898 self.all_whitespace_on_this_line = False 

899 

900 else: 

901 break 

902 

903 if not found_whitespace: 

904 raise UnexpectedCharacterAfterBackslash 

905 

906 return self.make_token(TokenType.whitespace) 

907 

908 # Indent / dedent checks 

909 if ( 

910 (self.byte_offset == 0 or self.weird_whitespace_case) 

911 and self.bracket_level == 0 

912 and self.fstring_state.state == FStringState.not_fstring 

913 ): 

914 self.weird_whitespace_case = False 

915 try: 

916 indent_token = self.indent() 

917 except NotAnIndent: 

918 indent_token = None 

919 

920 if indent_token is not None: 

921 return indent_token 

922 

923 if self.is_whitespace(): 

924 while self.is_in_bounds() and self.is_whitespace(): 

925 self.advance() 

926 return self.make_token(TokenType.whitespace) 

927 

928 if current_char in ("+", "&", "|", "^", "@", "%", "=", "!", "~"): 

929 self.advance() 

930 if self.peek() == "=": 

931 self.advance() 

932 return self.make_token(TokenType.op) 

933 

934 if current_char == "<": 

935 self.advance() 

936 if self.peek() == ">": 

937 # Barry as FLUFL easter egg 

938 self.advance() 

939 return self.make_token(TokenType.op) 

940 

941 if self.peek() == "<": 

942 self.advance() 

943 if self.peek() == "=": 

944 self.advance() 

945 return self.make_token(TokenType.op) 

946 

947 if current_char == ">": 

948 self.advance() 

949 if self.peek() == ">": 

950 self.advance() 

951 if self.peek() == "=": 

952 self.advance() 

953 return self.make_token(TokenType.op) 

954 

955 if current_char == "/": 

956 self.advance() 

957 if self.peek() == "/": 

958 self.advance() 

959 if self.peek() == "=": 

960 self.advance() 

961 return self.make_token(TokenType.op) 

962 

963 if current_char == "*": 

964 self.advance() 

965 if self.peek() == "*": 

966 self.advance() 

967 if self.peek() == "=": 

968 self.advance() 

969 return self.make_token(TokenType.op) 

970 

971 if current_char == "-": 

972 self.advance() 

973 # -> operator 

974 if self.peek() == ">": 

975 self.advance() 

976 return self.make_token(TokenType.op) 

977 

978 # -= operator 

979 if self.peek() == "=": 

980 self.advance() 

981 return self.make_token(TokenType.op) 

982 

983 if current_char in (",", ";"): 

984 self.advance() 

985 return self.make_token(TokenType.op) 

986 

987 # This guy is not used in Python3, but still exists 

988 # for backwards compatibility i guess. 

989 if current_char == "`": 

990 self.advance() 

991 return self.make_token(TokenType.op) 

992 

993 if current_char == "(": 

994 self.advance() 

995 self.bracket_level += 1 

996 return self.make_token(TokenType.lparen) 

997 

998 if current_char == ")": 

999 self.advance() 

1000 self.bracket_level -= 1 

1001 if self.bracket_level < 0: 

1002 self.bracket_level = 0 

1003 return self.make_token(TokenType.rparen) 

1004 

1005 if current_char == "[": 

1006 self.advance() 

1007 self.bracket_level += 1 

1008 return self.make_token(TokenType.lbracket) 

1009 

1010 if current_char == "]": 

1011 self.advance() 

1012 self.bracket_level -= 1 

1013 if self.bracket_level < 0: 

1014 self.bracket_level = 0 

1015 return self.make_token(TokenType.rbracket) 

1016 

1017 if current_char == "{": 

1018 self.advance() 

1019 self.bracket_level += 1 

1020 return self.make_token(TokenType.lbrace) 

1021 

1022 if current_char == "}": 

1023 self.advance() 

1024 if ( 

1025 self.bracket_level == 0 

1026 and self.fstring_state.state == FStringState.in_fstring_expr 

1027 ): 

1028 self.fstring_state.consume_rbrace() 

1029 self.bracket_level = self.bracket_level_stack.pop() 

1030 else: 

1031 self.bracket_level -= 1 

1032 if self.bracket_level < 0: 

1033 self.bracket_level = 0 

1034 

1035 return self.make_token(TokenType.rbrace) 

1036 

1037 if current_char == ":": 

1038 self.advance() 

1039 if ( 

1040 self.bracket_level == 0 

1041 and self.fstring_state.state == FStringState.in_fstring_expr 

1042 ): 

1043 self.fstring_state.state = FStringState.in_fstring_expr_modifier 

1044 return self.make_token(TokenType.op) 

1045 else: 

1046 if self.peek() == "=": 

1047 self.advance() 

1048 return self.make_token(TokenType.op) 

1049 

1050 if current_char in ".0123456789": 

1051 if self.current_index + 2 <= len(self.source) and self.source[ 

1052 self.current_index : self.current_index + 2 

1053 ] in ("0b", "0B"): 

1054 return self.binary() 

1055 elif self.current_index + 2 <= len(self.source) and self.source[ 

1056 self.current_index : self.current_index + 2 

1057 ] in ("0o", "0O"): 

1058 return self.octal() 

1059 elif self.current_index + 2 <= len(self.source) and self.source[ 

1060 self.current_index : self.current_index + 2 

1061 ] in ("0x", "0X"): 

1062 return self.hexadecimal() 

1063 else: 

1064 return self.decimal() 

1065 

1066 if ( 

1067 (self.current_index + 1 <= len(self.source) and self.match('"', "'")) 

1068 or ( 

1069 self.current_index + 2 <= len(self.source) 

1070 and self.match( 

1071 'b"', 

1072 "b'", 

1073 'r"', 

1074 "r'", 

1075 'f"', 

1076 "f'", 

1077 'u"', 

1078 "u'", 

1079 ignore_case=True, 

1080 ) 

1081 ) 

1082 or ( 

1083 self.current_index + 3 <= len(self.source) 

1084 and self.match( 

1085 'br"', 

1086 "br'", 

1087 'rb"', 

1088 "rb'", 

1089 'fr"', 

1090 "fr'", 

1091 'rf"', 

1092 "rf'", 

1093 ignore_case=True, 

1094 ) 

1095 ) 

1096 ): 

1097 return self.string() 

1098 

1099 return self.name() 

1100 

1101 

1102def tokenize( 

1103 source: str, 

1104 *, 

1105 fstring_tokens: bool = True, 

1106 issue_128233_handling: bool = True, 

1107) -> Iterator[Token]: 

1108 token_iterator = TokenIterator(source, issue_128233_handling=issue_128233_handling) 

1109 if fstring_tokens: 

1110 return iter(token_iterator) 

1111 

1112 return merge_fstring_tokens(token_iterator) 

1113 

1114 

1115def merge_fstring_tokens(token_iterator: TokenIterator) -> Iterator[Token]: 

1116 """Turn post-Python-3.12 FSTRING-* tokens back to a single STRING token.""" 

1117 for token in token_iterator: 

1118 if token.type != TokenType.fstring_start: 

1119 yield token 

1120 continue 

1121 

1122 start_token = token 

1123 end_token = token 

1124 

1125 fstring_starts = 1 

1126 fstring_ends = 0 

1127 for token in token_iterator: 

1128 if token.type == TokenType.fstring_start: 

1129 fstring_starts += 1 

1130 if token.type == TokenType.fstring_end: 

1131 fstring_ends += 1 

1132 

1133 if fstring_starts == fstring_ends: 

1134 end_token = token 

1135 break 

1136 

1137 yield Token( 

1138 type=TokenType.string, 

1139 start_index=start_token.start_index, 

1140 start_line=start_token.start_line, 

1141 start_col=start_token.start_col, 

1142 end_index=end_token.end_index, 

1143 end_line=end_token.end_line, 

1144 end_col=end_token.end_col, 

1145 )