Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pytokens/__init__.py: 78%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

657 statements  

1"""pytokens - A Fast, spec compliant Python 3.12+ tokenizer that runs on older Pythons.""" 

2 

3from __future__ import annotations 

4 

5from dataclasses import dataclass, field 

6import enum 

7import string 

8from typing import Iterator, NewType 

9 

10 

11class TokenizeError(Exception): ... 

12 

13 

14class IndentationError(TokenizeError): ... 

15 

16 

17class InconsistentUseOfTabsAndSpaces(IndentationError): ... 

18 

19 

20class DedentDoesNotMatchAnyOuterIndent(IndentationError): ... 

21 

22 

23class UnterminatedString(TokenizeError): ... 

24 

25 

26class UnexpectedEOF(TokenizeError): ... 

27 

28 

29class UnexpectedCharacterAfterBackslash(TokenizeError): ... 

30 

31 

32class NotAnIndent(AssertionError): ... 

33 

34 

35class Underflow(AssertionError): ... 

36 

37 

38class TokenType(enum.IntEnum): 

39 whitespace = 1 

40 indent = 2 

41 dedent = 3 

42 newline = 4 # semantically meaningful newline 

43 nl = 5 # non meaningful newline 

44 comment = 6 

45 

46 _op_start = 7 # marker used to check if a token is an operator 

47 semicolon = 8 

48 lparen = 9 

49 rparen = 10 

50 lbracket = 11 

51 rbracket = 12 

52 lbrace = 13 

53 rbrace = 14 

54 colon = 15 

55 op = 16 

56 _op_end = 17 # marker used to check if a token is an operator 

57 

58 identifier = 18 

59 number = 19 

60 string = 20 

61 fstring_start = 21 

62 fstring_middle = 22 

63 fstring_end = 23 

64 

65 tstring_start = 24 

66 tstring_middle = 25 

67 tstring_end = 26 

68 

69 endmarker = 27 

70 

71 errortoken = 28 

72 

73 def __repr__(self) -> str: 

74 return f"TokenType.{self.name}" 

75 

76 def to_python_token(self) -> str: 

77 if self.name == "identifier": 

78 return "NAME" 

79 

80 if self.is_operator(): 

81 return "OP" 

82 

83 return self.name.upper() 

84 

85 def is_operator(self) -> bool: 

86 return TokenType._op_start < self < TokenType._op_end 

87 

88 

89@dataclass 

90class Token: 

91 type: TokenType 

92 # Byte offsets in the file 

93 start_index: int 

94 end_index: int 

95 start_line: int 

96 # 0-indexed offset from start of line 

97 start_col: int 

98 end_line: int 

99 end_col: int 

100 

101 def to_byte_slice(self, source: str) -> str: 

102 # Newline at end of file may not exist in the file 

103 if ( 

104 (self.type == TokenType.newline or self.type == TokenType.nl) 

105 and self.start_index == len(source) 

106 and self.end_index == len(source) + 1 

107 ): 

108 return "" 

109 

110 # Dedents at end of file also may not exist in the file 

111 if ( 

112 self.type == TokenType.dedent 

113 and self.start_index == len(source) + 1 

114 and self.end_index == len(source) + 1 

115 ): 

116 return "" 

117 

118 # Endmarkers are out of bound too 

119 if self.type == TokenType.endmarker: 

120 return "" 

121 

122 return source[self.start_index : self.end_index] 

123 

124 

125class FStringState: 

126 State = NewType("State", int) 

127 

128 not_fstring = State(1) 

129 at_fstring_middle = State(2) 

130 at_fstring_lbrace = State(3) 

131 in_fstring_expr = State(4) 

132 in_fstring_expr_modifier = State(5) 

133 at_fstring_end = State(6) 

134 

135 def __init__(self) -> None: 

136 self.state = FStringState.not_fstring 

137 self.stack: list[FStringState.State] = [] 

138 

139 def enter_fstring(self) -> None: 

140 self.stack.append(self.state) 

141 self.state = FStringState.at_fstring_middle 

142 

143 def leave_fstring(self) -> None: 

144 assert self.state == FStringState.at_fstring_end 

145 self.state = self.stack.pop() 

146 

147 def consume_fstring_middle_for_lbrace(self) -> None: 

148 if self.state == FStringState.in_fstring_expr_modifier: 

149 self.stack.append(self.state) 

150 

151 self.state = FStringState.at_fstring_lbrace 

152 

153 def consume_fstring_middle_for_end(self) -> None: 

154 self.state = FStringState.at_fstring_end 

155 

156 def consume_lbrace(self) -> None: 

157 self.state = FStringState.in_fstring_expr 

158 

159 def consume_rbrace(self) -> None: 

160 assert ( 

161 self.state == FStringState.in_fstring_expr 

162 or self.state == FStringState.in_fstring_expr_modifier 

163 ) 

164 

165 if ( 

166 len(self.stack) > 0 

167 and self.stack[-1] == FStringState.in_fstring_expr_modifier 

168 ): 

169 self.state = self.stack.pop() 

170 else: 

171 self.state = FStringState.at_fstring_middle 

172 

173 def consume_colon(self) -> None: 

174 assert self.state == FStringState.in_fstring_expr 

175 self.state = FStringState.in_fstring_expr_modifier 

176 

177 

178@dataclass 

179class TokenIterator: 

180 source: str 

181 issue_128233_handling: bool 

182 

183 current_index: int = 0 

184 prev_index: int = 0 

185 line_number: int = 1 

186 prev_line_number: int = 1 

187 byte_offset: int = 0 

188 prev_byte_offset: int = 0 

189 all_whitespace_on_this_line: bool = True 

190 

191 bracket_level: int = 0 

192 bracket_level_stack: list[int] = field(default_factory=list) 

193 prev_token: Token | None = None 

194 

195 indent_stack: list[str] = field(default_factory=list) 

196 dedent_counter: int = 0 

197 

198 # f-string state 

199 fstring_state: FStringState = field(default_factory=FStringState) 

200 fstring_prefix_quote_stack: list[tuple[str, str]] = field(default_factory=list) 

201 fstring_prefix: str | None = None 

202 fstring_quote: str | None = None 

203 

204 # CPython has a weird bug where every time a bare \r is 

205 # present, the next token becomes an OP. regardless of what it is. 

206 weird_op_case: bool = False 

207 weird_op_case_nl: bool = False 

208 

209 weird_whitespace_case: bool = False 

210 

211 def is_in_bounds(self) -> bool: 

212 return self.current_index < len(self.source) 

213 

214 def peek(self) -> str: 

215 assert self.is_in_bounds() 

216 return self.source[self.current_index] 

217 

218 def peek_next(self) -> str: 

219 assert self.current_index + 1 < len(self.source) 

220 return self.source[self.current_index + 1] 

221 

222 def advance(self) -> None: 

223 self.current_index += 1 

224 self.byte_offset += 1 

225 

226 def advance_by(self, count: int) -> None: 

227 self.current_index += count 

228 self.byte_offset += count 

229 

230 def next_line(self) -> None: 

231 self.line_number += 1 

232 self.byte_offset = 0 

233 self.all_whitespace_on_this_line = True 

234 

235 def advance_check_newline(self) -> None: 

236 if self.source[self.current_index] == "\n": 

237 self.current_index += 1 

238 self.next_line() 

239 else: 

240 self.advance() 

241 

242 def match(self, *options: str, ignore_case: bool = False) -> bool: 

243 for option in options: 

244 if self.current_index + len(option) > len(self.source): 

245 continue 

246 snippet = self.source[self.current_index : self.current_index + len(option)] 

247 if ignore_case: 

248 option = option.lower() 

249 snippet = snippet.lower() 

250 

251 if option == snippet: 

252 return True 

253 

254 return False 

255 

256 def make_token(self, tok_type: TokenType) -> Token: 

257 if self.fstring_prefix is not None and "t" in self.fstring_prefix: 

258 if tok_type == TokenType.fstring_start: 

259 tok_type = TokenType.tstring_start 

260 elif tok_type == TokenType.fstring_middle: 

261 tok_type = TokenType.tstring_middle 

262 elif tok_type == TokenType.fstring_end: 

263 tok_type = TokenType.tstring_end 

264 

265 token_type = ( 

266 TokenType.op 

267 if self.weird_op_case 

268 and not tok_type.is_operator() 

269 and tok_type not in (TokenType.number, TokenType.string) 

270 else tok_type 

271 ) 

272 if self.weird_op_case: 

273 # And we have another weird case INSIDE the weird case. 

274 # For some reason when CPython accidentally captures a space 

275 # as the next character, i.e. when the token is '\r ', 

276 # It DOESN't see it as whitespace, so in that specific case, 

277 # we shouldn't set all_whitespace_on_this_line. 

278 # I think this is because CPython never expecte to have a 

279 # ' ' token in it anyway so it doesn't classify it as 

280 # whitespace. So it becomes non-whitespace. 

281 # Removing this if stmt breaks test 1001 right now. 

282 token_str = self.source[self.prev_index : self.current_index] 

283 if token_str == "\r ": 

284 self.all_whitespace_on_this_line = False 

285 self.weird_op_case = False 

286 

287 token = Token( 

288 type=token_type, 

289 start_index=self.prev_index, 

290 end_index=self.current_index, 

291 start_line=self.prev_line_number, 

292 start_col=self.prev_byte_offset, 

293 end_line=self.line_number, 

294 end_col=self.byte_offset, 

295 ) 

296 if tok_type == TokenType.newline or tok_type == TokenType.nl: 

297 self.next_line() 

298 elif tok_type == TokenType.whitespace or tok_type == TokenType.comment: 

299 pass 

300 else: 

301 self.all_whitespace_on_this_line = False 

302 

303 self.prev_token = token 

304 self.prev_index = self.current_index 

305 self.prev_line_number = self.line_number 

306 self.prev_byte_offset = self.byte_offset 

307 self.weird_op_case = False 

308 

309 return token 

310 

311 def push_fstring_prefix_quote(self, prefix: str, quote: str) -> None: 

312 if self.fstring_prefix is not None: 

313 assert self.fstring_quote is not None 

314 self.fstring_prefix_quote_stack.append( 

315 (self.fstring_prefix, self.fstring_quote) 

316 ) 

317 

318 self.fstring_prefix = prefix 

319 self.fstring_quote = quote 

320 

321 def pop_fstring_quote(self) -> None: 

322 if self.fstring_prefix is None: 

323 assert self.fstring_quote is None 

324 raise Underflow 

325 

326 self.fstring_prefix, self.fstring_quote = ( 

327 (None, None) 

328 if len(self.fstring_prefix_quote_stack) == 0 

329 else self.fstring_prefix_quote_stack.pop() 

330 ) 

331 

332 def newline(self) -> Token: 

333 if self.is_in_bounds() and self.source[self.current_index] == "\r": 

334 self.advance() 

335 self.advance() 

336 token_type = ( 

337 TokenType.nl 

338 if ( 

339 self.weird_op_case_nl 

340 or self.bracket_level > 0 

341 or self.fstring_state.state == FStringState.in_fstring_expr 

342 or self.all_whitespace_on_this_line 

343 ) 

344 else TokenType.newline 

345 ) 

346 token = self.make_token(token_type) 

347 self.weird_op_case_nl = False 

348 return token 

349 

350 def endmarker(self) -> Token: 

351 if self.bracket_level != 0: 

352 raise UnexpectedEOF 

353 

354 if len(self.indent_stack) > 0: 

355 _ = self.indent_stack.pop() 

356 return self.make_token(TokenType.dedent) 

357 

358 return self.make_token(TokenType.endmarker) 

359 

360 def decimal(self) -> Token: 

361 digit_before_decimal = False 

362 if self.source[self.current_index].isdigit(): 

363 digit_before_decimal = True 

364 self.advance() 

365 

366 # TODO: this is too lax; 1__2 tokenizes successfully 

367 while self.is_in_bounds() and ( 

368 self.source[self.current_index].isdigit() 

369 or self.source[self.current_index] == "_" 

370 ): 

371 self.advance() 

372 

373 if self.is_in_bounds() and self.source[self.current_index] == ".": 

374 self.advance() 

375 

376 while self.is_in_bounds() and ( 

377 self.source[self.current_index].isdigit() 

378 or ( 

379 self.source[self.current_index] == "_" 

380 and self.source[self.current_index - 1].isdigit() 

381 ) 

382 ): 

383 self.advance() 

384 # Before advancing over the 'e', ensure that there has been at least 1 digit before the 'e' 

385 if self.current_index + 1 < len(self.source) and ( 

386 (digit_before_decimal or self.source[self.current_index - 1].isdigit()) 

387 and ( 

388 self.source[self.current_index] == "e" 

389 or self.source[self.current_index] == "E" 

390 ) 

391 and ( 

392 self.source[self.current_index + 1].isdigit() 

393 or ( 

394 self.current_index + 2 < len(self.source) 

395 and ( 

396 self.source[self.current_index + 1] == "+" 

397 or self.source[self.current_index + 1] == "-" 

398 ) 

399 and self.source[self.current_index + 2].isdigit() 

400 ) 

401 ) 

402 ): 

403 self.advance() 

404 self.advance() 

405 # optional third advance not necessary as itll get advanced just below 

406 

407 # TODO: this is too lax; 1__2 tokenizes successfully 

408 while self.is_in_bounds() and ( 

409 self.source[self.current_index].isdigit() 

410 or ( 

411 (digit_before_decimal or self.source[self.current_index - 1].isdigit()) 

412 and self.source[self.current_index] == "_" 

413 ) 

414 ): 

415 self.advance() 

416 

417 # Complex numbers end in a `j`. But ensure at least 1 digit before it 

418 if self.is_in_bounds() and ( 

419 (digit_before_decimal or self.source[self.current_index - 1].isdigit()) 

420 and ( 

421 self.source[self.current_index] == "j" 

422 or self.source[self.current_index] == "J" 

423 ) 

424 ): 

425 self.advance() 

426 # If all of this resulted in just a dot, return an operator 

427 if ( 

428 self.current_index - self.prev_index == 1 

429 and self.source[self.current_index - 1] == "." 

430 ): 

431 # Ellipsis check 

432 if ( 

433 self.current_index + 2 <= len(self.source) 

434 and self.source[self.current_index : self.current_index + 2] == ".." 

435 ): 

436 self.advance() 

437 self.advance() 

438 

439 return self.make_token(TokenType.op) 

440 

441 return self.make_token(TokenType.number) 

442 

443 def binary(self) -> Token: 

444 # jump over `0b` 

445 self.advance() 

446 self.advance() 

447 while self.is_in_bounds() and ( 

448 self.source[self.current_index] == "0" 

449 or self.source[self.current_index] == "1" 

450 or self.source[self.current_index] == "_" 

451 ): 

452 self.advance() 

453 if self.is_in_bounds() and ( 

454 self.source[self.current_index] == "e" 

455 or self.source[self.current_index] == "E" 

456 ): 

457 self.advance() 

458 if self.is_in_bounds() and self.source[self.current_index] == "-": 

459 self.advance() 

460 

461 while self.is_in_bounds() and ( 

462 self.source[self.current_index] == "0" 

463 or self.source[self.current_index] == "1" 

464 or self.source[self.current_index] == "_" 

465 ): 

466 self.advance() 

467 return self.make_token(TokenType.number) 

468 

469 def octal(self) -> Token: 

470 # jump over `0o` 

471 self.advance() 

472 self.advance() 

473 while self.is_in_bounds() and ( 

474 self.source[self.current_index] >= "0" 

475 and self.source[self.current_index] <= "7" 

476 or self.source[self.current_index] == "_" 

477 ): 

478 self.advance() 

479 if self.is_in_bounds() and ( 

480 self.source[self.current_index] == "e" 

481 or self.source[self.current_index] == "E" 

482 ): 

483 self.advance() 

484 if self.is_in_bounds() and self.source[self.current_index] == "-": 

485 self.advance() 

486 

487 while self.is_in_bounds() and ( 

488 self.source[self.current_index] >= "0" 

489 and self.source[self.current_index] <= "7" 

490 or self.source[self.current_index] == "_" 

491 ): 

492 self.advance() 

493 return self.make_token(TokenType.number) 

494 

495 def hexadecimal(self) -> Token: 

496 # jump over `0x` 

497 self.advance() 

498 self.advance() 

499 while self.is_in_bounds() and ( 

500 self.source[self.current_index] in string.hexdigits 

501 or self.source[self.current_index] == "_" 

502 ): 

503 self.advance() 

504 if self.is_in_bounds() and ( 

505 self.source[self.current_index] == "e" 

506 or self.source[self.current_index] == "E" 

507 ): 

508 self.advance() 

509 if self.is_in_bounds() and self.source[self.current_index] == "-": 

510 self.advance() 

511 

512 while self.is_in_bounds() and ( 

513 self.source[self.current_index] in string.hexdigits 

514 or self.source[self.current_index] == "_" 

515 ): 

516 self.advance() 

517 return self.make_token(TokenType.number) 

518 

519 def find_opening_quote(self) -> int: 

520 # Quotes should always be within 3 chars of the beginning of the string token 

521 for offset in range(3): 

522 char = self.source[self.current_index + offset] 

523 if char == '"' or char == "'": 

524 return self.current_index + offset 

525 

526 raise AssertionError("Quote not found somehow") 

527 

528 def string_prefix_and_quotes(self) -> tuple[str, str]: 

529 quote_index = self.find_opening_quote() 

530 prefix = self.source[self.current_index : quote_index] 

531 quote_char = self.source[quote_index] 

532 

533 # Check for triple quotes 

534 quote = ( 

535 self.source[quote_index : quote_index + 3] 

536 if ( 

537 quote_index + 3 <= len(self.source) 

538 and self.source[quote_index + 1] == quote_char 

539 and self.source[quote_index + 2] == quote_char 

540 ) 

541 else self.source[quote_index : quote_index + 1] 

542 ) 

543 return prefix, quote 

544 

545 def fstring(self) -> Token: 

546 if self.fstring_state.state in ( 

547 FStringState.not_fstring, 

548 FStringState.in_fstring_expr, 

549 ): 

550 prefix, quote = self.string_prefix_and_quotes() 

551 

552 self.push_fstring_prefix_quote(prefix, quote) 

553 for _ in range(len(prefix)): 

554 self.advance() 

555 for _ in range(len(quote)): 

556 self.advance() 

557 self.fstring_state.enter_fstring() 

558 return self.make_token(TokenType.fstring_start) 

559 

560 if self.fstring_state.state == FStringState.at_fstring_middle: 

561 assert self.fstring_quote is not None 

562 is_single_quote = len(self.fstring_quote) == 1 

563 start_index = self.current_index 

564 while self.is_in_bounds(): 

565 char = self.source[self.current_index] 

566 # For single quotes, bail on newlines 

567 if char == "\n" and is_single_quote: 

568 raise UnterminatedString 

569 

570 # Handle escapes 

571 if char == "\\": 

572 self.advance() 

573 # But don't escape a `\{` or `\}` in f-strings 

574 # but DO escape `\N{` in f-strings, that's for unicode characters 

575 # but DON'T escape `\N{` in raw f-strings. 

576 assert self.fstring_prefix is not None 

577 if ( 

578 "r" not in self.fstring_prefix.lower() 

579 and self.current_index + 1 < len(self.source) 

580 and self.peek() == "N" 

581 and self.peek_next() == "{" 

582 ): 

583 self.advance() 

584 self.advance() 

585 

586 if self.is_in_bounds() and not ( 

587 self.peek() == "{" or self.peek() == "}" 

588 ): 

589 self.advance_check_newline() 

590 

591 continue 

592 

593 # Find opening / closing quote 

594 if char == "{": 

595 if self.peek_next() == "{": 

596 self.advance() 

597 self.advance() 

598 continue 

599 else: 

600 self.fstring_state.consume_fstring_middle_for_lbrace() 

601 # If fstring-middle is empty, skip it by returning the next step token 

602 if self.current_index == start_index: 

603 return self.fstring() 

604 

605 return self.make_token(TokenType.fstring_middle) 

606 

607 assert self.fstring_quote is not None 

608 if self.match(self.fstring_quote): 

609 self.fstring_state.consume_fstring_middle_for_end() 

610 # If fstring-middle is empty, skip it by returning the next step token 

611 if self.current_index == start_index: 

612 return self.fstring() 

613 

614 return self.make_token(TokenType.fstring_middle) 

615 

616 self.advance_check_newline() 

617 

618 raise UnexpectedEOF 

619 

620 if self.fstring_state.state == FStringState.at_fstring_lbrace: 

621 self.advance() 

622 self.bracket_level_stack.append(self.bracket_level) 

623 self.bracket_level = 0 

624 self.fstring_state.consume_lbrace() 

625 return self.make_token(TokenType.lbrace) 

626 

627 if self.fstring_state.state == FStringState.at_fstring_end: 

628 assert self.fstring_quote is not None 

629 for _ in range(len(self.fstring_quote)): 

630 self.advance() 

631 token = self.make_token(TokenType.fstring_end) 

632 self.pop_fstring_quote() 

633 self.fstring_state.leave_fstring() 

634 return token 

635 

636 if self.fstring_state.state == FStringState.in_fstring_expr_modifier: 

637 start_index = self.current_index 

638 while self.is_in_bounds(): 

639 char = self.source[self.current_index] 

640 assert self.fstring_quote is not None 

641 if (char == "\n" or char == "{") and len(self.fstring_quote) == 1: 

642 if char == "{": 

643 self.fstring_state.consume_fstring_middle_for_lbrace() 

644 else: 

645 # TODO: why? 

646 self.fstring_state.state = FStringState.in_fstring_expr 

647 

648 # If fstring-middle is empty, skip it by returning the next step token 

649 if self.current_index == start_index: 

650 return self.fstring() 

651 

652 return self.make_token(TokenType.fstring_middle) 

653 elif char == "}": 

654 self.fstring_state.state = FStringState.in_fstring_expr 

655 return self.make_token(TokenType.fstring_middle) 

656 

657 self.advance_check_newline() 

658 

659 raise UnexpectedEOF 

660 

661 raise AssertionError("Unhandled f-string state") 

662 

663 def string(self) -> Token: 

664 prefix, quote = self.string_prefix_and_quotes() 

665 if prefix and self.weird_op_case: 

666 self.advance() 

667 return self.make_token(tok_type=TokenType.op) 

668 

669 for char in prefix: 

670 if char in ("f", "F", "t", "T"): 

671 return self.fstring() 

672 

673 for _ in range(len(prefix)): 

674 self.advance() 

675 for _ in range(len(quote)): 

676 self.advance() 

677 

678 is_single_quote = len(quote) == 1 

679 

680 while self.is_in_bounds(): 

681 char = self.source[self.current_index] 

682 # For single quotes, bail on newlines 

683 if char == "\n" and is_single_quote: 

684 raise UnterminatedString 

685 

686 # Handle escapes 

687 if char == "\\": 

688 self.advance() 

689 self.advance_check_newline() 

690 continue 

691 

692 # Find closing quote 

693 if self.match(quote): 

694 for _ in range(len(quote)): 

695 self.advance() 

696 return self.make_token(TokenType.string) 

697 

698 self.advance_check_newline() 

699 

700 raise UnexpectedEOF 

701 

702 def indent(self) -> Token: 

703 start_index = self.current_index 

704 saw_whitespace = False 

705 saw_tab_or_space = False 

706 while self.is_in_bounds(): 

707 char = self.source[self.current_index] 

708 if self.is_whitespace(): 

709 self.advance() 

710 saw_whitespace = True 

711 if char == " " or char == "\t": 

712 saw_tab_or_space = True 

713 else: 

714 break 

715 

716 if not self.is_in_bounds(): 

717 # File ends with no whitespace after newline, don't return indent 

718 if self.current_index == start_index: 

719 raise NotAnIndent 

720 # If reached the end of the file, don't return an indent 

721 return self.make_token(TokenType.whitespace) 

722 

723 # If the line is preceded by just linefeeds/CR/etc., 

724 # treat it as whitespace. 

725 if saw_whitespace and not saw_tab_or_space: 

726 self.weird_whitespace_case = True 

727 return self.make_token(TokenType.whitespace) 

728 

729 # For lines that are just leading whitespace and a slash or a comment, 

730 # don't return indents 

731 next_char = self.peek() 

732 if next_char == "#" or next_char == "\\" or self.is_newline(): 

733 return self.make_token(TokenType.whitespace) 

734 

735 new_indent = self.source[start_index : self.current_index] 

736 current_indent = "" if len(self.indent_stack) == 0 else self.indent_stack[-1] 

737 

738 if len(new_indent) == len(current_indent): 

739 if len(new_indent) == 0: 

740 raise NotAnIndent 

741 

742 if new_indent != current_indent: 

743 raise InconsistentUseOfTabsAndSpaces 

744 return self.make_token(TokenType.whitespace) 

745 elif len(new_indent) > len(current_indent): 

746 if len(current_indent) > 0 and current_indent not in new_indent: 

747 raise InconsistentUseOfTabsAndSpaces 

748 self.indent_stack.append(new_indent) 

749 return self.make_token(TokenType.indent) 

750 else: 

751 while len(self.indent_stack) > 0: 

752 top_indent = self.indent_stack[-1] 

753 if len(top_indent) < len(new_indent): 

754 raise DedentDoesNotMatchAnyOuterIndent 

755 

756 if len(top_indent) == len(new_indent): 

757 break 

758 

759 _ = self.indent_stack.pop() 

760 self.dedent_counter += 1 

761 

762 # Let the dedent counter make the dedents. They must be length zero 

763 return self.make_token(TokenType.whitespace) 

764 

765 def is_whitespace(self) -> bool: 

766 if self.is_newline(): 

767 return False 

768 

769 char = self.source[self.current_index] 

770 return ( 

771 char == " " 

772 or char == "\r" 

773 or char == "\t" 

774 or char == "\x0b" 

775 or char == "\x0c" 

776 ) 

777 

778 def is_newline(self) -> bool: 

779 if self.source[self.current_index] == "\n": 

780 return True 

781 if ( 

782 self.source[self.current_index] == "\r" 

783 and self.current_index + 1 < len(self.source) 

784 and self.source[self.current_index + 1] == "\n" 

785 ): 

786 return True 

787 

788 return False 

789 

790 def name(self) -> Token: 

791 if self.weird_op_case: 

792 self.advance() 

793 return self.make_token(TokenType.identifier) 

794 

795 # According to PEP 3131, any non-ascii character is valid in a NAME token. 

796 # But if we see any non-identifier ASCII character we should stop. 

797 remaining = self.source[self.current_index :] 

798 for index, char in enumerate(remaining): 

799 if ord(char) < 128 and not str.isalnum(char) and char != "_": 

800 length = index 

801 break 

802 else: 

803 length = len(remaining) 

804 

805 self.advance_by(length) 

806 return self.make_token(TokenType.identifier) 

807 

808 def __iter__(self) -> TokenIterator: 

809 return self 

810 

811 def __next__(self) -> Token: 

812 if self.prev_token is not None and self.prev_token.type == TokenType.endmarker: 

813 raise StopIteration 

814 

815 # EOF checks 

816 if self.current_index == len(self.source): 

817 if self.prev_token is None: 

818 return self.endmarker() 

819 

820 if self.prev_token.type in { 

821 TokenType.newline, 

822 TokenType.nl, 

823 TokenType.dedent, 

824 }: 

825 return self.endmarker() 

826 else: 

827 return self.newline() 

828 

829 if self.current_index > len(self.source): 

830 return self.endmarker() 

831 

832 # f-string check 

833 if ( 

834 self.fstring_state.state != FStringState.not_fstring 

835 and self.fstring_state.state != FStringState.in_fstring_expr 

836 ): 

837 return self.fstring() 

838 

839 current_char = self.source[self.current_index] 

840 

841 # \r on its own, in certain cases it gets merged with the next char. 

842 # It's probably a bug: https://github.com/python/cpython/issues/128233 

843 # 'issue_128233_handling=True' works around this bug, but if it's False 

844 # then we produce identical tokens to CPython. 

845 if not self.issue_128233_handling and current_char == "\r": 

846 self.advance() 

847 if not self.is_in_bounds(): 

848 return self.newline() 

849 

850 current_char = self.source[self.current_index] 

851 if current_char != "\n": 

852 self.weird_op_case = True 

853 if ( 

854 self.prev_token is not None 

855 and self.prev_token.type == TokenType.comment 

856 ): 

857 self.weird_op_case_nl = True 

858 

859 # Comment check 

860 if current_char == "#": 

861 if self.weird_op_case: 

862 self.advance() 

863 return self.make_token(TokenType.comment) 

864 

865 while self.is_in_bounds() and not self.is_newline(): 

866 if ( 

867 not self.issue_128233_handling 

868 and self.source[self.current_index] == "\r" 

869 ): 

870 break 

871 self.advance() 

872 return self.make_token(TokenType.comment) 

873 

874 # Empty the dedent counter 

875 if self.dedent_counter > 0: 

876 self.dedent_counter -= 1 

877 return self.make_token(TokenType.dedent) 

878 

879 # Newline check 

880 if self.is_newline(): 

881 return self.newline() 

882 

883 # \<newline> check 

884 if current_char == "\\": 

885 self.advance() 

886 if not self.is_in_bounds(): 

887 raise UnexpectedEOF 

888 

889 # Consume all whitespace on this line and the next. 

890 found_whitespace = False 

891 seen_newline = False 

892 while self.is_in_bounds(): 

893 if self.is_whitespace(): 

894 self.advance() 

895 found_whitespace = True 

896 elif not seen_newline and (self.is_newline()): 

897 char = self.source[self.current_index] 

898 if char == "\r": 

899 self.advance() 

900 self.advance() 

901 found_whitespace = True 

902 seen_newline = True 

903 # Move to next line without creating a newline token. But, 

904 # if the previous line was all whitespace, whitespace on 

905 # the next line is still valid indentation. Avoid consuming 

906 if self.all_whitespace_on_this_line: 

907 self.next_line() 

908 break 

909 else: 

910 self.next_line() 

911 # Preserve this boolean, we're on the same line semantically 

912 self.all_whitespace_on_this_line = False 

913 

914 else: 

915 break 

916 

917 if not found_whitespace: 

918 raise UnexpectedCharacterAfterBackslash 

919 

920 return self.make_token(TokenType.whitespace) 

921 

922 # Indent / dedent checks 

923 if ( 

924 (self.byte_offset == 0 or self.weird_whitespace_case) 

925 and self.bracket_level == 0 

926 and self.fstring_state.state == FStringState.not_fstring 

927 ): 

928 self.weird_whitespace_case = False 

929 try: 

930 indent_token = self.indent() 

931 except NotAnIndent: 

932 indent_token = None 

933 

934 if indent_token is not None: 

935 return indent_token 

936 

937 if self.is_whitespace(): 

938 while self.is_in_bounds() and self.is_whitespace(): 

939 self.advance() 

940 return self.make_token(TokenType.whitespace) 

941 

942 if current_char in ("+", "&", "|", "^", "@", "%", "=", "!", "~"): 

943 self.advance() 

944 if self.peek() == "=": 

945 self.advance() 

946 return self.make_token(TokenType.op) 

947 

948 if current_char == "<": 

949 self.advance() 

950 if self.peek() == ">": 

951 # Barry as FLUFL easter egg 

952 self.advance() 

953 return self.make_token(TokenType.op) 

954 

955 if self.peek() == "<": 

956 self.advance() 

957 if self.peek() == "=": 

958 self.advance() 

959 return self.make_token(TokenType.op) 

960 

961 if current_char == ">": 

962 self.advance() 

963 if self.peek() == ">": 

964 self.advance() 

965 if self.peek() == "=": 

966 self.advance() 

967 return self.make_token(TokenType.op) 

968 

969 if current_char == "/": 

970 self.advance() 

971 if self.peek() == "/": 

972 self.advance() 

973 if self.peek() == "=": 

974 self.advance() 

975 return self.make_token(TokenType.op) 

976 

977 if current_char == "*": 

978 self.advance() 

979 if self.peek() == "*": 

980 self.advance() 

981 if self.peek() == "=": 

982 self.advance() 

983 return self.make_token(TokenType.op) 

984 

985 if current_char == "-": 

986 self.advance() 

987 # -> operator 

988 if self.peek() == ">": 

989 self.advance() 

990 return self.make_token(TokenType.op) 

991 

992 # -= operator 

993 if self.peek() == "=": 

994 self.advance() 

995 return self.make_token(TokenType.op) 

996 

997 if current_char in (",", ";"): 

998 self.advance() 

999 return self.make_token(TokenType.op) 

1000 

1001 # This guy is not used in Python3, but still exists 

1002 # for backwards compatibility i guess. 

1003 if current_char == "`": 

1004 self.advance() 

1005 return self.make_token(TokenType.op) 

1006 

1007 if current_char == "(": 

1008 self.advance() 

1009 self.bracket_level += 1 

1010 return self.make_token(TokenType.lparen) 

1011 

1012 if current_char == ")": 

1013 self.advance() 

1014 self.bracket_level -= 1 

1015 if self.bracket_level < 0: 

1016 self.bracket_level = 0 

1017 return self.make_token(TokenType.rparen) 

1018 

1019 if current_char == "[": 

1020 self.advance() 

1021 self.bracket_level += 1 

1022 return self.make_token(TokenType.lbracket) 

1023 

1024 if current_char == "]": 

1025 self.advance() 

1026 self.bracket_level -= 1 

1027 if self.bracket_level < 0: 

1028 self.bracket_level = 0 

1029 return self.make_token(TokenType.rbracket) 

1030 

1031 if current_char == "{": 

1032 self.advance() 

1033 self.bracket_level += 1 

1034 return self.make_token(TokenType.lbrace) 

1035 

1036 if current_char == "}": 

1037 self.advance() 

1038 if ( 

1039 self.bracket_level == 0 

1040 and self.fstring_state.state == FStringState.in_fstring_expr 

1041 ): 

1042 self.fstring_state.consume_rbrace() 

1043 self.bracket_level = self.bracket_level_stack.pop() 

1044 else: 

1045 self.bracket_level -= 1 

1046 if self.bracket_level < 0: 

1047 self.bracket_level = 0 

1048 

1049 return self.make_token(TokenType.rbrace) 

1050 

1051 if current_char == ":": 

1052 self.advance() 

1053 if ( 

1054 self.bracket_level == 0 

1055 and self.fstring_state.state == FStringState.in_fstring_expr 

1056 ): 

1057 self.fstring_state.state = FStringState.in_fstring_expr_modifier 

1058 return self.make_token(TokenType.op) 

1059 else: 

1060 if self.peek() == "=": 

1061 self.advance() 

1062 return self.make_token(TokenType.op) 

1063 

1064 if current_char in ".0123456789": 

1065 if self.current_index + 2 <= len(self.source) and self.source[ 

1066 self.current_index : self.current_index + 2 

1067 ] in ("0b", "0B"): 

1068 return self.binary() 

1069 elif self.current_index + 2 <= len(self.source) and self.source[ 

1070 self.current_index : self.current_index + 2 

1071 ] in ("0o", "0O"): 

1072 return self.octal() 

1073 elif self.current_index + 2 <= len(self.source) and self.source[ 

1074 self.current_index : self.current_index + 2 

1075 ] in ("0x", "0X"): 

1076 return self.hexadecimal() 

1077 else: 

1078 return self.decimal() 

1079 

1080 if ( 

1081 (self.current_index + 1 <= len(self.source) and self.match('"', "'")) 

1082 or ( 

1083 self.current_index + 2 <= len(self.source) 

1084 and self.match( 

1085 'b"', 

1086 "b'", 

1087 'r"', 

1088 "r'", 

1089 'f"', 

1090 "f'", 

1091 'u"', 

1092 "u'", 

1093 "t'", 

1094 't"', 

1095 ignore_case=True, 

1096 ) 

1097 ) 

1098 or ( 

1099 self.current_index + 3 <= len(self.source) 

1100 and self.match( 

1101 'br"', 

1102 "br'", 

1103 'rb"', 

1104 "rb'", 

1105 'fr"', 

1106 "fr'", 

1107 'rf"', 

1108 "rf'", 

1109 "tr'", 

1110 'tr"', 

1111 "rt'", 

1112 'rt"', 

1113 ignore_case=True, 

1114 ) 

1115 ) 

1116 ): 

1117 return self.string() 

1118 

1119 return self.name() 

1120 

1121 

1122def tokenize( 

1123 source: str, 

1124 *, 

1125 fstring_tokens: bool = True, 

1126 issue_128233_handling: bool = True, 

1127) -> Iterator[Token]: 

1128 token_iterator = TokenIterator(source, issue_128233_handling=issue_128233_handling) 

1129 if fstring_tokens: 

1130 return iter(token_iterator) 

1131 

1132 return merge_fstring_tokens(token_iterator) 

1133 

1134 

1135def merge_fstring_tokens(token_iterator: TokenIterator) -> Iterator[Token]: 

1136 """Turn post-Python-3.12 FSTRING-* tokens back to a single STRING token.""" 

1137 for token in token_iterator: 

1138 if token.type not in (TokenType.fstring_start, TokenType.tstring_start): 

1139 yield token 

1140 continue 

1141 

1142 start_token = token 

1143 end_token = token 

1144 

1145 fstring_starts = 1 

1146 fstring_ends = 0 

1147 for token in token_iterator: 

1148 if token.type in (TokenType.fstring_start, TokenType.tstring_start): 

1149 fstring_starts += 1 

1150 if token.type in (TokenType.fstring_end, TokenType.tstring_end): 

1151 fstring_ends += 1 

1152 

1153 if fstring_starts == fstring_ends: 

1154 end_token = token 

1155 break 

1156 

1157 yield Token( 

1158 type=TokenType.string, 

1159 start_index=start_token.start_index, 

1160 start_line=start_token.start_line, 

1161 start_col=start_token.start_col, 

1162 end_index=end_token.end_index, 

1163 end_line=end_token.end_line, 

1164 end_col=end_token.end_col, 

1165 )