1"""pytokens - A Fast, spec compliant Python 3.12+ tokenizer that runs on older Pythons."""
2
3from __future__ import annotations
4
5from dataclasses import dataclass, field
6import enum
7import string
8from typing import Iterator, NewType
9
10
11class TokenizeError(Exception): ...
12
13
14class IndentationError(TokenizeError): ...
15
16
17class InconsistentUseOfTabsAndSpaces(IndentationError): ...
18
19
20class DedentDoesNotMatchAnyOuterIndent(IndentationError): ...
21
22
23class UnterminatedString(TokenizeError): ...
24
25
26class UnexpectedEOF(TokenizeError): ...
27
28
29class UnexpectedCharacterAfterBackslash(TokenizeError): ...
30
31
32class NotAnIndent(AssertionError): ...
33
34
35class Underflow(AssertionError): ...
36
37
38class TokenType(enum.IntEnum):
39 whitespace = 1
40 indent = 2
41 dedent = 3
42 newline = 4 # semantically meaningful newline
43 nl = 5 # non meaningful newline
44 comment = 6
45
46 _op_start = 7 # marker used to check if a token is an operator
47 semicolon = 8
48 lparen = 9
49 rparen = 10
50 lbracket = 11
51 rbracket = 12
52 lbrace = 13
53 rbrace = 14
54 colon = 15
55 op = 16
56 _op_end = 17 # marker used to check if a token is an operator
57
58 identifier = 18
59 number = 19
60 string = 20
61 fstring_start = 21
62 fstring_middle = 22
63 fstring_end = 23
64
65 endmarker = 24
66
67 errortoken = 25
68
69 def __repr__(self) -> str:
70 return f"TokenType.{self.name}"
71
72 def to_python_token(self) -> str:
73 if self.name == "identifier":
74 return "NAME"
75
76 if self.is_operator():
77 return "OP"
78
79 return self.name.upper()
80
81 def is_operator(self) -> bool:
82 return TokenType._op_start < self < TokenType._op_end
83
84
85@dataclass
86class Token:
87 type: TokenType
88 # Byte offsets in the file
89 start_index: int
90 end_index: int
91 start_line: int
92 # 0-indexed offset from start of line
93 start_col: int
94 end_line: int
95 end_col: int
96
97 def to_byte_slice(self, source: str) -> str:
98 # Newline at end of file may not exist in the file
99 if (
100 (self.type == TokenType.newline or self.type == TokenType.nl)
101 and self.start_index == len(source)
102 and self.end_index == len(source) + 1
103 ):
104 return ""
105
106 # Dedents at end of file also may not exist in the file
107 if (
108 self.type == TokenType.dedent
109 and self.start_index == len(source) + 1
110 and self.end_index == len(source) + 1
111 ):
112 return ""
113
114 # Endmarkers are out of bound too
115 if self.type == TokenType.endmarker:
116 return ""
117
118 return source[self.start_index : self.end_index]
119
120
121class FStringState:
122 State = NewType("State", int)
123
124 not_fstring = State(1)
125 at_fstring_middle = State(2)
126 at_fstring_lbrace = State(3)
127 in_fstring_expr = State(4)
128 in_fstring_expr_modifier = State(5)
129 at_fstring_end = State(6)
130
131 def __init__(self) -> None:
132 self.state = FStringState.not_fstring
133 self.stack: list[FStringState.State] = []
134
135 def enter_fstring(self) -> None:
136 self.stack.append(self.state)
137 self.state = FStringState.at_fstring_middle
138
139 def leave_fstring(self) -> None:
140 assert self.state == FStringState.at_fstring_end
141 self.state = self.stack.pop()
142
143 def consume_fstring_middle_for_lbrace(self) -> None:
144 if self.state == FStringState.in_fstring_expr_modifier:
145 self.stack.append(self.state)
146
147 self.state = FStringState.at_fstring_lbrace
148
149 def consume_fstring_middle_for_end(self) -> None:
150 self.state = FStringState.at_fstring_end
151
152 def consume_lbrace(self) -> None:
153 self.state = FStringState.in_fstring_expr
154
155 def consume_rbrace(self) -> None:
156 assert (
157 self.state == FStringState.in_fstring_expr
158 or self.state == FStringState.in_fstring_expr_modifier
159 )
160
161 if (
162 len(self.stack) > 0
163 and self.stack[-1] == FStringState.in_fstring_expr_modifier
164 ):
165 self.state = self.stack.pop()
166 else:
167 self.state = FStringState.at_fstring_middle
168
169 def consume_colon(self) -> None:
170 assert self.state == FStringState.in_fstring_expr
171 self.state = FStringState.in_fstring_expr_modifier
172
173
174@dataclass
175class TokenIterator:
176 source: str
177 issue_128233_handling: bool
178
179 current_index: int = 0
180 prev_index: int = 0
181 line_number: int = 1
182 prev_line_number: int = 1
183 byte_offset: int = 0
184 prev_byte_offset: int = 0
185 all_whitespace_on_this_line: bool = True
186
187 bracket_level: int = 0
188 bracket_level_stack: list[int] = field(default_factory=list)
189 prev_token: Token | None = None
190
191 indent_stack: list[str] = field(default_factory=list)
192 dedent_counter: int = 0
193
194 # f-string state
195 fstring_state: FStringState = field(default_factory=FStringState)
196 fstring_prefix_quote_stack: list[tuple[str, str]] = field(default_factory=list)
197 fstring_prefix: str | None = None
198 fstring_quote: str | None = None
199
200 # CPython has a weird bug where every time a bare \r is
201 # present, the next token becomes an OP. regardless of what it is.
202 weird_op_case: bool = False
203 weird_op_case_nl: bool = False
204
205 weird_whitespace_case: bool = False
206
207 def is_in_bounds(self) -> bool:
208 return self.current_index < len(self.source)
209
210 def peek(self) -> str:
211 assert self.is_in_bounds()
212 return self.source[self.current_index]
213
214 def peek_next(self) -> str:
215 assert self.current_index + 1 < len(self.source)
216 return self.source[self.current_index + 1]
217
218 def advance(self) -> None:
219 self.current_index += 1
220 self.byte_offset += 1
221
222 def advance_by(self, count: int) -> None:
223 self.current_index += count
224 self.byte_offset += count
225
226 def next_line(self) -> None:
227 self.line_number += 1
228 self.byte_offset = 0
229 self.all_whitespace_on_this_line = True
230
231 def advance_check_newline(self) -> None:
232 if self.source[self.current_index] == "\n":
233 self.current_index += 1
234 self.next_line()
235 else:
236 self.advance()
237
238 def match(self, *options: str, ignore_case: bool = False) -> bool:
239 for option in options:
240 if self.current_index + len(option) > len(self.source):
241 continue
242 snippet = self.source[self.current_index : self.current_index + len(option)]
243 if ignore_case:
244 option = option.lower()
245 snippet = snippet.lower()
246
247 if option == snippet:
248 return True
249
250 return False
251
252 def make_token(self, tok_type: TokenType) -> Token:
253 token_type = (
254 TokenType.op
255 if self.weird_op_case
256 and not tok_type.is_operator()
257 and tok_type not in (TokenType.number, TokenType.string)
258 else tok_type
259 )
260 if self.weird_op_case:
261 # And we have another weird case INSIDE the weird case.
262 # For some reason when CPython accidentally captures a space
263 # as the next character, i.e. when the token is '\r ',
264 # It DOESN't see it as whitespace, so in that specific case,
265 # we shouldn't set all_whitespace_on_this_line.
266 # I think this is because CPython never expecte to have a
267 # ' ' token in it anyway so it doesn't classify it as
268 # whitespace. So it becomes non-whitespace.
269 # Removing this if stmt breaks test 1001 right now.
270 token_str = self.source[self.prev_index : self.current_index]
271 if token_str == "\r ":
272 self.all_whitespace_on_this_line = False
273 self.weird_op_case = False
274
275 token = Token(
276 type=token_type,
277 start_index=self.prev_index,
278 end_index=self.current_index,
279 start_line=self.prev_line_number,
280 start_col=self.prev_byte_offset,
281 end_line=self.line_number,
282 end_col=self.byte_offset,
283 )
284 if tok_type == TokenType.newline or tok_type == TokenType.nl:
285 self.next_line()
286 elif tok_type == TokenType.whitespace or tok_type == TokenType.comment:
287 pass
288 else:
289 self.all_whitespace_on_this_line = False
290
291 self.prev_token = token
292 self.prev_index = self.current_index
293 self.prev_line_number = self.line_number
294 self.prev_byte_offset = self.byte_offset
295 self.weird_op_case = False
296
297 return token
298
299 def push_fstring_prefix_quote(self, prefix: str, quote: str) -> None:
300 if self.fstring_prefix is not None:
301 assert self.fstring_quote is not None
302 self.fstring_prefix_quote_stack.append(
303 (self.fstring_prefix, self.fstring_quote)
304 )
305
306 self.fstring_prefix = prefix
307 self.fstring_quote = quote
308
309 def pop_fstring_quote(self) -> None:
310 if self.fstring_prefix is None:
311 assert self.fstring_quote is None
312 raise Underflow
313
314 self.fstring_prefix, self.fstring_quote = (
315 (None, None)
316 if len(self.fstring_prefix_quote_stack) == 0
317 else self.fstring_prefix_quote_stack.pop()
318 )
319
320 def newline(self) -> Token:
321 if self.is_in_bounds() and self.source[self.current_index] == "\r":
322 self.advance()
323 self.advance()
324 token_type = (
325 TokenType.nl
326 if (
327 self.weird_op_case_nl
328 or self.bracket_level > 0
329 or self.fstring_state.state == FStringState.in_fstring_expr
330 or self.all_whitespace_on_this_line
331 )
332 else TokenType.newline
333 )
334 token = self.make_token(token_type)
335 self.weird_op_case_nl = False
336 return token
337
338 def endmarker(self) -> Token:
339 if self.bracket_level != 0:
340 raise UnexpectedEOF
341
342 if len(self.indent_stack) > 0:
343 _ = self.indent_stack.pop()
344 return self.make_token(TokenType.dedent)
345
346 return self.make_token(TokenType.endmarker)
347
348 def decimal(self) -> Token:
349 digit_before_decimal = False
350 if self.source[self.current_index].isdigit():
351 digit_before_decimal = True
352 self.advance()
353
354 # TODO: this is too lax; 1__2 tokenizes successfully
355 while self.is_in_bounds() and (
356 self.source[self.current_index].isdigit()
357 or self.source[self.current_index] == "_"
358 ):
359 self.advance()
360
361 if self.is_in_bounds() and self.source[self.current_index] == ".":
362 self.advance()
363
364 while self.is_in_bounds() and (
365 self.source[self.current_index].isdigit()
366 or (
367 self.source[self.current_index] == "_"
368 and self.source[self.current_index - 1].isdigit()
369 )
370 ):
371 self.advance()
372 # Before advancing over the 'e', ensure that there has been at least 1 digit before the 'e'
373 if self.current_index + 1 < len(self.source) and (
374 (digit_before_decimal or self.source[self.current_index - 1].isdigit())
375 and (
376 self.source[self.current_index] == "e"
377 or self.source[self.current_index] == "E"
378 )
379 and (
380 self.source[self.current_index + 1].isdigit()
381 or (
382 self.current_index + 2 < len(self.source)
383 and (
384 self.source[self.current_index + 1] == "+"
385 or self.source[self.current_index + 1] == "-"
386 )
387 and self.source[self.current_index + 2].isdigit()
388 )
389 )
390 ):
391 self.advance()
392 self.advance()
393 # optional third advance not necessary as itll get advanced just below
394
395 # TODO: this is too lax; 1__2 tokenizes successfully
396 while self.is_in_bounds() and (
397 self.source[self.current_index].isdigit()
398 or (
399 (digit_before_decimal or self.source[self.current_index - 1].isdigit())
400 and self.source[self.current_index] == "_"
401 )
402 ):
403 self.advance()
404
405 # Complex numbers end in a `j`. But ensure at least 1 digit before it
406 if self.is_in_bounds() and (
407 (digit_before_decimal or self.source[self.current_index - 1].isdigit())
408 and (
409 self.source[self.current_index] == "j"
410 or self.source[self.current_index] == "J"
411 )
412 ):
413 self.advance()
414 # If all of this resulted in just a dot, return an operator
415 if (
416 self.current_index - self.prev_index == 1
417 and self.source[self.current_index - 1] == "."
418 ):
419 # Ellipsis check
420 if (
421 self.current_index + 2 <= len(self.source)
422 and self.source[self.current_index : self.current_index + 2] == ".."
423 ):
424 self.advance()
425 self.advance()
426
427 return self.make_token(TokenType.op)
428
429 return self.make_token(TokenType.number)
430
431 def binary(self) -> Token:
432 # jump over `0b`
433 self.advance()
434 self.advance()
435 while self.is_in_bounds() and (
436 self.source[self.current_index] == "0"
437 or self.source[self.current_index] == "1"
438 or self.source[self.current_index] == "_"
439 ):
440 self.advance()
441 if self.is_in_bounds() and (
442 self.source[self.current_index] == "e"
443 or self.source[self.current_index] == "E"
444 ):
445 self.advance()
446 if self.is_in_bounds() and self.source[self.current_index] == "-":
447 self.advance()
448
449 while self.is_in_bounds() and (
450 self.source[self.current_index] == "0"
451 or self.source[self.current_index] == "1"
452 or self.source[self.current_index] == "_"
453 ):
454 self.advance()
455 return self.make_token(TokenType.number)
456
457 def octal(self) -> Token:
458 # jump over `0o`
459 self.advance()
460 self.advance()
461 while self.is_in_bounds() and (
462 self.source[self.current_index] >= "0"
463 and self.source[self.current_index] <= "7"
464 or self.source[self.current_index] == "_"
465 ):
466 self.advance()
467 if self.is_in_bounds() and (
468 self.source[self.current_index] == "e"
469 or self.source[self.current_index] == "E"
470 ):
471 self.advance()
472 if self.is_in_bounds() and self.source[self.current_index] == "-":
473 self.advance()
474
475 while self.is_in_bounds() and (
476 self.source[self.current_index] >= "0"
477 and self.source[self.current_index] <= "7"
478 or self.source[self.current_index] == "_"
479 ):
480 self.advance()
481 return self.make_token(TokenType.number)
482
483 def hexadecimal(self) -> Token:
484 # jump over `0x`
485 self.advance()
486 self.advance()
487 while self.is_in_bounds() and (
488 self.source[self.current_index] in string.hexdigits
489 or self.source[self.current_index] == "_"
490 ):
491 self.advance()
492 if self.is_in_bounds() and (
493 self.source[self.current_index] == "e"
494 or self.source[self.current_index] == "E"
495 ):
496 self.advance()
497 if self.is_in_bounds() and self.source[self.current_index] == "-":
498 self.advance()
499
500 while self.is_in_bounds() and (
501 self.source[self.current_index] in string.hexdigits
502 or self.source[self.current_index] == "_"
503 ):
504 self.advance()
505 return self.make_token(TokenType.number)
506
507 def find_opening_quote(self) -> int:
508 # Quotes should always be within 3 chars of the beginning of the string token
509 for offset in range(3):
510 char = self.source[self.current_index + offset]
511 if char == '"' or char == "'":
512 return self.current_index + offset
513
514 raise AssertionError("Quote not found somehow")
515
516 def string_prefix_and_quotes(self) -> tuple[str, str]:
517 quote_index = self.find_opening_quote()
518 prefix = self.source[self.current_index : quote_index]
519 quote_char = self.source[quote_index]
520
521 # Check for triple quotes
522 quote = (
523 self.source[quote_index : quote_index + 3]
524 if (
525 quote_index + 3 <= len(self.source)
526 and self.source[quote_index + 1] == quote_char
527 and self.source[quote_index + 2] == quote_char
528 )
529 else self.source[quote_index : quote_index + 1]
530 )
531 return prefix, quote
532
533 def fstring(self) -> Token:
534 if self.fstring_state.state in (
535 FStringState.not_fstring,
536 FStringState.in_fstring_expr,
537 ):
538 prefix, quote = self.string_prefix_and_quotes()
539 self.push_fstring_prefix_quote(prefix, quote)
540 for _ in range(len(prefix)):
541 self.advance()
542 for _ in range(len(quote)):
543 self.advance()
544 self.fstring_state.enter_fstring()
545 return self.make_token(TokenType.fstring_start)
546
547 if self.fstring_state.state == FStringState.at_fstring_middle:
548 assert self.fstring_quote is not None
549 is_single_quote = len(self.fstring_quote) == 1
550 start_index = self.current_index
551 while self.is_in_bounds():
552 char = self.source[self.current_index]
553 # For single quotes, bail on newlines
554 if char == "\n" and is_single_quote:
555 raise UnterminatedString
556
557 # Handle escapes
558 if char == "\\":
559 self.advance()
560 # But don't escape a `\{` or `\}` in f-strings
561 # but DO escape `\N{` in f-strings, that's for unicode characters
562 # but DON'T escape `\N{` in raw f-strings.
563 assert self.fstring_prefix is not None
564 if (
565 "r" not in self.fstring_prefix.lower()
566 and self.current_index + 1 < len(self.source)
567 and self.peek() == "N"
568 and self.peek_next() == "{"
569 ):
570 self.advance()
571 self.advance()
572
573 if self.is_in_bounds() and not (
574 self.peek() == "{" or self.peek() == "}"
575 ):
576 self.advance_check_newline()
577
578 continue
579
580 # Find opening / closing quote
581 if char == "{":
582 if self.peek_next() == "{":
583 self.advance()
584 self.advance()
585 continue
586 else:
587 self.fstring_state.consume_fstring_middle_for_lbrace()
588 # If fstring-middle is empty, skip it by returning the next step token
589 if self.current_index == start_index:
590 return self.fstring()
591
592 return self.make_token(TokenType.fstring_middle)
593
594 assert self.fstring_quote is not None
595 if self.match(self.fstring_quote):
596 self.fstring_state.consume_fstring_middle_for_end()
597 # If fstring-middle is empty, skip it by returning the next step token
598 if self.current_index == start_index:
599 return self.fstring()
600
601 return self.make_token(TokenType.fstring_middle)
602
603 self.advance_check_newline()
604
605 raise UnexpectedEOF
606
607 if self.fstring_state.state == FStringState.at_fstring_lbrace:
608 self.advance()
609 self.bracket_level_stack.append(self.bracket_level)
610 self.bracket_level = 0
611 self.fstring_state.consume_lbrace()
612 return self.make_token(TokenType.lbrace)
613
614 if self.fstring_state.state == FStringState.at_fstring_end:
615 assert self.fstring_quote is not None
616 for _ in range(len(self.fstring_quote)):
617 self.advance()
618 self.pop_fstring_quote()
619 self.fstring_state.leave_fstring()
620 return self.make_token(TokenType.fstring_end)
621
622 if self.fstring_state.state == FStringState.in_fstring_expr_modifier:
623 start_index = self.current_index
624 while self.is_in_bounds():
625 char = self.source[self.current_index]
626 assert self.fstring_quote is not None
627 if (char == "\n" or char == "{") and len(self.fstring_quote) == 1:
628 if char == "{":
629 self.fstring_state.consume_fstring_middle_for_lbrace()
630 else:
631 # TODO: why?
632 self.fstring_state.state = FStringState.in_fstring_expr
633
634 # If fstring-middle is empty, skip it by returning the next step token
635 if self.current_index == start_index:
636 return self.fstring()
637
638 return self.make_token(TokenType.fstring_middle)
639 elif char == "}":
640 self.fstring_state.state = FStringState.in_fstring_expr
641 return self.make_token(TokenType.fstring_middle)
642
643 self.advance_check_newline()
644
645 raise UnexpectedEOF
646
647 raise AssertionError("Unhandled f-string state")
648
649 def string(self) -> Token:
650 prefix, quote = self.string_prefix_and_quotes()
651 if prefix and self.weird_op_case:
652 self.advance()
653 return self.make_token(tok_type=TokenType.op)
654
655 for char in prefix:
656 if char == "f" or char == "F":
657 return self.fstring()
658
659 for _ in range(len(prefix)):
660 self.advance()
661 for _ in range(len(quote)):
662 self.advance()
663
664 is_single_quote = len(quote) == 1
665
666 while self.is_in_bounds():
667 char = self.source[self.current_index]
668 # For single quotes, bail on newlines
669 if char == "\n" and is_single_quote:
670 raise UnterminatedString
671
672 # Handle escapes
673 if char == "\\":
674 self.advance()
675 self.advance_check_newline()
676 continue
677
678 # Find closing quote
679 if self.match(quote):
680 for _ in range(len(quote)):
681 self.advance()
682 return self.make_token(TokenType.string)
683
684 self.advance_check_newline()
685
686 raise UnexpectedEOF
687
688 def indent(self) -> Token:
689 start_index = self.current_index
690 saw_whitespace = False
691 saw_tab_or_space = False
692 while self.is_in_bounds():
693 char = self.source[self.current_index]
694 if self.is_whitespace():
695 self.advance()
696 saw_whitespace = True
697 if char == " " or char == "\t":
698 saw_tab_or_space = True
699 else:
700 break
701
702 if not self.is_in_bounds():
703 # File ends with no whitespace after newline, don't return indent
704 if self.current_index == start_index:
705 raise NotAnIndent
706 # If reached the end of the file, don't return an indent
707 return self.make_token(TokenType.whitespace)
708
709 # If the line is preceded by just linefeeds/CR/etc.,
710 # treat it as whitespace.
711 if saw_whitespace and not saw_tab_or_space:
712 self.weird_whitespace_case = True
713 return self.make_token(TokenType.whitespace)
714
715 # For lines that are just leading whitespace and a slash or a comment,
716 # don't return indents
717 next_char = self.peek()
718 if next_char == "#" or next_char == "\\" or self.is_newline():
719 return self.make_token(TokenType.whitespace)
720
721 new_indent = self.source[start_index : self.current_index]
722 current_indent = "" if len(self.indent_stack) == 0 else self.indent_stack[-1]
723
724 if len(new_indent) == len(current_indent):
725 if len(new_indent) == 0:
726 raise NotAnIndent
727
728 if new_indent != current_indent:
729 raise InconsistentUseOfTabsAndSpaces
730 return self.make_token(TokenType.whitespace)
731 elif len(new_indent) > len(current_indent):
732 if len(current_indent) > 0 and current_indent not in new_indent:
733 raise InconsistentUseOfTabsAndSpaces
734 self.indent_stack.append(new_indent)
735 return self.make_token(TokenType.indent)
736 else:
737 while len(self.indent_stack) > 0:
738 top_indent = self.indent_stack[-1]
739 if len(top_indent) < len(new_indent):
740 raise DedentDoesNotMatchAnyOuterIndent
741
742 if len(top_indent) == len(new_indent):
743 break
744
745 _ = self.indent_stack.pop()
746 self.dedent_counter += 1
747
748 # Let the dedent counter make the dedents. They must be length zero
749 return self.make_token(TokenType.whitespace)
750
751 def is_whitespace(self) -> bool:
752 if self.is_newline():
753 return False
754
755 char = self.source[self.current_index]
756 return (
757 char == " "
758 or char == "\r"
759 or char == "\t"
760 or char == "\x0b"
761 or char == "\x0c"
762 )
763
764 def is_newline(self) -> bool:
765 if self.source[self.current_index] == "\n":
766 return True
767 if (
768 self.source[self.current_index] == "\r"
769 and self.current_index + 1 < len(self.source)
770 and self.source[self.current_index + 1] == "\n"
771 ):
772 return True
773
774 return False
775
776 def name(self) -> Token:
777 if self.weird_op_case:
778 self.advance()
779 return self.make_token(TokenType.identifier)
780
781 # According to PEP 3131, any non-ascii character is valid in a NAME token.
782 # But if we see any non-identifier ASCII character we should stop.
783 remaining = self.source[self.current_index :]
784 for index, char in enumerate(remaining):
785 if ord(char) < 128 and not str.isalnum(char) and char != "_":
786 length = index
787 break
788 else:
789 length = len(remaining)
790
791 self.advance_by(length)
792 return self.make_token(TokenType.identifier)
793
794 def __iter__(self) -> TokenIterator:
795 return self
796
797 def __next__(self) -> Token:
798 if self.prev_token is not None and self.prev_token.type == TokenType.endmarker:
799 raise StopIteration
800
801 # EOF checks
802 if self.current_index == len(self.source):
803 if self.prev_token is None:
804 return self.endmarker()
805
806 if self.prev_token.type in {
807 TokenType.newline,
808 TokenType.nl,
809 TokenType.dedent,
810 }:
811 return self.endmarker()
812 else:
813 return self.newline()
814
815 if self.current_index > len(self.source):
816 return self.endmarker()
817
818 # f-string check
819 if (
820 self.fstring_state.state != FStringState.not_fstring
821 and self.fstring_state.state != FStringState.in_fstring_expr
822 ):
823 return self.fstring()
824
825 current_char = self.source[self.current_index]
826
827 # \r on its own, in certain cases it gets merged with the next char.
828 # It's probably a bug: https://github.com/python/cpython/issues/128233
829 # 'issue_128233_handling=True' works around this bug, but if it's False
830 # then we produce identical tokens to CPython.
831 if not self.issue_128233_handling and current_char == "\r":
832 self.advance()
833 if not self.is_in_bounds():
834 return self.newline()
835
836 current_char = self.source[self.current_index]
837 if current_char != "\n":
838 self.weird_op_case = True
839 if (
840 self.prev_token is not None
841 and self.prev_token.type == TokenType.comment
842 ):
843 self.weird_op_case_nl = True
844
845 # Comment check
846 if current_char == "#":
847 if self.weird_op_case:
848 self.advance()
849 return self.make_token(TokenType.comment)
850
851 while self.is_in_bounds() and not self.is_newline():
852 if (
853 not self.issue_128233_handling
854 and self.source[self.current_index] == "\r"
855 ):
856 break
857 self.advance()
858 return self.make_token(TokenType.comment)
859
860 # Empty the dedent counter
861 if self.dedent_counter > 0:
862 self.dedent_counter -= 1
863 return self.make_token(TokenType.dedent)
864
865 # Newline check
866 if self.is_newline():
867 return self.newline()
868
869 # \<newline> check
870 if current_char == "\\":
871 self.advance()
872 if not self.is_in_bounds():
873 raise UnexpectedEOF
874
875 # Consume all whitespace on this line and the next.
876 found_whitespace = False
877 seen_newline = False
878 while self.is_in_bounds():
879 if self.is_whitespace():
880 self.advance()
881 found_whitespace = True
882 elif not seen_newline and (self.is_newline()):
883 char = self.source[self.current_index]
884 if char == "\r":
885 self.advance()
886 self.advance()
887 found_whitespace = True
888 seen_newline = True
889 # Move to next line without creating a newline token. But,
890 # if the previous line was all whitespace, whitespace on
891 # the next line is still valid indentation. Avoid consuming
892 if self.all_whitespace_on_this_line:
893 self.next_line()
894 break
895 else:
896 self.next_line()
897 # Preserve this boolean, we're on the same line semantically
898 self.all_whitespace_on_this_line = False
899
900 else:
901 break
902
903 if not found_whitespace:
904 raise UnexpectedCharacterAfterBackslash
905
906 return self.make_token(TokenType.whitespace)
907
908 # Indent / dedent checks
909 if (
910 (self.byte_offset == 0 or self.weird_whitespace_case)
911 and self.bracket_level == 0
912 and self.fstring_state.state == FStringState.not_fstring
913 ):
914 self.weird_whitespace_case = False
915 try:
916 indent_token = self.indent()
917 except NotAnIndent:
918 indent_token = None
919
920 if indent_token is not None:
921 return indent_token
922
923 if self.is_whitespace():
924 while self.is_in_bounds() and self.is_whitespace():
925 self.advance()
926 return self.make_token(TokenType.whitespace)
927
928 if current_char in ("+", "&", "|", "^", "@", "%", "=", "!", "~"):
929 self.advance()
930 if self.peek() == "=":
931 self.advance()
932 return self.make_token(TokenType.op)
933
934 if current_char == "<":
935 self.advance()
936 if self.peek() == ">":
937 # Barry as FLUFL easter egg
938 self.advance()
939 return self.make_token(TokenType.op)
940
941 if self.peek() == "<":
942 self.advance()
943 if self.peek() == "=":
944 self.advance()
945 return self.make_token(TokenType.op)
946
947 if current_char == ">":
948 self.advance()
949 if self.peek() == ">":
950 self.advance()
951 if self.peek() == "=":
952 self.advance()
953 return self.make_token(TokenType.op)
954
955 if current_char == "/":
956 self.advance()
957 if self.peek() == "/":
958 self.advance()
959 if self.peek() == "=":
960 self.advance()
961 return self.make_token(TokenType.op)
962
963 if current_char == "*":
964 self.advance()
965 if self.peek() == "*":
966 self.advance()
967 if self.peek() == "=":
968 self.advance()
969 return self.make_token(TokenType.op)
970
971 if current_char == "-":
972 self.advance()
973 # -> operator
974 if self.peek() == ">":
975 self.advance()
976 return self.make_token(TokenType.op)
977
978 # -= operator
979 if self.peek() == "=":
980 self.advance()
981 return self.make_token(TokenType.op)
982
983 if current_char in (",", ";"):
984 self.advance()
985 return self.make_token(TokenType.op)
986
987 # This guy is not used in Python3, but still exists
988 # for backwards compatibility i guess.
989 if current_char == "`":
990 self.advance()
991 return self.make_token(TokenType.op)
992
993 if current_char == "(":
994 self.advance()
995 self.bracket_level += 1
996 return self.make_token(TokenType.lparen)
997
998 if current_char == ")":
999 self.advance()
1000 self.bracket_level -= 1
1001 if self.bracket_level < 0:
1002 self.bracket_level = 0
1003 return self.make_token(TokenType.rparen)
1004
1005 if current_char == "[":
1006 self.advance()
1007 self.bracket_level += 1
1008 return self.make_token(TokenType.lbracket)
1009
1010 if current_char == "]":
1011 self.advance()
1012 self.bracket_level -= 1
1013 if self.bracket_level < 0:
1014 self.bracket_level = 0
1015 return self.make_token(TokenType.rbracket)
1016
1017 if current_char == "{":
1018 self.advance()
1019 self.bracket_level += 1
1020 return self.make_token(TokenType.lbrace)
1021
1022 if current_char == "}":
1023 self.advance()
1024 if (
1025 self.bracket_level == 0
1026 and self.fstring_state.state == FStringState.in_fstring_expr
1027 ):
1028 self.fstring_state.consume_rbrace()
1029 self.bracket_level = self.bracket_level_stack.pop()
1030 else:
1031 self.bracket_level -= 1
1032 if self.bracket_level < 0:
1033 self.bracket_level = 0
1034
1035 return self.make_token(TokenType.rbrace)
1036
1037 if current_char == ":":
1038 self.advance()
1039 if (
1040 self.bracket_level == 0
1041 and self.fstring_state.state == FStringState.in_fstring_expr
1042 ):
1043 self.fstring_state.state = FStringState.in_fstring_expr_modifier
1044 return self.make_token(TokenType.op)
1045 else:
1046 if self.peek() == "=":
1047 self.advance()
1048 return self.make_token(TokenType.op)
1049
1050 if current_char in ".0123456789":
1051 if self.current_index + 2 <= len(self.source) and self.source[
1052 self.current_index : self.current_index + 2
1053 ] in ("0b", "0B"):
1054 return self.binary()
1055 elif self.current_index + 2 <= len(self.source) and self.source[
1056 self.current_index : self.current_index + 2
1057 ] in ("0o", "0O"):
1058 return self.octal()
1059 elif self.current_index + 2 <= len(self.source) and self.source[
1060 self.current_index : self.current_index + 2
1061 ] in ("0x", "0X"):
1062 return self.hexadecimal()
1063 else:
1064 return self.decimal()
1065
1066 if (
1067 (self.current_index + 1 <= len(self.source) and self.match('"', "'"))
1068 or (
1069 self.current_index + 2 <= len(self.source)
1070 and self.match(
1071 'b"',
1072 "b'",
1073 'r"',
1074 "r'",
1075 'f"',
1076 "f'",
1077 'u"',
1078 "u'",
1079 ignore_case=True,
1080 )
1081 )
1082 or (
1083 self.current_index + 3 <= len(self.source)
1084 and self.match(
1085 'br"',
1086 "br'",
1087 'rb"',
1088 "rb'",
1089 'fr"',
1090 "fr'",
1091 'rf"',
1092 "rf'",
1093 ignore_case=True,
1094 )
1095 )
1096 ):
1097 return self.string()
1098
1099 return self.name()
1100
1101
1102def tokenize(
1103 source: str,
1104 *,
1105 fstring_tokens: bool = True,
1106 issue_128233_handling: bool = True,
1107) -> Iterator[Token]:
1108 token_iterator = TokenIterator(source, issue_128233_handling=issue_128233_handling)
1109 if fstring_tokens:
1110 return iter(token_iterator)
1111
1112 return merge_fstring_tokens(token_iterator)
1113
1114
1115def merge_fstring_tokens(token_iterator: TokenIterator) -> Iterator[Token]:
1116 """Turn post-Python-3.12 FSTRING-* tokens back to a single STRING token."""
1117 for token in token_iterator:
1118 if token.type != TokenType.fstring_start:
1119 yield token
1120 continue
1121
1122 start_token = token
1123 end_token = token
1124
1125 fstring_starts = 1
1126 fstring_ends = 0
1127 for token in token_iterator:
1128 if token.type == TokenType.fstring_start:
1129 fstring_starts += 1
1130 if token.type == TokenType.fstring_end:
1131 fstring_ends += 1
1132
1133 if fstring_starts == fstring_ends:
1134 end_token = token
1135 break
1136
1137 yield Token(
1138 type=TokenType.string,
1139 start_index=start_token.start_index,
1140 start_line=start_token.start_line,
1141 start_col=start_token.start_col,
1142 end_index=end_token.end_index,
1143 end_line=end_token.end_line,
1144 end_col=end_token.end_col,
1145 )