Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pytokens/__init__.py: 78%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""pytokens - A Fast, spec compliant Python 3.12+ tokenizer that runs on older Pythons."""
3from __future__ import annotations
5from dataclasses import dataclass, field
6import enum
7import string
8from typing import Iterator, NewType
11class TokenizeError(Exception): ...
14class IndentationError(TokenizeError): ...
17class InconsistentUseOfTabsAndSpaces(IndentationError): ...
20class DedentDoesNotMatchAnyOuterIndent(IndentationError): ...
23class UnterminatedString(TokenizeError): ...
26class UnexpectedEOF(TokenizeError): ...
29class UnexpectedCharacterAfterBackslash(TokenizeError): ...
32class NotAnIndent(AssertionError): ...
35class Underflow(AssertionError): ...
38class TokenType(enum.IntEnum):
39 whitespace = 1
40 indent = 2
41 dedent = 3
42 newline = 4 # semantically meaningful newline
43 nl = 5 # non meaningful newline
44 comment = 6
46 _op_start = 7 # marker used to check if a token is an operator
47 semicolon = 8
48 lparen = 9
49 rparen = 10
50 lbracket = 11
51 rbracket = 12
52 lbrace = 13
53 rbrace = 14
54 colon = 15
55 op = 16
56 _op_end = 17 # marker used to check if a token is an operator
58 identifier = 18
59 number = 19
60 string = 20
61 fstring_start = 21
62 fstring_middle = 22
63 fstring_end = 23
65 tstring_start = 24
66 tstring_middle = 25
67 tstring_end = 26
69 endmarker = 27
71 errortoken = 28
73 def __repr__(self) -> str:
74 return f"TokenType.{self.name}"
76 def to_python_token(self) -> str:
77 if self.name == "identifier":
78 return "NAME"
80 if self.is_operator():
81 return "OP"
83 return self.name.upper()
85 def is_operator(self) -> bool:
86 return TokenType._op_start < self < TokenType._op_end
89@dataclass
90class Token:
91 type: TokenType
92 # Byte offsets in the file
93 start_index: int
94 end_index: int
95 start_line: int
96 # 0-indexed offset from start of line
97 start_col: int
98 end_line: int
99 end_col: int
101 def to_byte_slice(self, source: str) -> str:
102 # Newline at end of file may not exist in the file
103 if (
104 (self.type == TokenType.newline or self.type == TokenType.nl)
105 and self.start_index == len(source)
106 and self.end_index == len(source) + 1
107 ):
108 return ""
110 # Dedents at end of file also may not exist in the file
111 if (
112 self.type == TokenType.dedent
113 and self.start_index == len(source) + 1
114 and self.end_index == len(source) + 1
115 ):
116 return ""
118 # Endmarkers are out of bound too
119 if self.type == TokenType.endmarker:
120 return ""
122 return source[self.start_index : self.end_index]
125class FStringState:
126 State = NewType("State", int)
128 not_fstring = State(1)
129 at_fstring_middle = State(2)
130 at_fstring_lbrace = State(3)
131 in_fstring_expr = State(4)
132 in_fstring_expr_modifier = State(5)
133 at_fstring_end = State(6)
135 def __init__(self) -> None:
136 self.state = FStringState.not_fstring
137 self.stack: list[FStringState.State] = []
139 def enter_fstring(self) -> None:
140 self.stack.append(self.state)
141 self.state = FStringState.at_fstring_middle
143 def leave_fstring(self) -> None:
144 assert self.state == FStringState.at_fstring_end
145 self.state = self.stack.pop()
147 def consume_fstring_middle_for_lbrace(self) -> None:
148 if self.state == FStringState.in_fstring_expr_modifier:
149 self.stack.append(self.state)
151 self.state = FStringState.at_fstring_lbrace
153 def consume_fstring_middle_for_end(self) -> None:
154 self.state = FStringState.at_fstring_end
156 def consume_lbrace(self) -> None:
157 self.state = FStringState.in_fstring_expr
159 def consume_rbrace(self) -> None:
160 assert (
161 self.state == FStringState.in_fstring_expr
162 or self.state == FStringState.in_fstring_expr_modifier
163 )
165 if (
166 len(self.stack) > 0
167 and self.stack[-1] == FStringState.in_fstring_expr_modifier
168 ):
169 self.state = self.stack.pop()
170 else:
171 self.state = FStringState.at_fstring_middle
173 def consume_colon(self) -> None:
174 assert self.state == FStringState.in_fstring_expr
175 self.state = FStringState.in_fstring_expr_modifier
178@dataclass
179class TokenIterator:
180 source: str
181 issue_128233_handling: bool
183 current_index: int = 0
184 prev_index: int = 0
185 line_number: int = 1
186 prev_line_number: int = 1
187 byte_offset: int = 0
188 prev_byte_offset: int = 0
189 all_whitespace_on_this_line: bool = True
191 bracket_level: int = 0
192 bracket_level_stack: list[int] = field(default_factory=list)
193 prev_token: Token | None = None
195 indent_stack: list[str] = field(default_factory=list)
196 dedent_counter: int = 0
198 # f-string state
199 fstring_state: FStringState = field(default_factory=FStringState)
200 fstring_prefix_quote_stack: list[tuple[str, str]] = field(default_factory=list)
201 fstring_prefix: str | None = None
202 fstring_quote: str | None = None
204 # CPython has a weird bug where every time a bare \r is
205 # present, the next token becomes an OP. regardless of what it is.
206 weird_op_case: bool = False
207 weird_op_case_nl: bool = False
209 weird_whitespace_case: bool = False
211 def is_in_bounds(self) -> bool:
212 return self.current_index < len(self.source)
214 def peek(self) -> str:
215 assert self.is_in_bounds()
216 return self.source[self.current_index]
218 def peek_next(self) -> str:
219 assert self.current_index + 1 < len(self.source)
220 return self.source[self.current_index + 1]
222 def advance(self) -> None:
223 self.current_index += 1
224 self.byte_offset += 1
226 def advance_by(self, count: int) -> None:
227 self.current_index += count
228 self.byte_offset += count
230 def next_line(self) -> None:
231 self.line_number += 1
232 self.byte_offset = 0
233 self.all_whitespace_on_this_line = True
235 def advance_check_newline(self) -> None:
236 if self.source[self.current_index] == "\n":
237 self.current_index += 1
238 self.next_line()
239 else:
240 self.advance()
242 def match(self, *options: str, ignore_case: bool = False) -> bool:
243 for option in options:
244 if self.current_index + len(option) > len(self.source):
245 continue
246 snippet = self.source[self.current_index : self.current_index + len(option)]
247 if ignore_case:
248 option = option.lower()
249 snippet = snippet.lower()
251 if option == snippet:
252 return True
254 return False
256 def make_token(self, tok_type: TokenType) -> Token:
257 if self.fstring_prefix is not None and "t" in self.fstring_prefix:
258 if tok_type == TokenType.fstring_start:
259 tok_type = TokenType.tstring_start
260 elif tok_type == TokenType.fstring_middle:
261 tok_type = TokenType.tstring_middle
262 elif tok_type == TokenType.fstring_end:
263 tok_type = TokenType.tstring_end
265 token_type = (
266 TokenType.op
267 if self.weird_op_case
268 and not tok_type.is_operator()
269 and tok_type not in (TokenType.number, TokenType.string)
270 else tok_type
271 )
272 if self.weird_op_case:
273 # And we have another weird case INSIDE the weird case.
274 # For some reason when CPython accidentally captures a space
275 # as the next character, i.e. when the token is '\r ',
276 # It DOESN't see it as whitespace, so in that specific case,
277 # we shouldn't set all_whitespace_on_this_line.
278 # I think this is because CPython never expecte to have a
279 # ' ' token in it anyway so it doesn't classify it as
280 # whitespace. So it becomes non-whitespace.
281 # Removing this if stmt breaks test 1001 right now.
282 token_str = self.source[self.prev_index : self.current_index]
283 if token_str == "\r ":
284 self.all_whitespace_on_this_line = False
285 self.weird_op_case = False
287 token = Token(
288 type=token_type,
289 start_index=self.prev_index,
290 end_index=self.current_index,
291 start_line=self.prev_line_number,
292 start_col=self.prev_byte_offset,
293 end_line=self.line_number,
294 end_col=self.byte_offset,
295 )
296 if tok_type == TokenType.newline or tok_type == TokenType.nl:
297 self.next_line()
298 elif tok_type == TokenType.whitespace or tok_type == TokenType.comment:
299 pass
300 else:
301 self.all_whitespace_on_this_line = False
303 self.prev_token = token
304 self.prev_index = self.current_index
305 self.prev_line_number = self.line_number
306 self.prev_byte_offset = self.byte_offset
307 self.weird_op_case = False
309 return token
311 def push_fstring_prefix_quote(self, prefix: str, quote: str) -> None:
312 if self.fstring_prefix is not None:
313 assert self.fstring_quote is not None
314 self.fstring_prefix_quote_stack.append(
315 (self.fstring_prefix, self.fstring_quote)
316 )
318 self.fstring_prefix = prefix
319 self.fstring_quote = quote
321 def pop_fstring_quote(self) -> None:
322 if self.fstring_prefix is None:
323 assert self.fstring_quote is None
324 raise Underflow
326 self.fstring_prefix, self.fstring_quote = (
327 (None, None)
328 if len(self.fstring_prefix_quote_stack) == 0
329 else self.fstring_prefix_quote_stack.pop()
330 )
332 def newline(self) -> Token:
333 if self.is_in_bounds() and self.source[self.current_index] == "\r":
334 self.advance()
335 self.advance()
336 token_type = (
337 TokenType.nl
338 if (
339 self.weird_op_case_nl
340 or self.bracket_level > 0
341 or self.fstring_state.state == FStringState.in_fstring_expr
342 or self.all_whitespace_on_this_line
343 )
344 else TokenType.newline
345 )
346 token = self.make_token(token_type)
347 self.weird_op_case_nl = False
348 return token
350 def endmarker(self) -> Token:
351 if self.bracket_level != 0:
352 raise UnexpectedEOF
354 if len(self.indent_stack) > 0:
355 _ = self.indent_stack.pop()
356 return self.make_token(TokenType.dedent)
358 return self.make_token(TokenType.endmarker)
360 def decimal(self) -> Token:
361 digit_before_decimal = False
362 if self.source[self.current_index].isdigit():
363 digit_before_decimal = True
364 self.advance()
366 # TODO: this is too lax; 1__2 tokenizes successfully
367 while self.is_in_bounds() and (
368 self.source[self.current_index].isdigit()
369 or self.source[self.current_index] == "_"
370 ):
371 self.advance()
373 if self.is_in_bounds() and self.source[self.current_index] == ".":
374 self.advance()
376 while self.is_in_bounds() and (
377 self.source[self.current_index].isdigit()
378 or (
379 self.source[self.current_index] == "_"
380 and self.source[self.current_index - 1].isdigit()
381 )
382 ):
383 self.advance()
384 # Before advancing over the 'e', ensure that there has been at least 1 digit before the 'e'
385 if self.current_index + 1 < len(self.source) and (
386 (digit_before_decimal or self.source[self.current_index - 1].isdigit())
387 and (
388 self.source[self.current_index] == "e"
389 or self.source[self.current_index] == "E"
390 )
391 and (
392 self.source[self.current_index + 1].isdigit()
393 or (
394 self.current_index + 2 < len(self.source)
395 and (
396 self.source[self.current_index + 1] == "+"
397 or self.source[self.current_index + 1] == "-"
398 )
399 and self.source[self.current_index + 2].isdigit()
400 )
401 )
402 ):
403 self.advance()
404 self.advance()
405 # optional third advance not necessary as itll get advanced just below
407 # TODO: this is too lax; 1__2 tokenizes successfully
408 while self.is_in_bounds() and (
409 self.source[self.current_index].isdigit()
410 or (
411 (digit_before_decimal or self.source[self.current_index - 1].isdigit())
412 and self.source[self.current_index] == "_"
413 )
414 ):
415 self.advance()
417 # Complex numbers end in a `j`. But ensure at least 1 digit before it
418 if self.is_in_bounds() and (
419 (digit_before_decimal or self.source[self.current_index - 1].isdigit())
420 and (
421 self.source[self.current_index] == "j"
422 or self.source[self.current_index] == "J"
423 )
424 ):
425 self.advance()
426 # If all of this resulted in just a dot, return an operator
427 if (
428 self.current_index - self.prev_index == 1
429 and self.source[self.current_index - 1] == "."
430 ):
431 # Ellipsis check
432 if (
433 self.current_index + 2 <= len(self.source)
434 and self.source[self.current_index : self.current_index + 2] == ".."
435 ):
436 self.advance()
437 self.advance()
439 return self.make_token(TokenType.op)
441 return self.make_token(TokenType.number)
443 def binary(self) -> Token:
444 # jump over `0b`
445 self.advance()
446 self.advance()
447 while self.is_in_bounds() and (
448 self.source[self.current_index] == "0"
449 or self.source[self.current_index] == "1"
450 or self.source[self.current_index] == "_"
451 ):
452 self.advance()
453 if self.is_in_bounds() and (
454 self.source[self.current_index] == "e"
455 or self.source[self.current_index] == "E"
456 ):
457 self.advance()
458 if self.is_in_bounds() and self.source[self.current_index] == "-":
459 self.advance()
461 while self.is_in_bounds() and (
462 self.source[self.current_index] == "0"
463 or self.source[self.current_index] == "1"
464 or self.source[self.current_index] == "_"
465 ):
466 self.advance()
467 return self.make_token(TokenType.number)
469 def octal(self) -> Token:
470 # jump over `0o`
471 self.advance()
472 self.advance()
473 while self.is_in_bounds() and (
474 self.source[self.current_index] >= "0"
475 and self.source[self.current_index] <= "7"
476 or self.source[self.current_index] == "_"
477 ):
478 self.advance()
479 if self.is_in_bounds() and (
480 self.source[self.current_index] == "e"
481 or self.source[self.current_index] == "E"
482 ):
483 self.advance()
484 if self.is_in_bounds() and self.source[self.current_index] == "-":
485 self.advance()
487 while self.is_in_bounds() and (
488 self.source[self.current_index] >= "0"
489 and self.source[self.current_index] <= "7"
490 or self.source[self.current_index] == "_"
491 ):
492 self.advance()
493 return self.make_token(TokenType.number)
495 def hexadecimal(self) -> Token:
496 # jump over `0x`
497 self.advance()
498 self.advance()
499 while self.is_in_bounds() and (
500 self.source[self.current_index] in string.hexdigits
501 or self.source[self.current_index] == "_"
502 ):
503 self.advance()
504 if self.is_in_bounds() and (
505 self.source[self.current_index] == "e"
506 or self.source[self.current_index] == "E"
507 ):
508 self.advance()
509 if self.is_in_bounds() and self.source[self.current_index] == "-":
510 self.advance()
512 while self.is_in_bounds() and (
513 self.source[self.current_index] in string.hexdigits
514 or self.source[self.current_index] == "_"
515 ):
516 self.advance()
517 return self.make_token(TokenType.number)
519 def find_opening_quote(self) -> int:
520 # Quotes should always be within 3 chars of the beginning of the string token
521 for offset in range(3):
522 char = self.source[self.current_index + offset]
523 if char == '"' or char == "'":
524 return self.current_index + offset
526 raise AssertionError("Quote not found somehow")
528 def string_prefix_and_quotes(self) -> tuple[str, str]:
529 quote_index = self.find_opening_quote()
530 prefix = self.source[self.current_index : quote_index]
531 quote_char = self.source[quote_index]
533 # Check for triple quotes
534 quote = (
535 self.source[quote_index : quote_index + 3]
536 if (
537 quote_index + 3 <= len(self.source)
538 and self.source[quote_index + 1] == quote_char
539 and self.source[quote_index + 2] == quote_char
540 )
541 else self.source[quote_index : quote_index + 1]
542 )
543 return prefix, quote
545 def fstring(self) -> Token:
546 if self.fstring_state.state in (
547 FStringState.not_fstring,
548 FStringState.in_fstring_expr,
549 ):
550 prefix, quote = self.string_prefix_and_quotes()
552 self.push_fstring_prefix_quote(prefix, quote)
553 for _ in range(len(prefix)):
554 self.advance()
555 for _ in range(len(quote)):
556 self.advance()
557 self.fstring_state.enter_fstring()
558 return self.make_token(TokenType.fstring_start)
560 if self.fstring_state.state == FStringState.at_fstring_middle:
561 assert self.fstring_quote is not None
562 is_single_quote = len(self.fstring_quote) == 1
563 start_index = self.current_index
564 while self.is_in_bounds():
565 char = self.source[self.current_index]
566 # For single quotes, bail on newlines
567 if char == "\n" and is_single_quote:
568 raise UnterminatedString
570 # Handle escapes
571 if char == "\\":
572 self.advance()
573 # But don't escape a `\{` or `\}` in f-strings
574 # but DO escape `\N{` in f-strings, that's for unicode characters
575 # but DON'T escape `\N{` in raw f-strings.
576 assert self.fstring_prefix is not None
577 if (
578 "r" not in self.fstring_prefix.lower()
579 and self.current_index + 1 < len(self.source)
580 and self.peek() == "N"
581 and self.peek_next() == "{"
582 ):
583 self.advance()
584 self.advance()
586 if self.is_in_bounds() and not (
587 self.peek() == "{" or self.peek() == "}"
588 ):
589 self.advance_check_newline()
591 continue
593 # Find opening / closing quote
594 if char == "{":
595 if self.peek_next() == "{":
596 self.advance()
597 self.advance()
598 continue
599 else:
600 self.fstring_state.consume_fstring_middle_for_lbrace()
601 # If fstring-middle is empty, skip it by returning the next step token
602 if self.current_index == start_index:
603 return self.fstring()
605 return self.make_token(TokenType.fstring_middle)
607 assert self.fstring_quote is not None
608 if self.match(self.fstring_quote):
609 self.fstring_state.consume_fstring_middle_for_end()
610 # If fstring-middle is empty, skip it by returning the next step token
611 if self.current_index == start_index:
612 return self.fstring()
614 return self.make_token(TokenType.fstring_middle)
616 self.advance_check_newline()
618 raise UnexpectedEOF
620 if self.fstring_state.state == FStringState.at_fstring_lbrace:
621 self.advance()
622 self.bracket_level_stack.append(self.bracket_level)
623 self.bracket_level = 0
624 self.fstring_state.consume_lbrace()
625 return self.make_token(TokenType.lbrace)
627 if self.fstring_state.state == FStringState.at_fstring_end:
628 assert self.fstring_quote is not None
629 for _ in range(len(self.fstring_quote)):
630 self.advance()
631 token = self.make_token(TokenType.fstring_end)
632 self.pop_fstring_quote()
633 self.fstring_state.leave_fstring()
634 return token
636 if self.fstring_state.state == FStringState.in_fstring_expr_modifier:
637 start_index = self.current_index
638 while self.is_in_bounds():
639 char = self.source[self.current_index]
640 assert self.fstring_quote is not None
641 if (char == "\n" or char == "{") and len(self.fstring_quote) == 1:
642 if char == "{":
643 self.fstring_state.consume_fstring_middle_for_lbrace()
644 else:
645 # TODO: why?
646 self.fstring_state.state = FStringState.in_fstring_expr
648 # If fstring-middle is empty, skip it by returning the next step token
649 if self.current_index == start_index:
650 return self.fstring()
652 return self.make_token(TokenType.fstring_middle)
653 elif char == "}":
654 self.fstring_state.state = FStringState.in_fstring_expr
655 return self.make_token(TokenType.fstring_middle)
657 self.advance_check_newline()
659 raise UnexpectedEOF
661 raise AssertionError("Unhandled f-string state")
663 def string(self) -> Token:
664 prefix, quote = self.string_prefix_and_quotes()
665 if prefix and self.weird_op_case:
666 self.advance()
667 return self.make_token(tok_type=TokenType.op)
669 for char in prefix:
670 if char in ("f", "F", "t", "T"):
671 return self.fstring()
673 for _ in range(len(prefix)):
674 self.advance()
675 for _ in range(len(quote)):
676 self.advance()
678 is_single_quote = len(quote) == 1
680 while self.is_in_bounds():
681 char = self.source[self.current_index]
682 # For single quotes, bail on newlines
683 if char == "\n" and is_single_quote:
684 raise UnterminatedString
686 # Handle escapes
687 if char == "\\":
688 self.advance()
689 self.advance_check_newline()
690 continue
692 # Find closing quote
693 if self.match(quote):
694 for _ in range(len(quote)):
695 self.advance()
696 return self.make_token(TokenType.string)
698 self.advance_check_newline()
700 raise UnexpectedEOF
702 def indent(self) -> Token:
703 start_index = self.current_index
704 saw_whitespace = False
705 saw_tab_or_space = False
706 while self.is_in_bounds():
707 char = self.source[self.current_index]
708 if self.is_whitespace():
709 self.advance()
710 saw_whitespace = True
711 if char == " " or char == "\t":
712 saw_tab_or_space = True
713 else:
714 break
716 if not self.is_in_bounds():
717 # File ends with no whitespace after newline, don't return indent
718 if self.current_index == start_index:
719 raise NotAnIndent
720 # If reached the end of the file, don't return an indent
721 return self.make_token(TokenType.whitespace)
723 # If the line is preceded by just linefeeds/CR/etc.,
724 # treat it as whitespace.
725 if saw_whitespace and not saw_tab_or_space:
726 self.weird_whitespace_case = True
727 return self.make_token(TokenType.whitespace)
729 # For lines that are just leading whitespace and a slash or a comment,
730 # don't return indents
731 next_char = self.peek()
732 if next_char == "#" or next_char == "\\" or self.is_newline():
733 return self.make_token(TokenType.whitespace)
735 new_indent = self.source[start_index : self.current_index]
736 current_indent = "" if len(self.indent_stack) == 0 else self.indent_stack[-1]
738 if len(new_indent) == len(current_indent):
739 if len(new_indent) == 0:
740 raise NotAnIndent
742 if new_indent != current_indent:
743 raise InconsistentUseOfTabsAndSpaces
744 return self.make_token(TokenType.whitespace)
745 elif len(new_indent) > len(current_indent):
746 if len(current_indent) > 0 and current_indent not in new_indent:
747 raise InconsistentUseOfTabsAndSpaces
748 self.indent_stack.append(new_indent)
749 return self.make_token(TokenType.indent)
750 else:
751 while len(self.indent_stack) > 0:
752 top_indent = self.indent_stack[-1]
753 if len(top_indent) < len(new_indent):
754 raise DedentDoesNotMatchAnyOuterIndent
756 if len(top_indent) == len(new_indent):
757 break
759 _ = self.indent_stack.pop()
760 self.dedent_counter += 1
762 # Let the dedent counter make the dedents. They must be length zero
763 return self.make_token(TokenType.whitespace)
765 def is_whitespace(self) -> bool:
766 if self.is_newline():
767 return False
769 char = self.source[self.current_index]
770 return (
771 char == " "
772 or char == "\r"
773 or char == "\t"
774 or char == "\x0b"
775 or char == "\x0c"
776 )
778 def is_newline(self) -> bool:
779 if self.source[self.current_index] == "\n":
780 return True
781 if (
782 self.source[self.current_index] == "\r"
783 and self.current_index + 1 < len(self.source)
784 and self.source[self.current_index + 1] == "\n"
785 ):
786 return True
788 return False
790 def name(self) -> Token:
791 if self.weird_op_case:
792 self.advance()
793 return self.make_token(TokenType.identifier)
795 # According to PEP 3131, any non-ascii character is valid in a NAME token.
796 # But if we see any non-identifier ASCII character we should stop.
797 remaining = self.source[self.current_index :]
798 for index, char in enumerate(remaining):
799 if ord(char) < 128 and not str.isalnum(char) and char != "_":
800 length = index
801 break
802 else:
803 length = len(remaining)
805 self.advance_by(length)
806 return self.make_token(TokenType.identifier)
808 def __iter__(self) -> TokenIterator:
809 return self
811 def __next__(self) -> Token:
812 if self.prev_token is not None and self.prev_token.type == TokenType.endmarker:
813 raise StopIteration
815 # EOF checks
816 if self.current_index == len(self.source):
817 if self.prev_token is None:
818 return self.endmarker()
820 if self.prev_token.type in {
821 TokenType.newline,
822 TokenType.nl,
823 TokenType.dedent,
824 }:
825 return self.endmarker()
826 else:
827 return self.newline()
829 if self.current_index > len(self.source):
830 return self.endmarker()
832 # f-string check
833 if (
834 self.fstring_state.state != FStringState.not_fstring
835 and self.fstring_state.state != FStringState.in_fstring_expr
836 ):
837 return self.fstring()
839 current_char = self.source[self.current_index]
841 # \r on its own, in certain cases it gets merged with the next char.
842 # It's probably a bug: https://github.com/python/cpython/issues/128233
843 # 'issue_128233_handling=True' works around this bug, but if it's False
844 # then we produce identical tokens to CPython.
845 if not self.issue_128233_handling and current_char == "\r":
846 self.advance()
847 if not self.is_in_bounds():
848 return self.newline()
850 current_char = self.source[self.current_index]
851 if current_char != "\n":
852 self.weird_op_case = True
853 if (
854 self.prev_token is not None
855 and self.prev_token.type == TokenType.comment
856 ):
857 self.weird_op_case_nl = True
859 # Comment check
860 if current_char == "#":
861 if self.weird_op_case:
862 self.advance()
863 return self.make_token(TokenType.comment)
865 while self.is_in_bounds() and not self.is_newline():
866 if (
867 not self.issue_128233_handling
868 and self.source[self.current_index] == "\r"
869 ):
870 break
871 self.advance()
872 return self.make_token(TokenType.comment)
874 # Empty the dedent counter
875 if self.dedent_counter > 0:
876 self.dedent_counter -= 1
877 return self.make_token(TokenType.dedent)
879 # Newline check
880 if self.is_newline():
881 return self.newline()
883 # \<newline> check
884 if current_char == "\\":
885 self.advance()
886 if not self.is_in_bounds():
887 raise UnexpectedEOF
889 # Consume all whitespace on this line and the next.
890 found_whitespace = False
891 seen_newline = False
892 while self.is_in_bounds():
893 if self.is_whitespace():
894 self.advance()
895 found_whitespace = True
896 elif not seen_newline and (self.is_newline()):
897 char = self.source[self.current_index]
898 if char == "\r":
899 self.advance()
900 self.advance()
901 found_whitespace = True
902 seen_newline = True
903 # Move to next line without creating a newline token. But,
904 # if the previous line was all whitespace, whitespace on
905 # the next line is still valid indentation. Avoid consuming
906 if self.all_whitespace_on_this_line:
907 self.next_line()
908 break
909 else:
910 self.next_line()
911 # Preserve this boolean, we're on the same line semantically
912 self.all_whitespace_on_this_line = False
914 else:
915 break
917 if not found_whitespace:
918 raise UnexpectedCharacterAfterBackslash
920 return self.make_token(TokenType.whitespace)
922 # Indent / dedent checks
923 if (
924 (self.byte_offset == 0 or self.weird_whitespace_case)
925 and self.bracket_level == 0
926 and self.fstring_state.state == FStringState.not_fstring
927 ):
928 self.weird_whitespace_case = False
929 try:
930 indent_token = self.indent()
931 except NotAnIndent:
932 indent_token = None
934 if indent_token is not None:
935 return indent_token
937 if self.is_whitespace():
938 while self.is_in_bounds() and self.is_whitespace():
939 self.advance()
940 return self.make_token(TokenType.whitespace)
942 if current_char in ("+", "&", "|", "^", "@", "%", "=", "!", "~"):
943 self.advance()
944 if self.peek() == "=":
945 self.advance()
946 return self.make_token(TokenType.op)
948 if current_char == "<":
949 self.advance()
950 if self.peek() == ">":
951 # Barry as FLUFL easter egg
952 self.advance()
953 return self.make_token(TokenType.op)
955 if self.peek() == "<":
956 self.advance()
957 if self.peek() == "=":
958 self.advance()
959 return self.make_token(TokenType.op)
961 if current_char == ">":
962 self.advance()
963 if self.peek() == ">":
964 self.advance()
965 if self.peek() == "=":
966 self.advance()
967 return self.make_token(TokenType.op)
969 if current_char == "/":
970 self.advance()
971 if self.peek() == "/":
972 self.advance()
973 if self.peek() == "=":
974 self.advance()
975 return self.make_token(TokenType.op)
977 if current_char == "*":
978 self.advance()
979 if self.peek() == "*":
980 self.advance()
981 if self.peek() == "=":
982 self.advance()
983 return self.make_token(TokenType.op)
985 if current_char == "-":
986 self.advance()
987 # -> operator
988 if self.peek() == ">":
989 self.advance()
990 return self.make_token(TokenType.op)
992 # -= operator
993 if self.peek() == "=":
994 self.advance()
995 return self.make_token(TokenType.op)
997 if current_char in (",", ";"):
998 self.advance()
999 return self.make_token(TokenType.op)
1001 # This guy is not used in Python3, but still exists
1002 # for backwards compatibility i guess.
1003 if current_char == "`":
1004 self.advance()
1005 return self.make_token(TokenType.op)
1007 if current_char == "(":
1008 self.advance()
1009 self.bracket_level += 1
1010 return self.make_token(TokenType.lparen)
1012 if current_char == ")":
1013 self.advance()
1014 self.bracket_level -= 1
1015 if self.bracket_level < 0:
1016 self.bracket_level = 0
1017 return self.make_token(TokenType.rparen)
1019 if current_char == "[":
1020 self.advance()
1021 self.bracket_level += 1
1022 return self.make_token(TokenType.lbracket)
1024 if current_char == "]":
1025 self.advance()
1026 self.bracket_level -= 1
1027 if self.bracket_level < 0:
1028 self.bracket_level = 0
1029 return self.make_token(TokenType.rbracket)
1031 if current_char == "{":
1032 self.advance()
1033 self.bracket_level += 1
1034 return self.make_token(TokenType.lbrace)
1036 if current_char == "}":
1037 self.advance()
1038 if (
1039 self.bracket_level == 0
1040 and self.fstring_state.state == FStringState.in_fstring_expr
1041 ):
1042 self.fstring_state.consume_rbrace()
1043 self.bracket_level = self.bracket_level_stack.pop()
1044 else:
1045 self.bracket_level -= 1
1046 if self.bracket_level < 0:
1047 self.bracket_level = 0
1049 return self.make_token(TokenType.rbrace)
1051 if current_char == ":":
1052 self.advance()
1053 if (
1054 self.bracket_level == 0
1055 and self.fstring_state.state == FStringState.in_fstring_expr
1056 ):
1057 self.fstring_state.state = FStringState.in_fstring_expr_modifier
1058 return self.make_token(TokenType.op)
1059 else:
1060 if self.peek() == "=":
1061 self.advance()
1062 return self.make_token(TokenType.op)
1064 if current_char in ".0123456789":
1065 if self.current_index + 2 <= len(self.source) and self.source[
1066 self.current_index : self.current_index + 2
1067 ] in ("0b", "0B"):
1068 return self.binary()
1069 elif self.current_index + 2 <= len(self.source) and self.source[
1070 self.current_index : self.current_index + 2
1071 ] in ("0o", "0O"):
1072 return self.octal()
1073 elif self.current_index + 2 <= len(self.source) and self.source[
1074 self.current_index : self.current_index + 2
1075 ] in ("0x", "0X"):
1076 return self.hexadecimal()
1077 else:
1078 return self.decimal()
1080 if (
1081 (self.current_index + 1 <= len(self.source) and self.match('"', "'"))
1082 or (
1083 self.current_index + 2 <= len(self.source)
1084 and self.match(
1085 'b"',
1086 "b'",
1087 'r"',
1088 "r'",
1089 'f"',
1090 "f'",
1091 'u"',
1092 "u'",
1093 "t'",
1094 't"',
1095 ignore_case=True,
1096 )
1097 )
1098 or (
1099 self.current_index + 3 <= len(self.source)
1100 and self.match(
1101 'br"',
1102 "br'",
1103 'rb"',
1104 "rb'",
1105 'fr"',
1106 "fr'",
1107 'rf"',
1108 "rf'",
1109 "tr'",
1110 'tr"',
1111 "rt'",
1112 'rt"',
1113 ignore_case=True,
1114 )
1115 )
1116 ):
1117 return self.string()
1119 return self.name()
1122def tokenize(
1123 source: str,
1124 *,
1125 fstring_tokens: bool = True,
1126 issue_128233_handling: bool = True,
1127) -> Iterator[Token]:
1128 token_iterator = TokenIterator(source, issue_128233_handling=issue_128233_handling)
1129 if fstring_tokens:
1130 return iter(token_iterator)
1132 return merge_fstring_tokens(token_iterator)
1135def merge_fstring_tokens(token_iterator: TokenIterator) -> Iterator[Token]:
1136 """Turn post-Python-3.12 FSTRING-* tokens back to a single STRING token."""
1137 for token in token_iterator:
1138 if token.type not in (TokenType.fstring_start, TokenType.tstring_start):
1139 yield token
1140 continue
1142 start_token = token
1143 end_token = token
1145 fstring_starts = 1
1146 fstring_ends = 0
1147 for token in token_iterator:
1148 if token.type in (TokenType.fstring_start, TokenType.tstring_start):
1149 fstring_starts += 1
1150 if token.type in (TokenType.fstring_end, TokenType.tstring_end):
1151 fstring_ends += 1
1153 if fstring_starts == fstring_ends:
1154 end_token = token
1155 break
1157 yield Token(
1158 type=TokenType.string,
1159 start_index=start_token.start_index,
1160 start_line=start_token.start_line,
1161 start_col=start_token.start_col,
1162 end_index=end_token.end_index,
1163 end_line=end_token.end_line,
1164 end_col=end_token.end_col,
1165 )