Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pytokens/__init_

1"""pytokens - A Fast, spec compliant Python 3.12+ tokenizer that runs on older Pythons."""

3from __future__ import annotations

5from dataclasses import dataclass, field

6import enum

7import string

8from typing import Iterator, NewType

11class TokenizeError(Exception): ...

14class IndentationError(TokenizeError): ...

17class InconsistentUseOfTabsAndSpaces(IndentationError): ...

20class DedentDoesNotMatchAnyOuterIndent(IndentationError): ...

23class UnterminatedString(TokenizeError): ...

26class UnexpectedEOF(TokenizeError): ...

29class UnexpectedCharacterAfterBackslash(TokenizeError): ...

32class NotAnIndent(AssertionError): ...

35class Underflow(AssertionError): ...

38class TokenType(enum.IntEnum):

39 whitespace = 1

40 indent = 2

41 dedent = 3

42 newline = 4 # semantically meaningful newline

43 nl = 5 # non meaningful newline

44 comment = 6

46 _op_start = 7 # marker used to check if a token is an operator

47 semicolon = 8

48 lparen = 9

49 rparen = 10

50 lbracket = 11

51 rbracket = 12

52 lbrace = 13

53 rbrace = 14

54 colon = 15

55 op = 16

56 _op_end = 17 # marker used to check if a token is an operator

58 identifier = 18

59 number = 19

60 string = 20

61 fstring_start = 21

62 fstring_middle = 22

63 fstring_end = 23

65 endmarker = 24

67 errortoken = 25

69 def __repr__(self) -> str:

70 return f"TokenType.{self.name}"

72 def to_python_token(self) -> str:

73 if self.name == "identifier":

74 return "NAME"

76 if self.is_operator():

77 return "OP"

79 return self.name.upper()

81 def is_operator(self) -> bool:

82 return TokenType._op_start < self < TokenType._op_end

85@dataclass

86class Token:

87 type: TokenType

88 # Byte offsets in the file

89 start_index: int

90 end_index: int

91 start_line: int

92 # 0-indexed offset from start of line

93 start_col: int

94 end_line: int

95 end_col: int

97 def to_byte_slice(self, source: str) -> str:

98 # Newline at end of file may not exist in the file

99 if (

100 (self.type == TokenType.newline or self.type == TokenType.nl)

101 and self.start_index == len(source)

102 and self.end_index == len(source) + 1

103 ):

104 return ""

105

106 # Dedents at end of file also may not exist in the file

107 if (

108 self.type == TokenType.dedent

109 and self.start_index == len(source) + 1

110 and self.end_index == len(source) + 1

111 ):

112 return ""

113

114 # Endmarkers are out of bound too

115 if self.type == TokenType.endmarker:

116 return ""

117

118 return source[self.start_index : self.end_index]

119

120

121class FStringState:

122 State = NewType("State", int)

123

124 not_fstring = State(1)

125 at_fstring_middle = State(2)

126 at_fstring_lbrace = State(3)

127 in_fstring_expr = State(4)

128 in_fstring_expr_modifier = State(5)

129 at_fstring_end = State(6)

130

131 def __init__(self) -> None:

132 self.state = FStringState.not_fstring

133 self.stack: list[FStringState.State] = []

134

135 def enter_fstring(self) -> None:

136 self.stack.append(self.state)

137 self.state = FStringState.at_fstring_middle

138

139 def leave_fstring(self) -> None:

140 assert self.state == FStringState.at_fstring_end

141 self.state = self.stack.pop()

142

143 def consume_fstring_middle_for_lbrace(self) -> None:

144 if self.state == FStringState.in_fstring_expr_modifier:

145 self.stack.append(self.state)

146

147 self.state = FStringState.at_fstring_lbrace

148

149 def consume_fstring_middle_for_end(self) -> None:

150 self.state = FStringState.at_fstring_end

151

152 def consume_lbrace(self) -> None:

153 self.state = FStringState.in_fstring_expr

154

155 def consume_rbrace(self) -> None:

156 assert (

157 self.state == FStringState.in_fstring_expr

158 or self.state == FStringState.in_fstring_expr_modifier

159 )

160

161 if (

162 len(self.stack) > 0

163 and self.stack[-1] == FStringState.in_fstring_expr_modifier

164 ):

165 self.state = self.stack.pop()

166 else:

167 self.state = FStringState.at_fstring_middle

168

169 def consume_colon(self) -> None:

170 assert self.state == FStringState.in_fstring_expr

171 self.state = FStringState.in_fstring_expr_modifier

172

173

174@dataclass

175class TokenIterator:

176 source: str

177 issue_128233_handling: bool

178

179 current_index: int = 0

180 prev_index: int = 0

181 line_number: int = 1

182 prev_line_number: int = 1

183 byte_offset: int = 0

184 prev_byte_offset: int = 0

185 all_whitespace_on_this_line: bool = True

186

187 bracket_level: int = 0

188 bracket_level_stack: list[int] = field(default_factory=list)

189 prev_token: Token | None = None

190

191 indent_stack: list[str] = field(default_factory=list)

192 dedent_counter: int = 0

193

194 # f-string state

195 fstring_state: FStringState = field(default_factory=FStringState)

196 fstring_prefix_quote_stack: list[tuple[str, str]] = field(default_factory=list)

197 fstring_prefix: str | None = None

198 fstring_quote: str | None = None

199

200 # CPython has a weird bug where every time a bare \r is

201 # present, the next token becomes an OP. regardless of what it is.

202 weird_op_case: bool = False

203 weird_op_case_nl: bool = False

204

205 weird_whitespace_case: bool = False

206

207 def is_in_bounds(self) -> bool:

208 return self.current_index < len(self.source)

209

210 def peek(self) -> str:

211 assert self.is_in_bounds()

212 return self.source[self.current_index]

213

214 def peek_next(self) -> str:

215 assert self.current_index + 1 < len(self.source)

216 return self.source[self.current_index + 1]

217

218 def advance(self) -> None:

219 self.current_index += 1

220 self.byte_offset += 1

221

222 def advance_by(self, count: int) -> None:

223 self.current_index += count

224 self.byte_offset += count

225

226 def next_line(self) -> None:

227 self.line_number += 1

228 self.byte_offset = 0

229 self.all_whitespace_on_this_line = True

230

231 def advance_check_newline(self) -> None:

232 if self.source[self.current_index] == "\n":

233 self.current_index += 1

234 self.next_line()

235 else:

236 self.advance()

237

238 def match(self, *options: str, ignore_case: bool = False) -> bool:

239 for option in options:

240 if self.current_index + len(option) > len(self.source):

241 continue

242 snippet = self.source[self.current_index : self.current_index + len(option)]

243 if ignore_case:

244 option = option.lower()

245 snippet = snippet.lower()

246

247 if option == snippet:

248 return True

249

250 return False

251

252 def make_token(self, tok_type: TokenType) -> Token:

253 token_type = (

254 TokenType.op

255 if self.weird_op_case

256 and not tok_type.is_operator()

257 and tok_type not in (TokenType.number, TokenType.string)

258 else tok_type

259 )

260 if self.weird_op_case:

261 # And we have another weird case INSIDE the weird case.

262 # For some reason when CPython accidentally captures a space

263 # as the next character, i.e. when the token is '\r ',

264 # It DOESN't see it as whitespace, so in that specific case,

265 # we shouldn't set all_whitespace_on_this_line.

266 # I think this is because CPython never expecte to have a

267 # ' ' token in it anyway so it doesn't classify it as

268 # whitespace. So it becomes non-whitespace.

269 # Removing this if stmt breaks test 1001 right now.

270 token_str = self.source[self.prev_index : self.current_index]

271 if token_str == "\r ":

272 self.all_whitespace_on_this_line = False

273 self.weird_op_case = False

274

275 token = Token(

276 type=token_type,

277 start_index=self.prev_index,

278 end_index=self.current_index,

279 start_line=self.prev_line_number,

280 start_col=self.prev_byte_offset,

281 end_line=self.line_number,

282 end_col=self.byte_offset,

283 )

284 if tok_type == TokenType.newline or tok_type == TokenType.nl:

285 self.next_line()

286 elif tok_type == TokenType.whitespace or tok_type == TokenType.comment:

287 pass

288 else:

289 self.all_whitespace_on_this_line = False

290

291 self.prev_token = token

292 self.prev_index = self.current_index

293 self.prev_line_number = self.line_number

294 self.prev_byte_offset = self.byte_offset

295 self.weird_op_case = False

296

297 return token

298

299 def push_fstring_prefix_quote(self, prefix: str, quote: str) -> None:

300 if self.fstring_prefix is not None:

301 assert self.fstring_quote is not None

302 self.fstring_prefix_quote_stack.append(

303 (self.fstring_prefix, self.fstring_quote)

304 )

305

306 self.fstring_prefix = prefix

307 self.fstring_quote = quote

308

309 def pop_fstring_quote(self) -> None:

310 if self.fstring_prefix is None:

311 assert self.fstring_quote is None

312 raise Underflow

313

314 self.fstring_prefix, self.fstring_quote = (

315 (None, None)

316 if len(self.fstring_prefix_quote_stack) == 0

317 else self.fstring_prefix_quote_stack.pop()

318 )

319

320 def newline(self) -> Token:

321 if self.is_in_bounds() and self.source[self.current_index] == "\r":

322 self.advance()

323 self.advance()

324 token_type = (

325 TokenType.nl

326 if (

327 self.weird_op_case_nl

328 or self.bracket_level > 0

329 or self.fstring_state.state == FStringState.in_fstring_expr

330 or self.all_whitespace_on_this_line

331 )

332 else TokenType.newline

333 )

334 token = self.make_token(token_type)

335 self.weird_op_case_nl = False

336 return token

337

338 def endmarker(self) -> Token:

339 if self.bracket_level != 0:

340 raise UnexpectedEOF

341

342 if len(self.indent_stack) > 0:

343 _ = self.indent_stack.pop()

344 return self.make_token(TokenType.dedent)

345

346 return self.make_token(TokenType.endmarker)

347

348 def decimal(self) -> Token:

349 digit_before_decimal = False

350 if self.source[self.current_index].isdigit():

351 digit_before_decimal = True

352 self.advance()

353

354 # TODO: this is too lax; 1__2 tokenizes successfully

355 while self.is_in_bounds() and (

356 self.source[self.current_index].isdigit()

357 or self.source[self.current_index] == "_"

358 ):

359 self.advance()

360

361 if self.is_in_bounds() and self.source[self.current_index] == ".":

362 self.advance()

363

364 while self.is_in_bounds() and (

365 self.source[self.current_index].isdigit()

366 or (

367 self.source[self.current_index] == "_"

368 and self.source[self.current_index - 1].isdigit()

369 )

370 ):

371 self.advance()

372 # Before advancing over the 'e', ensure that there has been at least 1 digit before the 'e'

373 if self.current_index + 1 < len(self.source) and (

374 (digit_before_decimal or self.source[self.current_index - 1].isdigit())

375 and (

376 self.source[self.current_index] == "e"

377 or self.source[self.current_index] == "E"

378 )

379 and (

380 self.source[self.current_index + 1].isdigit()

381 or (

382 self.current_index + 2 < len(self.source)

383 and (

384 self.source[self.current_index + 1] == "+"

385 or self.source[self.current_index + 1] == "-"

386 )

387 and self.source[self.current_index + 2].isdigit()

388 )

389 )

390 ):

391 self.advance()

392 self.advance()

393 # optional third advance not necessary as itll get advanced just below

394

395 # TODO: this is too lax; 1__2 tokenizes successfully

396 while self.is_in_bounds() and (

397 self.source[self.current_index].isdigit()

398 or (

399 (digit_before_decimal or self.source[self.current_index - 1].isdigit())

400 and self.source[self.current_index] == "_"

401 )

402 ):

403 self.advance()

404

405 # Complex numbers end in a `j`. But ensure at least 1 digit before it

406 if self.is_in_bounds() and (

407 (digit_before_decimal or self.source[self.current_index - 1].isdigit())

408 and (

409 self.source[self.current_index] == "j"

410 or self.source[self.current_index] == "J"

411 )

412 ):

413 self.advance()

414 # If all of this resulted in just a dot, return an operator

415 if (

416 self.current_index - self.prev_index == 1

417 and self.source[self.current_index - 1] == "."

418 ):

419 # Ellipsis check

420 if (

421 self.current_index + 2 <= len(self.source)

422 and self.source[self.current_index : self.current_index + 2] == ".."

423 ):

424 self.advance()

425 self.advance()

426

427 return self.make_token(TokenType.op)

428

429 return self.make_token(TokenType.number)

430

431 def binary(self) -> Token:

432 # jump over `0b`

433 self.advance()

434 self.advance()

435 while self.is_in_bounds() and (

436 self.source[self.current_index] == "0"

437 or self.source[self.current_index] == "1"

438 or self.source[self.current_index] == "_"

439 ):

440 self.advance()

441 if self.is_in_bounds() and (

442 self.source[self.current_index] == "e"

443 or self.source[self.current_index] == "E"

444 ):

445 self.advance()

446 if self.is_in_bounds() and self.source[self.current_index] == "-":

447 self.advance()

448

449 while self.is_in_bounds() and (

450 self.source[self.current_index] == "0"

451 or self.source[self.current_index] == "1"

452 or self.source[self.current_index] == "_"

453 ):

454 self.advance()

455 return self.make_token(TokenType.number)

456

457 def octal(self) -> Token:

458 # jump over `0o`

459 self.advance()

460 self.advance()

461 while self.is_in_bounds() and (

462 self.source[self.current_index] >= "0"

463 and self.source[self.current_index] <= "7"

464 or self.source[self.current_index] == "_"

465 ):

466 self.advance()

467 if self.is_in_bounds() and (

468 self.source[self.current_index] == "e"

469 or self.source[self.current_index] == "E"

470 ):

471 self.advance()

472 if self.is_in_bounds() and self.source[self.current_index] == "-":

473 self.advance()

474

475 while self.is_in_bounds() and (

476 self.source[self.current_index] >= "0"

477 and self.source[self.current_index] <= "7"

478 or self.source[self.current_index] == "_"

479 ):

480 self.advance()

481 return self.make_token(TokenType.number)

482

483 def hexadecimal(self) -> Token:

484 # jump over `0x`

485 self.advance()

486 self.advance()

487 while self.is_in_bounds() and (

488 self.source[self.current_index] in string.hexdigits

489 or self.source[self.current_index] == "_"

490 ):

491 self.advance()

492 if self.is_in_bounds() and (

493 self.source[self.current_index] == "e"

494 or self.source[self.current_index] == "E"

495 ):

496 self.advance()

497 if self.is_in_bounds() and self.source[self.current_index] == "-":

498 self.advance()

499

500 while self.is_in_bounds() and (

501 self.source[self.current_index] in string.hexdigits

502 or self.source[self.current_index] == "_"

503 ):

504 self.advance()

505 return self.make_token(TokenType.number)

506

507 def find_opening_quote(self) -> int:

508 # Quotes should always be within 3 chars of the beginning of the string token

509 for offset in range(3):

510 char = self.source[self.current_index + offset]

511 if char == '"' or char == "'":

512 return self.current_index + offset

513

514 raise AssertionError("Quote not found somehow")

515

516 def string_prefix_and_quotes(self) -> tuple[str, str]:

517 quote_index = self.find_opening_quote()

518 prefix = self.source[self.current_index : quote_index]

519 quote_char = self.source[quote_index]

520

521 # Check for triple quotes

522 quote = (

523 self.source[quote_index : quote_index + 3]

524 if (

525 quote_index + 3 <= len(self.source)

526 and self.source[quote_index + 1] == quote_char

527 and self.source[quote_index + 2] == quote_char

528 )

529 else self.source[quote_index : quote_index + 1]

530 )

531 return prefix, quote

532

533 def fstring(self) -> Token:

534 if self.fstring_state.state in (

535 FStringState.not_fstring,

536 FStringState.in_fstring_expr,

537 ):

538 prefix, quote = self.string_prefix_and_quotes()

539 self.push_fstring_prefix_quote(prefix, quote)

540 for _ in range(len(prefix)):

541 self.advance()

542 for _ in range(len(quote)):

543 self.advance()

544 self.fstring_state.enter_fstring()

545 return self.make_token(TokenType.fstring_start)

546

547 if self.fstring_state.state == FStringState.at_fstring_middle:

548 assert self.fstring_quote is not None

549 is_single_quote = len(self.fstring_quote) == 1

550 start_index = self.current_index

551 while self.is_in_bounds():

552 char = self.source[self.current_index]

553 # For single quotes, bail on newlines

554 if char == "\n" and is_single_quote:

555 raise UnterminatedString

556

557 # Handle escapes

558 if char == "\\":

559 self.advance()

560 # But don't escape a `\{` or `\}` in f-strings

561 # but DO escape `\N{` in f-strings, that's for unicode characters

562 # but DON'T escape `\N{` in raw f-strings.

563 assert self.fstring_prefix is not None

564 if (

565 "r" not in self.fstring_prefix.lower()

566 and self.current_index + 1 < len(self.source)

567 and self.peek() == "N"

568 and self.peek_next() == "{"

569 ):

570 self.advance()

571 self.advance()

572

573 if self.is_in_bounds() and not (

574 self.peek() == "{" or self.peek() == "}"

575 ):

576 self.advance_check_newline()

577

578 continue

579

580 # Find opening / closing quote

581 if char == "{":

582 if self.peek_next() == "{":

583 self.advance()

584 self.advance()

585 continue

586 else:

587 self.fstring_state.consume_fstring_middle_for_lbrace()

588 # If fstring-middle is empty, skip it by returning the next step token

589 if self.current_index == start_index:

590 return self.fstring()

591

592 return self.make_token(TokenType.fstring_middle)

593

594 assert self.fstring_quote is not None

595 if self.match(self.fstring_quote):

596 self.fstring_state.consume_fstring_middle_for_end()

597 # If fstring-middle is empty, skip it by returning the next step token

598 if self.current_index == start_index:

599 return self.fstring()

600

601 return self.make_token(TokenType.fstring_middle)

602

603 self.advance_check_newline()

604

605 raise UnexpectedEOF

606

607 if self.fstring_state.state == FStringState.at_fstring_lbrace:

608 self.advance()

609 self.bracket_level_stack.append(self.bracket_level)

610 self.bracket_level = 0

611 self.fstring_state.consume_lbrace()

612 return self.make_token(TokenType.lbrace)

613

614 if self.fstring_state.state == FStringState.at_fstring_end:

615 assert self.fstring_quote is not None

616 for _ in range(len(self.fstring_quote)):

617 self.advance()

618 self.pop_fstring_quote()

619 self.fstring_state.leave_fstring()

620 return self.make_token(TokenType.fstring_end)

621

622 if self.fstring_state.state == FStringState.in_fstring_expr_modifier:

623 start_index = self.current_index

624 while self.is_in_bounds():

625 char = self.source[self.current_index]

626 assert self.fstring_quote is not None

627 if (char == "\n" or char == "{") and len(self.fstring_quote) == 1:

628 if char == "{":

629 self.fstring_state.consume_fstring_middle_for_lbrace()

630 else:

631 # TODO: why?

632 self.fstring_state.state = FStringState.in_fstring_expr

633

634 # If fstring-middle is empty, skip it by returning the next step token

635 if self.current_index == start_index:

636 return self.fstring()

637

638 return self.make_token(TokenType.fstring_middle)

639 elif char == "}":

640 self.fstring_state.state = FStringState.in_fstring_expr

641 return self.make_token(TokenType.fstring_middle)

642

643 self.advance_check_newline()

644

645 raise UnexpectedEOF

646

647 raise AssertionError("Unhandled f-string state")

648

649 def string(self) -> Token:

650 prefix, quote = self.string_prefix_and_quotes()

651 if prefix and self.weird_op_case:

652 self.advance()

653 return self.make_token(tok_type=TokenType.op)

654

655 for char in prefix:

656 if char == "f" or char == "F":

657 return self.fstring()

658

659 for _ in range(len(prefix)):

660 self.advance()

661 for _ in range(len(quote)):

662 self.advance()

663

664 is_single_quote = len(quote) == 1

665

666 while self.is_in_bounds():

667 char = self.source[self.current_index]

668 # For single quotes, bail on newlines

669 if char == "\n" and is_single_quote:

670 raise UnterminatedString

671

672 # Handle escapes

673 if char == "\\":

674 self.advance()

675 self.advance_check_newline()

676 continue

677

678 # Find closing quote

679 if self.match(quote):

680 for _ in range(len(quote)):

681 self.advance()

682 return self.make_token(TokenType.string)

683

684 self.advance_check_newline()

685

686 raise UnexpectedEOF

687

688 def indent(self) -> Token:

689 start_index = self.current_index

690 saw_whitespace = False

691 saw_tab_or_space = False

692 while self.is_in_bounds():

693 char = self.source[self.current_index]

694 if self.is_whitespace():

695 self.advance()

696 saw_whitespace = True

697 if char == " " or char == "\t":

698 saw_tab_or_space = True

699 else:

700 break

701

702 if not self.is_in_bounds():

703 # File ends with no whitespace after newline, don't return indent

704 if self.current_index == start_index:

705 raise NotAnIndent

706 # If reached the end of the file, don't return an indent

707 return self.make_token(TokenType.whitespace)

708

709 # If the line is preceded by just linefeeds/CR/etc.,

710 # treat it as whitespace.

711 if saw_whitespace and not saw_tab_or_space:

712 self.weird_whitespace_case = True

713 return self.make_token(TokenType.whitespace)

714

715 # For lines that are just leading whitespace and a slash or a comment,

716 # don't return indents

717 next_char = self.peek()

718 if next_char == "#" or next_char == "\\" or self.is_newline():

719 return self.make_token(TokenType.whitespace)

720

721 new_indent = self.source[start_index : self.current_index]

722 current_indent = "" if len(self.indent_stack) == 0 else self.indent_stack[-1]

723

724 if len(new_indent) == len(current_indent):

725 if len(new_indent) == 0:

726 raise NotAnIndent

727

728 if new_indent != current_indent:

729 raise InconsistentUseOfTabsAndSpaces

730 return self.make_token(TokenType.whitespace)

731 elif len(new_indent) > len(current_indent):

732 if len(current_indent) > 0 and current_indent not in new_indent:

733 raise InconsistentUseOfTabsAndSpaces

734 self.indent_stack.append(new_indent)

735 return self.make_token(TokenType.indent)

736 else:

737 while len(self.indent_stack) > 0:

738 top_indent = self.indent_stack[-1]

739 if len(top_indent) < len(new_indent):

740 raise DedentDoesNotMatchAnyOuterIndent

741

742 if len(top_indent) == len(new_indent):

743 break

744

745 _ = self.indent_stack.pop()

746 self.dedent_counter += 1

747

748 # Let the dedent counter make the dedents. They must be length zero

749 return self.make_token(TokenType.whitespace)

750

751 def is_whitespace(self) -> bool:

752 if self.is_newline():

753 return False

754

755 char = self.source[self.current_index]

756 return (

757 char == " "

758 or char == "\r"

759 or char == "\t"

760 or char == "\x0b"

761 or char == "\x0c"

762 )

763

764 def is_newline(self) -> bool:

765 if self.source[self.current_index] == "\n":

766 return True

767 if (

768 self.source[self.current_index] == "\r"

769 and self.current_index + 1 < len(self.source)

770 and self.source[self.current_index + 1] == "\n"

771 ):

772 return True

773

774 return False

775

776 def name(self) -> Token:

777 if self.weird_op_case:

778 self.advance()

779 return self.make_token(TokenType.identifier)

780

781 # According to PEP 3131, any non-ascii character is valid in a NAME token.

782 # But if we see any non-identifier ASCII character we should stop.

783 remaining = self.source[self.current_index :]

784 for index, char in enumerate(remaining):

785 if ord(char) < 128 and not str.isalnum(char) and char != "_":

786 length = index

787 break

788 else:

789 length = len(remaining)

790

791 self.advance_by(length)

792 return self.make_token(TokenType.identifier)

793

794 def __iter__(self) -> TokenIterator:

795 return self

796

797 def __next__(self) -> Token:

798 if self.prev_token is not None and self.prev_token.type == TokenType.endmarker:

799 raise StopIteration

800

801 # EOF checks

802 if self.current_index == len(self.source):

803 if self.prev_token is None:

804 return self.endmarker()

805

806 if self.prev_token.type in {

807 TokenType.newline,

808 TokenType.nl,

809 TokenType.dedent,

810 }:

811 return self.endmarker()

812 else:

813 return self.newline()

814

815 if self.current_index > len(self.source):

816 return self.endmarker()

817

818 # f-string check

819 if (

820 self.fstring_state.state != FStringState.not_fstring

821 and self.fstring_state.state != FStringState.in_fstring_expr

822 ):

823 return self.fstring()

824

825 current_char = self.source[self.current_index]

826

827 # \r on its own, in certain cases it gets merged with the next char.

828 # It's probably a bug: https://github.com/python/cpython/issues/128233

829 # 'issue_128233_handling=True' works around this bug, but if it's False

830 # then we produce identical tokens to CPython.

831 if not self.issue_128233_handling and current_char == "\r":

832 self.advance()

833 if not self.is_in_bounds():

834 return self.newline()

835

836 current_char = self.source[self.current_index]

837 if current_char != "\n":

838 self.weird_op_case = True

839 if (

840 self.prev_token is not None

841 and self.prev_token.type == TokenType.comment

842 ):

843 self.weird_op_case_nl = True

844

845 # Comment check

846 if current_char == "#":

847 if self.weird_op_case:

848 self.advance()

849 return self.make_token(TokenType.comment)

850

851 while self.is_in_bounds() and not self.is_newline():

852 if (

853 not self.issue_128233_handling

854 and self.source[self.current_index] == "\r"

855 ):

856 break

857 self.advance()

858 return self.make_token(TokenType.comment)

859

860 # Empty the dedent counter

861 if self.dedent_counter > 0:

862 self.dedent_counter -= 1

863 return self.make_token(TokenType.dedent)

864

865 # Newline check

866 if self.is_newline():

867 return self.newline()

868

869 # \<newline> check

870 if current_char == "\\":

871 self.advance()

872 if not self.is_in_bounds():

873 raise UnexpectedEOF

874

875 # Consume all whitespace on this line and the next.

876 found_whitespace = False

877 seen_newline = False

878 while self.is_in_bounds():

879 if self.is_whitespace():

880 self.advance()

881 found_whitespace = True

882 elif not seen_newline and (self.is_newline()):

883 char = self.source[self.current_index]

884 if char == "\r":

885 self.advance()

886 self.advance()

887 found_whitespace = True

888 seen_newline = True

889 # Move to next line without creating a newline token. But,

890 # if the previous line was all whitespace, whitespace on

891 # the next line is still valid indentation. Avoid consuming

892 if self.all_whitespace_on_this_line:

893 self.next_line()

894 break

895 else:

896 self.next_line()

897 # Preserve this boolean, we're on the same line semantically

898 self.all_whitespace_on_this_line = False

899

900 else:

901 break

902

903 if not found_whitespace:

904 raise UnexpectedCharacterAfterBackslash

905

906 return self.make_token(TokenType.whitespace)

907

908 # Indent / dedent checks

909 if (

910 (self.byte_offset == 0 or self.weird_whitespace_case)

911 and self.bracket_level == 0

912 and self.fstring_state.state == FStringState.not_fstring

913 ):

914 self.weird_whitespace_case = False

915 try:

916 indent_token = self.indent()

917 except NotAnIndent:

918 indent_token = None

919

920 if indent_token is not None:

921 return indent_token

922

923 if self.is_whitespace():

924 while self.is_in_bounds() and self.is_whitespace():

925 self.advance()

926 return self.make_token(TokenType.whitespace)

927

928 if current_char in ("+", "&", "|", "^", "@", "%", "=", "!", "~"):

929 self.advance()

930 if self.peek() == "=":

931 self.advance()

932 return self.make_token(TokenType.op)

933

934 if current_char == "<":

935 self.advance()

936 if self.peek() == ">":

937 # Barry as FLUFL easter egg

938 self.advance()

939 return self.make_token(TokenType.op)

940

941 if self.peek() == "<":

942 self.advance()

943 if self.peek() == "=":

944 self.advance()

945 return self.make_token(TokenType.op)

946

947 if current_char == ">":

948 self.advance()

949 if self.peek() == ">":

950 self.advance()

951 if self.peek() == "=":

952 self.advance()

953 return self.make_token(TokenType.op)

954

955 if current_char == "/":

956 self.advance()

957 if self.peek() == "/":

958 self.advance()

959 if self.peek() == "=":

960 self.advance()

961 return self.make_token(TokenType.op)

962

963 if current_char == "*":

964 self.advance()

965 if self.peek() == "*":

966 self.advance()

967 if self.peek() == "=":

968 self.advance()

969 return self.make_token(TokenType.op)

970

971 if current_char == "-":

972 self.advance()

973 # -> operator

974 if self.peek() == ">":

975 self.advance()

976 return self.make_token(TokenType.op)

977

978 # -= operator

979 if self.peek() == "=":

980 self.advance()

981 return self.make_token(TokenType.op)

982

983 if current_char in (",", ";"):

984 self.advance()

985 return self.make_token(TokenType.op)

986

987 # This guy is not used in Python3, but still exists

988 # for backwards compatibility i guess.

989 if current_char == "`":

990 self.advance()

991 return self.make_token(TokenType.op)

992

993 if current_char == "(":

994 self.advance()

995 self.bracket_level += 1

996 return self.make_token(TokenType.lparen)

997

998 if current_char == ")":

999 self.advance()

1000 self.bracket_level -= 1

1001 if self.bracket_level < 0:

1002 self.bracket_level = 0

1003 return self.make_token(TokenType.rparen)

1004

1005 if current_char == "[":

1006 self.advance()

1007 self.bracket_level += 1

1008 return self.make_token(TokenType.lbracket)

1009

1010 if current_char == "]":

1011 self.advance()

1012 self.bracket_level -= 1

1013 if self.bracket_level < 0:

1014 self.bracket_level = 0

1015 return self.make_token(TokenType.rbracket)

1016

1017 if current_char == "{":

1018 self.advance()

1019 self.bracket_level += 1

1020 return self.make_token(TokenType.lbrace)

1021

1022 if current_char == "}":

1023 self.advance()

1024 if (

1025 self.bracket_level == 0

1026 and self.fstring_state.state == FStringState.in_fstring_expr

1027 ):

1028 self.fstring_state.consume_rbrace()

1029 self.bracket_level = self.bracket_level_stack.pop()

1030 else:

1031 self.bracket_level -= 1

1032 if self.bracket_level < 0:

1033 self.bracket_level = 0

1034

1035 return self.make_token(TokenType.rbrace)

1036

1037 if current_char == ":":

1038 self.advance()

1039 if (

1040 self.bracket_level == 0

1041 and self.fstring_state.state == FStringState.in_fstring_expr

1042 ):

1043 self.fstring_state.state = FStringState.in_fstring_expr_modifier

1044 return self.make_token(TokenType.op)

1045 else:

1046 if self.peek() == "=":

1047 self.advance()

1048 return self.make_token(TokenType.op)

1049

1050 if current_char in ".0123456789":

1051 if self.current_index + 2 <= len(self.source) and self.source[

1052 self.current_index : self.current_index + 2

1053 ] in ("0b", "0B"):

1054 return self.binary()

1055 elif self.current_index + 2 <= len(self.source) and self.source[

1056 self.current_index : self.current_index + 2

1057 ] in ("0o", "0O"):

1058 return self.octal()

1059 elif self.current_index + 2 <= len(self.source) and self.source[

1060 self.current_index : self.current_index + 2

1061 ] in ("0x", "0X"):

1062 return self.hexadecimal()

1063 else:

1064 return self.decimal()

1065

1066 if (

1067 (self.current_index + 1 <= len(self.source) and self.match('"', "'"))

1068 or (

1069 self.current_index + 2 <= len(self.source)

1070 and self.match(

1071 'b"',

1072 "b'",

1073 'r"',

1074 "r'",

1075 'f"',

1076 "f'",

1077 'u"',

1078 "u'",

1079 ignore_case=True,

1080 )

1081 )

1082 or (

1083 self.current_index + 3 <= len(self.source)

1084 and self.match(

1085 'br"',

1086 "br'",

1087 'rb"',

1088 "rb'",

1089 'fr"',

1090 "fr'",

1091 'rf"',

1092 "rf'",

1093 ignore_case=True,

1094 )

1095 )

1096 ):

1097 return self.string()

1098

1099 return self.name()

1100

1101

1102def tokenize(

1103 source: str,

1104 *,

1105 fstring_tokens: bool = True,

1106 issue_128233_handling: bool = True,

1107) -> Iterator[Token]:

1108 token_iterator = TokenIterator(source, issue_128233_handling=issue_128233_handling)

1109 if fstring_tokens:

1110 return iter(token_iterator)

1111

1112 return merge_fstring_tokens(token_iterator)

1113

1114

1115def merge_fstring_tokens(token_iterator: TokenIterator) -> Iterator[Token]:

1116 """Turn post-Python-3.12 FSTRING-* tokens back to a single STRING token."""

1117 for token in token_iterator:

1118 if token.type != TokenType.fstring_start:

1119 yield token

1120 continue

1121

1122 start_token = token

1123 end_token = token

1124

1125 fstring_starts = 1

1126 fstring_ends = 0

1127 for token in token_iterator:

1128 if token.type == TokenType.fstring_start:

1129 fstring_starts += 1

1130 if token.type == TokenType.fstring_end:

1131 fstring_ends += 1

1132

1133 if fstring_starts == fstring_ends:

1134 end_token = token

1135 break

1136

1137 yield Token(

1138 type=TokenType.string,

1139 start_index=start_token.start_index,

1140 start_line=start_token.start_line,

1141 start_col=start_token.start_col,

1142 end_index=end_token.end_index,

1143 end_line=end_token.end_line,

1144 end_col=end_token.end_col,

1145 )

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pytokens/init.py: 78%

646 statements