Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pytokens/__init_

1"""pytokens - A Fast, spec compliant Python 3.12+ tokenizer that runs on older Pythons."""

3from __future__ import annotations

5from dataclasses import dataclass, field

6import enum

7import string

8from typing import Iterator, NewType

11class TokenizeError(Exception): ...

14class IndentationError(TokenizeError): ...

17class InconsistentUseOfTabsAndSpaces(IndentationError): ...

20class DedentDoesNotMatchAnyOuterIndent(IndentationError): ...

23class UnterminatedString(TokenizeError): ...

26class UnexpectedEOF(TokenizeError): ...

29class UnexpectedCharacterAfterBackslash(TokenizeError): ...

32class NotAnIndent(AssertionError): ...

35class Underflow(AssertionError): ...

38class TokenType(enum.IntEnum):

39 whitespace = 1

40 indent = 2

41 dedent = 3

42 newline = 4 # semantically meaningful newline

43 nl = 5 # non meaningful newline

44 comment = 6

46 _op_start = 7 # marker used to check if a token is an operator

47 semicolon = 8

48 lparen = 9

49 rparen = 10

50 lbracket = 11

51 rbracket = 12

52 lbrace = 13

53 rbrace = 14

54 colon = 15

55 op = 16

56 _op_end = 17 # marker used to check if a token is an operator

58 identifier = 18

59 number = 19

60 string = 20

61 fstring_start = 21

62 fstring_middle = 22

63 fstring_end = 23

65 tstring_start = 24

66 tstring_middle = 25

67 tstring_end = 26

69 endmarker = 27

71 errortoken = 28

73 def __repr__(self) -> str:

74 return f"TokenType.{self.name}"

76 def to_python_token(self) -> str:

77 if self.name == "identifier":

78 return "NAME"

80 if self.is_operator():

81 return "OP"

83 return self.name.upper()

85 def is_operator(self) -> bool:

86 return TokenType._op_start < self < TokenType._op_end

89@dataclass

90class Token:

91 type: TokenType

92 # Byte offsets in the file

93 start_index: int

94 end_index: int

95 start_line: int

96 # 0-indexed offset from start of line

97 start_col: int

98 end_line: int

99 end_col: int

100

101 def to_byte_slice(self, source: str) -> str:

102 # Newline at end of file may not exist in the file

103 if (

104 (self.type == TokenType.newline or self.type == TokenType.nl)

105 and self.start_index == len(source)

106 and self.end_index == len(source) + 1

107 ):

108 return ""

109

110 # Dedents at end of file also may not exist in the file

111 if (

112 self.type == TokenType.dedent

113 and self.start_index == len(source) + 1

114 and self.end_index == len(source) + 1

115 ):

116 return ""

117

118 # Endmarkers are out of bound too

119 if self.type == TokenType.endmarker:

120 return ""

121

122 return source[self.start_index : self.end_index]

123

124

125class FStringState:

126 State = NewType("State", int)

127

128 not_fstring = State(1)

129 at_fstring_middle = State(2)

130 at_fstring_lbrace = State(3)

131 in_fstring_expr = State(4)

132 in_fstring_expr_modifier = State(5)

133 at_fstring_end = State(6)

134

135 def __init__(self) -> None:

136 self.state = FStringState.not_fstring

137 self.stack: list[FStringState.State] = []

138

139 def enter_fstring(self) -> None:

140 self.stack.append(self.state)

141 self.state = FStringState.at_fstring_middle

142

143 def leave_fstring(self) -> None:

144 assert self.state == FStringState.at_fstring_end

145 self.state = self.stack.pop()

146

147 def consume_fstring_middle_for_lbrace(self) -> None:

148 if self.state == FStringState.in_fstring_expr_modifier:

149 self.stack.append(self.state)

150

151 self.state = FStringState.at_fstring_lbrace

152

153 def consume_fstring_middle_for_end(self) -> None:

154 self.state = FStringState.at_fstring_end

155

156 def consume_lbrace(self) -> None:

157 self.state = FStringState.in_fstring_expr

158

159 def consume_rbrace(self) -> None:

160 assert (

161 self.state == FStringState.in_fstring_expr

162 or self.state == FStringState.in_fstring_expr_modifier

163 )

164

165 if (

166 len(self.stack) > 0

167 and self.stack[-1] == FStringState.in_fstring_expr_modifier

168 ):

169 self.state = self.stack.pop()

170 else:

171 self.state = FStringState.at_fstring_middle

172

173 def consume_colon(self) -> None:

174 assert self.state == FStringState.in_fstring_expr

175 self.state = FStringState.in_fstring_expr_modifier

176

177

178@dataclass

179class TokenIterator:

180 source: str

181 issue_128233_handling: bool

182

183 current_index: int = 0

184 prev_index: int = 0

185 line_number: int = 1

186 prev_line_number: int = 1

187 byte_offset: int = 0

188 prev_byte_offset: int = 0

189 all_whitespace_on_this_line: bool = True

190

191 bracket_level: int = 0

192 bracket_level_stack: list[int] = field(default_factory=list)

193 prev_token: Token | None = None

194

195 indent_stack: list[str] = field(default_factory=list)

196 dedent_counter: int = 0

197

198 # f-string state

199 fstring_state: FStringState = field(default_factory=FStringState)

200 fstring_prefix_quote_stack: list[tuple[str, str]] = field(default_factory=list)

201 fstring_prefix: str | None = None

202 fstring_quote: str | None = None

203

204 # CPython has a weird bug where every time a bare \r is

205 # present, the next token becomes an OP. regardless of what it is.

206 weird_op_case: bool = False

207 weird_op_case_nl: bool = False

208

209 weird_whitespace_case: bool = False

210

211 def is_in_bounds(self) -> bool:

212 return self.current_index < len(self.source)

213

214 def peek(self) -> str:

215 assert self.is_in_bounds()

216 return self.source[self.current_index]

217

218 def peek_next(self) -> str:

219 assert self.current_index + 1 < len(self.source)

220 return self.source[self.current_index + 1]

221

222 def advance(self) -> None:

223 self.current_index += 1

224 self.byte_offset += 1

225

226 def advance_by(self, count: int) -> None:

227 self.current_index += count

228 self.byte_offset += count

229

230 def next_line(self) -> None:

231 self.line_number += 1

232 self.byte_offset = 0

233 self.all_whitespace_on_this_line = True

234

235 def advance_check_newline(self) -> None:

236 if self.source[self.current_index] == "\n":

237 self.current_index += 1

238 self.next_line()

239 else:

240 self.advance()

241

242 def match(self, *options: str, ignore_case: bool = False) -> bool:

243 for option in options:

244 if self.current_index + len(option) > len(self.source):

245 continue

246 snippet = self.source[self.current_index : self.current_index + len(option)]

247 if ignore_case:

248 option = option.lower()

249 snippet = snippet.lower()

250

251 if option == snippet:

252 return True

253

254 return False

255

256 def make_token(self, tok_type: TokenType) -> Token:

257 if self.fstring_prefix is not None and "t" in self.fstring_prefix:

258 if tok_type == TokenType.fstring_start:

259 tok_type = TokenType.tstring_start

260 elif tok_type == TokenType.fstring_middle:

261 tok_type = TokenType.tstring_middle

262 elif tok_type == TokenType.fstring_end:

263 tok_type = TokenType.tstring_end

264

265 token_type = (

266 TokenType.op

267 if self.weird_op_case

268 and not tok_type.is_operator()

269 and tok_type not in (TokenType.number, TokenType.string)

270 else tok_type

271 )

272 if self.weird_op_case:

273 # And we have another weird case INSIDE the weird case.

274 # For some reason when CPython accidentally captures a space

275 # as the next character, i.e. when the token is '\r ',

276 # It DOESN't see it as whitespace, so in that specific case,

277 # we shouldn't set all_whitespace_on_this_line.

278 # I think this is because CPython never expecte to have a

279 # ' ' token in it anyway so it doesn't classify it as

280 # whitespace. So it becomes non-whitespace.

281 # Removing this if stmt breaks test 1001 right now.

282 token_str = self.source[self.prev_index : self.current_index]

283 if token_str == "\r ":

284 self.all_whitespace_on_this_line = False

285 self.weird_op_case = False

286

287 token = Token(

288 type=token_type,

289 start_index=self.prev_index,

290 end_index=self.current_index,

291 start_line=self.prev_line_number,

292 start_col=self.prev_byte_offset,

293 end_line=self.line_number,

294 end_col=self.byte_offset,

295 )

296 if tok_type == TokenType.newline or tok_type == TokenType.nl:

297 self.next_line()

298 elif tok_type == TokenType.whitespace or tok_type == TokenType.comment:

299 pass

300 else:

301 self.all_whitespace_on_this_line = False

302

303 self.prev_token = token

304 self.prev_index = self.current_index

305 self.prev_line_number = self.line_number

306 self.prev_byte_offset = self.byte_offset

307 self.weird_op_case = False

308

309 return token

310

311 def push_fstring_prefix_quote(self, prefix: str, quote: str) -> None:

312 if self.fstring_prefix is not None:

313 assert self.fstring_quote is not None

314 self.fstring_prefix_quote_stack.append(

315 (self.fstring_prefix, self.fstring_quote)

316 )

317

318 self.fstring_prefix = prefix

319 self.fstring_quote = quote

320

321 def pop_fstring_quote(self) -> None:

322 if self.fstring_prefix is None:

323 assert self.fstring_quote is None

324 raise Underflow

325

326 self.fstring_prefix, self.fstring_quote = (

327 (None, None)

328 if len(self.fstring_prefix_quote_stack) == 0

329 else self.fstring_prefix_quote_stack.pop()

330 )

331

332 def newline(self) -> Token:

333 if self.is_in_bounds() and self.source[self.current_index] == "\r":

334 self.advance()

335 self.advance()

336 token_type = (

337 TokenType.nl

338 if (

339 self.weird_op_case_nl

340 or self.bracket_level > 0

341 or self.fstring_state.state == FStringState.in_fstring_expr

342 or self.all_whitespace_on_this_line

343 )

344 else TokenType.newline

345 )

346 token = self.make_token(token_type)

347 self.weird_op_case_nl = False

348 return token

349

350 def endmarker(self) -> Token:

351 if self.bracket_level != 0:

352 raise UnexpectedEOF

353

354 if len(self.indent_stack) > 0:

355 _ = self.indent_stack.pop()

356 return self.make_token(TokenType.dedent)

357

358 return self.make_token(TokenType.endmarker)

359

360 def decimal(self) -> Token:

361 digit_before_decimal = False

362 if self.source[self.current_index].isdigit():

363 digit_before_decimal = True

364 self.advance()

365

366 # TODO: this is too lax; 1__2 tokenizes successfully

367 while self.is_in_bounds() and (

368 self.source[self.current_index].isdigit()

369 or self.source[self.current_index] == "_"

370 ):

371 self.advance()

372

373 if self.is_in_bounds() and self.source[self.current_index] == ".":

374 self.advance()

375

376 while self.is_in_bounds() and (

377 self.source[self.current_index].isdigit()

378 or (

379 self.source[self.current_index] == "_"

380 and self.source[self.current_index - 1].isdigit()

381 )

382 ):

383 self.advance()

384 # Before advancing over the 'e', ensure that there has been at least 1 digit before the 'e'

385 if self.current_index + 1 < len(self.source) and (

386 (digit_before_decimal or self.source[self.current_index - 1].isdigit())

387 and (

388 self.source[self.current_index] == "e"

389 or self.source[self.current_index] == "E"

390 )

391 and (

392 self.source[self.current_index + 1].isdigit()

393 or (

394 self.current_index + 2 < len(self.source)

395 and (

396 self.source[self.current_index + 1] == "+"

397 or self.source[self.current_index + 1] == "-"

398 )

399 and self.source[self.current_index + 2].isdigit()

400 )

401 )

402 ):

403 self.advance()

404 self.advance()

405 # optional third advance not necessary as itll get advanced just below

406

407 # TODO: this is too lax; 1__2 tokenizes successfully

408 while self.is_in_bounds() and (

409 self.source[self.current_index].isdigit()

410 or (

411 (digit_before_decimal or self.source[self.current_index - 1].isdigit())

412 and self.source[self.current_index] == "_"

413 )

414 ):

415 self.advance()

416

417 # Complex numbers end in a `j`. But ensure at least 1 digit before it

418 if self.is_in_bounds() and (

419 (digit_before_decimal or self.source[self.current_index - 1].isdigit())

420 and (

421 self.source[self.current_index] == "j"

422 or self.source[self.current_index] == "J"

423 )

424 ):

425 self.advance()

426 # If all of this resulted in just a dot, return an operator

427 if (

428 self.current_index - self.prev_index == 1

429 and self.source[self.current_index - 1] == "."

430 ):

431 # Ellipsis check

432 if (

433 self.current_index + 2 <= len(self.source)

434 and self.source[self.current_index : self.current_index + 2] == ".."

435 ):

436 self.advance()

437 self.advance()

438

439 return self.make_token(TokenType.op)

440

441 return self.make_token(TokenType.number)

442

443 def binary(self) -> Token:

444 # jump over `0b`

445 self.advance()

446 self.advance()

447 while self.is_in_bounds() and (

448 self.source[self.current_index] == "0"

449 or self.source[self.current_index] == "1"

450 or self.source[self.current_index] == "_"

451 ):

452 self.advance()

453 if self.is_in_bounds() and (

454 self.source[self.current_index] == "e"

455 or self.source[self.current_index] == "E"

456 ):

457 self.advance()

458 if self.is_in_bounds() and self.source[self.current_index] == "-":

459 self.advance()

460

461 while self.is_in_bounds() and (

462 self.source[self.current_index] == "0"

463 or self.source[self.current_index] == "1"

464 or self.source[self.current_index] == "_"

465 ):

466 self.advance()

467 return self.make_token(TokenType.number)

468

469 def octal(self) -> Token:

470 # jump over `0o`

471 self.advance()

472 self.advance()

473 while self.is_in_bounds() and (

474 self.source[self.current_index] >= "0"

475 and self.source[self.current_index] <= "7"

476 or self.source[self.current_index] == "_"

477 ):

478 self.advance()

479 if self.is_in_bounds() and (

480 self.source[self.current_index] == "e"

481 or self.source[self.current_index] == "E"

482 ):

483 self.advance()

484 if self.is_in_bounds() and self.source[self.current_index] == "-":

485 self.advance()

486

487 while self.is_in_bounds() and (

488 self.source[self.current_index] >= "0"

489 and self.source[self.current_index] <= "7"

490 or self.source[self.current_index] == "_"

491 ):

492 self.advance()

493 return self.make_token(TokenType.number)

494

495 def hexadecimal(self) -> Token:

496 # jump over `0x`

497 self.advance()

498 self.advance()

499 while self.is_in_bounds() and (

500 self.source[self.current_index] in string.hexdigits

501 or self.source[self.current_index] == "_"

502 ):

503 self.advance()

504 if self.is_in_bounds() and (

505 self.source[self.current_index] == "e"

506 or self.source[self.current_index] == "E"

507 ):

508 self.advance()

509 if self.is_in_bounds() and self.source[self.current_index] == "-":

510 self.advance()

511

512 while self.is_in_bounds() and (

513 self.source[self.current_index] in string.hexdigits

514 or self.source[self.current_index] == "_"

515 ):

516 self.advance()

517 return self.make_token(TokenType.number)

518

519 def find_opening_quote(self) -> int:

520 # Quotes should always be within 3 chars of the beginning of the string token

521 for offset in range(3):

522 char = self.source[self.current_index + offset]

523 if char == '"' or char == "'":

524 return self.current_index + offset

525

526 raise AssertionError("Quote not found somehow")

527

528 def string_prefix_and_quotes(self) -> tuple[str, str]:

529 quote_index = self.find_opening_quote()

530 prefix = self.source[self.current_index : quote_index]

531 quote_char = self.source[quote_index]

532

533 # Check for triple quotes

534 quote = (

535 self.source[quote_index : quote_index + 3]

536 if (

537 quote_index + 3 <= len(self.source)

538 and self.source[quote_index + 1] == quote_char

539 and self.source[quote_index + 2] == quote_char

540 )

541 else self.source[quote_index : quote_index + 1]

542 )

543 return prefix, quote

544

545 def fstring(self) -> Token:

546 if self.fstring_state.state in (

547 FStringState.not_fstring,

548 FStringState.in_fstring_expr,

549 ):

550 prefix, quote = self.string_prefix_and_quotes()

551

552 self.push_fstring_prefix_quote(prefix, quote)

553 for _ in range(len(prefix)):

554 self.advance()

555 for _ in range(len(quote)):

556 self.advance()

557 self.fstring_state.enter_fstring()

558 return self.make_token(TokenType.fstring_start)

559

560 if self.fstring_state.state == FStringState.at_fstring_middle:

561 assert self.fstring_quote is not None

562 is_single_quote = len(self.fstring_quote) == 1

563 start_index = self.current_index

564 while self.is_in_bounds():

565 char = self.source[self.current_index]

566 # For single quotes, bail on newlines

567 if char == "\n" and is_single_quote:

568 raise UnterminatedString

569

570 # Handle escapes

571 if char == "\\":

572 self.advance()

573 # But don't escape a `\{` or `\}` in f-strings

574 # but DO escape `\N{` in f-strings, that's for unicode characters

575 # but DON'T escape `\N{` in raw f-strings.

576 assert self.fstring_prefix is not None

577 if (

578 "r" not in self.fstring_prefix.lower()

579 and self.current_index + 1 < len(self.source)

580 and self.peek() == "N"

581 and self.peek_next() == "{"

582 ):

583 self.advance()

584 self.advance()

585

586 if self.is_in_bounds() and not (

587 self.peek() == "{" or self.peek() == "}"

588 ):

589 self.advance_check_newline()

590

591 continue

592

593 # Find opening / closing quote

594 if char == "{":

595 if self.peek_next() == "{":

596 self.advance()

597 self.advance()

598 continue

599 else:

600 self.fstring_state.consume_fstring_middle_for_lbrace()

601 # If fstring-middle is empty, skip it by returning the next step token

602 if self.current_index == start_index:

603 return self.fstring()

604

605 return self.make_token(TokenType.fstring_middle)

606

607 assert self.fstring_quote is not None

608 if self.match(self.fstring_quote):

609 self.fstring_state.consume_fstring_middle_for_end()

610 # If fstring-middle is empty, skip it by returning the next step token

611 if self.current_index == start_index:

612 return self.fstring()

613

614 return self.make_token(TokenType.fstring_middle)

615

616 self.advance_check_newline()

617

618 raise UnexpectedEOF

619

620 if self.fstring_state.state == FStringState.at_fstring_lbrace:

621 self.advance()

622 self.bracket_level_stack.append(self.bracket_level)

623 self.bracket_level = 0

624 self.fstring_state.consume_lbrace()

625 return self.make_token(TokenType.lbrace)

626

627 if self.fstring_state.state == FStringState.at_fstring_end:

628 assert self.fstring_quote is not None

629 for _ in range(len(self.fstring_quote)):

630 self.advance()

631 token = self.make_token(TokenType.fstring_end)

632 self.pop_fstring_quote()

633 self.fstring_state.leave_fstring()

634 return token

635

636 if self.fstring_state.state == FStringState.in_fstring_expr_modifier:

637 start_index = self.current_index

638 while self.is_in_bounds():

639 char = self.source[self.current_index]

640 assert self.fstring_quote is not None

641 if (char == "\n" or char == "{") and len(self.fstring_quote) == 1:

642 if char == "{":

643 self.fstring_state.consume_fstring_middle_for_lbrace()

644 else:

645 # TODO: why?

646 self.fstring_state.state = FStringState.in_fstring_expr

647

648 # If fstring-middle is empty, skip it by returning the next step token

649 if self.current_index == start_index:

650 return self.fstring()

651

652 return self.make_token(TokenType.fstring_middle)

653 elif char == "}":

654 self.fstring_state.state = FStringState.in_fstring_expr

655 return self.make_token(TokenType.fstring_middle)

656

657 self.advance_check_newline()

658

659 raise UnexpectedEOF

660

661 raise AssertionError("Unhandled f-string state")

662

663 def string(self) -> Token:

664 prefix, quote = self.string_prefix_and_quotes()

665 if prefix and self.weird_op_case:

666 self.advance()

667 return self.make_token(tok_type=TokenType.op)

668

669 for char in prefix:

670 if char in ("f", "F", "t", "T"):

671 return self.fstring()

672

673 for _ in range(len(prefix)):

674 self.advance()

675 for _ in range(len(quote)):

676 self.advance()

677

678 is_single_quote = len(quote) == 1

679

680 while self.is_in_bounds():

681 char = self.source[self.current_index]

682 # For single quotes, bail on newlines

683 if char == "\n" and is_single_quote:

684 raise UnterminatedString

685

686 # Handle escapes

687 if char == "\\":

688 self.advance()

689 self.advance_check_newline()

690 continue

691

692 # Find closing quote

693 if self.match(quote):

694 for _ in range(len(quote)):

695 self.advance()

696 return self.make_token(TokenType.string)

697

698 self.advance_check_newline()

699

700 raise UnexpectedEOF

701

702 def indent(self) -> Token:

703 start_index = self.current_index

704 saw_whitespace = False

705 saw_tab_or_space = False

706 while self.is_in_bounds():

707 char = self.source[self.current_index]

708 if self.is_whitespace():

709 self.advance()

710 saw_whitespace = True

711 if char == " " or char == "\t":

712 saw_tab_or_space = True

713 else:

714 break

715

716 if not self.is_in_bounds():

717 # File ends with no whitespace after newline, don't return indent

718 if self.current_index == start_index:

719 raise NotAnIndent

720 # If reached the end of the file, don't return an indent

721 return self.make_token(TokenType.whitespace)

722

723 # If the line is preceded by just linefeeds/CR/etc.,

724 # treat it as whitespace.

725 if saw_whitespace and not saw_tab_or_space:

726 self.weird_whitespace_case = True

727 return self.make_token(TokenType.whitespace)

728

729 # For lines that are just leading whitespace and a slash or a comment,

730 # don't return indents

731 next_char = self.peek()

732 if next_char == "#" or next_char == "\\" or self.is_newline():

733 return self.make_token(TokenType.whitespace)

734

735 new_indent = self.source[start_index : self.current_index]

736 current_indent = "" if len(self.indent_stack) == 0 else self.indent_stack[-1]

737

738 if len(new_indent) == len(current_indent):

739 if len(new_indent) == 0:

740 raise NotAnIndent

741

742 if new_indent != current_indent:

743 raise InconsistentUseOfTabsAndSpaces

744 return self.make_token(TokenType.whitespace)

745 elif len(new_indent) > len(current_indent):

746 if len(current_indent) > 0 and current_indent not in new_indent:

747 raise InconsistentUseOfTabsAndSpaces

748 self.indent_stack.append(new_indent)

749 return self.make_token(TokenType.indent)

750 else:

751 while len(self.indent_stack) > 0:

752 top_indent = self.indent_stack[-1]

753 if len(top_indent) < len(new_indent):

754 raise DedentDoesNotMatchAnyOuterIndent

755

756 if len(top_indent) == len(new_indent):

757 break

758

759 _ = self.indent_stack.pop()

760 self.dedent_counter += 1

761

762 # Let the dedent counter make the dedents. They must be length zero

763 return self.make_token(TokenType.whitespace)

764

765 def is_whitespace(self) -> bool:

766 if self.is_newline():

767 return False

768

769 char = self.source[self.current_index]

770 return (

771 char == " "

772 or char == "\r"

773 or char == "\t"

774 or char == "\x0b"

775 or char == "\x0c"

776 )

777

778 def is_newline(self) -> bool:

779 if self.source[self.current_index] == "\n":

780 return True

781 if (

782 self.source[self.current_index] == "\r"

783 and self.current_index + 1 < len(self.source)

784 and self.source[self.current_index + 1] == "\n"

785 ):

786 return True

787

788 return False

789

790 def name(self) -> Token:

791 if self.weird_op_case:

792 self.advance()

793 return self.make_token(TokenType.identifier)

794

795 # According to PEP 3131, any non-ascii character is valid in a NAME token.

796 # But if we see any non-identifier ASCII character we should stop.

797 remaining = self.source[self.current_index :]

798 for index, char in enumerate(remaining):

799 if ord(char) < 128 and not str.isalnum(char) and char != "_":

800 length = index

801 break

802 else:

803 length = len(remaining)

804

805 self.advance_by(length)

806 return self.make_token(TokenType.identifier)

807

808 def __iter__(self) -> TokenIterator:

809 return self

810

811 def __next__(self) -> Token:

812 if self.prev_token is not None and self.prev_token.type == TokenType.endmarker:

813 raise StopIteration

814

815 # EOF checks

816 if self.current_index == len(self.source):

817 if self.prev_token is None:

818 return self.endmarker()

819

820 if self.prev_token.type in {

821 TokenType.newline,

822 TokenType.nl,

823 TokenType.dedent,

824 }:

825 return self.endmarker()

826 else:

827 return self.newline()

828

829 if self.current_index > len(self.source):

830 return self.endmarker()

831

832 # f-string check

833 if (

834 self.fstring_state.state != FStringState.not_fstring

835 and self.fstring_state.state != FStringState.in_fstring_expr

836 ):

837 return self.fstring()

838

839 current_char = self.source[self.current_index]

840

841 # \r on its own, in certain cases it gets merged with the next char.

842 # It's probably a bug: https://github.com/python/cpython/issues/128233

843 # 'issue_128233_handling=True' works around this bug, but if it's False

844 # then we produce identical tokens to CPython.

845 if not self.issue_128233_handling and current_char == "\r":

846 self.advance()

847 if not self.is_in_bounds():

848 return self.newline()

849

850 current_char = self.source[self.current_index]

851 if current_char != "\n":

852 self.weird_op_case = True

853 if (

854 self.prev_token is not None

855 and self.prev_token.type == TokenType.comment

856 ):

857 self.weird_op_case_nl = True

858

859 # Comment check

860 if current_char == "#":

861 if self.weird_op_case:

862 self.advance()

863 return self.make_token(TokenType.comment)

864

865 while self.is_in_bounds() and not self.is_newline():

866 if (

867 not self.issue_128233_handling

868 and self.source[self.current_index] == "\r"

869 ):

870 break

871 self.advance()

872 return self.make_token(TokenType.comment)

873

874 # Empty the dedent counter

875 if self.dedent_counter > 0:

876 self.dedent_counter -= 1

877 return self.make_token(TokenType.dedent)

878

879 # Newline check

880 if self.is_newline():

881 return self.newline()

882

883 # \<newline> check

884 if current_char == "\\":

885 self.advance()

886 if not self.is_in_bounds():

887 raise UnexpectedEOF

888

889 # Consume all whitespace on this line and the next.

890 found_whitespace = False

891 seen_newline = False

892 while self.is_in_bounds():

893 if self.is_whitespace():

894 self.advance()

895 found_whitespace = True

896 elif not seen_newline and (self.is_newline()):

897 char = self.source[self.current_index]

898 if char == "\r":

899 self.advance()

900 self.advance()

901 found_whitespace = True

902 seen_newline = True

903 # Move to next line without creating a newline token. But,

904 # if the previous line was all whitespace, whitespace on

905 # the next line is still valid indentation. Avoid consuming

906 if self.all_whitespace_on_this_line:

907 self.next_line()

908 break

909 else:

910 self.next_line()

911 # Preserve this boolean, we're on the same line semantically

912 self.all_whitespace_on_this_line = False

913

914 else:

915 break

916

917 if not found_whitespace:

918 raise UnexpectedCharacterAfterBackslash

919

920 return self.make_token(TokenType.whitespace)

921

922 # Indent / dedent checks

923 if (

924 (self.byte_offset == 0 or self.weird_whitespace_case)

925 and self.bracket_level == 0

926 and self.fstring_state.state == FStringState.not_fstring

927 ):

928 self.weird_whitespace_case = False

929 try:

930 indent_token = self.indent()

931 except NotAnIndent:

932 indent_token = None

933

934 if indent_token is not None:

935 return indent_token

936

937 if self.is_whitespace():

938 while self.is_in_bounds() and self.is_whitespace():

939 self.advance()

940 return self.make_token(TokenType.whitespace)

941

942 if current_char in ("+", "&", "|", "^", "@", "%", "=", "!", "~"):

943 self.advance()

944 if self.peek() == "=":

945 self.advance()

946 return self.make_token(TokenType.op)

947

948 if current_char == "<":

949 self.advance()

950 if self.peek() == ">":

951 # Barry as FLUFL easter egg

952 self.advance()

953 return self.make_token(TokenType.op)

954

955 if self.peek() == "<":

956 self.advance()

957 if self.peek() == "=":

958 self.advance()

959 return self.make_token(TokenType.op)

960

961 if current_char == ">":

962 self.advance()

963 if self.peek() == ">":

964 self.advance()

965 if self.peek() == "=":

966 self.advance()

967 return self.make_token(TokenType.op)

968

969 if current_char == "/":

970 self.advance()

971 if self.peek() == "/":

972 self.advance()

973 if self.peek() == "=":

974 self.advance()

975 return self.make_token(TokenType.op)

976

977 if current_char == "*":

978 self.advance()

979 if self.peek() == "*":

980 self.advance()

981 if self.peek() == "=":

982 self.advance()

983 return self.make_token(TokenType.op)

984

985 if current_char == "-":

986 self.advance()

987 # -> operator

988 if self.peek() == ">":

989 self.advance()

990 return self.make_token(TokenType.op)

991

992 # -= operator

993 if self.peek() == "=":

994 self.advance()

995 return self.make_token(TokenType.op)

996

997 if current_char in (",", ";"):

998 self.advance()

999 return self.make_token(TokenType.op)

1000

1001 # This guy is not used in Python3, but still exists

1002 # for backwards compatibility i guess.

1003 if current_char == "`":

1004 self.advance()

1005 return self.make_token(TokenType.op)

1006

1007 if current_char == "(":

1008 self.advance()

1009 self.bracket_level += 1

1010 return self.make_token(TokenType.lparen)

1011

1012 if current_char == ")":

1013 self.advance()

1014 self.bracket_level -= 1

1015 if self.bracket_level < 0:

1016 self.bracket_level = 0

1017 return self.make_token(TokenType.rparen)

1018

1019 if current_char == "[":

1020 self.advance()

1021 self.bracket_level += 1

1022 return self.make_token(TokenType.lbracket)

1023

1024 if current_char == "]":

1025 self.advance()

1026 self.bracket_level -= 1

1027 if self.bracket_level < 0:

1028 self.bracket_level = 0

1029 return self.make_token(TokenType.rbracket)

1030

1031 if current_char == "{":

1032 self.advance()

1033 self.bracket_level += 1

1034 return self.make_token(TokenType.lbrace)

1035

1036 if current_char == "}":

1037 self.advance()

1038 if (

1039 self.bracket_level == 0

1040 and self.fstring_state.state == FStringState.in_fstring_expr

1041 ):

1042 self.fstring_state.consume_rbrace()

1043 self.bracket_level = self.bracket_level_stack.pop()

1044 else:

1045 self.bracket_level -= 1

1046 if self.bracket_level < 0:

1047 self.bracket_level = 0

1048

1049 return self.make_token(TokenType.rbrace)

1050

1051 if current_char == ":":

1052 self.advance()

1053 if (

1054 self.bracket_level == 0

1055 and self.fstring_state.state == FStringState.in_fstring_expr

1056 ):

1057 self.fstring_state.state = FStringState.in_fstring_expr_modifier

1058 return self.make_token(TokenType.op)

1059 else:

1060 if self.peek() == "=":

1061 self.advance()

1062 return self.make_token(TokenType.op)

1063

1064 if current_char in ".0123456789":

1065 if self.current_index + 2 <= len(self.source) and self.source[

1066 self.current_index : self.current_index + 2

1067 ] in ("0b", "0B"):

1068 return self.binary()

1069 elif self.current_index + 2 <= len(self.source) and self.source[

1070 self.current_index : self.current_index + 2

1071 ] in ("0o", "0O"):

1072 return self.octal()

1073 elif self.current_index + 2 <= len(self.source) and self.source[

1074 self.current_index : self.current_index + 2

1075 ] in ("0x", "0X"):

1076 return self.hexadecimal()

1077 else:

1078 return self.decimal()

1079

1080 if (

1081 (self.current_index + 1 <= len(self.source) and self.match('"', "'"))

1082 or (

1083 self.current_index + 2 <= len(self.source)

1084 and self.match(

1085 'b"',

1086 "b'",

1087 'r"',

1088 "r'",

1089 'f"',

1090 "f'",

1091 'u"',

1092 "u'",

1093 "t'",

1094 't"',

1095 ignore_case=True,

1096 )

1097 )

1098 or (

1099 self.current_index + 3 <= len(self.source)

1100 and self.match(

1101 'br"',

1102 "br'",

1103 'rb"',

1104 "rb'",

1105 'fr"',

1106 "fr'",

1107 'rf"',

1108 "rf'",

1109 "tr'",

1110 'tr"',

1111 "rt'",

1112 'rt"',

1113 ignore_case=True,

1114 )

1115 )

1116 ):

1117 return self.string()

1118

1119 return self.name()

1120

1121

1122def tokenize(

1123 source: str,

1124 *,

1125 fstring_tokens: bool = True,

1126 issue_128233_handling: bool = True,

1127) -> Iterator[Token]:

1128 token_iterator = TokenIterator(source, issue_128233_handling=issue_128233_handling)

1129 if fstring_tokens:

1130 return iter(token_iterator)

1131

1132 return merge_fstring_tokens(token_iterator)

1133

1134

1135def merge_fstring_tokens(token_iterator: TokenIterator) -> Iterator[Token]:

1136 """Turn post-Python-3.12 FSTRING-* tokens back to a single STRING token."""

1137 for token in token_iterator:

1138 if token.type not in (TokenType.fstring_start, TokenType.tstring_start):

1139 yield token

1140 continue

1141

1142 start_token = token

1143 end_token = token

1144

1145 fstring_starts = 1

1146 fstring_ends = 0

1147 for token in token_iterator:

1148 if token.type in (TokenType.fstring_start, TokenType.tstring_start):

1149 fstring_starts += 1

1150 if token.type in (TokenType.fstring_end, TokenType.tstring_end):

1151 fstring_ends += 1

1152

1153 if fstring_starts == fstring_ends:

1154 end_token = token

1155 break

1156

1157 yield Token(

1158 type=TokenType.string,

1159 start_index=start_token.start_index,

1160 start_line=start_token.start_line,

1161 start_col=start_token.start_col,

1162 end_index=end_token.end_index,

1163 end_line=end_token.end_line,

1164 end_col=end_token.end_col,

1165 )

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pytokens/init.py: 78%

657 statements