Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/libcst/

2# Licensed to PSF under a Contributor Agreement.

4# Modifications:

5# Copyright David Halter and Contributors

6# Modifications are dual-licensed: MIT and PSF.

7# 99% of the code is different from pgen2, now.

9# A fork of `parso.python.tokenize`.

10# https://github.com/davidhalter/parso/blob/master/parso/python/tokenize.py

11#

12# The following changes were made:

13# - Changes to be compatible with PythonTokenTypes

14# - Removed main section

15# - Applied type stubs directly

16# - Removed Python 2 shims

17# - Added support for Python 3.6 ASYNC/AWAIT hacks

18#

19# -*- coding: utf-8 -*-

20# This tokenizer has been copied from the ``tokenize.py`` standard library

21# tokenizer. The reason was simple: The standard library tokenizer fails

22# if the indentation is not right. To make it possible to do error recovery the

23# tokenizer needed to be rewritten.

24#

25# Basically this is a stripped down version of the standard library module, so

26# you can read the documentation there. Additionally we included some speed and

27# memory optimizations here.

28# pyre-unsafe

29from __future__ import absolute_import

31import itertools as _itertools

32import re

33import sys

34from codecs import BOM_UTF8

35from collections import namedtuple

36from dataclasses import dataclass

37from typing import Dict, Generator, Iterable, Optional, Pattern, Set, Tuple

39from libcst import CSTLogicError

40from libcst._parser.parso.python.token import PythonTokenTypes

41from libcst._parser.parso.utils import PythonVersionInfo, split_lines

43# Maximum code point of Unicode 6.0: 0x10ffff (1,114,111)

44MAX_UNICODE = "\U0010ffff"

45BOM_UTF8_STRING = BOM_UTF8.decode("utf-8")

47STRING = PythonTokenTypes.STRING

48NAME = PythonTokenTypes.NAME

49NUMBER = PythonTokenTypes.NUMBER

50OP = PythonTokenTypes.OP

51NEWLINE = PythonTokenTypes.NEWLINE

52INDENT = PythonTokenTypes.INDENT

53DEDENT = PythonTokenTypes.DEDENT

54ASYNC = PythonTokenTypes.ASYNC

55AWAIT = PythonTokenTypes.AWAIT

56ENDMARKER = PythonTokenTypes.ENDMARKER

57ERRORTOKEN = PythonTokenTypes.ERRORTOKEN

58ERROR_DEDENT = PythonTokenTypes.ERROR_DEDENT

59FSTRING_START = PythonTokenTypes.FSTRING_START

60FSTRING_STRING = PythonTokenTypes.FSTRING_STRING

61FSTRING_END = PythonTokenTypes.FSTRING_END

64@dataclass(frozen=True)

65class TokenCollection:

66 pseudo_token: Pattern

67 single_quoted: Set[str]

68 triple_quoted: Set[str]

69 endpats: Dict[str, Pattern]

70 whitespace: Pattern

71 fstring_pattern_map: Dict[str, str]

72 always_break_tokens: Set[str]

75_token_collection_cache: Dict[PythonVersionInfo, TokenCollection] = {}

78def group(*choices: str, **kwargs: object) -> str:

79 capture = kwargs.pop("capture", False) # Python 2, arrghhhhh :(

80 assert not kwargs

82 start = "("

83 if not capture:

84 start += "?:"

85 return start + "|".join(choices) + ")"

88def maybe(*choices: str) -> str:

89 return group(*choices) + "?"

92# Return the empty string, plus all of the valid string prefixes.

93def _all_string_prefixes(

94 version_info: PythonVersionInfo,

95 include_fstring: bool = False,

96 only_fstring: bool = False,

97) -> Set[str]:

98 def different_case_versions(prefix):

99 for s in _itertools.product(*[(c, c.upper()) for c in prefix]):

100 yield "".join(s)

101

102 # The valid string prefixes. Only contain the lower case versions,

103 # and don't contain any permuations (include 'fr', but not

104 # 'rf'). The various permutations will be generated.

105 valid_string_prefixes = ["b", "r"]

106 if version_info >= (3, 0):

107 valid_string_prefixes.append("br")

108 if version_info < (3, 0) or version_info >= (3, 3):

109 valid_string_prefixes.append("u")

110

111 result = {""}

112 if version_info >= (3, 6) and include_fstring:

113 f = ["f", "fr"]

114 if only_fstring:

115 valid_string_prefixes = f

116 result = set()

117 else:

118 valid_string_prefixes += f

119 elif only_fstring:

120 return set()

121

122 # if we add binary f-strings, add: ['fb', 'fbr']

123 for prefix in valid_string_prefixes:

124 for t in _itertools.permutations(prefix):

125 # create a list with upper and lower versions of each

126 # character

127 result.update(different_case_versions(t))

128 if version_info <= (2, 7):

129 # In Python 2 the order cannot just be random.

130 result.update(different_case_versions("ur"))

131 result.update(different_case_versions("br"))

132 return result

133

134

135def _compile(expr: str) -> Pattern:

136 return re.compile(expr, re.UNICODE)

137

138

139def _get_token_collection(version_info: PythonVersionInfo) -> TokenCollection:

140 try:

141 return _token_collection_cache[version_info]

142 except KeyError:

143 _token_collection_cache[version_info] = result = _create_token_collection(

144 version_info

145 )

146 return result

147

148

149fstring_raw_string = _compile(r"(?:[^{}]+|\{\{|\}\})+")

150

151unicode_character_name = r"[A-Za-z0-9\-]+(?: [A-Za-z0-9\-]+)*"

152fstring_string_single_line = _compile(

153 r"(?:\{\{|\}\}|\\N\{"

154 + unicode_character_name

155 + r"\}|\\(?:\r\n?|\n)|\\[^\r\nN]|[^{}\r\n\\])+"

156)

157fstring_string_multi_line = _compile(

158 r"(?:\{\{|\}\}|\\N\{" + unicode_character_name + r"\}|\\[^N]|[^{}\\])+"

159)

160

161fstring_format_spec_single_line = _compile(r"(?:\\(?:\r\n?|\n)|[^{}\r\n])+")

162fstring_format_spec_multi_line = _compile(r"[^{}]+")

163

164

165def _create_token_collection( # noqa: C901

166 version_info: PythonVersionInfo,

167) -> TokenCollection:

168 # Note: we use unicode matching for names ("\w") but ascii matching for

169 # number literals.

170 Whitespace = r"[ \f\t]*"

171 Comment = r"#[^\r\n]*"

172 # Python 2 is pretty much not working properly anymore, we just ignore

173 # parsing unicode properly, which is fine, I guess.

174 if version_info.major == 2:

175 Name = r"([A-Za-z_0-9]+)"

176 elif sys.version_info[0] == 2:

177 # Unfortunately the regex engine cannot deal with the regex below, so

178 # just use this one.

179 Name = r"(\w+)"

180 else:

181 Name = "([A-Za-z_0-9\u0080-" + MAX_UNICODE + "]+)"

182

183 if version_info >= (3, 6):

184 Hexnumber = r"0[xX](?:_?[0-9a-fA-F])+"

185 Binnumber = r"0[bB](?:_?[01])+"

186 Octnumber = r"0[oO](?:_?[0-7])+"

187 Decnumber = r"(?:0(?:_?0)*|[1-9](?:_?[0-9])*)"

188 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)

189 Exponent = r"[eE][-+]?[0-9](?:_?[0-9])*"

190 Pointfloat = group(

191 r"[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?", r"\.[0-9](?:_?[0-9])*"

192 ) + maybe(Exponent)

193 Expfloat = r"[0-9](?:_?[0-9])*" + Exponent

194 Floatnumber = group(Pointfloat, Expfloat)

195 Imagnumber = group(r"[0-9](?:_?[0-9])*[jJ]", Floatnumber + r"[jJ]")

196 else:

197 Hexnumber = r"0[xX][0-9a-fA-F]+"

198 Binnumber = r"0[bB][01]+"

199 if version_info >= (3, 0):

200 Octnumber = r"0[oO][0-7]+"

201 else:

202 Octnumber = "0[oO]?[0-7]+"

203 Decnumber = r"(?:0+|[1-9][0-9]*)"

204 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)

205 if version_info.major < 3:

206 Intnumber += "[lL]?"

207 Exponent = r"[eE][-+]?[0-9]+"

208 Pointfloat = group(r"[0-9]+\.[0-9]*", r"\.[0-9]+") + maybe(Exponent)

209 Expfloat = r"[0-9]+" + Exponent

210 Floatnumber = group(Pointfloat, Expfloat)

211 Imagnumber = group(r"[0-9]+[jJ]", Floatnumber + r"[jJ]")

212 Number = group(Imagnumber, Floatnumber, Intnumber)

213

214 # Note that since _all_string_prefixes includes the empty string,

215 # StringPrefix can be the empty string (making it optional).

216 possible_prefixes = _all_string_prefixes(version_info)

217 StringPrefix = group(*possible_prefixes)

218 StringPrefixWithF = group(*_all_string_prefixes(version_info, include_fstring=True))

219 fstring_prefixes = _all_string_prefixes(

220 version_info, include_fstring=True, only_fstring=True

221 )

222 FStringStart = group(*fstring_prefixes)

223

224 # Tail end of ' string.

225 Single = r"(?:\\.|[^'\\])*'"

226 # Tail end of " string.

227 Double = r'(?:\\.|[^"\\])*"'

228 # Tail end of ''' string.

229 Single3 = r"(?:\\.|'(?!'')|[^'\\])*'''"

230 # Tail end of """ string.

231 Double3 = r'(?:\\.|"(?!"")|[^"\\])*"""'

232 Triple = group(StringPrefixWithF + "'''", StringPrefixWithF + '"""')

233

234 # Because of leftmost-then-longest match semantics, be sure to put the

235 # longest operators first (e.g., if = came before ==, == would get

236 # recognized as two instances of =).

237 Operator = group(

238 r"\*\*=?", r">>=?", r"<<=?", r"//=?", r"->", r"[+\-*/%&@`|^!=<>]=?", r"~"

239 )

240

241 Bracket = "[][(){}]"

242

243 special_args = [r"\r\n?", r"\n", r"[;.,@]"]

244 if version_info >= (3, 0):

245 special_args.insert(0, r"\.\.\.")

246 if version_info >= (3, 8):

247 special_args.insert(0, ":=?")

248 else:

249 special_args.insert(0, ":")

250 Special = group(*special_args)

251

252 Funny = group(Operator, Bracket, Special)

253

254 # First (or only) line of ' or " string.

255 ContStr = group(

256 StringPrefix

257 + r"'[^\r\n'\\]*(?:\\.[^\r\n'\\]*)*"

258 + group("'", r"\\(?:\r\n?|\n)"),

259 StringPrefix

260 + r'"[^\r\n"\\]*(?:\\.[^\r\n"\\]*)*'

261 + group('"', r"\\(?:\r\n?|\n)"),

262 )

263 pseudo_extra_pool = [Comment, Triple]

264 all_quotes = '"', "'", '"""', "'''"

265 if fstring_prefixes:

266 pseudo_extra_pool.append(FStringStart + group(*all_quotes))

267

268 PseudoExtras = group(r"\\(?:\r\n?|\n)|\Z", *pseudo_extra_pool)

269 PseudoToken = group(Whitespace, capture=True) + group(

270 PseudoExtras, Number, Funny, ContStr, Name, capture=True

271 )

272

273 # For a given string prefix plus quotes, endpats maps it to a regex

274 # to match the remainder of that string. _prefix can be empty, for

275 # a normal single or triple quoted string (with no prefix).

276 endpats = {}

277 for _prefix in possible_prefixes:

278 endpats[_prefix + "'"] = _compile(Single)

279 endpats[_prefix + '"'] = _compile(Double)

280 endpats[_prefix + "'''"] = _compile(Single3)

281 endpats[_prefix + '"""'] = _compile(Double3)

282

283 # A set of all of the single and triple quoted string prefixes,

284 # including the opening quotes.

285 single_quoted = set()

286 triple_quoted = set()

287 fstring_pattern_map = {}

288 for t in possible_prefixes:

289 for quote in '"', "'":

290 single_quoted.add(t + quote)

291

292 for quote in '"""', "'''":

293 triple_quoted.add(t + quote)

294

295 for t in fstring_prefixes:

296 for quote in all_quotes:

297 fstring_pattern_map[t + quote] = quote

298

299 pseudo_token_compiled = _compile(PseudoToken)

300 return TokenCollection(

301 pseudo_token_compiled,

302 single_quoted,

303 triple_quoted,

304 endpats,

305 _compile(Whitespace),

306 fstring_pattern_map,

307 {

308 ";",

309 "import",

310 "class",

311 "def",

312 "try",

313 "except",

314 "finally",

315 "while",

316 "with",

317 "return",

318 },

319 )

320

321

322class Token(namedtuple("Token", ["type", "string", "start_pos", "prefix"])):

323 @property

324 def end_pos(self):

325 lines = split_lines(self.string)

326 if len(lines) > 1:

327 return self.start_pos[0] + len(lines) - 1, 0

328 else:

329 return self.start_pos[0], self.start_pos[1] + len(self.string)

330

331

332class PythonToken(Token):

333 def __repr__(self):

334 return "TokenInfo(type=%s, string=%r, start_pos=%r, prefix=%r)" % self._replace(

335 type=self.type.name

336 )

337

338

339class FStringNode:

340 def __init__(self, quote, raw):

341 self.quote = quote

342 self.raw = raw

343 self.parentheses_count = 0

344 self.previous_lines = ""

345 self.last_string_start_pos = None

346 # In the syntax there can be multiple format_spec's nested:

347 # {x:{y:3}}

348 self.format_spec_count = 0

349

350 def open_parentheses(self, character):

351 self.parentheses_count += 1

352

353 def close_parentheses(self, character):

354 self.parentheses_count -= 1

355 if self.parentheses_count == 0:

356 # No parentheses means that the format spec is also finished.

357 self.format_spec_count = 0

358

359 def allow_multiline(self):

360 return len(self.quote) == 3

361

362 def is_in_expr(self):

363 return self.parentheses_count > self.format_spec_count

364

365 def is_in_format_spec(self):

366 return not self.is_in_expr() and self.format_spec_count

367

368

369def _close_fstring_if_necessary(fstring_stack, string, start_pos, additional_prefix):

370 for fstring_stack_index, node in enumerate(fstring_stack):

371 if string.startswith(node.quote):

372 token = PythonToken(

373 FSTRING_END, node.quote, start_pos, prefix=additional_prefix

374 )

375 additional_prefix = ""

376 assert not node.previous_lines

377 del fstring_stack[fstring_stack_index:]

378 return token, "", len(node.quote)

379 return None, additional_prefix, 0

380

381

382def _find_fstring_string(endpats, fstring_stack, line, lnum, pos):

383 tos = fstring_stack[-1]

384 allow_multiline = tos.allow_multiline()

385 if tos.is_in_format_spec():

386 if allow_multiline:

387 regex = fstring_format_spec_multi_line

388 else:

389 regex = fstring_format_spec_single_line

390 else:

391 if tos.raw:

392 regex = fstring_raw_string

393 elif allow_multiline:

394 regex = fstring_string_multi_line

395 else:

396 regex = fstring_string_single_line

397

398 match = regex.match(line, pos)

399 if match is None:

400 return tos.previous_lines, pos

401

402 if not tos.previous_lines:

403 tos.last_string_start_pos = (lnum, pos)

404

405 string = match.group(0)

406 for fstring_stack_node in fstring_stack:

407 end_match = endpats[fstring_stack_node.quote].match(string)

408 if end_match is not None:

409 string = end_match.group(0)[: -len(fstring_stack_node.quote)]

410

411 new_pos = pos

412 new_pos += len(string)

413 # even if allow_multiline is False, we still need to check for trailing

414 # newlines, because a single-line f-string can contain line continuations

415 if string.endswith("\n") or string.endswith("\r"):

416 tos.previous_lines += string

417 string = ""

418 else:

419 string = tos.previous_lines + string

420

421 return string, new_pos

422

423

424def tokenize(

425 code: str, version_info: PythonVersionInfo, start_pos: Tuple[int, int] = (1, 0)

426) -> Generator[PythonToken, None, None]:

427 """Generate tokens from a the source code (string)."""

428 lines = split_lines(code, keepends=True)

429 return tokenize_lines(lines, version_info, start_pos=start_pos)

430

431

432def tokenize_lines( # noqa: C901

433 lines: Iterable[str],

434 version_info: PythonVersionInfo,

435 start_pos: Tuple[int, int] = (1, 0),

436) -> Generator[PythonToken, None, None]:

437 token_collection = _get_token_collection(version_info)

438 if version_info >= PythonVersionInfo(3, 7):

439 return _tokenize_lines_py37_or_above(

440 lines, version_info, token_collection, start_pos=start_pos

441 )

442 else:

443 return _tokenize_lines_py36_or_below(

444 lines, version_info, token_collection, start_pos=start_pos

445 )

446

447

448def _tokenize_lines_py36_or_below( # noqa: C901

449 lines: Iterable[str],

450 version_info: PythonVersionInfo,

451 token_collection: TokenCollection,

452 start_pos: Tuple[int, int] = (1, 0),

453) -> Generator[PythonToken, None, None]:

454 """

455 A heavily modified Python standard library tokenizer.

456

457 Additionally to the default information, yields also the prefix of each

458 token. This idea comes from lib2to3. The prefix contains all information

459 that is irrelevant for the parser like newlines in parentheses or comments.

460 """

461

462 paren_level = 0 # count parentheses

463 indents = [0]

464 max = 0

465 numchars = "0123456789"

466 contstr = ""

467 contline = None

468 # We start with a newline. This makes indent at the first position

469 # possible. It's not valid Python, but still better than an INDENT in the

470 # second line (and not in the first). This makes quite a few things in

471 # Jedi's fast parser possible.

472 new_line = True

473 prefix = "" # Should never be required, but here for safety

474 endprog = None # Should not be required, but here for lint

475 contstr_start: Optional[Tuple[int, int]] = None

476 additional_prefix = ""

477 first = True

478 lnum = start_pos[0] - 1

479 fstring_stack = []

480 # stash and async_* are used for async/await parsing

481 stashed: Optional[PythonToken] = None

482 async_def: bool = False

483 async_def_indent: int = 0

484 async_def_newline: bool = False

485

486 def dedent_if_necessary(start):

487 nonlocal stashed

488 nonlocal async_def

489 nonlocal async_def_indent

490 nonlocal async_def_newline

491

492 while start < indents[-1]:

493 if start > indents[-2]:

494 yield PythonToken(ERROR_DEDENT, "", (lnum, 0), "")

495 break

496 if stashed is not None:

497 yield stashed

498 stashed = None

499 if async_def and async_def_newline and async_def_indent >= indents[-1]:

500 # We exited an 'async def' block, so stop tracking for indents

501 async_def = False

502 async_def_newline = False

503 async_def_indent = 0

504 yield PythonToken(DEDENT, "", spos, "")

505 indents.pop()

506

507 for line in lines: # loop over lines in stream

508 lnum += 1

509 pos = 0

510 max = len(line)

511 if first:

512 if line.startswith(BOM_UTF8_STRING):

513 additional_prefix = BOM_UTF8_STRING

514 line = line[1:]

515 max = len(line)

516

517 # Fake that the part before was already parsed.

518 line = "^" * start_pos[1] + line

519 pos = start_pos[1]

520 max += start_pos[1]

521

522 first = False

523

524 if contstr: # continued string

525 if endprog is None:

526 raise CSTLogicError("Logic error!")

527 endmatch = endprog.match(line)

528 if endmatch:

529 pos = endmatch.end(0)

530 if contstr_start is None:

531 raise CSTLogicError("Logic error!")

532 if stashed is not None:

533 raise CSTLogicError("Logic error!")

534 yield PythonToken(STRING, contstr + line[:pos], contstr_start, prefix)

535 contstr = ""

536 contline = None

537 else:

538 contstr = contstr + line

539 contline = contline + line

540 continue

541

542 while pos < max:

543 if fstring_stack:

544 tos = fstring_stack[-1]

545 if not tos.is_in_expr():

546 string, pos = _find_fstring_string(

547 token_collection.endpats, fstring_stack, line, lnum, pos

548 )

549 if string:

550 if stashed is not None:

551 raise CSTLogicError("Logic error!")

552 yield PythonToken(

553 FSTRING_STRING,

554 string,

555 tos.last_string_start_pos,

556 # Never has a prefix because it can start anywhere and

557 # include whitespace.

558 prefix="",

559 )

560 tos.previous_lines = ""

561 continue

562 if pos == max:

563 break

564

565 rest = line[pos:]

566 (

567 fstring_end_token,

568 additional_prefix,

569 quote_length,

570 ) = _close_fstring_if_necessary(

571 fstring_stack, rest, (lnum, pos), additional_prefix

572 )

573 pos += quote_length

574 if fstring_end_token is not None:

575 if stashed is not None:

576 raise CSTLogicError("Logic error!")

577 yield fstring_end_token

578 continue

579

580 pseudomatch = token_collection.pseudo_token.match(line, pos)

581 if not pseudomatch: # scan for tokens

582 match = token_collection.whitespace.match(line, pos)

583 if pos == 0:

584 # pyre-fixme[16]: `Optional` has no attribute `end`.

585 yield from dedent_if_necessary(match.end())

586 pos = match.end()

587 new_line = False

588 yield PythonToken(

589 ERRORTOKEN,

590 line[pos],

591 (lnum, pos),

592 # pyre-fixme[16]: `Optional` has no attribute `group`.

593 additional_prefix + match.group(0),

594 )

595 additional_prefix = ""

596 pos += 1

597 continue

598

599 prefix = additional_prefix + pseudomatch.group(1)

600 additional_prefix = ""

601 start, pos = pseudomatch.span(2)

602 spos = (lnum, start)

603 token = pseudomatch.group(2)

604 if token == "":

605 assert prefix

606 additional_prefix = prefix

607 # This means that we have a line with whitespace/comments at

608 # the end, which just results in an endmarker.

609 break

610 initial = token[0]

611

612 if new_line and initial not in "\r\n\\#":

613 new_line = False

614 if paren_level == 0 and not fstring_stack:

615 i = 0

616 indent_start = start

617 while line[i] == "\f":

618 i += 1

619 # TODO don't we need to change spos as well?

620 indent_start -= 1

621 if indent_start > indents[-1]:

622 if stashed is not None:

623 yield stashed

624 stashed = None

625 yield PythonToken(INDENT, "", spos, "")

626 indents.append(indent_start)

627 yield from dedent_if_necessary(indent_start)

628

629 if initial in numchars or ( # ordinary number

630 initial == "." and token != "." and token != "..."

631 ):

632 if stashed is not None:

633 yield stashed

634 stashed = None

635 yield PythonToken(NUMBER, token, spos, prefix)

636 elif pseudomatch.group(3) is not None: # ordinary name

637 if token in token_collection.always_break_tokens:

638 fstring_stack[:] = []

639 paren_level = 0

640 # We only want to dedent if the token is on a new line.

641 if re.match(r"[ \f\t]*$", line[:start]):

642 while True:

643 indent = indents.pop()

644 if indent > start:

645 if (

646 async_def

647 and async_def_newline

648 and async_def_indent >= indent

649 ):

650 # We dedented outside of an 'async def' block.

651 async_def = False

652 async_def_newline = False

653 async_def_indent = 0

654 if stashed is not None:

655 yield stashed

656 stashed = None

657 yield PythonToken(DEDENT, "", spos, "")

658 else:

659 indents.append(indent)

660 break

661 if str.isidentifier(token):

662 should_yield_identifier = True

663 if token in ("async", "await") and async_def:

664 # We're inside an 'async def' block, all async/await are

665 # tokens.

666 if token == "async":

667 yield PythonToken(ASYNC, token, spos, prefix)

668 else:

669 yield PythonToken(AWAIT, token, spos, prefix)

670 should_yield_identifier = False

671

672 # We are possibly starting an 'async def' section

673 elif token == "async" and not stashed:

674 stashed = PythonToken(NAME, token, spos, prefix)

675 should_yield_identifier = False

676

677 # We actually are starting an 'async def' section

678 elif (

679 token == "def"

680 and stashed is not None

681 and stashed[0] is NAME

682 and stashed[1] == "async"

683 ):

684 async_def = True

685 async_def_indent = indents[-1]

686 yield PythonToken(ASYNC, stashed[1], stashed[2], stashed[3])

687 stashed = None

688

689 # We are either not stashed, or we output an ASYNC token above.

690 elif stashed:

691 yield stashed

692 stashed = None

693

694 # If we didn't bail early due to possibly recognizing an 'async def',

695 # then we should yield this token as normal.

696 if should_yield_identifier:

697 yield PythonToken(NAME, token, spos, prefix)

698 else:

699 yield from _split_illegal_unicode_name(token, spos, prefix)

700 elif initial in "\r\n":

701 if any(not f.allow_multiline() for f in fstring_stack):

702 # Would use fstring_stack.clear, but that's not available

703 # in Python 2.

704 fstring_stack[:] = []

705

706 if not new_line and paren_level == 0 and not fstring_stack:

707 if async_def:

708 async_def_newline = True

709 if stashed:

710 yield stashed

711 stashed = None

712 yield PythonToken(NEWLINE, token, spos, prefix)

713 else:

714 additional_prefix = prefix + token

715 new_line = True

716 elif initial == "#": # Comments

717 assert not token.endswith("\n")

718 additional_prefix = prefix + token

719 elif token in token_collection.triple_quoted:

720 endprog = token_collection.endpats[token]

721 endmatch = endprog.match(line, pos)

722 if endmatch: # all on one line

723 pos = endmatch.end(0)

724 token = line[start:pos]

725 if stashed is not None:

726 yield stashed

727 stashed = None

728 yield PythonToken(STRING, token, spos, prefix)

729 else:

730 contstr_start = (lnum, start) # multiple lines

731 contstr = line[start:]

732 contline = line

733 break

734

735 # Check up to the first 3 chars of the token to see if

736 # they're in the single_quoted set. If so, they start

737 # a string.

738 # We're using the first 3, because we're looking for

739 # "rb'" (for example) at the start of the token. If

740 # we switch to longer prefixes, this needs to be

741 # adjusted.

742 # Note that initial == token[:1].

743 # Also note that single quote checking must come after

744 # triple quote checking (above).

745 elif (

746 initial in token_collection.single_quoted

747 or token[:2] in token_collection.single_quoted

748 or token[:3] in token_collection.single_quoted

749 ):

750 if token[-1] in "\r\n": # continued string

751 # This means that a single quoted string ends with a

752 # backslash and is continued.

753 contstr_start = lnum, start

754 endprog = (

755 token_collection.endpats.get(initial)

756 or token_collection.endpats.get(token[1])

757 or token_collection.endpats.get(token[2])

758 )

759 contstr = line[start:]

760 contline = line

761 break

762 else: # ordinary string

763 if stashed is not None:

764 yield stashed

765 stashed = None

766 yield PythonToken(STRING, token, spos, prefix)

767 elif (

768 token in token_collection.fstring_pattern_map

769 ): # The start of an fstring.

770 fstring_stack.append(

771 FStringNode(

772 token_collection.fstring_pattern_map[token],

773 "r" in token or "R" in token,

774 )

775 )

776 if stashed is not None:

777 yield stashed

778 stashed = None

779 yield PythonToken(FSTRING_START, token, spos, prefix)

780 elif initial == "\\" and line[start:] in (

781 "\\\n",

782 "\\\r\n",

783 "\\\r",

784 ): # continued stmt

785 additional_prefix += prefix + line[start:]

786 break

787 else:

788 if token in "([{":

789 if fstring_stack:

790 fstring_stack[-1].open_parentheses(token)

791 else:

792 paren_level += 1

793 elif token in ")]}":

794 if fstring_stack:

795 fstring_stack[-1].close_parentheses(token)

796 else:

797 if paren_level:

798 paren_level -= 1

799 elif (

800 token == ":"

801 and fstring_stack

802 and fstring_stack[-1].parentheses_count

803 - fstring_stack[-1].format_spec_count

804 == 1

805 ):

806 fstring_stack[-1].format_spec_count += 1

807

808 if stashed is not None:

809 yield stashed

810 stashed = None

811 yield PythonToken(OP, token, spos, prefix)

812

813 if contstr:

814 yield PythonToken(ERRORTOKEN, contstr, contstr_start, prefix)

815 if contstr.endswith("\n") or contstr.endswith("\r"):

816 new_line = True

817

818 if stashed is not None:

819 yield stashed

820 stashed = None

821

822 end_pos = lnum, max

823 # As the last position we just take the maximally possible position. We

824 # remove -1 for the last new line.

825 for indent in indents[1:]:

826 yield PythonToken(DEDENT, "", end_pos, "")

827 yield PythonToken(ENDMARKER, "", end_pos, additional_prefix)

828

829

830def _tokenize_lines_py37_or_above( # noqa: C901

831 lines: Iterable[str],

832 version_info: PythonVersionInfo,

833 token_collection: TokenCollection,

834 start_pos: Tuple[int, int] = (1, 0),

835) -> Generator[PythonToken, None, None]:

836 """

837 A heavily modified Python standard library tokenizer.

838

839 Additionally to the default information, yields also the prefix of each

840 token. This idea comes from lib2to3. The prefix contains all information

841 that is irrelevant for the parser like newlines in parentheses or comments.

842 """

843

844 def dedent_if_necessary(start):

845 while start < indents[-1]:

846 if start > indents[-2]:

847 yield PythonToken(ERROR_DEDENT, "", (lnum, 0), "")

848 break

849 yield PythonToken(DEDENT, "", spos, "")

850 indents.pop()

851

852 paren_level = 0 # count parentheses

853 indents = [0]

854 max = 0

855 numchars = "0123456789"

856 contstr = ""

857 contline = None

858 # We start with a newline. This makes indent at the first position

859 # possible. It's not valid Python, but still better than an INDENT in the

860 # second line (and not in the first). This makes quite a few things in

861 # Jedi's fast parser possible.

862 new_line = True

863 prefix = "" # Should never be required, but here for safety

864 endprog = None # Should not be required, but here for lint

865 contstr_start: Optional[Tuple[int, int]] = None

866 additional_prefix = ""

867 first = True

868 lnum = start_pos[0] - 1

869 fstring_stack = []

870 for line in lines: # loop over lines in stream

871 lnum += 1

872 pos = 0

873 max = len(line)

874 if first:

875 if line.startswith(BOM_UTF8_STRING):

876 additional_prefix = BOM_UTF8_STRING

877 line = line[1:]

878 max = len(line)

879

880 # Fake that the part before was already parsed.

881 line = "^" * start_pos[1] + line

882 pos = start_pos[1]

883 max += start_pos[1]

884

885 first = False

886

887 if contstr: # continued string

888 if endprog is None:

889 raise CSTLogicError("Logic error!")

890 endmatch = endprog.match(line)

891 if endmatch:

892 pos = endmatch.end(0)

893 if contstr_start is None:

894 raise CSTLogicError("Logic error!")

895 yield PythonToken(STRING, contstr + line[:pos], contstr_start, prefix)

896 contstr = ""

897 contline = None

898 else:

899 contstr = contstr + line

900 contline = contline + line

901 continue

902

903 while pos < max:

904 if fstring_stack:

905 tos = fstring_stack[-1]

906 if not tos.is_in_expr():

907 string, pos = _find_fstring_string(

908 token_collection.endpats, fstring_stack, line, lnum, pos

909 )

910 if string:

911 yield PythonToken(

912 FSTRING_STRING,

913 string,

914 tos.last_string_start_pos,

915 # Never has a prefix because it can start anywhere and

916 # include whitespace.

917 prefix="",

918 )

919 tos.previous_lines = ""

920 continue

921 if pos == max:

922 break

923

924 rest = line[pos:]

925 (

926 fstring_end_token,

927 additional_prefix,

928 quote_length,

929 ) = _close_fstring_if_necessary(

930 fstring_stack, rest, (lnum, pos), additional_prefix

931 )

932 pos += quote_length

933 if fstring_end_token is not None:

934 yield fstring_end_token

935 continue

936

937 pseudomatch = token_collection.pseudo_token.match(line, pos)

938 if not pseudomatch: # scan for tokens

939 match = token_collection.whitespace.match(line, pos)

940 if pos == 0:

941 # pyre-fixme[16]: `Optional` has no attribute `end`.

942 for t in dedent_if_necessary(match.end()):

943 yield t

944 pos = match.end()

945 new_line = False

946 yield PythonToken(

947 ERRORTOKEN,

948 line[pos],

949 (lnum, pos),

950 # pyre-fixme[16]: `Optional` has no attribute `group`.

951 additional_prefix + match.group(0),

952 )

953 additional_prefix = ""

954 pos += 1

955 continue

956

957 prefix = additional_prefix + pseudomatch.group(1)

958 additional_prefix = ""

959 start, pos = pseudomatch.span(2)

960 spos = (lnum, start)

961 token = pseudomatch.group(2)

962 if token == "":

963 assert prefix

964 additional_prefix = prefix

965 # This means that we have a line with whitespace/comments at

966 # the end, which just results in an endmarker.

967 break

968 initial = token[0]

969

970 if new_line and initial not in "\r\n\\#":

971 new_line = False

972 if paren_level == 0 and not fstring_stack:

973 i = 0

974 indent_start = start

975 while line[i] == "\f":

976 i += 1

977 # TODO don't we need to change spos as well?

978 indent_start -= 1

979 if indent_start > indents[-1]:

980 yield PythonToken(INDENT, "", spos, "")

981 indents.append(indent_start)

982 for t in dedent_if_necessary(indent_start):

983 yield t

984

985 if initial in numchars or ( # ordinary number

986 initial == "." and token != "." and token != "..."

987 ):

988 yield PythonToken(NUMBER, token, spos, prefix)

989 elif pseudomatch.group(3) is not None: # ordinary name

990 if token in token_collection.always_break_tokens:

991 fstring_stack[:] = []

992 paren_level = 0

993 # We only want to dedent if the token is on a new line.

994 if re.match(r"[ \f\t]*$", line[:start]):

995 while True:

996 indent = indents.pop()

997 if indent > start:

998 yield PythonToken(DEDENT, "", spos, "")

999 else:

1000 indents.append(indent)

1001 break

1002 if str.isidentifier(token):

1003 # py37 doesn't need special tokens for async/await, and we could

1004 # emit NAME, but then we'd need different grammar for py36 and py37.

1005 if token == "async":

1006 yield PythonToken(ASYNC, token, spos, prefix)

1007 elif token == "await":

1008 yield PythonToken(AWAIT, token, spos, prefix)

1009 else:

1010 yield PythonToken(NAME, token, spos, prefix)

1011 else:

1012 for t in _split_illegal_unicode_name(token, spos, prefix):

1013 yield t # yield from Python 2

1014 elif initial in "\r\n":

1015 if any(not f.allow_multiline() for f in fstring_stack):

1016 # Would use fstring_stack.clear, but that's not available

1017 # in Python 2.

1018 fstring_stack[:] = []

1019

1020 if not new_line and paren_level == 0 and not fstring_stack:

1021 yield PythonToken(NEWLINE, token, spos, prefix)

1022 else:

1023 additional_prefix = prefix + token

1024 new_line = True

1025 elif initial == "#": # Comments

1026 assert not token.endswith("\n")

1027 additional_prefix = prefix + token

1028 elif token in token_collection.triple_quoted:

1029 endprog = token_collection.endpats[token]

1030 endmatch = endprog.match(line, pos)

1031 if endmatch: # all on one line

1032 pos = endmatch.end(0)

1033 token = line[start:pos]

1034 yield PythonToken(STRING, token, spos, prefix)

1035 else:

1036 contstr_start = (lnum, start) # multiple lines

1037 contstr = line[start:]

1038 contline = line

1039 break

1040

1041 # Check up to the first 3 chars of the token to see if

1042 # they're in the single_quoted set. If so, they start

1043 # a string.

1044 # We're using the first 3, because we're looking for

1045 # "rb'" (for example) at the start of the token. If

1046 # we switch to longer prefixes, this needs to be

1047 # adjusted.

1048 # Note that initial == token[:1].

1049 # Also note that single quote checking must come after

1050 # triple quote checking (above).

1051 elif (

1052 initial in token_collection.single_quoted

1053 or token[:2] in token_collection.single_quoted

1054 or token[:3] in token_collection.single_quoted

1055 ):

1056 if token[-1] in "\r\n": # continued string

1057 # This means that a single quoted string ends with a

1058 # backslash and is continued.

1059 contstr_start = lnum, start

1060 endprog = (

1061 token_collection.endpats.get(initial)

1062 or token_collection.endpats.get(token[1])

1063 or token_collection.endpats.get(token[2])

1064 )

1065 contstr = line[start:]

1066 contline = line

1067 break

1068 else: # ordinary string

1069 yield PythonToken(STRING, token, spos, prefix)

1070 elif (

1071 token in token_collection.fstring_pattern_map

1072 ): # The start of an fstring.

1073 fstring_stack.append(

1074 FStringNode(

1075 token_collection.fstring_pattern_map[token],

1076 "r" in token or "R" in token,

1077 )

1078 )

1079 yield PythonToken(FSTRING_START, token, spos, prefix)

1080 elif initial == "\\" and line[start:] in (

1081 "\\\n",

1082 "\\\r\n",

1083 "\\\r",

1084 ): # continued stmt

1085 additional_prefix += prefix + line[start:]

1086 break

1087 else:

1088 if token in "([{":

1089 if fstring_stack:

1090 fstring_stack[-1].open_parentheses(token)

1091 else:

1092 paren_level += 1

1093 elif token in ")]}":

1094 if fstring_stack:

1095 fstring_stack[-1].close_parentheses(token)

1096 else:

1097 if paren_level:

1098 paren_level -= 1

1099 elif (

1100 token == ":"

1101 and fstring_stack

1102 and fstring_stack[-1].parentheses_count

1103 - fstring_stack[-1].format_spec_count

1104 == 1

1105 ):

1106 fstring_stack[-1].format_spec_count += 1

1107

1108 yield PythonToken(OP, token, spos, prefix)

1109

1110 if contstr:

1111 yield PythonToken(ERRORTOKEN, contstr, contstr_start, prefix)

1112 if contstr.endswith("\n") or contstr.endswith("\r"):

1113 new_line = True

1114

1115 end_pos = lnum, max

1116 # As the last position we just take the maximally possible position. We

1117 # remove -1 for the last new line.

1118 for indent in indents[1:]:

1119 yield PythonToken(DEDENT, "", end_pos, "")

1120 yield PythonToken(ENDMARKER, "", end_pos, additional_prefix)

1121

1122

1123def _split_illegal_unicode_name(

1124 token: str, start_pos: Tuple[int, int], prefix: str

1125) -> Generator[PythonToken, None, None]:

1126 def create_token():

1127 return PythonToken(ERRORTOKEN if is_illegal else NAME, found, pos, prefix)

1128

1129 found = ""

1130 is_illegal = False

1131 pos = start_pos

1132 for i, char in enumerate(token):

1133 if is_illegal:

1134 if str.isidentifier(char):

1135 yield create_token()

1136 found = char

1137 is_illegal = False

1138 prefix = ""

1139 pos = start_pos[0], start_pos[1] + i

1140 else:

1141 found += char

1142 else:

1143 new_found = found + char

1144 if str.isidentifier(new_found):

1145 found = new_found

1146 else:

1147 if found:

1148 yield create_token()

1149 prefix = ""

1150 pos = start_pos[0], start_pos[1] + i

1151 found = char

1152 is_illegal = True

1153

1154 if found:

1155 yield create_token()

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/libcst/_parser/parso/python/tokenize.py: 10%

684 statements