Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/libcst/

2# Licensed to PSF under a Contributor Agreement.

4# Modifications:

5# Copyright David Halter and Contributors

6# Modifications are dual-licensed: MIT and PSF.

7# 99% of the code is different from pgen2, now.

9# A fork of `parso.python.tokenize`.

10# https://github.com/davidhalter/parso/blob/master/parso/python/tokenize.py

11#

12# The following changes were made:

13# - Changes to be compatible with PythonTokenTypes

14# - Removed main section

15# - Applied type stubs directly

16# - Removed Python 2 shims

17# - Added support for Python 3.6 ASYNC/AWAIT hacks

18#

19# -*- coding: utf-8 -*-

20# This tokenizer has been copied from the ``tokenize.py`` standard library

21# tokenizer. The reason was simple: The standard library tokenizer fails

22# if the indentation is not right. To make it possible to do error recovery the

23# tokenizer needed to be rewritten.

24#

25# Basically this is a stripped down version of the standard library module, so

26# you can read the documentation there. Additionally we included some speed and

27# memory optimizations here.

28# pyre-unsafe

29from __future__ import absolute_import

31import itertools as _itertools

32import re

33import sys

34from codecs import BOM_UTF8

35from collections import namedtuple

36from dataclasses import dataclass

37from typing import Dict, Generator, Iterable, Optional, Pattern, Set, Tuple

39from libcst._parser.parso.python.token import PythonTokenTypes

40from libcst._parser.parso.utils import PythonVersionInfo, split_lines

42# Maximum code point of Unicode 6.0: 0x10ffff (1,114,111)

43MAX_UNICODE = "\U0010ffff"

44BOM_UTF8_STRING = BOM_UTF8.decode("utf-8")

46STRING = PythonTokenTypes.STRING

47NAME = PythonTokenTypes.NAME

48NUMBER = PythonTokenTypes.NUMBER

49OP = PythonTokenTypes.OP

50NEWLINE = PythonTokenTypes.NEWLINE

51INDENT = PythonTokenTypes.INDENT

52DEDENT = PythonTokenTypes.DEDENT

53ASYNC = PythonTokenTypes.ASYNC

54AWAIT = PythonTokenTypes.AWAIT

55ENDMARKER = PythonTokenTypes.ENDMARKER

56ERRORTOKEN = PythonTokenTypes.ERRORTOKEN

57ERROR_DEDENT = PythonTokenTypes.ERROR_DEDENT

58FSTRING_START = PythonTokenTypes.FSTRING_START

59FSTRING_STRING = PythonTokenTypes.FSTRING_STRING

60FSTRING_END = PythonTokenTypes.FSTRING_END

63@dataclass(frozen=True)

64class TokenCollection:

65 pseudo_token: Pattern

66 single_quoted: Set[str]

67 triple_quoted: Set[str]

68 endpats: Dict[str, Pattern]

69 whitespace: Pattern

70 fstring_pattern_map: Dict[str, str]

71 always_break_tokens: Set[str]

74_token_collection_cache: Dict[PythonVersionInfo, TokenCollection] = {}

77def group(*choices: str, **kwargs: object) -> str:

78 capture = kwargs.pop("capture", False) # Python 2, arrghhhhh :(

79 assert not kwargs

81 start = "("

82 if not capture:

83 start += "?:"

84 return start + "|".join(choices) + ")"

87def maybe(*choices: str) -> str:

88 return group(*choices) + "?"

91# Return the empty string, plus all of the valid string prefixes.

92def _all_string_prefixes(

93 version_info: PythonVersionInfo,

94 include_fstring: bool = False,

95 only_fstring: bool = False,

96) -> Set[str]:

97 def different_case_versions(prefix):

98 for s in _itertools.product(*[(c, c.upper()) for c in prefix]):

99 yield "".join(s)

100

101 # The valid string prefixes. Only contain the lower case versions,

102 # and don't contain any permuations (include 'fr', but not

103 # 'rf'). The various permutations will be generated.

104 valid_string_prefixes = ["b", "r"]

105 if version_info >= (3, 0):

106 valid_string_prefixes.append("br")

107 if version_info < (3, 0) or version_info >= (3, 3):

108 valid_string_prefixes.append("u")

109

110 result = {""}

111 if version_info >= (3, 6) and include_fstring:

112 f = ["f", "fr"]

113 if only_fstring:

114 valid_string_prefixes = f

115 result = set()

116 else:

117 valid_string_prefixes += f

118 elif only_fstring:

119 return set()

120

121 # if we add binary f-strings, add: ['fb', 'fbr']

122 for prefix in valid_string_prefixes:

123 for t in _itertools.permutations(prefix):

124 # create a list with upper and lower versions of each

125 # character

126 result.update(different_case_versions(t))

127 if version_info <= (2, 7):

128 # In Python 2 the order cannot just be random.

129 result.update(different_case_versions("ur"))

130 result.update(different_case_versions("br"))

131 return result

132

133

134def _compile(expr: str) -> Pattern:

135 return re.compile(expr, re.UNICODE)

136

137

138def _get_token_collection(version_info: PythonVersionInfo) -> TokenCollection:

139 try:

140 return _token_collection_cache[version_info]

141 except KeyError:

142 _token_collection_cache[version_info] = result = _create_token_collection(

143 version_info

144 )

145 return result

146

147

148fstring_raw_string = _compile(r"(?:[^{}]+|\{\{|\}\})+")

149

150unicode_character_name = r"[A-Za-z0-9\-]+(?: [A-Za-z0-9\-]+)*"

151fstring_string_single_line = _compile(

152 r"(?:\{\{|\}\}|\\N\{"

153 + unicode_character_name

154 + r"\}|\\(?:\r\n?|\n)|\\[^\r\nN]|[^{}\r\n\\])+"

155)

156fstring_string_multi_line = _compile(

157 r"(?:\{\{|\}\}|\\N\{" + unicode_character_name + r"\}|\\[^N]|[^{}\\])+"

158)

159

160fstring_format_spec_single_line = _compile(r"(?:\\(?:\r\n?|\n)|[^{}\r\n])+")

161fstring_format_spec_multi_line = _compile(r"[^{}]+")

162

163

164def _create_token_collection( # noqa: C901

165 version_info: PythonVersionInfo,

166) -> TokenCollection:

167 # Note: we use unicode matching for names ("\w") but ascii matching for

168 # number literals.

169 Whitespace = r"[ \f\t]*"

170 Comment = r"#[^\r\n]*"

171 # Python 2 is pretty much not working properly anymore, we just ignore

172 # parsing unicode properly, which is fine, I guess.

173 if version_info.major == 2:

174 Name = r"([A-Za-z_0-9]+)"

175 elif sys.version_info[0] == 2:

176 # Unfortunately the regex engine cannot deal with the regex below, so

177 # just use this one.

178 Name = r"(\w+)"

179 else:

180 Name = "([A-Za-z_0-9\u0080-" + MAX_UNICODE + "]+)"

181

182 if version_info >= (3, 6):

183 Hexnumber = r"0[xX](?:_?[0-9a-fA-F])+"

184 Binnumber = r"0[bB](?:_?[01])+"

185 Octnumber = r"0[oO](?:_?[0-7])+"

186 Decnumber = r"(?:0(?:_?0)*|[1-9](?:_?[0-9])*)"

187 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)

188 Exponent = r"[eE][-+]?[0-9](?:_?[0-9])*"

189 Pointfloat = group(

190 r"[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?", r"\.[0-9](?:_?[0-9])*"

191 ) + maybe(Exponent)

192 Expfloat = r"[0-9](?:_?[0-9])*" + Exponent

193 Floatnumber = group(Pointfloat, Expfloat)

194 Imagnumber = group(r"[0-9](?:_?[0-9])*[jJ]", Floatnumber + r"[jJ]")

195 else:

196 Hexnumber = r"0[xX][0-9a-fA-F]+"

197 Binnumber = r"0[bB][01]+"

198 if version_info >= (3, 0):

199 Octnumber = r"0[oO][0-7]+"

200 else:

201 Octnumber = "0[oO]?[0-7]+"

202 Decnumber = r"(?:0+|[1-9][0-9]*)"

203 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)

204 if version_info.major < 3:

205 Intnumber += "[lL]?"

206 Exponent = r"[eE][-+]?[0-9]+"

207 Pointfloat = group(r"[0-9]+\.[0-9]*", r"\.[0-9]+") + maybe(Exponent)

208 Expfloat = r"[0-9]+" + Exponent

209 Floatnumber = group(Pointfloat, Expfloat)

210 Imagnumber = group(r"[0-9]+[jJ]", Floatnumber + r"[jJ]")

211 Number = group(Imagnumber, Floatnumber, Intnumber)

212

213 # Note that since _all_string_prefixes includes the empty string,

214 # StringPrefix can be the empty string (making it optional).

215 possible_prefixes = _all_string_prefixes(version_info)

216 StringPrefix = group(*possible_prefixes)

217 StringPrefixWithF = group(*_all_string_prefixes(version_info, include_fstring=True))

218 fstring_prefixes = _all_string_prefixes(

219 version_info, include_fstring=True, only_fstring=True

220 )

221 FStringStart = group(*fstring_prefixes)

222

223 # Tail end of ' string.

224 Single = r"(?:\\.|[^'\\])*'"

225 # Tail end of " string.

226 Double = r'(?:\\.|[^"\\])*"'

227 # Tail end of ''' string.

228 Single3 = r"(?:\\.|'(?!'')|[^'\\])*'''"

229 # Tail end of """ string.

230 Double3 = r'(?:\\.|"(?!"")|[^"\\])*"""'

231 Triple = group(StringPrefixWithF + "'''", StringPrefixWithF + '"""')

232

233 # Because of leftmost-then-longest match semantics, be sure to put the

234 # longest operators first (e.g., if = came before ==, == would get

235 # recognized as two instances of =).

236 Operator = group(

237 r"\*\*=?", r">>=?", r"<<=?", r"//=?", r"->", r"[+\-*/%&@`|^!=<>]=?", r"~"

238 )

239

240 Bracket = "[][(){}]"

241

242 special_args = [r"\r\n?", r"\n", r"[;.,@]"]

243 if version_info >= (3, 0):

244 special_args.insert(0, r"\.\.\.")

245 if version_info >= (3, 8):

246 special_args.insert(0, ":=?")

247 else:

248 special_args.insert(0, ":")

249 Special = group(*special_args)

250

251 Funny = group(Operator, Bracket, Special)

252

253 # First (or only) line of ' or " string.

254 ContStr = group(

255 StringPrefix

256 + r"'[^\r\n'\\]*(?:\\.[^\r\n'\\]*)*"

257 + group("'", r"\\(?:\r\n?|\n)"),

258 StringPrefix

259 + r'"[^\r\n"\\]*(?:\\.[^\r\n"\\]*)*'

260 + group('"', r"\\(?:\r\n?|\n)"),

261 )

262 pseudo_extra_pool = [Comment, Triple]

263 all_quotes = '"', "'", '"""', "'''"

264 if fstring_prefixes:

265 pseudo_extra_pool.append(FStringStart + group(*all_quotes))

266

267 PseudoExtras = group(r"\\(?:\r\n?|\n)|\Z", *pseudo_extra_pool)

268 PseudoToken = group(Whitespace, capture=True) + group(

269 PseudoExtras, Number, Funny, ContStr, Name, capture=True

270 )

271

272 # For a given string prefix plus quotes, endpats maps it to a regex

273 # to match the remainder of that string. _prefix can be empty, for

274 # a normal single or triple quoted string (with no prefix).

275 endpats = {}

276 for _prefix in possible_prefixes:

277 endpats[_prefix + "'"] = _compile(Single)

278 endpats[_prefix + '"'] = _compile(Double)

279 endpats[_prefix + "'''"] = _compile(Single3)

280 endpats[_prefix + '"""'] = _compile(Double3)

281

282 # A set of all of the single and triple quoted string prefixes,

283 # including the opening quotes.

284 single_quoted = set()

285 triple_quoted = set()

286 fstring_pattern_map = {}

287 for t in possible_prefixes:

288 for quote in '"', "'":

289 single_quoted.add(t + quote)

290

291 for quote in '"""', "'''":

292 triple_quoted.add(t + quote)

293

294 for t in fstring_prefixes:

295 for quote in all_quotes:

296 fstring_pattern_map[t + quote] = quote

297

298 pseudo_token_compiled = _compile(PseudoToken)

299 return TokenCollection(

300 pseudo_token_compiled,

301 single_quoted,

302 triple_quoted,

303 endpats,

304 _compile(Whitespace),

305 fstring_pattern_map,

306 {

307 ";",

308 "import",

309 "class",

310 "def",

311 "try",

312 "except",

313 "finally",

314 "while",

315 "with",

316 "return",

317 },

318 )

319

320

321class Token(namedtuple("Token", ["type", "string", "start_pos", "prefix"])):

322 @property

323 def end_pos(self):

324 lines = split_lines(self.string)

325 if len(lines) > 1:

326 return self.start_pos[0] + len(lines) - 1, 0

327 else:

328 return self.start_pos[0], self.start_pos[1] + len(self.string)

329

330

331class PythonToken(Token):

332 def __repr__(self):

333 return "TokenInfo(type=%s, string=%r, start_pos=%r, prefix=%r)" % self._replace(

334 type=self.type.name

335 )

336

337

338class FStringNode:

339 def __init__(self, quote, raw):

340 self.quote = quote

341 self.raw = raw

342 self.parentheses_count = 0

343 self.previous_lines = ""

344 self.last_string_start_pos = None

345 # In the syntax there can be multiple format_spec's nested:

346 # {x:{y:3}}

347 self.format_spec_count = 0

348

349 def open_parentheses(self, character):

350 self.parentheses_count += 1

351

352 def close_parentheses(self, character):

353 self.parentheses_count -= 1

354 if self.parentheses_count == 0:

355 # No parentheses means that the format spec is also finished.

356 self.format_spec_count = 0

357

358 def allow_multiline(self):

359 return len(self.quote) == 3

360

361 def is_in_expr(self):

362 return self.parentheses_count > self.format_spec_count

363

364 def is_in_format_spec(self):

365 return not self.is_in_expr() and self.format_spec_count

366

367

368def _close_fstring_if_necessary(fstring_stack, string, start_pos, additional_prefix):

369 for fstring_stack_index, node in enumerate(fstring_stack):

370 if string.startswith(node.quote):

371 token = PythonToken(

372 FSTRING_END, node.quote, start_pos, prefix=additional_prefix

373 )

374 additional_prefix = ""

375 assert not node.previous_lines

376 del fstring_stack[fstring_stack_index:]

377 return token, "", len(node.quote)

378 return None, additional_prefix, 0

379

380

381def _find_fstring_string(endpats, fstring_stack, line, lnum, pos):

382 tos = fstring_stack[-1]

383 allow_multiline = tos.allow_multiline()

384 if tos.is_in_format_spec():

385 if allow_multiline:

386 regex = fstring_format_spec_multi_line

387 else:

388 regex = fstring_format_spec_single_line

389 else:

390 if tos.raw:

391 regex = fstring_raw_string

392 elif allow_multiline:

393 regex = fstring_string_multi_line

394 else:

395 regex = fstring_string_single_line

396

397 match = regex.match(line, pos)

398 if match is None:

399 return tos.previous_lines, pos

400

401 if not tos.previous_lines:

402 tos.last_string_start_pos = (lnum, pos)

403

404 string = match.group(0)

405 for fstring_stack_node in fstring_stack:

406 end_match = endpats[fstring_stack_node.quote].match(string)

407 if end_match is not None:

408 string = end_match.group(0)[: -len(fstring_stack_node.quote)]

409

410 new_pos = pos

411 new_pos += len(string)

412 # even if allow_multiline is False, we still need to check for trailing

413 # newlines, because a single-line f-string can contain line continuations

414 if string.endswith("\n") or string.endswith("\r"):

415 tos.previous_lines += string

416 string = ""

417 else:

418 string = tos.previous_lines + string

419

420 return string, new_pos

421

422

423def tokenize(

424 code: str, version_info: PythonVersionInfo, start_pos: Tuple[int, int] = (1, 0)

425) -> Generator[PythonToken, None, None]:

426 """Generate tokens from a the source code (string)."""

427 lines = split_lines(code, keepends=True)

428 return tokenize_lines(lines, version_info, start_pos=start_pos)

429

430

431def tokenize_lines( # noqa: C901

432 lines: Iterable[str],

433 version_info: PythonVersionInfo,

434 start_pos: Tuple[int, int] = (1, 0),

435) -> Generator[PythonToken, None, None]:

436 token_collection = _get_token_collection(version_info)

437 if version_info >= PythonVersionInfo(3, 7):

438 return _tokenize_lines_py37_or_above(

439 lines, version_info, token_collection, start_pos=start_pos

440 )

441 else:

442 return _tokenize_lines_py36_or_below(

443 lines, version_info, token_collection, start_pos=start_pos

444 )

445

446

447def _tokenize_lines_py36_or_below( # noqa: C901

448 lines: Iterable[str],

449 version_info: PythonVersionInfo,

450 token_collection: TokenCollection,

451 start_pos: Tuple[int, int] = (1, 0),

452) -> Generator[PythonToken, None, None]:

453 """

454 A heavily modified Python standard library tokenizer.

455

456 Additionally to the default information, yields also the prefix of each

457 token. This idea comes from lib2to3. The prefix contains all information

458 that is irrelevant for the parser like newlines in parentheses or comments.

459 """

460

461 paren_level = 0 # count parentheses

462 indents = [0]

463 max = 0

464 numchars = "0123456789"

465 contstr = ""

466 contline = None

467 # We start with a newline. This makes indent at the first position

468 # possible. It's not valid Python, but still better than an INDENT in the

469 # second line (and not in the first). This makes quite a few things in

470 # Jedi's fast parser possible.

471 new_line = True

472 prefix = "" # Should never be required, but here for safety

473 endprog = None # Should not be required, but here for lint

474 contstr_start: Optional[Tuple[int, int]] = None

475 additional_prefix = ""

476 first = True

477 lnum = start_pos[0] - 1

478 fstring_stack = []

479 # stash and async_* are used for async/await parsing

480 stashed: Optional[PythonToken] = None

481 async_def: bool = False

482 async_def_indent: int = 0

483 async_def_newline: bool = False

484

485 def dedent_if_necessary(start):

486 nonlocal stashed

487 nonlocal async_def

488 nonlocal async_def_indent

489 nonlocal async_def_newline

490

491 while start < indents[-1]:

492 if start > indents[-2]:

493 yield PythonToken(ERROR_DEDENT, "", (lnum, 0), "")

494 break

495 if stashed is not None:

496 yield stashed

497 stashed = None

498 if async_def and async_def_newline and async_def_indent >= indents[-1]:

499 # We exited an 'async def' block, so stop tracking for indents

500 async_def = False

501 async_def_newline = False

502 async_def_indent = 0

503 yield PythonToken(DEDENT, "", spos, "")

504 indents.pop()

505

506 for line in lines: # loop over lines in stream

507 lnum += 1

508 pos = 0

509 max = len(line)

510 if first:

511 if line.startswith(BOM_UTF8_STRING):

512 additional_prefix = BOM_UTF8_STRING

513 line = line[1:]

514 max = len(line)

515

516 # Fake that the part before was already parsed.

517 line = "^" * start_pos[1] + line

518 pos = start_pos[1]

519 max += start_pos[1]

520

521 first = False

522

523 if contstr: # continued string

524 if endprog is None:

525 raise Exception("Logic error!")

526 endmatch = endprog.match(line)

527 if endmatch:

528 pos = endmatch.end(0)

529 if contstr_start is None:

530 raise Exception("Logic error!")

531 if stashed is not None:

532 raise Exception("Logic error!")

533 yield PythonToken(STRING, contstr + line[:pos], contstr_start, prefix)

534 contstr = ""

535 contline = None

536 else:

537 contstr = contstr + line

538 contline = contline + line

539 continue

540

541 while pos < max:

542 if fstring_stack:

543 tos = fstring_stack[-1]

544 if not tos.is_in_expr():

545 string, pos = _find_fstring_string(

546 token_collection.endpats, fstring_stack, line, lnum, pos

547 )

548 if string:

549 if stashed is not None:

550 raise Exception("Logic error!")

551 yield PythonToken(

552 FSTRING_STRING,

553 string,

554 tos.last_string_start_pos,

555 # Never has a prefix because it can start anywhere and

556 # include whitespace.

557 prefix="",

558 )

559 tos.previous_lines = ""

560 continue

561 if pos == max:

562 break

563

564 rest = line[pos:]

565 (

566 fstring_end_token,

567 additional_prefix,

568 quote_length,

569 ) = _close_fstring_if_necessary(

570 fstring_stack, rest, (lnum, pos), additional_prefix

571 )

572 pos += quote_length

573 if fstring_end_token is not None:

574 if stashed is not None:

575 raise Exception("Logic error!")

576 yield fstring_end_token

577 continue

578

579 pseudomatch = token_collection.pseudo_token.match(line, pos)

580 if not pseudomatch: # scan for tokens

581 match = token_collection.whitespace.match(line, pos)

582 if pos == 0:

583 # pyre-fixme[16]: `Optional` has no attribute `end`.

584 yield from dedent_if_necessary(match.end())

585 pos = match.end()

586 new_line = False

587 yield PythonToken(

588 ERRORTOKEN,

589 line[pos],

590 (lnum, pos),

591 # pyre-fixme[16]: `Optional` has no attribute `group`.

592 additional_prefix + match.group(0),

593 )

594 additional_prefix = ""

595 pos += 1

596 continue

597

598 prefix = additional_prefix + pseudomatch.group(1)

599 additional_prefix = ""

600 start, pos = pseudomatch.span(2)

601 spos = (lnum, start)

602 token = pseudomatch.group(2)

603 if token == "":

604 assert prefix

605 additional_prefix = prefix

606 # This means that we have a line with whitespace/comments at

607 # the end, which just results in an endmarker.

608 break

609 initial = token[0]

610

611 if new_line and initial not in "\r\n\\#":

612 new_line = False

613 if paren_level == 0 and not fstring_stack:

614 i = 0

615 indent_start = start

616 while line[i] == "\f":

617 i += 1

618 # TODO don't we need to change spos as well?

619 indent_start -= 1

620 if indent_start > indents[-1]:

621 if stashed is not None:

622 yield stashed

623 stashed = None

624 yield PythonToken(INDENT, "", spos, "")

625 indents.append(indent_start)

626 yield from dedent_if_necessary(indent_start)

627

628 if initial in numchars or ( # ordinary number

629 initial == "." and token != "." and token != "..."

630 ):

631 if stashed is not None:

632 yield stashed

633 stashed = None

634 yield PythonToken(NUMBER, token, spos, prefix)

635 elif pseudomatch.group(3) is not None: # ordinary name

636 if token in token_collection.always_break_tokens:

637 fstring_stack[:] = []

638 paren_level = 0

639 # We only want to dedent if the token is on a new line.

640 if re.match(r"[ \f\t]*$", line[:start]):

641 while True:

642 indent = indents.pop()

643 if indent > start:

644 if (

645 async_def

646 and async_def_newline

647 and async_def_indent >= indent

648 ):

649 # We dedented outside of an 'async def' block.

650 async_def = False

651 async_def_newline = False

652 async_def_indent = 0

653 if stashed is not None:

654 yield stashed

655 stashed = None

656 yield PythonToken(DEDENT, "", spos, "")

657 else:

658 indents.append(indent)

659 break

660 if str.isidentifier(token):

661 should_yield_identifier = True

662 if token in ("async", "await") and async_def:

663 # We're inside an 'async def' block, all async/await are

664 # tokens.

665 if token == "async":

666 yield PythonToken(ASYNC, token, spos, prefix)

667 else:

668 yield PythonToken(AWAIT, token, spos, prefix)

669 should_yield_identifier = False

670

671 # We are possibly starting an 'async def' section

672 elif token == "async" and not stashed:

673 stashed = PythonToken(NAME, token, spos, prefix)

674 should_yield_identifier = False

675

676 # We actually are starting an 'async def' section

677 elif (

678 token == "def"

679 and stashed is not None

680 and stashed[0] is NAME

681 and stashed[1] == "async"

682 ):

683 async_def = True

684 async_def_indent = indents[-1]

685 yield PythonToken(ASYNC, stashed[1], stashed[2], stashed[3])

686 stashed = None

687

688 # We are either not stashed, or we output an ASYNC token above.

689 elif stashed:

690 yield stashed

691 stashed = None

692

693 # If we didn't bail early due to possibly recognizing an 'async def',

694 # then we should yield this token as normal.

695 if should_yield_identifier:

696 yield PythonToken(NAME, token, spos, prefix)

697 else:

698 yield from _split_illegal_unicode_name(token, spos, prefix)

699 elif initial in "\r\n":

700 if any(not f.allow_multiline() for f in fstring_stack):

701 # Would use fstring_stack.clear, but that's not available

702 # in Python 2.

703 fstring_stack[:] = []

704

705 if not new_line and paren_level == 0 and not fstring_stack:

706 if async_def:

707 async_def_newline = True

708 if stashed:

709 yield stashed

710 stashed = None

711 yield PythonToken(NEWLINE, token, spos, prefix)

712 else:

713 additional_prefix = prefix + token

714 new_line = True

715 elif initial == "#": # Comments

716 assert not token.endswith("\n")

717 additional_prefix = prefix + token

718 elif token in token_collection.triple_quoted:

719 endprog = token_collection.endpats[token]

720 endmatch = endprog.match(line, pos)

721 if endmatch: # all on one line

722 pos = endmatch.end(0)

723 token = line[start:pos]

724 if stashed is not None:

725 yield stashed

726 stashed = None

727 yield PythonToken(STRING, token, spos, prefix)

728 else:

729 contstr_start = (lnum, start) # multiple lines

730 contstr = line[start:]

731 contline = line

732 break

733

734 # Check up to the first 3 chars of the token to see if

735 # they're in the single_quoted set. If so, they start

736 # a string.

737 # We're using the first 3, because we're looking for

738 # "rb'" (for example) at the start of the token. If

739 # we switch to longer prefixes, this needs to be

740 # adjusted.

741 # Note that initial == token[:1].

742 # Also note that single quote checking must come after

743 # triple quote checking (above).

744 elif (

745 initial in token_collection.single_quoted

746 or token[:2] in token_collection.single_quoted

747 or token[:3] in token_collection.single_quoted

748 ):

749 if token[-1] in "\r\n": # continued string

750 # This means that a single quoted string ends with a

751 # backslash and is continued.

752 contstr_start = lnum, start

753 endprog = (

754 token_collection.endpats.get(initial)

755 or token_collection.endpats.get(token[1])

756 or token_collection.endpats.get(token[2])

757 )

758 contstr = line[start:]

759 contline = line

760 break

761 else: # ordinary string

762 if stashed is not None:

763 yield stashed

764 stashed = None

765 yield PythonToken(STRING, token, spos, prefix)

766 elif (

767 token in token_collection.fstring_pattern_map

768 ): # The start of an fstring.

769 fstring_stack.append(

770 FStringNode(

771 token_collection.fstring_pattern_map[token],

772 "r" in token or "R" in token,

773 )

774 )

775 if stashed is not None:

776 yield stashed

777 stashed = None

778 yield PythonToken(FSTRING_START, token, spos, prefix)

779 elif initial == "\\" and line[start:] in (

780 "\\\n",

781 "\\\r\n",

782 "\\\r",

783 ): # continued stmt

784 additional_prefix += prefix + line[start:]

785 break

786 else:

787 if token in "([{":

788 if fstring_stack:

789 fstring_stack[-1].open_parentheses(token)

790 else:

791 paren_level += 1

792 elif token in ")]}":

793 if fstring_stack:

794 fstring_stack[-1].close_parentheses(token)

795 else:

796 if paren_level:

797 paren_level -= 1

798 elif (

799 token == ":"

800 and fstring_stack

801 and fstring_stack[-1].parentheses_count

802 - fstring_stack[-1].format_spec_count

803 == 1

804 ):

805 fstring_stack[-1].format_spec_count += 1

806

807 if stashed is not None:

808 yield stashed

809 stashed = None

810 yield PythonToken(OP, token, spos, prefix)

811

812 if contstr:

813 yield PythonToken(ERRORTOKEN, contstr, contstr_start, prefix)

814 if contstr.endswith("\n") or contstr.endswith("\r"):

815 new_line = True

816

817 if stashed is not None:

818 yield stashed

819 stashed = None

820

821 end_pos = lnum, max

822 # As the last position we just take the maximally possible position. We

823 # remove -1 for the last new line.

824 for indent in indents[1:]:

825 yield PythonToken(DEDENT, "", end_pos, "")

826 yield PythonToken(ENDMARKER, "", end_pos, additional_prefix)

827

828

829def _tokenize_lines_py37_or_above( # noqa: C901

830 lines: Iterable[str],

831 version_info: PythonVersionInfo,

832 token_collection: TokenCollection,

833 start_pos: Tuple[int, int] = (1, 0),

834) -> Generator[PythonToken, None, None]:

835 """

836 A heavily modified Python standard library tokenizer.

837

838 Additionally to the default information, yields also the prefix of each

839 token. This idea comes from lib2to3. The prefix contains all information

840 that is irrelevant for the parser like newlines in parentheses or comments.

841 """

842

843 def dedent_if_necessary(start):

844 while start < indents[-1]:

845 if start > indents[-2]:

846 yield PythonToken(ERROR_DEDENT, "", (lnum, 0), "")

847 break

848 yield PythonToken(DEDENT, "", spos, "")

849 indents.pop()

850

851 paren_level = 0 # count parentheses

852 indents = [0]

853 max = 0

854 numchars = "0123456789"

855 contstr = ""

856 contline = None

857 # We start with a newline. This makes indent at the first position

858 # possible. It's not valid Python, but still better than an INDENT in the

859 # second line (and not in the first). This makes quite a few things in

860 # Jedi's fast parser possible.

861 new_line = True

862 prefix = "" # Should never be required, but here for safety

863 endprog = None # Should not be required, but here for lint

864 contstr_start: Optional[Tuple[int, int]] = None

865 additional_prefix = ""

866 first = True

867 lnum = start_pos[0] - 1

868 fstring_stack = []

869 for line in lines: # loop over lines in stream

870 lnum += 1

871 pos = 0

872 max = len(line)

873 if first:

874 if line.startswith(BOM_UTF8_STRING):

875 additional_prefix = BOM_UTF8_STRING

876 line = line[1:]

877 max = len(line)

878

879 # Fake that the part before was already parsed.

880 line = "^" * start_pos[1] + line

881 pos = start_pos[1]

882 max += start_pos[1]

883

884 first = False

885

886 if contstr: # continued string

887 if endprog is None:

888 raise Exception("Logic error!")

889 endmatch = endprog.match(line)

890 if endmatch:

891 pos = endmatch.end(0)

892 if contstr_start is None:

893 raise Exception("Logic error!")

894 yield PythonToken(STRING, contstr + line[:pos], contstr_start, prefix)

895 contstr = ""

896 contline = None

897 else:

898 contstr = contstr + line

899 contline = contline + line

900 continue

901

902 while pos < max:

903 if fstring_stack:

904 tos = fstring_stack[-1]

905 if not tos.is_in_expr():

906 string, pos = _find_fstring_string(

907 token_collection.endpats, fstring_stack, line, lnum, pos

908 )

909 if string:

910 yield PythonToken(

911 FSTRING_STRING,

912 string,

913 tos.last_string_start_pos,

914 # Never has a prefix because it can start anywhere and

915 # include whitespace.

916 prefix="",

917 )

918 tos.previous_lines = ""

919 continue

920 if pos == max:

921 break

922

923 rest = line[pos:]

924 (

925 fstring_end_token,

926 additional_prefix,

927 quote_length,

928 ) = _close_fstring_if_necessary(

929 fstring_stack, rest, (lnum, pos), additional_prefix

930 )

931 pos += quote_length

932 if fstring_end_token is not None:

933 yield fstring_end_token

934 continue

935

936 pseudomatch = token_collection.pseudo_token.match(line, pos)

937 if not pseudomatch: # scan for tokens

938 match = token_collection.whitespace.match(line, pos)

939 if pos == 0:

940 # pyre-fixme[16]: `Optional` has no attribute `end`.

941 for t in dedent_if_necessary(match.end()):

942 yield t

943 pos = match.end()

944 new_line = False

945 yield PythonToken(

946 ERRORTOKEN,

947 line[pos],

948 (lnum, pos),

949 # pyre-fixme[16]: `Optional` has no attribute `group`.

950 additional_prefix + match.group(0),

951 )

952 additional_prefix = ""

953 pos += 1

954 continue

955

956 prefix = additional_prefix + pseudomatch.group(1)

957 additional_prefix = ""

958 start, pos = pseudomatch.span(2)

959 spos = (lnum, start)

960 token = pseudomatch.group(2)

961 if token == "":

962 assert prefix

963 additional_prefix = prefix

964 # This means that we have a line with whitespace/comments at

965 # the end, which just results in an endmarker.

966 break

967 initial = token[0]

968

969 if new_line and initial not in "\r\n\\#":

970 new_line = False

971 if paren_level == 0 and not fstring_stack:

972 i = 0

973 indent_start = start

974 while line[i] == "\f":

975 i += 1

976 # TODO don't we need to change spos as well?

977 indent_start -= 1

978 if indent_start > indents[-1]:

979 yield PythonToken(INDENT, "", spos, "")

980 indents.append(indent_start)

981 for t in dedent_if_necessary(indent_start):

982 yield t

983

984 if initial in numchars or ( # ordinary number

985 initial == "." and token != "." and token != "..."

986 ):

987 yield PythonToken(NUMBER, token, spos, prefix)

988 elif pseudomatch.group(3) is not None: # ordinary name

989 if token in token_collection.always_break_tokens:

990 fstring_stack[:] = []

991 paren_level = 0

992 # We only want to dedent if the token is on a new line.

993 if re.match(r"[ \f\t]*$", line[:start]):

994 while True:

995 indent = indents.pop()

996 if indent > start:

997 yield PythonToken(DEDENT, "", spos, "")

998 else:

999 indents.append(indent)

1000 break

1001 if str.isidentifier(token):

1002 # py37 doesn't need special tokens for async/await, and we could

1003 # emit NAME, but then we'd need different grammar for py36 and py37.

1004 if token == "async":

1005 yield PythonToken(ASYNC, token, spos, prefix)

1006 elif token == "await":

1007 yield PythonToken(AWAIT, token, spos, prefix)

1008 else:

1009 yield PythonToken(NAME, token, spos, prefix)

1010 else:

1011 for t in _split_illegal_unicode_name(token, spos, prefix):

1012 yield t # yield from Python 2

1013 elif initial in "\r\n":

1014 if any(not f.allow_multiline() for f in fstring_stack):

1015 # Would use fstring_stack.clear, but that's not available

1016 # in Python 2.

1017 fstring_stack[:] = []

1018

1019 if not new_line and paren_level == 0 and not fstring_stack:

1020 yield PythonToken(NEWLINE, token, spos, prefix)

1021 else:

1022 additional_prefix = prefix + token

1023 new_line = True

1024 elif initial == "#": # Comments

1025 assert not token.endswith("\n")

1026 additional_prefix = prefix + token

1027 elif token in token_collection.triple_quoted:

1028 endprog = token_collection.endpats[token]

1029 endmatch = endprog.match(line, pos)

1030 if endmatch: # all on one line

1031 pos = endmatch.end(0)

1032 token = line[start:pos]

1033 yield PythonToken(STRING, token, spos, prefix)

1034 else:

1035 contstr_start = (lnum, start) # multiple lines

1036 contstr = line[start:]

1037 contline = line

1038 break

1039

1040 # Check up to the first 3 chars of the token to see if

1041 # they're in the single_quoted set. If so, they start

1042 # a string.

1043 # We're using the first 3, because we're looking for

1044 # "rb'" (for example) at the start of the token. If

1045 # we switch to longer prefixes, this needs to be

1046 # adjusted.

1047 # Note that initial == token[:1].

1048 # Also note that single quote checking must come after

1049 # triple quote checking (above).

1050 elif (

1051 initial in token_collection.single_quoted

1052 or token[:2] in token_collection.single_quoted

1053 or token[:3] in token_collection.single_quoted

1054 ):

1055 if token[-1] in "\r\n": # continued string

1056 # This means that a single quoted string ends with a

1057 # backslash and is continued.

1058 contstr_start = lnum, start

1059 endprog = (

1060 token_collection.endpats.get(initial)

1061 or token_collection.endpats.get(token[1])

1062 or token_collection.endpats.get(token[2])

1063 )

1064 contstr = line[start:]

1065 contline = line

1066 break

1067 else: # ordinary string

1068 yield PythonToken(STRING, token, spos, prefix)

1069 elif (

1070 token in token_collection.fstring_pattern_map

1071 ): # The start of an fstring.

1072 fstring_stack.append(

1073 FStringNode(

1074 token_collection.fstring_pattern_map[token],

1075 "r" in token or "R" in token,

1076 )

1077 )

1078 yield PythonToken(FSTRING_START, token, spos, prefix)

1079 elif initial == "\\" and line[start:] in (

1080 "\\\n",

1081 "\\\r\n",

1082 "\\\r",

1083 ): # continued stmt

1084 additional_prefix += prefix + line[start:]

1085 break

1086 else:

1087 if token in "([{":

1088 if fstring_stack:

1089 fstring_stack[-1].open_parentheses(token)

1090 else:

1091 paren_level += 1

1092 elif token in ")]}":

1093 if fstring_stack:

1094 fstring_stack[-1].close_parentheses(token)

1095 else:

1096 if paren_level:

1097 paren_level -= 1

1098 elif (

1099 token == ":"

1100 and fstring_stack

1101 and fstring_stack[-1].parentheses_count

1102 - fstring_stack[-1].format_spec_count

1103 == 1

1104 ):

1105 fstring_stack[-1].format_spec_count += 1

1106

1107 yield PythonToken(OP, token, spos, prefix)

1108

1109 if contstr:

1110 yield PythonToken(ERRORTOKEN, contstr, contstr_start, prefix)

1111 if contstr.endswith("\n") or contstr.endswith("\r"):

1112 new_line = True

1113

1114 end_pos = lnum, max

1115 # As the last position we just take the maximally possible position. We

1116 # remove -1 for the last new line.

1117 for indent in indents[1:]:

1118 yield PythonToken(DEDENT, "", end_pos, "")

1119 yield PythonToken(ENDMARKER, "", end_pos, additional_prefix)

1120

1121

1122def _split_illegal_unicode_name(

1123 token: str, start_pos: Tuple[int, int], prefix: str

1124) -> Generator[PythonToken, None, None]:

1125 def create_token():

1126 return PythonToken(ERRORTOKEN if is_illegal else NAME, found, pos, prefix)

1127

1128 found = ""

1129 is_illegal = False

1130 pos = start_pos

1131 for i, char in enumerate(token):

1132 if is_illegal:

1133 if str.isidentifier(char):

1134 yield create_token()

1135 found = char

1136 is_illegal = False

1137 prefix = ""

1138 pos = start_pos[0], start_pos[1] + i

1139 else:

1140 found += char

1141 else:

1142 new_found = found + char

1143 if str.isidentifier(new_found):

1144 found = new_found

1145 else:

1146 if found:

1147 yield create_token()

1148 prefix = ""

1149 pos = start_pos[0], start_pos[1] + i

1150 found = char

1151 is_illegal = True

1152

1153 if found:

1154 yield create_token()

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/libcst/_parser/parso/python/tokenize.py: 10%

680 statements