Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/tomlkit/parser.py: 97%

1from __future__ import annotations

3import datetime

4import re

5import string

7from typing import Any

8from typing import Callable

10from tomlkit._compat import decode

11from tomlkit._utils import RFC_3339_LOOSE

12from tomlkit._utils import _escaped

13from tomlkit._utils import parse_rfc3339

14from tomlkit.container import Container

15from tomlkit.exceptions import EmptyKeyError

16from tomlkit.exceptions import EmptyTableNameError

17from tomlkit.exceptions import InternalParserError

18from tomlkit.exceptions import InvalidCharInStringError

19from tomlkit.exceptions import InvalidControlChar

20from tomlkit.exceptions import InvalidDateError

21from tomlkit.exceptions import InvalidDateTimeError

22from tomlkit.exceptions import InvalidNumberError

23from tomlkit.exceptions import InvalidTimeError

24from tomlkit.exceptions import InvalidUnicodeValueError

25from tomlkit.exceptions import ParseError

26from tomlkit.exceptions import UnexpectedCharError

27from tomlkit.exceptions import UnexpectedEofError

28from tomlkit.items import AoT

29from tomlkit.items import Array

30from tomlkit.items import Bool

31from tomlkit.items import BoolType

32from tomlkit.items import Comment

33from tomlkit.items import Date

34from tomlkit.items import DateTime

35from tomlkit.items import Float

36from tomlkit.items import InlineTable

37from tomlkit.items import Integer

38from tomlkit.items import Item

39from tomlkit.items import Key

40from tomlkit.items import KeyType

41from tomlkit.items import Null

42from tomlkit.items import SingleKey

43from tomlkit.items import String

44from tomlkit.items import StringType

45from tomlkit.items import Table

46from tomlkit.items import Time

47from tomlkit.items import Trivia

48from tomlkit.items import Whitespace

49from tomlkit.source import Source

50from tomlkit.source import _StateHandler

51from tomlkit.toml_document import TOMLDocument

54CTRL_I = 0x09 # Tab

55CTRL_J = 0x0A # Line feed

56CTRL_M = 0x0D # Carriage return

57CTRL_CHAR_LIMIT = 0x1F

58CHR_DEL = 0x7F

60# TOML character classes (formerly the `TOMLChar` constants), as frozensets for

61# O(1) membership tests; also the stop-sets for the Source.advance_while /

62# advance_until bulk run scans that replace per-character

63# `while self._current in <set> and self.inc()` loops with a single scan.

64_SPACES = frozenset(" \t")

65_NL = frozenset("\n\r")

66_WS = _SPACES | _NL

67_KV = frozenset("= \t")

68_BARE_KEY_OR_SPACE = frozenset(string.ascii_letters + string.digits + "-_ \t")

69_NUM_STOP = frozenset(" \t\n\r#,]}")

70_DATE_TAIL_STOP = frozenset("\t\n\r#,]}")

71# Control chars invalid inside a single-line string (DEL + everything <= 0x1F

72# except tab) — exactly the set that raises InvalidControlChar in the per-char

73# string loop. The single-line string-body fast-path stops its bulk scan at the

74# first delimiter / backslash / control char, then the main loop handles that

75# char with its existing branch (raising InvalidControlChar where needed).

76_CTRL_SINGLE = frozenset(chr(c) for c in range(0x20) if c != CTRL_I) | {chr(CHR_DEL)}

77_SINGLE_LITERAL_STOP = _CTRL_SINGLE | {"'"} # literal: only the closing quote

78_SINGLE_BASIC_STOP = _CTRL_SINGLE | {'"', "\\"} # basic: quote or escape

81class Parser:

82 """

83 Parser for TOML documents.

84 """

86 # Deeply nested documents would overflow the interpreter stack: arrays and

87 # inline tables are parsed recursively, and every fragment of a dotted key

88 # adds a level of nested containers. Refuse documents beyond this depth.

89 MAX_NESTING_DEPTH = 100

91 def __init__(self, string: str | bytes) -> None:

92 # Input to parse

93 self._src = Source(decode(string))

95 self._aot_stack: list[Key] = []

96 self._nesting_depth = 0

98 @property

99 def _state(self) -> _StateHandler:

100 return self._src.state

101

102 @property

103 def _idx(self) -> int:

104 return self._src.idx

105

106 @property

107 def _current(self) -> str:

108 return self._src.current

109

110 @property

111 def _marker(self) -> int:

112 return self._src.marker

113

114 def extract(self) -> str:

115 """

116 Extracts the value between marker and index

117 """

118 return self._src.extract()

119

120 def inc(self, exception: type[ParseError] | None = None) -> bool:

121 """

122 Increments the parser if the end of the input has not been reached.

123 Returns whether or not it was able to advance.

124 """

125 return self._src.inc(exception=exception)

126

127 def inc_n(self, n: int, exception: type[ParseError] | None = None) -> bool:

128 """

129 Increments the parser by n characters

130 if the end of the input has not been reached.

131 """

132 return self._src.inc_n(n=n, exception=exception)

133

134 def consume(self, chars: str, min: int = 0, max: int = -1) -> None:

135 """

136 Consume chars until min/max is satisfied is valid.

137 """

138 return self._src.consume(chars=chars, min=min, max=max)

139

140 def end(self) -> bool:

141 """

142 Returns True if the parser has reached the end of the input.

143 """

144 return self._src.end()

145

146 def mark(self) -> None:

147 """

148 Sets the marker to the index's current position

149 """

150 self._src.mark()

151

152 def parse_error(

153 self,

154 exception: type[ParseError] = ParseError,

155 *args: Any,

156 **kwargs: Any,

157 ) -> ParseError:

158 """

159 Creates a generic "parse error" at the current position.

160 """

161 return self._src.parse_error(exception, *args, **kwargs)

162

163 def parse(self) -> TOMLDocument:

164 body = TOMLDocument(True)

165

166 # Take all keyvals outside of tables/AoT's.

167 while not self.end():

168 # Break out if a table is found

169 if self._current == "[":

170 break

171

172 # Otherwise, take and append one KV

173 item = self._parse_item()

174 if not item:

175 break

176

177 key, value = item

178 if (key is not None and key.is_multi()) or not self._merge_ws(value, body):

179 # We actually have a table

180 try:

181 body.append(key, value)

182 except Exception as e:

183 raise self.parse_error(ParseError, str(e)) from e

184

185 self.mark()

186

187 while not self.end():

188 key, value = self._parse_table()

189 if isinstance(value, Table) and value.is_aot_element():

190 # This is just the first table in an AoT. Parse the rest of the array

191 # along with it.

192 value = self._parse_aot(value, key)

193

194 try:

195 body.append(key, value)

196 except Exception as e:

197 raise self.parse_error(ParseError, str(e)) from e

198

199 body.parsing(False)

200

201 return body

202

203 def _merge_ws(self, item: Item, container: Container) -> bool:

204 """

205 Merges the given Item with the last one currently in the given Container if

206 both are whitespace items.

207

208 Returns True if the items were merged.

209 """

210 last = container.last_item()

211 if not last:

212 return False

213

214 if not isinstance(item, Whitespace) or not isinstance(last, Whitespace):

215 return False

216

217 start = self._idx - (len(last.s) + len(item.s))

218 container.body[-1] = (

219 container.body[-1][0],

220 Whitespace(self._src[start : self._idx]),

221 )

222

223 return True

224

225 def _is_child(self, parent: Key, child: Key) -> bool:

226 """

227 Returns whether a key is strictly a child of another key.

228 AoT siblings are not considered children of one another.

229 """

230 parent_parts = tuple(parent)

231 child_parts = tuple(child)

232

233 if parent_parts == child_parts:

234 return False

235

236 return parent_parts == child_parts[: len(parent_parts)]

237

238 def _parse_item(self) -> tuple[Key | None, Item] | None:

239 """

240 Attempts to parse the next item and returns it, along with its key

241 if the item is value-like.

242 """

243 self.mark()

244 with self._state as state:

245 while True:

246 c = self._current

247 if c == "\n":

248 # Found a newline; Return all whitespace found up to this point.

249 self.inc()

250

251 return None, Whitespace(self.extract())

252 elif c in " \t\r":

253 if c == "\r":

254 with self._state(restore=True):

255 if not self.inc() or self._current != "\n":

256 raise self.parse_error(

257 InvalidControlChar, CTRL_M, "documents"

258 )

259 # Skip whitespace.

260 if not self.inc():

261 return None, Whitespace(self.extract())

262 elif c == "#":

263 # Found a comment, parse it

264 indent = self.extract()

265 cws, comment, trail = self._parse_comment_trail()

266

267 return None, Comment(Trivia(indent, cws, comment, trail))

268 elif c == "[":

269 # Found a table, delegate to the calling function.

270 return None

271 else:

272 # Beginning of a KV pair.

273 # Return to beginning of whitespace so it gets included

274 # as indentation for the KV about to be parsed.

275 state.restore = True

276 break

277

278 return self._parse_key_value(True)

279

280 def _parse_comment_trail(self, parse_trail: bool = True) -> tuple[str, str, str]:

281 """

282 Returns (comment_ws, comment, trail)

283 If there is no comment, comment_ws and comment will

284 simply be empty.

285 """

286 if self.end():

287 return "", "", ""

288

289 comment = ""

290 comment_ws = ""

291 self.mark()

292

293 while True:

294 c = self._current

295

296 if c == "\n":

297 break

298 elif c == "#":

299 comment_ws = self.extract()

300

301 self.mark()

302 self.inc() # Skip #

303

304 # The comment itself

305 while not self.end() and self._current not in _NL:

306 code = ord(self._current)

307 if code == CHR_DEL or (code <= CTRL_CHAR_LIMIT and code != CTRL_I):

308 raise self.parse_error(InvalidControlChar, code, "comments")

309

310 if not self.inc():

311 break

312

313 comment = self.extract()

314 self.mark()

315

316 break

317 elif c in " \t\r":

318 if c == "\r":

319 with self._state(restore=True):

320 if not self.inc() or self._current != "\n":

321 raise self.parse_error(

322 InvalidControlChar, CTRL_M, "comments"

323 )

324 self.inc()

325 else:

326 raise self.parse_error(UnexpectedCharError, c)

327

328 if self.end():

329 break

330

331 trail = ""

332 if parse_trail:

333 self._src.advance_while(_SPACES)

334

335 if self._current == "\r":

336 with self._state(restore=True):

337 if not self.inc() or self._current != "\n":

338 raise self.parse_error(InvalidControlChar, CTRL_M, "documents")

339 self.inc()

340

341 if self._current == "\n":

342 self.inc()

343

344 if self._idx != self._marker or self._current in _WS:

345 trail = self.extract()

346

347 return comment_ws, comment, trail

348

349 def _parse_key_value(self, parse_comment: bool = False) -> tuple[Key, Item]:

350 # Leading indent

351 self.mark()

352

353 self._src.advance_while(_SPACES)

354

355 indent = self.extract()

356

357 # Key

358 key = self._parse_key()

359

360 self.mark()

361

362 found_equals = self._current == "="

363 while self._current in _KV and self.inc():

364 if self._current == "=":

365 if found_equals:

366 raise self.parse_error(UnexpectedCharError, "=")

367 else:

368 found_equals = True

369 if not found_equals:

370 raise self.parse_error(UnexpectedCharError, self._current)

371

372 if not key.sep:

373 key.sep = self.extract()

374 else:

375 key.sep += self.extract()

376

377 # Value

378 val = self._parse_value()

379 # Comment

380 if parse_comment:

381 cws, comment, trail = self._parse_comment_trail()

382 meta = val.trivia

383 if not meta.comment_ws:

384 meta.comment_ws = cws

385

386 meta.comment = comment

387 meta.trail = trail

388 else:

389 val.trivia.trail = ""

390

391 val.trivia.indent = indent

392

393 return key, val

394

395 def _parse_key(self) -> Key:

396 """

397 Parses a Key at the current position;

398 WS before the key must be exhausted first at the callsite.

399 """

400 key = self._parse_simple_key()

401 fragments = 1

402 while self._current == ".":

403 fragments += 1

404 if fragments > self.MAX_NESTING_DEPTH:

405 raise self.parse_error(

406 ParseError,

407 f"TOML key nested more than {self.MAX_NESTING_DEPTH} levels deep",

408 )

409 self.inc()

410 key = key.concat(self._parse_simple_key())

411

412 return key

413

414 def _parse_simple_key(self) -> Key:

415 """

416 Parses a single (non-dotted) key fragment.

417 """

418 self.mark()

419 # Skip any leading whitespace (bulk scan)

420 self._src.advance_while(_SPACES)

421 if self._current in "\"'":

422 return self._parse_quoted_key()

423 else:

424 return self._parse_bare_key()

425

426 def _parse_quoted_key(self) -> Key:

427 """

428 Parses a key enclosed in either single or double quotes.

429 """

430 # Extract the leading whitespace

431 original = self.extract()

432 quote_style = self._current

433 key_type = next((t for t in KeyType if t.value == quote_style), None)

434

435 if key_type is None:

436 raise RuntimeError("Should not have entered _parse_quoted_key()")

437

438 key_str = self._parse_string(

439 StringType.SLB if key_type == KeyType.Basic else StringType.SLL

440 )

441 if key_str._t.is_multiline():

442 raise self.parse_error(UnexpectedCharError, key_str._t.value)

443 original += key_str.as_string()

444 self.mark()

445 self._src.advance_while(_SPACES)

446 original += self.extract()

447

448 return SingleKey(str(key_str), t=key_type, sep="", original=original)

449

450 def _parse_bare_key(self) -> Key:

451 """

452 Parses a bare key.

453 """

454 self._src.advance_while(_BARE_KEY_OR_SPACE)

455

456 original = self.extract()

457 key_s = original.strip()

458 if not key_s:

459 # Empty key

460 raise self.parse_error(EmptyKeyError)

461

462 if " " in key_s or "\t" in key_s:

463 # Bare key with whitespace in it

464 raise self.parse_error(ParseError, f'Invalid key "{key_s}"')

465

466 return SingleKey(key_s, KeyType.Bare, "", original)

467

468 def _parse_value(self) -> Item:

469 """

470 Attempts to parse a value at the current position.

471 """

472 self.mark()

473 c = self._current

474 trivia = Trivia()

475

476 if c == StringType.SLB.value:

477 return self._parse_basic_string()

478 elif c == StringType.SLL.value:

479 return self._parse_literal_string()

480 elif c == BoolType.TRUE.value[0]:

481 return self._parse_true()

482 elif c == BoolType.FALSE.value[0]:

483 return self._parse_false()

484 elif c == "[":

485 return self._parse_nested(self._parse_array)

486 elif c == "{":

487 return self._parse_nested(self._parse_inline_table)

488 elif c in "+-" or self._peek(4) in {

489 "+inf",

490 "-inf",

491 "inf",

492 "+nan",

493 "-nan",

494 "nan",

495 }:

496 # Number

497 self._src.advance_until(_NUM_STOP)

498

499 raw = self.extract()

500

501 item = self._parse_number(raw, trivia)

502 if item is not None:

503 return item

504

505 raise self.parse_error(InvalidNumberError)

506 elif c in string.digits:

507 # Integer, Float, Date, Time or DateTime

508 self._src.advance_until(_NUM_STOP)

509

510 raw = self.extract()

511

512 m = RFC_3339_LOOSE.match(raw)

513 if m:

514 if m.group("date") and m.group("time"):

515 # datetime

516 try:

517 dt = parse_rfc3339(raw)

518 assert isinstance(dt, datetime.datetime)

519 return DateTime(

520 dt.year,

521 dt.month,

522 dt.day,

523 dt.hour,

524 dt.minute,

525 dt.second,

526 dt.microsecond,

527 dt.tzinfo,

528 trivia,

529 raw,

530 )

531 except ValueError:

532 raise self.parse_error(InvalidDateTimeError) from None

533

534 if m.group("date"):

535 try:

536 dt = parse_rfc3339(raw)

537 assert isinstance(dt, datetime.date)

538 date = Date(dt.year, dt.month, dt.day, trivia, raw)

539 self.mark()

540 self._src.advance_until(_DATE_TAIL_STOP)

541

542 time_raw = self.extract()

543 time_part = time_raw.rstrip()

544 trivia.comment_ws = time_raw[len(time_part) :]

545 if not time_part:

546 return date

547

548 dt = parse_rfc3339(raw + time_part)

549 assert isinstance(dt, datetime.datetime)

550 return DateTime(

551 dt.year,

552 dt.month,

553 dt.day,

554 dt.hour,

555 dt.minute,

556 dt.second,

557 dt.microsecond,

558 dt.tzinfo,

559 trivia,

560 raw + time_part,

561 )

562 except ValueError:

563 raise self.parse_error(InvalidDateError) from None

564

565 if m.group("time"):

566 try:

567 t = parse_rfc3339(raw)

568 assert isinstance(t, datetime.time)

569 return Time(

570 t.hour,

571 t.minute,

572 t.second,

573 t.microsecond,

574 t.tzinfo,

575 trivia,

576 raw,

577 )

578 except ValueError:

579 raise self.parse_error(InvalidTimeError) from None

580

581 item = self._parse_number(raw, trivia)

582 if item is not None:

583 return item

584

585 raise self.parse_error(InvalidNumberError)

586 else:

587 raise self.parse_error(UnexpectedCharError, c)

588

589 def _parse_true(self) -> Bool:

590 return self._parse_bool(BoolType.TRUE)

591

592 def _parse_false(self) -> Bool:

593 return self._parse_bool(BoolType.FALSE)

594

595 def _parse_bool(self, style: BoolType) -> Bool:

596 with self._state:

597 style = BoolType(style)

598

599 # only keep parsing for bool if the characters match the style

600 # try consuming rest of chars in style

601 for c in style:

602 self.consume(c, min=1, max=1)

603

604 return Bool(style, Trivia())

605

606 def _parse_nested(self, parse: Callable[[], Item]) -> Item:

607 """

608 Parses an array or inline table, enforcing the nesting depth limit.

609 """

610 self._nesting_depth += 1

611 if self._nesting_depth > self.MAX_NESTING_DEPTH:

612 raise self.parse_error(

613 ParseError,

614 f"TOML value nested more than {self.MAX_NESTING_DEPTH} levels deep",

615 )

616 try:

617 return parse()

618 finally:

619 self._nesting_depth -= 1

620

621 def _parse_array(self) -> Array:

622 # Consume opening bracket, EOF here is an issue (middle of array)

623 self.inc(exception=UnexpectedEofError)

624

625 elems: list[Item] = []

626 prev_value = None

627 while True:

628 # consume whitespace

629 mark = self._idx

630 self.consume(" \t\n\r")

631 indent = self._src[mark : self._idx]

632 newline = _NL & set(indent)

633 if newline:

634 elems.append(Whitespace(indent))

635 continue

636

637 # consume comment

638 if self._current == "#":

639 cws, comment, trail = self._parse_comment_trail(parse_trail=False)

640 elems.append(Comment(Trivia(indent, cws, comment, trail)))

641 continue

642

643 # consume indent

644 if indent:

645 elems.append(Whitespace(indent))

646 continue

647

648 # consume value

649 if not prev_value:

650 try:

651 elems.append(self._parse_value())

652 prev_value = True

653 continue

654 except UnexpectedCharError:

655 pass

656

657 # consume comma

658 if prev_value and self._current == ",":

659 self.inc(exception=UnexpectedEofError)

660 # If the previous item is Whitespace, add to it

661 if isinstance(elems[-1], Whitespace):

662 elems[-1]._s = elems[-1].s + ","

663 else:

664 elems.append(Whitespace(","))

665 prev_value = False

666 continue

667

668 # consume closing bracket

669 if self._current == "]":

670 # consume closing bracket, EOF here doesn't matter

671 self.inc()

672 break

673

674 raise self.parse_error(UnexpectedCharError, self._current)

675

676 try:

677 res = Array(elems, Trivia())

678 except ValueError:

679 pass

680 else:

681 return res

682

683 raise self.parse_error(ParseError, "Failed to parse array")

684

685 def _parse_inline_table(self) -> InlineTable:

686 # consume opening bracket, EOF here is an issue (middle of array)

687 self.inc(exception=UnexpectedEofError)

688

689 elems = Container(True)

690 expect_key = True

691 while True:

692 while True:

693 # consume whitespace and newlines

694 mark = self._idx

695 self.consume(" \t\n\r")

696 raw = self._src[mark : self._idx]

697 if raw:

698 elems.add(Whitespace(raw))

699

700 if self._current != "#":

701 break

702

703 cws, comment, trail = self._parse_comment_trail(parse_trail=False)

704 elems.add(Comment(Trivia("", cws, comment, trail)))

705

706 if self._current == "}":

707 # consume closing bracket, EOF here doesn't matter

708 self.inc()

709 break

710

711 if expect_key:

712 if self._current == ",":

713 raise self.parse_error(UnexpectedCharError, self._current)

714 key, val = self._parse_key_value(False)

715 elems.add(key, val)

716 expect_key = False

717 continue

718

719 if self._current != ",":

720 raise self.parse_error(UnexpectedCharError, self._current)

721

722 elems.add(Whitespace(","))

723 # consume comma, EOF here is an issue (middle of inline table)

724 self.inc(exception=UnexpectedEofError)

725 expect_key = True

726

727 return InlineTable(elems, Trivia())

728

729 def _parse_number(self, raw: str, trivia: Trivia) -> Item | None:

730 # Leading zeros are not allowed

731 sign = ""

732 if raw.startswith(("+", "-")):

733 sign = raw[0]

734 raw = raw[1:]

735

736 if len(raw) > 1 and (

737 (raw.startswith("0") and not raw.startswith(("0.", "0o", "0x", "0b", "0e")))

738 or (sign and raw.startswith("."))

739 ):

740 return None

741

742 if raw.startswith(("0o", "0x", "0b")) and sign:

743 return None

744

745 digits = "[0-9]"

746 base = 10

747 if raw.startswith("0b"):

748 digits = "[01]"

749 base = 2

750 elif raw.startswith("0o"):

751 digits = "[0-7]"

752 base = 8

753 elif raw.startswith("0x"):

754 digits = "[0-9a-f]"

755 base = 16

756

757 # Underscores should be surrounded by digits

758 clean = re.sub(f"(?i)(?<={digits})_(?={digits})", "", raw).lower()

759

760 if "_" in clean:

761 return None

762

763 if clean.endswith(".") or (

764 not clean.startswith("0x") and clean.split("e", 1)[0].endswith(".")

765 ):

766 return None

767

768 try:

769 return Integer(int(sign + clean, base), trivia, sign + raw)

770 except ValueError:

771 try:

772 return Float(float(sign + clean), trivia, sign + raw)

773 except ValueError:

774 return None

775

776 def _parse_literal_string(self) -> String:

777 with self._state:

778 return self._parse_string(StringType.SLL)

779

780 def _parse_basic_string(self) -> String:

781 with self._state:

782 return self._parse_string(StringType.SLB)

783

784 def _parse_escaped_char(self, multiline: bool) -> str:

785 if multiline and self._current in _WS:

786 # When the last non-whitespace character on a line is

787 # a \, it will be trimmed along with all whitespace

788 # (including newlines) up to the next non-whitespace

789 # character or closing delimiter.

790 # """\

791 # hello \

792 # world"""

793 tmp = ""

794 while self._current in _WS:

795 tmp += self._current

796 # consume the whitespace, EOF here is an issue

797 # (middle of string)

798 self.inc(exception=UnexpectedEofError)

799 continue

800

801 # the escape followed by whitespace must have a newline

802 # before any other chars

803 if "\n" not in tmp:

804 raise self.parse_error(InvalidCharInStringError, self._current)

805

806 return ""

807

808 if self._current in _escaped:

809 c = _escaped[self._current]

810

811 # consume this char, EOF here is an issue (middle of string)

812 self.inc(exception=UnexpectedEofError)

813

814 return c

815

816 if self._current in {"u", "U"}:

817 # this needs to be a unicode

818 u, ue = self._peek_unicode(self._current == "U")

819 if u is not None:

820 assert ue is not None

821 # consume the U char and the unicode value

822 self.inc_n(len(ue) + 1)

823

824 return u

825

826 raise self.parse_error(InvalidUnicodeValueError)

827

828 if self._current == "x":

829 h, he = self._peek_hex()

830 if h is not None:

831 assert he is not None

832 # consume the x char and the hex value

833 self.inc_n(len(he) + 1)

834 return h

835

836 raise self.parse_error(InvalidUnicodeValueError)

837

838 raise self.parse_error(InvalidCharInStringError, self._current)

839

840 def _parse_string(self, delim: StringType) -> String:

841 # only keep parsing for string if the current character matches the delim

842 if self._current != delim.unit:

843 raise self.parse_error(

844 InternalParserError,

845 f"Invalid character for string type {delim}",

846 )

847

848 # consume the opening/first delim, EOF here is an issue

849 # (middle of string or middle of delim)

850 self.inc(exception=UnexpectedEofError)

851

852 if self._current == delim.unit:

853 # consume the closing/second delim, we do not care if EOF occurs as

854 # that would simply imply an empty single line string

855 if not self.inc() or self._current != delim.unit:

856 # Empty string

857 return String(delim, "", "", Trivia())

858

859 # consume the third delim, EOF here is an issue (middle of string)

860 self.inc(exception=UnexpectedEofError)

861

862 delim = delim.toggle() # convert delim to multi delim

863

864 self.mark() # to extract the original string with whitespace and all

865 value = ""

866

867 # A newline immediately following the opening delimiter will be trimmed.

868 if delim.is_multiline():

869 if self._current == "\n":

870 # consume the newline, EOF here is an issue (middle of string)

871 self.inc(exception=UnexpectedEofError)

872 else:

873 cur: str = self._current

874 with self._state(restore=True):

875 if self.inc():

876 cur += self._current

877 if cur == "\r\n":

878 self.inc_n(2, exception=UnexpectedEofError)

879

880 # PERF: stop-set for the single-line string-body bulk fast-path (None for

881 # multiline, which keeps the per-char loop because of \r\n handling).

882 src = self._src

883 single_stop = None

884 if delim.is_singleline():

885 single_stop = (

886 _SINGLE_BASIC_STOP if delim.is_basic() else _SINGLE_LITERAL_STOP

887 )

888

889 escaped = False # whether the previous key was ESCAPE

890 while True:

891 code = ord(self._current)

892 if (

893 delim.is_singleline()

894 and not escaped

895 and (code == CHR_DEL or (code <= CTRL_CHAR_LIMIT and code != CTRL_I))

896 ) or (

897 delim.is_multiline()

898 and not escaped

899 and (

900 code == CHR_DEL

901 or (

902 code <= CTRL_CHAR_LIMIT and code not in [CTRL_I, CTRL_J, CTRL_M]

903 )

904 )

905 ):

906 raise self.parse_error(InvalidControlChar, code, "strings")

907 elif delim.is_multiline() and not escaped and self._current == "\r":

908 with self._state(restore=True):

909 if not self.inc() or self._current != "\n":

910 raise self.parse_error(InvalidControlChar, CTRL_M, "strings")

911 value += self._current

912 self.inc(exception=UnexpectedEofError)

913 elif not escaped and self._current == delim.unit:

914 # try to process current as a closing delim

915 original = self.extract()

916

917 close = ""

918 if delim.is_multiline():

919 # Consume the delimiters to see if we are at the end of the string

920 close = ""

921 while self._current == delim.unit:

922 close += self._current

923 self.inc()

924

925 if len(close) < 3:

926 # Not a triple quote, leave in result as-is.

927 # Adding back the characters we already consumed

928 value += close

929 continue

930

931 if len(close) == 3:

932 # We are at the end of the string

933 return String(delim, value, original, Trivia())

934

935 if len(close) >= 6:

936 raise self.parse_error(InvalidCharInStringError, self._current)

937

938 value += close[:-3]

939 original += close[:-3]

940

941 return String(delim, value, original, Trivia())

942 else:

943 # consume the closing delim, we do not care if EOF occurs as

944 # that would simply imply the end of self._src

945 self.inc()

946

947 return String(delim, value, original, Trivia())

948 elif delim.is_basic() and escaped:

949 # attempt to parse the current char as an escaped value, an exception

950 # is raised if this fails

951 value += self._parse_escaped_char(delim.is_multiline())

952

953 # no longer escaped

954 escaped = False

955 elif delim.is_basic() and self._current == "\\":

956 # the next char is being escaped

957 escaped = True

958

959 # consume this char, EOF here is an issue (middle of string)

960 self.inc(exception=UnexpectedEofError)

961 else:

962 # this is either a literal string where we keep everything as is,

963 # or this is not a special escaped char in a basic string

964 if single_stop is not None:

965 # PERF fast-path: bulk-append the run of ordinary characters

966 # up to the next delimiter / backslash / control char, instead

967 # of one `value += cur; inc()` iteration per character. The

968 # stop char is then handled by the branches above on the next

969 # iteration (single-line only; multiline keeps the per-char

970 # loop for CRLF handling).

971 run_start = src._idx

972 src.advance_until(single_stop)

973 if src.end():

974 # mid-string EOF — same error as the per-char inc()

975 raise self.parse_error(UnexpectedEofError)

976 value += src[run_start : src._idx]

977 else:

978 value += self._current

979

980 # consume this char, EOF here is an issue (middle of string)

981 self.inc(exception=UnexpectedEofError)

982

983 def _parse_table(

984 self, parent_name: Key | None = None, parent: Table | None = None

985 ) -> tuple[Key, Table | AoT]:

986 """

987 Parses a table element.

988 """

989 if self._current != "[":

990 raise self.parse_error(

991 InternalParserError, "_parse_table() called on non-bracket character."

992 )

993

994 indent = self.extract()

995 self.inc() # Skip opening bracket

996

997 if self.end():

998 raise self.parse_error(UnexpectedEofError)

999

1000 is_aot = False

1001 if self._current == "[":

1002 if not self.inc():

1003 raise self.parse_error(UnexpectedEofError)

1004

1005 is_aot = True

1006 try:

1007 key = self._parse_key()

1008 except EmptyKeyError:

1009 raise self.parse_error(EmptyTableNameError) from None

1010 if self.end():

1011 raise self.parse_error(UnexpectedEofError)

1012 elif self._current != "]":

1013 raise self.parse_error(UnexpectedCharError, self._current)

1014

1015 key.sep = ""

1016 full_key = key

1017 name_parts = tuple(key)

1018 if any(" " in part.key.strip() and part.is_bare() for part in name_parts):

1019 raise self.parse_error(

1020 ParseError, f'Invalid table name "{full_key.as_string()}"'

1021 )

1022

1023 missing_table = False

1024 if parent_name:

1025 parent_name_parts = tuple(parent_name)

1026 else:

1027 parent_name_parts = ()

1028

1029 if len(name_parts) > len(parent_name_parts) + 1:

1030 missing_table = True

1031

1032 name_parts = name_parts[len(parent_name_parts) :]

1033

1034 values = Container(True)

1035

1036 self.inc() # Skip closing bracket

1037 if is_aot:

1038 # TODO: Verify close bracket

1039 self.inc()

1040

1041 cws, comment, trail = self._parse_comment_trail()

1042

1043 result: Table | AoT = Null() # type: ignore[assignment]

1044 table = Table(

1045 values,

1046 Trivia(indent, cws, comment, trail),

1047 is_aot,

1048 name=name_parts[0].key if name_parts else key.key,

1049 display_name=full_key.as_string(),

1050 is_super_table=False,

1051 )

1052

1053 if len(name_parts) > 1:

1054 if missing_table:

1055 # Missing super table

1056 # i.e. a table initialized like this: [foo.bar]

1057 # without initializing [foo]

1058 #

1059 # So we have to create the parent tables

1060 table = Table(

1061 Container(True),

1062 Trivia("", cws, comment, trail),

1063 is_aot and name_parts[0] in self._aot_stack,

1064 is_super_table=True,

1065 name=name_parts[0].key,

1066 )

1067

1068 result = table

1069 key = name_parts[0]

1070

1071 for i, _name in enumerate(name_parts[1:]):

1072 child = table.get(

1073 _name,

1074 Table(

1075 Container(True),

1076 Trivia(indent, cws, comment, trail),

1077 is_aot and i == len(name_parts) - 2,

1078 is_super_table=i < len(name_parts) - 2,

1079 name=_name.key,

1080 display_name=(

1081 full_key.as_string() if i == len(name_parts) - 2 else None

1082 ),

1083 ),

1084 )

1085

1086 if is_aot and i == len(name_parts) - 2:

1087 table.raw_append(_name, AoT([child], name=table.name, parsed=True))

1088 else:

1089 table.raw_append(_name, child)

1090

1091 table = child

1092 values = table.value

1093 else:

1094 if name_parts:

1095 key = name_parts[0]

1096

1097 while not self.end():

1098 parsed = self._parse_item()

1099 if parsed:

1100 _key, _val = parsed

1101 if not self._merge_ws(_val, values):

1102 table.raw_append(_key, _val)

1103 else:

1104 if self._current == "[":

1105 _, key_next = self._peek_table()

1106

1107 if self._is_child(full_key, key_next):

1108 key_next, table_next = self._parse_table(full_key, table)

1109

1110 table.raw_append(key_next, table_next)

1111

1112 # Picking up any sibling

1113 while not self.end():

1114 _, key_next = self._peek_table()

1115

1116 if not self._is_child(full_key, key_next):

1117 break

1118

1119 key_next, table_next = self._parse_table(full_key, table)

1120

1121 table.raw_append(key_next, table_next)

1122

1123 break

1124 else:

1125 raise self.parse_error(

1126 InternalParserError,

1127 "_parse_item() returned None on a non-bracket character.",

1128 )

1129 table.value._validate_out_of_order_table()

1130 if isinstance(result, Null):

1131 result = table

1132

1133 if is_aot and (not self._aot_stack or full_key != self._aot_stack[-1]):

1134 result = self._parse_aot(result, full_key)

1135

1136 return key, result

1137

1138 def _peek_table(self) -> tuple[bool, Key]:

1139 """

1140 Peeks ahead non-intrusively by cloning then restoring the

1141 initial state of the parser.

1142

1143 Returns the name of the table about to be parsed,

1144 as well as whether it is part of an AoT.

1145 """

1146 # we always want to restore after exiting this scope

1147 with self._state(save_marker=True, restore=True):

1148 if self._current != "[":

1149 raise self.parse_error(

1150 InternalParserError,

1151 "_peek_table() entered on non-bracket character",

1152 )

1153

1154 # AoT

1155 self.inc()

1156 is_aot = False

1157 if self._current == "[":

1158 self.inc()

1159 is_aot = True

1160 try:

1161 return is_aot, self._parse_key()

1162 except EmptyKeyError:

1163 raise self.parse_error(EmptyTableNameError) from None

1164

1165 def _parse_aot(self, first: Table, name_first: Key) -> AoT:

1166 """

1167 Parses all siblings of the provided table first and bundles them into

1168 an AoT.

1169 """

1170 payload: list[Table] = [first]

1171 self._aot_stack.append(name_first)

1172 while not self.end():

1173 is_aot_next, name_next = self._peek_table()

1174 if is_aot_next and name_next == name_first:

1175 _, table = self._parse_table(name_first)

1176 assert isinstance(table, Table)

1177 payload.append(table)

1178 else:

1179 break

1180

1181 self._aot_stack.pop()

1182

1183 return AoT(payload, parsed=True)

1184

1185 def _peek(self, n: int) -> str:

1186 """

1187 Peeks ahead n characters.

1188

1189 n is the max number of characters that will be peeked.

1190 """

1191 # we always want to restore after exiting this scope

1192 with self._state(restore=True):

1193 buf = ""

1194 for _ in range(n):

1195 if self._current not in " \t\n\r#,]}" + self._src.EOF:

1196 buf += self._current

1197 self.inc()

1198 continue

1199

1200 break

1201 return buf

1202

1203 def _peek_unicode(self, is_long: bool) -> tuple[str | None, str | None]:

1204 """

1205 Peeks ahead non-intrusively by cloning then restoring the

1206 initial state of the parser.

1207

1208 Returns the unicode value is it's a valid one else None.

1209 """

1210 # we always want to restore after exiting this scope

1211 with self._state(save_marker=True, restore=True):

1212 if self._current not in {"u", "U"}:

1213 raise self.parse_error(

1214 InternalParserError, "_peek_unicode() entered on non-unicode value"

1215 )

1216

1217 self.inc() # Dropping prefix

1218 self.mark()

1219

1220 if is_long:

1221 chars = 8

1222 else:

1223 chars = 4

1224

1225 if not self.inc_n(chars):

1226 value, extracted = None, None

1227 else:

1228 extracted = self.extract()

1229

1230 if extracted.strip("0123456789abcdefABCDEF"):

1231 return None, extracted

1232

1233 codepoint = int(extracted, 16)

1234

1235 # Unicode scalar values exclude the surrogate range

1236 # (U+D800 to U+DFFF). The 8-digit \U form reaches this range

1237 # with leading zeros, so it must be checked on the value itself.

1238 if 0xD800 <= codepoint <= 0xDFFF:

1239 return None, extracted

1240

1241 try:

1242 value = chr(codepoint)

1243 except (ValueError, OverflowError):

1244 value = None

1245

1246 return value, extracted

1247

1248 def _peek_hex(self) -> tuple[str | None, str | None]:

1249 with self._state(save_marker=True, restore=True):

1250 if self._current != "x":

1251 raise self.parse_error(

1252 InternalParserError, "_peek_hex() entered on non-hex value"

1253 )

1254

1255 self.inc() # Dropping prefix

1256 self.mark()

1257

1258 if not self.inc_n(2):

1259 return None, None

1260

1261 extracted = self.extract()

1262 if extracted.strip("0123456789abcdefABCDEF"):

1263 return None, None

1264

1265 try:

1266 value = chr(int(extracted, 16))

1267 except (ValueError, OverflowError):

1268 value = None

1269

1270 return value, extracted