Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/tomlkit/parser.py: 97%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1from __future__ import annotations
3import datetime
4import re
5import string
7from typing import Any
8from typing import Callable
10from tomlkit._compat import decode
11from tomlkit._utils import RFC_3339_LOOSE
12from tomlkit._utils import _escaped
13from tomlkit._utils import parse_rfc3339
14from tomlkit.container import Container
15from tomlkit.exceptions import EmptyKeyError
16from tomlkit.exceptions import EmptyTableNameError
17from tomlkit.exceptions import InternalParserError
18from tomlkit.exceptions import InvalidCharInStringError
19from tomlkit.exceptions import InvalidControlChar
20from tomlkit.exceptions import InvalidDateError
21from tomlkit.exceptions import InvalidDateTimeError
22from tomlkit.exceptions import InvalidNumberError
23from tomlkit.exceptions import InvalidTimeError
24from tomlkit.exceptions import InvalidUnicodeValueError
25from tomlkit.exceptions import ParseError
26from tomlkit.exceptions import UnexpectedCharError
27from tomlkit.exceptions import UnexpectedEofError
28from tomlkit.items import AoT
29from tomlkit.items import Array
30from tomlkit.items import Bool
31from tomlkit.items import BoolType
32from tomlkit.items import Comment
33from tomlkit.items import Date
34from tomlkit.items import DateTime
35from tomlkit.items import Float
36from tomlkit.items import InlineTable
37from tomlkit.items import Integer
38from tomlkit.items import Item
39from tomlkit.items import Key
40from tomlkit.items import KeyType
41from tomlkit.items import Null
42from tomlkit.items import SingleKey
43from tomlkit.items import String
44from tomlkit.items import StringType
45from tomlkit.items import Table
46from tomlkit.items import Time
47from tomlkit.items import Trivia
48from tomlkit.items import Whitespace
49from tomlkit.source import Source
50from tomlkit.source import _StateHandler
51from tomlkit.toml_document import TOMLDocument
54CTRL_I = 0x09 # Tab
55CTRL_J = 0x0A # Line feed
56CTRL_M = 0x0D # Carriage return
57CTRL_CHAR_LIMIT = 0x1F
58CHR_DEL = 0x7F
60# TOML character classes (formerly the `TOMLChar` constants), as frozensets for
61# O(1) membership tests; also the stop-sets for the Source.advance_while /
62# advance_until bulk run scans that replace per-character
63# `while self._current in <set> and self.inc()` loops with a single scan.
64_SPACES = frozenset(" \t")
65_NL = frozenset("\n\r")
66_WS = _SPACES | _NL
67_KV = frozenset("= \t")
68_BARE_KEY_OR_SPACE = frozenset(string.ascii_letters + string.digits + "-_ \t")
69_NUM_STOP = frozenset(" \t\n\r#,]}")
70_DATE_TAIL_STOP = frozenset("\t\n\r#,]}")
71# Control chars invalid inside a single-line string (DEL + everything <= 0x1F
72# except tab) — exactly the set that raises InvalidControlChar in the per-char
73# string loop. The single-line string-body fast-path stops its bulk scan at the
74# first delimiter / backslash / control char, then the main loop handles that
75# char with its existing branch (raising InvalidControlChar where needed).
76_CTRL_SINGLE = frozenset(chr(c) for c in range(0x20) if c != CTRL_I) | {chr(CHR_DEL)}
77_SINGLE_LITERAL_STOP = _CTRL_SINGLE | {"'"} # literal: only the closing quote
78_SINGLE_BASIC_STOP = _CTRL_SINGLE | {'"', "\\"} # basic: quote or escape
81class Parser:
82 """
83 Parser for TOML documents.
84 """
86 # Deeply nested documents would overflow the interpreter stack: arrays and
87 # inline tables are parsed recursively, and every fragment of a dotted key
88 # adds a level of nested containers. Refuse documents beyond this depth.
89 MAX_NESTING_DEPTH = 100
91 def __init__(self, string: str | bytes) -> None:
92 # Input to parse
93 self._src = Source(decode(string))
95 self._aot_stack: list[Key] = []
96 self._nesting_depth = 0
98 @property
99 def _state(self) -> _StateHandler:
100 return self._src.state
102 @property
103 def _idx(self) -> int:
104 return self._src.idx
106 @property
107 def _current(self) -> str:
108 return self._src.current
110 @property
111 def _marker(self) -> int:
112 return self._src.marker
114 def extract(self) -> str:
115 """
116 Extracts the value between marker and index
117 """
118 return self._src.extract()
120 def inc(self, exception: type[ParseError] | None = None) -> bool:
121 """
122 Increments the parser if the end of the input has not been reached.
123 Returns whether or not it was able to advance.
124 """
125 return self._src.inc(exception=exception)
127 def inc_n(self, n: int, exception: type[ParseError] | None = None) -> bool:
128 """
129 Increments the parser by n characters
130 if the end of the input has not been reached.
131 """
132 return self._src.inc_n(n=n, exception=exception)
134 def consume(self, chars: str, min: int = 0, max: int = -1) -> None:
135 """
136 Consume chars until min/max is satisfied is valid.
137 """
138 return self._src.consume(chars=chars, min=min, max=max)
140 def end(self) -> bool:
141 """
142 Returns True if the parser has reached the end of the input.
143 """
144 return self._src.end()
146 def mark(self) -> None:
147 """
148 Sets the marker to the index's current position
149 """
150 self._src.mark()
152 def parse_error(
153 self,
154 exception: type[ParseError] = ParseError,
155 *args: Any,
156 **kwargs: Any,
157 ) -> ParseError:
158 """
159 Creates a generic "parse error" at the current position.
160 """
161 return self._src.parse_error(exception, *args, **kwargs)
163 def parse(self) -> TOMLDocument:
164 body = TOMLDocument(True)
166 # Take all keyvals outside of tables/AoT's.
167 while not self.end():
168 # Break out if a table is found
169 if self._current == "[":
170 break
172 # Otherwise, take and append one KV
173 item = self._parse_item()
174 if not item:
175 break
177 key, value = item
178 if (key is not None and key.is_multi()) or not self._merge_ws(value, body):
179 # We actually have a table
180 try:
181 body.append(key, value)
182 except Exception as e:
183 raise self.parse_error(ParseError, str(e)) from e
185 self.mark()
187 while not self.end():
188 key, value = self._parse_table()
189 if isinstance(value, Table) and value.is_aot_element():
190 # This is just the first table in an AoT. Parse the rest of the array
191 # along with it.
192 value = self._parse_aot(value, key)
194 try:
195 body.append(key, value)
196 except Exception as e:
197 raise self.parse_error(ParseError, str(e)) from e
199 body.parsing(False)
201 return body
203 def _merge_ws(self, item: Item, container: Container) -> bool:
204 """
205 Merges the given Item with the last one currently in the given Container if
206 both are whitespace items.
208 Returns True if the items were merged.
209 """
210 last = container.last_item()
211 if not last:
212 return False
214 if not isinstance(item, Whitespace) or not isinstance(last, Whitespace):
215 return False
217 start = self._idx - (len(last.s) + len(item.s))
218 container.body[-1] = (
219 container.body[-1][0],
220 Whitespace(self._src[start : self._idx]),
221 )
223 return True
225 def _is_child(self, parent: Key, child: Key) -> bool:
226 """
227 Returns whether a key is strictly a child of another key.
228 AoT siblings are not considered children of one another.
229 """
230 parent_parts = tuple(parent)
231 child_parts = tuple(child)
233 if parent_parts == child_parts:
234 return False
236 return parent_parts == child_parts[: len(parent_parts)]
238 def _parse_item(self) -> tuple[Key | None, Item] | None:
239 """
240 Attempts to parse the next item and returns it, along with its key
241 if the item is value-like.
242 """
243 self.mark()
244 with self._state as state:
245 while True:
246 c = self._current
247 if c == "\n":
248 # Found a newline; Return all whitespace found up to this point.
249 self.inc()
251 return None, Whitespace(self.extract())
252 elif c in " \t\r":
253 if c == "\r":
254 with self._state(restore=True):
255 if not self.inc() or self._current != "\n":
256 raise self.parse_error(
257 InvalidControlChar, CTRL_M, "documents"
258 )
259 # Skip whitespace.
260 if not self.inc():
261 return None, Whitespace(self.extract())
262 elif c == "#":
263 # Found a comment, parse it
264 indent = self.extract()
265 cws, comment, trail = self._parse_comment_trail()
267 return None, Comment(Trivia(indent, cws, comment, trail))
268 elif c == "[":
269 # Found a table, delegate to the calling function.
270 return None
271 else:
272 # Beginning of a KV pair.
273 # Return to beginning of whitespace so it gets included
274 # as indentation for the KV about to be parsed.
275 state.restore = True
276 break
278 return self._parse_key_value(True)
280 def _parse_comment_trail(self, parse_trail: bool = True) -> tuple[str, str, str]:
281 """
282 Returns (comment_ws, comment, trail)
283 If there is no comment, comment_ws and comment will
284 simply be empty.
285 """
286 if self.end():
287 return "", "", ""
289 comment = ""
290 comment_ws = ""
291 self.mark()
293 while True:
294 c = self._current
296 if c == "\n":
297 break
298 elif c == "#":
299 comment_ws = self.extract()
301 self.mark()
302 self.inc() # Skip #
304 # The comment itself
305 while not self.end() and self._current not in _NL:
306 code = ord(self._current)
307 if code == CHR_DEL or (code <= CTRL_CHAR_LIMIT and code != CTRL_I):
308 raise self.parse_error(InvalidControlChar, code, "comments")
310 if not self.inc():
311 break
313 comment = self.extract()
314 self.mark()
316 break
317 elif c in " \t\r":
318 if c == "\r":
319 with self._state(restore=True):
320 if not self.inc() or self._current != "\n":
321 raise self.parse_error(
322 InvalidControlChar, CTRL_M, "comments"
323 )
324 self.inc()
325 else:
326 raise self.parse_error(UnexpectedCharError, c)
328 if self.end():
329 break
331 trail = ""
332 if parse_trail:
333 self._src.advance_while(_SPACES)
335 if self._current == "\r":
336 with self._state(restore=True):
337 if not self.inc() or self._current != "\n":
338 raise self.parse_error(InvalidControlChar, CTRL_M, "documents")
339 self.inc()
341 if self._current == "\n":
342 self.inc()
344 if self._idx != self._marker or self._current in _WS:
345 trail = self.extract()
347 return comment_ws, comment, trail
349 def _parse_key_value(self, parse_comment: bool = False) -> tuple[Key, Item]:
350 # Leading indent
351 self.mark()
353 self._src.advance_while(_SPACES)
355 indent = self.extract()
357 # Key
358 key = self._parse_key()
360 self.mark()
362 found_equals = self._current == "="
363 while self._current in _KV and self.inc():
364 if self._current == "=":
365 if found_equals:
366 raise self.parse_error(UnexpectedCharError, "=")
367 else:
368 found_equals = True
369 if not found_equals:
370 raise self.parse_error(UnexpectedCharError, self._current)
372 if not key.sep:
373 key.sep = self.extract()
374 else:
375 key.sep += self.extract()
377 # Value
378 val = self._parse_value()
379 # Comment
380 if parse_comment:
381 cws, comment, trail = self._parse_comment_trail()
382 meta = val.trivia
383 if not meta.comment_ws:
384 meta.comment_ws = cws
386 meta.comment = comment
387 meta.trail = trail
388 else:
389 val.trivia.trail = ""
391 val.trivia.indent = indent
393 return key, val
395 def _parse_key(self) -> Key:
396 """
397 Parses a Key at the current position;
398 WS before the key must be exhausted first at the callsite.
399 """
400 key = self._parse_simple_key()
401 fragments = 1
402 while self._current == ".":
403 fragments += 1
404 if fragments > self.MAX_NESTING_DEPTH:
405 raise self.parse_error(
406 ParseError,
407 f"TOML key nested more than {self.MAX_NESTING_DEPTH} levels deep",
408 )
409 self.inc()
410 key = key.concat(self._parse_simple_key())
412 return key
414 def _parse_simple_key(self) -> Key:
415 """
416 Parses a single (non-dotted) key fragment.
417 """
418 self.mark()
419 # Skip any leading whitespace (bulk scan)
420 self._src.advance_while(_SPACES)
421 if self._current in "\"'":
422 return self._parse_quoted_key()
423 else:
424 return self._parse_bare_key()
426 def _parse_quoted_key(self) -> Key:
427 """
428 Parses a key enclosed in either single or double quotes.
429 """
430 # Extract the leading whitespace
431 original = self.extract()
432 quote_style = self._current
433 key_type = next((t for t in KeyType if t.value == quote_style), None)
435 if key_type is None:
436 raise RuntimeError("Should not have entered _parse_quoted_key()")
438 key_str = self._parse_string(
439 StringType.SLB if key_type == KeyType.Basic else StringType.SLL
440 )
441 if key_str._t.is_multiline():
442 raise self.parse_error(UnexpectedCharError, key_str._t.value)
443 original += key_str.as_string()
444 self.mark()
445 self._src.advance_while(_SPACES)
446 original += self.extract()
448 return SingleKey(str(key_str), t=key_type, sep="", original=original)
450 def _parse_bare_key(self) -> Key:
451 """
452 Parses a bare key.
453 """
454 self._src.advance_while(_BARE_KEY_OR_SPACE)
456 original = self.extract()
457 key_s = original.strip()
458 if not key_s:
459 # Empty key
460 raise self.parse_error(EmptyKeyError)
462 if " " in key_s or "\t" in key_s:
463 # Bare key with whitespace in it
464 raise self.parse_error(ParseError, f'Invalid key "{key_s}"')
466 return SingleKey(key_s, KeyType.Bare, "", original)
468 def _parse_value(self) -> Item:
469 """
470 Attempts to parse a value at the current position.
471 """
472 self.mark()
473 c = self._current
474 trivia = Trivia()
476 if c == StringType.SLB.value:
477 return self._parse_basic_string()
478 elif c == StringType.SLL.value:
479 return self._parse_literal_string()
480 elif c == BoolType.TRUE.value[0]:
481 return self._parse_true()
482 elif c == BoolType.FALSE.value[0]:
483 return self._parse_false()
484 elif c == "[":
485 return self._parse_nested(self._parse_array)
486 elif c == "{":
487 return self._parse_nested(self._parse_inline_table)
488 elif c in "+-" or self._peek(4) in {
489 "+inf",
490 "-inf",
491 "inf",
492 "+nan",
493 "-nan",
494 "nan",
495 }:
496 # Number
497 self._src.advance_until(_NUM_STOP)
499 raw = self.extract()
501 item = self._parse_number(raw, trivia)
502 if item is not None:
503 return item
505 raise self.parse_error(InvalidNumberError)
506 elif c in string.digits:
507 # Integer, Float, Date, Time or DateTime
508 self._src.advance_until(_NUM_STOP)
510 raw = self.extract()
512 m = RFC_3339_LOOSE.match(raw)
513 if m:
514 if m.group("date") and m.group("time"):
515 # datetime
516 try:
517 dt = parse_rfc3339(raw)
518 assert isinstance(dt, datetime.datetime)
519 return DateTime(
520 dt.year,
521 dt.month,
522 dt.day,
523 dt.hour,
524 dt.minute,
525 dt.second,
526 dt.microsecond,
527 dt.tzinfo,
528 trivia,
529 raw,
530 )
531 except ValueError:
532 raise self.parse_error(InvalidDateTimeError) from None
534 if m.group("date"):
535 try:
536 dt = parse_rfc3339(raw)
537 assert isinstance(dt, datetime.date)
538 date = Date(dt.year, dt.month, dt.day, trivia, raw)
539 self.mark()
540 self._src.advance_until(_DATE_TAIL_STOP)
542 time_raw = self.extract()
543 time_part = time_raw.rstrip()
544 trivia.comment_ws = time_raw[len(time_part) :]
545 if not time_part:
546 return date
548 dt = parse_rfc3339(raw + time_part)
549 assert isinstance(dt, datetime.datetime)
550 return DateTime(
551 dt.year,
552 dt.month,
553 dt.day,
554 dt.hour,
555 dt.minute,
556 dt.second,
557 dt.microsecond,
558 dt.tzinfo,
559 trivia,
560 raw + time_part,
561 )
562 except ValueError:
563 raise self.parse_error(InvalidDateError) from None
565 if m.group("time"):
566 try:
567 t = parse_rfc3339(raw)
568 assert isinstance(t, datetime.time)
569 return Time(
570 t.hour,
571 t.minute,
572 t.second,
573 t.microsecond,
574 t.tzinfo,
575 trivia,
576 raw,
577 )
578 except ValueError:
579 raise self.parse_error(InvalidTimeError) from None
581 item = self._parse_number(raw, trivia)
582 if item is not None:
583 return item
585 raise self.parse_error(InvalidNumberError)
586 else:
587 raise self.parse_error(UnexpectedCharError, c)
589 def _parse_true(self) -> Bool:
590 return self._parse_bool(BoolType.TRUE)
592 def _parse_false(self) -> Bool:
593 return self._parse_bool(BoolType.FALSE)
595 def _parse_bool(self, style: BoolType) -> Bool:
596 with self._state:
597 style = BoolType(style)
599 # only keep parsing for bool if the characters match the style
600 # try consuming rest of chars in style
601 for c in style:
602 self.consume(c, min=1, max=1)
604 return Bool(style, Trivia())
606 def _parse_nested(self, parse: Callable[[], Item]) -> Item:
607 """
608 Parses an array or inline table, enforcing the nesting depth limit.
609 """
610 self._nesting_depth += 1
611 if self._nesting_depth > self.MAX_NESTING_DEPTH:
612 raise self.parse_error(
613 ParseError,
614 f"TOML value nested more than {self.MAX_NESTING_DEPTH} levels deep",
615 )
616 try:
617 return parse()
618 finally:
619 self._nesting_depth -= 1
621 def _parse_array(self) -> Array:
622 # Consume opening bracket, EOF here is an issue (middle of array)
623 self.inc(exception=UnexpectedEofError)
625 elems: list[Item] = []
626 prev_value = None
627 while True:
628 # consume whitespace
629 mark = self._idx
630 self.consume(" \t\n\r")
631 indent = self._src[mark : self._idx]
632 newline = _NL & set(indent)
633 if newline:
634 elems.append(Whitespace(indent))
635 continue
637 # consume comment
638 if self._current == "#":
639 cws, comment, trail = self._parse_comment_trail(parse_trail=False)
640 elems.append(Comment(Trivia(indent, cws, comment, trail)))
641 continue
643 # consume indent
644 if indent:
645 elems.append(Whitespace(indent))
646 continue
648 # consume value
649 if not prev_value:
650 try:
651 elems.append(self._parse_value())
652 prev_value = True
653 continue
654 except UnexpectedCharError:
655 pass
657 # consume comma
658 if prev_value and self._current == ",":
659 self.inc(exception=UnexpectedEofError)
660 # If the previous item is Whitespace, add to it
661 if isinstance(elems[-1], Whitespace):
662 elems[-1]._s = elems[-1].s + ","
663 else:
664 elems.append(Whitespace(","))
665 prev_value = False
666 continue
668 # consume closing bracket
669 if self._current == "]":
670 # consume closing bracket, EOF here doesn't matter
671 self.inc()
672 break
674 raise self.parse_error(UnexpectedCharError, self._current)
676 try:
677 res = Array(elems, Trivia())
678 except ValueError:
679 pass
680 else:
681 return res
683 raise self.parse_error(ParseError, "Failed to parse array")
685 def _parse_inline_table(self) -> InlineTable:
686 # consume opening bracket, EOF here is an issue (middle of array)
687 self.inc(exception=UnexpectedEofError)
689 elems = Container(True)
690 expect_key = True
691 while True:
692 while True:
693 # consume whitespace and newlines
694 mark = self._idx
695 self.consume(" \t\n\r")
696 raw = self._src[mark : self._idx]
697 if raw:
698 elems.add(Whitespace(raw))
700 if self._current != "#":
701 break
703 cws, comment, trail = self._parse_comment_trail(parse_trail=False)
704 elems.add(Comment(Trivia("", cws, comment, trail)))
706 if self._current == "}":
707 # consume closing bracket, EOF here doesn't matter
708 self.inc()
709 break
711 if expect_key:
712 if self._current == ",":
713 raise self.parse_error(UnexpectedCharError, self._current)
714 key, val = self._parse_key_value(False)
715 elems.add(key, val)
716 expect_key = False
717 continue
719 if self._current != ",":
720 raise self.parse_error(UnexpectedCharError, self._current)
722 elems.add(Whitespace(","))
723 # consume comma, EOF here is an issue (middle of inline table)
724 self.inc(exception=UnexpectedEofError)
725 expect_key = True
727 return InlineTable(elems, Trivia())
729 def _parse_number(self, raw: str, trivia: Trivia) -> Item | None:
730 # Leading zeros are not allowed
731 sign = ""
732 if raw.startswith(("+", "-")):
733 sign = raw[0]
734 raw = raw[1:]
736 if len(raw) > 1 and (
737 (raw.startswith("0") and not raw.startswith(("0.", "0o", "0x", "0b", "0e")))
738 or (sign and raw.startswith("."))
739 ):
740 return None
742 if raw.startswith(("0o", "0x", "0b")) and sign:
743 return None
745 digits = "[0-9]"
746 base = 10
747 if raw.startswith("0b"):
748 digits = "[01]"
749 base = 2
750 elif raw.startswith("0o"):
751 digits = "[0-7]"
752 base = 8
753 elif raw.startswith("0x"):
754 digits = "[0-9a-f]"
755 base = 16
757 # Underscores should be surrounded by digits
758 clean = re.sub(f"(?i)(?<={digits})_(?={digits})", "", raw).lower()
760 if "_" in clean:
761 return None
763 if clean.endswith(".") or (
764 not clean.startswith("0x") and clean.split("e", 1)[0].endswith(".")
765 ):
766 return None
768 try:
769 return Integer(int(sign + clean, base), trivia, sign + raw)
770 except ValueError:
771 try:
772 return Float(float(sign + clean), trivia, sign + raw)
773 except ValueError:
774 return None
776 def _parse_literal_string(self) -> String:
777 with self._state:
778 return self._parse_string(StringType.SLL)
780 def _parse_basic_string(self) -> String:
781 with self._state:
782 return self._parse_string(StringType.SLB)
784 def _parse_escaped_char(self, multiline: bool) -> str:
785 if multiline and self._current in _WS:
786 # When the last non-whitespace character on a line is
787 # a \, it will be trimmed along with all whitespace
788 # (including newlines) up to the next non-whitespace
789 # character or closing delimiter.
790 # """\
791 # hello \
792 # world"""
793 tmp = ""
794 while self._current in _WS:
795 tmp += self._current
796 # consume the whitespace, EOF here is an issue
797 # (middle of string)
798 self.inc(exception=UnexpectedEofError)
799 continue
801 # the escape followed by whitespace must have a newline
802 # before any other chars
803 if "\n" not in tmp:
804 raise self.parse_error(InvalidCharInStringError, self._current)
806 return ""
808 if self._current in _escaped:
809 c = _escaped[self._current]
811 # consume this char, EOF here is an issue (middle of string)
812 self.inc(exception=UnexpectedEofError)
814 return c
816 if self._current in {"u", "U"}:
817 # this needs to be a unicode
818 u, ue = self._peek_unicode(self._current == "U")
819 if u is not None:
820 assert ue is not None
821 # consume the U char and the unicode value
822 self.inc_n(len(ue) + 1)
824 return u
826 raise self.parse_error(InvalidUnicodeValueError)
828 if self._current == "x":
829 h, he = self._peek_hex()
830 if h is not None:
831 assert he is not None
832 # consume the x char and the hex value
833 self.inc_n(len(he) + 1)
834 return h
836 raise self.parse_error(InvalidUnicodeValueError)
838 raise self.parse_error(InvalidCharInStringError, self._current)
840 def _parse_string(self, delim: StringType) -> String:
841 # only keep parsing for string if the current character matches the delim
842 if self._current != delim.unit:
843 raise self.parse_error(
844 InternalParserError,
845 f"Invalid character for string type {delim}",
846 )
848 # consume the opening/first delim, EOF here is an issue
849 # (middle of string or middle of delim)
850 self.inc(exception=UnexpectedEofError)
852 if self._current == delim.unit:
853 # consume the closing/second delim, we do not care if EOF occurs as
854 # that would simply imply an empty single line string
855 if not self.inc() or self._current != delim.unit:
856 # Empty string
857 return String(delim, "", "", Trivia())
859 # consume the third delim, EOF here is an issue (middle of string)
860 self.inc(exception=UnexpectedEofError)
862 delim = delim.toggle() # convert delim to multi delim
864 self.mark() # to extract the original string with whitespace and all
865 value = ""
867 # A newline immediately following the opening delimiter will be trimmed.
868 if delim.is_multiline():
869 if self._current == "\n":
870 # consume the newline, EOF here is an issue (middle of string)
871 self.inc(exception=UnexpectedEofError)
872 else:
873 cur: str = self._current
874 with self._state(restore=True):
875 if self.inc():
876 cur += self._current
877 if cur == "\r\n":
878 self.inc_n(2, exception=UnexpectedEofError)
880 # PERF: stop-set for the single-line string-body bulk fast-path (None for
881 # multiline, which keeps the per-char loop because of \r\n handling).
882 src = self._src
883 single_stop = None
884 if delim.is_singleline():
885 single_stop = (
886 _SINGLE_BASIC_STOP if delim.is_basic() else _SINGLE_LITERAL_STOP
887 )
889 escaped = False # whether the previous key was ESCAPE
890 while True:
891 code = ord(self._current)
892 if (
893 delim.is_singleline()
894 and not escaped
895 and (code == CHR_DEL or (code <= CTRL_CHAR_LIMIT and code != CTRL_I))
896 ) or (
897 delim.is_multiline()
898 and not escaped
899 and (
900 code == CHR_DEL
901 or (
902 code <= CTRL_CHAR_LIMIT and code not in [CTRL_I, CTRL_J, CTRL_M]
903 )
904 )
905 ):
906 raise self.parse_error(InvalidControlChar, code, "strings")
907 elif delim.is_multiline() and not escaped and self._current == "\r":
908 with self._state(restore=True):
909 if not self.inc() or self._current != "\n":
910 raise self.parse_error(InvalidControlChar, CTRL_M, "strings")
911 value += self._current
912 self.inc(exception=UnexpectedEofError)
913 elif not escaped and self._current == delim.unit:
914 # try to process current as a closing delim
915 original = self.extract()
917 close = ""
918 if delim.is_multiline():
919 # Consume the delimiters to see if we are at the end of the string
920 close = ""
921 while self._current == delim.unit:
922 close += self._current
923 self.inc()
925 if len(close) < 3:
926 # Not a triple quote, leave in result as-is.
927 # Adding back the characters we already consumed
928 value += close
929 continue
931 if len(close) == 3:
932 # We are at the end of the string
933 return String(delim, value, original, Trivia())
935 if len(close) >= 6:
936 raise self.parse_error(InvalidCharInStringError, self._current)
938 value += close[:-3]
939 original += close[:-3]
941 return String(delim, value, original, Trivia())
942 else:
943 # consume the closing delim, we do not care if EOF occurs as
944 # that would simply imply the end of self._src
945 self.inc()
947 return String(delim, value, original, Trivia())
948 elif delim.is_basic() and escaped:
949 # attempt to parse the current char as an escaped value, an exception
950 # is raised if this fails
951 value += self._parse_escaped_char(delim.is_multiline())
953 # no longer escaped
954 escaped = False
955 elif delim.is_basic() and self._current == "\\":
956 # the next char is being escaped
957 escaped = True
959 # consume this char, EOF here is an issue (middle of string)
960 self.inc(exception=UnexpectedEofError)
961 else:
962 # this is either a literal string where we keep everything as is,
963 # or this is not a special escaped char in a basic string
964 if single_stop is not None:
965 # PERF fast-path: bulk-append the run of ordinary characters
966 # up to the next delimiter / backslash / control char, instead
967 # of one `value += cur; inc()` iteration per character. The
968 # stop char is then handled by the branches above on the next
969 # iteration (single-line only; multiline keeps the per-char
970 # loop for CRLF handling).
971 run_start = src._idx
972 src.advance_until(single_stop)
973 if src.end():
974 # mid-string EOF — same error as the per-char inc()
975 raise self.parse_error(UnexpectedEofError)
976 value += src[run_start : src._idx]
977 else:
978 value += self._current
980 # consume this char, EOF here is an issue (middle of string)
981 self.inc(exception=UnexpectedEofError)
983 def _parse_table(
984 self, parent_name: Key | None = None, parent: Table | None = None
985 ) -> tuple[Key, Table | AoT]:
986 """
987 Parses a table element.
988 """
989 if self._current != "[":
990 raise self.parse_error(
991 InternalParserError, "_parse_table() called on non-bracket character."
992 )
994 indent = self.extract()
995 self.inc() # Skip opening bracket
997 if self.end():
998 raise self.parse_error(UnexpectedEofError)
1000 is_aot = False
1001 if self._current == "[":
1002 if not self.inc():
1003 raise self.parse_error(UnexpectedEofError)
1005 is_aot = True
1006 try:
1007 key = self._parse_key()
1008 except EmptyKeyError:
1009 raise self.parse_error(EmptyTableNameError) from None
1010 if self.end():
1011 raise self.parse_error(UnexpectedEofError)
1012 elif self._current != "]":
1013 raise self.parse_error(UnexpectedCharError, self._current)
1015 key.sep = ""
1016 full_key = key
1017 name_parts = tuple(key)
1018 if any(" " in part.key.strip() and part.is_bare() for part in name_parts):
1019 raise self.parse_error(
1020 ParseError, f'Invalid table name "{full_key.as_string()}"'
1021 )
1023 missing_table = False
1024 if parent_name:
1025 parent_name_parts = tuple(parent_name)
1026 else:
1027 parent_name_parts = ()
1029 if len(name_parts) > len(parent_name_parts) + 1:
1030 missing_table = True
1032 name_parts = name_parts[len(parent_name_parts) :]
1034 values = Container(True)
1036 self.inc() # Skip closing bracket
1037 if is_aot:
1038 # TODO: Verify close bracket
1039 self.inc()
1041 cws, comment, trail = self._parse_comment_trail()
1043 result: Table | AoT = Null() # type: ignore[assignment]
1044 table = Table(
1045 values,
1046 Trivia(indent, cws, comment, trail),
1047 is_aot,
1048 name=name_parts[0].key if name_parts else key.key,
1049 display_name=full_key.as_string(),
1050 is_super_table=False,
1051 )
1053 if len(name_parts) > 1:
1054 if missing_table:
1055 # Missing super table
1056 # i.e. a table initialized like this: [foo.bar]
1057 # without initializing [foo]
1058 #
1059 # So we have to create the parent tables
1060 table = Table(
1061 Container(True),
1062 Trivia("", cws, comment, trail),
1063 is_aot and name_parts[0] in self._aot_stack,
1064 is_super_table=True,
1065 name=name_parts[0].key,
1066 )
1068 result = table
1069 key = name_parts[0]
1071 for i, _name in enumerate(name_parts[1:]):
1072 child = table.get(
1073 _name,
1074 Table(
1075 Container(True),
1076 Trivia(indent, cws, comment, trail),
1077 is_aot and i == len(name_parts) - 2,
1078 is_super_table=i < len(name_parts) - 2,
1079 name=_name.key,
1080 display_name=(
1081 full_key.as_string() if i == len(name_parts) - 2 else None
1082 ),
1083 ),
1084 )
1086 if is_aot and i == len(name_parts) - 2:
1087 table.raw_append(_name, AoT([child], name=table.name, parsed=True))
1088 else:
1089 table.raw_append(_name, child)
1091 table = child
1092 values = table.value
1093 else:
1094 if name_parts:
1095 key = name_parts[0]
1097 while not self.end():
1098 parsed = self._parse_item()
1099 if parsed:
1100 _key, _val = parsed
1101 if not self._merge_ws(_val, values):
1102 table.raw_append(_key, _val)
1103 else:
1104 if self._current == "[":
1105 _, key_next = self._peek_table()
1107 if self._is_child(full_key, key_next):
1108 key_next, table_next = self._parse_table(full_key, table)
1110 table.raw_append(key_next, table_next)
1112 # Picking up any sibling
1113 while not self.end():
1114 _, key_next = self._peek_table()
1116 if not self._is_child(full_key, key_next):
1117 break
1119 key_next, table_next = self._parse_table(full_key, table)
1121 table.raw_append(key_next, table_next)
1123 break
1124 else:
1125 raise self.parse_error(
1126 InternalParserError,
1127 "_parse_item() returned None on a non-bracket character.",
1128 )
1129 table.value._validate_out_of_order_table()
1130 if isinstance(result, Null):
1131 result = table
1133 if is_aot and (not self._aot_stack or full_key != self._aot_stack[-1]):
1134 result = self._parse_aot(result, full_key)
1136 return key, result
1138 def _peek_table(self) -> tuple[bool, Key]:
1139 """
1140 Peeks ahead non-intrusively by cloning then restoring the
1141 initial state of the parser.
1143 Returns the name of the table about to be parsed,
1144 as well as whether it is part of an AoT.
1145 """
1146 # we always want to restore after exiting this scope
1147 with self._state(save_marker=True, restore=True):
1148 if self._current != "[":
1149 raise self.parse_error(
1150 InternalParserError,
1151 "_peek_table() entered on non-bracket character",
1152 )
1154 # AoT
1155 self.inc()
1156 is_aot = False
1157 if self._current == "[":
1158 self.inc()
1159 is_aot = True
1160 try:
1161 return is_aot, self._parse_key()
1162 except EmptyKeyError:
1163 raise self.parse_error(EmptyTableNameError) from None
1165 def _parse_aot(self, first: Table, name_first: Key) -> AoT:
1166 """
1167 Parses all siblings of the provided table first and bundles them into
1168 an AoT.
1169 """
1170 payload: list[Table] = [first]
1171 self._aot_stack.append(name_first)
1172 while not self.end():
1173 is_aot_next, name_next = self._peek_table()
1174 if is_aot_next and name_next == name_first:
1175 _, table = self._parse_table(name_first)
1176 assert isinstance(table, Table)
1177 payload.append(table)
1178 else:
1179 break
1181 self._aot_stack.pop()
1183 return AoT(payload, parsed=True)
1185 def _peek(self, n: int) -> str:
1186 """
1187 Peeks ahead n characters.
1189 n is the max number of characters that will be peeked.
1190 """
1191 # we always want to restore after exiting this scope
1192 with self._state(restore=True):
1193 buf = ""
1194 for _ in range(n):
1195 if self._current not in " \t\n\r#,]}" + self._src.EOF:
1196 buf += self._current
1197 self.inc()
1198 continue
1200 break
1201 return buf
1203 def _peek_unicode(self, is_long: bool) -> tuple[str | None, str | None]:
1204 """
1205 Peeks ahead non-intrusively by cloning then restoring the
1206 initial state of the parser.
1208 Returns the unicode value is it's a valid one else None.
1209 """
1210 # we always want to restore after exiting this scope
1211 with self._state(save_marker=True, restore=True):
1212 if self._current not in {"u", "U"}:
1213 raise self.parse_error(
1214 InternalParserError, "_peek_unicode() entered on non-unicode value"
1215 )
1217 self.inc() # Dropping prefix
1218 self.mark()
1220 if is_long:
1221 chars = 8
1222 else:
1223 chars = 4
1225 if not self.inc_n(chars):
1226 value, extracted = None, None
1227 else:
1228 extracted = self.extract()
1230 if extracted.strip("0123456789abcdefABCDEF"):
1231 return None, extracted
1233 codepoint = int(extracted, 16)
1235 # Unicode scalar values exclude the surrogate range
1236 # (U+D800 to U+DFFF). The 8-digit \U form reaches this range
1237 # with leading zeros, so it must be checked on the value itself.
1238 if 0xD800 <= codepoint <= 0xDFFF:
1239 return None, extracted
1241 try:
1242 value = chr(codepoint)
1243 except (ValueError, OverflowError):
1244 value = None
1246 return value, extracted
1248 def _peek_hex(self) -> tuple[str | None, str | None]:
1249 with self._state(save_marker=True, restore=True):
1250 if self._current != "x":
1251 raise self.parse_error(
1252 InternalParserError, "_peek_hex() entered on non-hex value"
1253 )
1255 self.inc() # Dropping prefix
1256 self.mark()
1258 if not self.inc_n(2):
1259 return None, None
1261 extracted = self.extract()
1262 if extracted.strip("0123456789abcdefABCDEF"):
1263 return None, None
1265 try:
1266 value = chr(int(extracted, 16))
1267 except (ValueError, OverflowError):
1268 value = None
1270 return value, extracted