1"""Parser for attributes::
2
3 attributes { id = "foo", class = "bar baz",
4 key1 = "val1", key2 = "val2" }
5
6Adapted from:
7https://github.com/jgm/djot/blob/fae7364b86bfce69bc6d5b5eede1f5196d845fd6/djot/attributes.lua#L1
8
9syntax:
10
11attributes <- '{' whitespace* attribute (whitespace attribute)* whitespace* '}'
12attribute <- identifier | class | keyval
13identifier <- '#' name
14class <- '.' name
15name <- (nonspace, nonpunctuation other than ':', '_', '-')+
16keyval <- key '=' val
17key <- (ASCII_ALPHANUM | ':' | '_' | '-')+
18val <- bareval | quotedval
19bareval <- (ASCII_ALPHANUM | ':' | '_' | '-')+
20quotedval <- '"' ([^"] | '\"') '"'
21"""
22
23from __future__ import annotations
24
25from enum import Enum
26import re
27from typing import Callable
28
29
30class State(Enum):
31 START = 0
32 SCANNING = 1
33 SCANNING_ID = 2
34 SCANNING_CLASS = 3
35 SCANNING_KEY = 4
36 SCANNING_VALUE = 5
37 SCANNING_BARE_VALUE = 6
38 SCANNING_QUOTED_VALUE = 7
39 SCANNING_COMMENT = 8
40 SCANNING_ESCAPED = 9
41 DONE = 10
42
43
44REGEX_SPACE = re.compile(r"\s")
45REGEX_SPACE_PUNCTUATION = re.compile(r"[\s!\"#$%&'()*+,./;<=>?@[\]^`{|}~]")
46REGEX_KEY_CHARACTERS = re.compile(r"[a-zA-Z\d_:-]")
47
48
49class TokenState:
50 def __init__(self) -> None:
51 self._tokens: list[tuple[int, int, str]] = []
52 self.start: int = 0
53
54 def set_start(self, start: int) -> None:
55 self.start = start
56
57 def append(self, start: int, end: int, ttype: str) -> None:
58 self._tokens.append((start, end, ttype))
59
60 def compile(self, string: str) -> dict[str, str]:
61 """compile the tokens into a dictionary"""
62 attributes = {}
63 classes = []
64 idx = 0
65 while idx < len(self._tokens):
66 start, end, ttype = self._tokens[idx]
67 if ttype == "id":
68 attributes["id"] = string[start:end]
69 elif ttype == "class":
70 classes.append(string[start:end])
71 elif ttype == "key":
72 key = string[start:end]
73 if idx + 1 < len(self._tokens):
74 start, end, ttype = self._tokens[idx + 1]
75 if ttype == "value":
76 if key == "class":
77 classes.append(string[start:end])
78 else:
79 attributes[key] = string[start:end]
80 idx += 1
81 idx += 1
82 if classes:
83 attributes["class"] = " ".join(classes)
84 return attributes
85
86 def __str__(self) -> str:
87 return str(self._tokens)
88
89 def __repr__(self) -> str:
90 return repr(self._tokens)
91
92
93class ParseError(Exception):
94 def __init__(self, msg: str, pos: int) -> None:
95 self.pos = pos
96 super().__init__(msg + f" at position {pos}")
97
98
99def parse(string: str) -> tuple[int, dict[str, str]]:
100 """Parse attributes from start of string.
101
102 :returns: (length of parsed string, dict of attributes)
103 """
104 pos = 0
105 state: State = State.START
106 tokens = TokenState()
107 while pos < len(string):
108 state = HANDLERS[state](string[pos], pos, tokens)
109 if state == State.DONE:
110 return pos, tokens.compile(string)
111 pos = pos + 1
112
113 return pos, tokens.compile(string)
114
115
116def handle_start(char: str, pos: int, tokens: TokenState) -> State:
117 if char == "{":
118 return State.SCANNING
119 raise ParseError("Attributes must start with '{'", pos)
120
121
122def handle_scanning(char: str, pos: int, tokens: TokenState) -> State:
123 if char == " " or char == "\t" or char == "\n" or char == "\r":
124 return State.SCANNING
125 if char == "}":
126 return State.DONE
127 if char == "#":
128 tokens.set_start(pos)
129 return State.SCANNING_ID
130 if char == "%":
131 tokens.set_start(pos)
132 return State.SCANNING_COMMENT
133 if char == ".":
134 tokens.set_start(pos)
135 return State.SCANNING_CLASS
136 if REGEX_KEY_CHARACTERS.fullmatch(char):
137 tokens.set_start(pos)
138 return State.SCANNING_KEY
139
140 raise ParseError(f"Unexpected character whilst scanning: {char}", pos)
141
142
143def handle_scanning_comment(char: str, pos: int, tokens: TokenState) -> State:
144 if char == "%":
145 return State.SCANNING
146
147 return State.SCANNING_COMMENT
148
149
150def handle_scanning_id(char: str, pos: int, tokens: TokenState) -> State:
151 if not REGEX_SPACE_PUNCTUATION.fullmatch(char):
152 return State.SCANNING_ID
153
154 if char == "}":
155 if (pos - 1) > tokens.start:
156 tokens.append(tokens.start + 1, pos, "id")
157 return State.DONE
158
159 if REGEX_SPACE.fullmatch(char):
160 if (pos - 1) > tokens.start:
161 tokens.append(tokens.start + 1, pos, "id")
162 return State.SCANNING
163
164 raise ParseError(f"Unexpected character whilst scanning id: {char}", pos)
165
166
167def handle_scanning_class(char: str, pos: int, tokens: TokenState) -> State:
168 if not REGEX_SPACE_PUNCTUATION.fullmatch(char):
169 return State.SCANNING_CLASS
170
171 if char == "}":
172 if (pos - 1) > tokens.start:
173 tokens.append(tokens.start + 1, pos, "class")
174 return State.DONE
175
176 if REGEX_SPACE.fullmatch(char):
177 if (pos - 1) > tokens.start:
178 tokens.append(tokens.start + 1, pos, "class")
179 return State.SCANNING
180
181 raise ParseError(f"Unexpected character whilst scanning class: {char}", pos)
182
183
184def handle_scanning_key(char: str, pos: int, tokens: TokenState) -> State:
185 if char == "=":
186 tokens.append(tokens.start, pos, "key")
187 return State.SCANNING_VALUE
188
189 if REGEX_KEY_CHARACTERS.fullmatch(char):
190 return State.SCANNING_KEY
191
192 raise ParseError(f"Unexpected character whilst scanning key: {char}", pos)
193
194
195def handle_scanning_value(char: str, pos: int, tokens: TokenState) -> State:
196 if char == '"':
197 tokens.set_start(pos)
198 return State.SCANNING_QUOTED_VALUE
199
200 if REGEX_KEY_CHARACTERS.fullmatch(char):
201 tokens.set_start(pos)
202 return State.SCANNING_BARE_VALUE
203
204 raise ParseError(f"Unexpected character whilst scanning value: {char}", pos)
205
206
207def handle_scanning_bare_value(char: str, pos: int, tokens: TokenState) -> State:
208 if REGEX_KEY_CHARACTERS.fullmatch(char):
209 return State.SCANNING_BARE_VALUE
210
211 if char == "}":
212 tokens.append(tokens.start, pos, "value")
213 return State.DONE
214
215 if REGEX_SPACE.fullmatch(char):
216 tokens.append(tokens.start, pos, "value")
217 return State.SCANNING
218
219 raise ParseError(f"Unexpected character whilst scanning bare value: {char}", pos)
220
221
222def handle_scanning_escaped(char: str, pos: int, tokens: TokenState) -> State:
223 return State.SCANNING_QUOTED_VALUE
224
225
226def handle_scanning_quoted_value(char: str, pos: int, tokens: TokenState) -> State:
227 if char == '"':
228 tokens.append(tokens.start + 1, pos, "value")
229 return State.SCANNING
230
231 if char == "\\":
232 return State.SCANNING_ESCAPED
233
234 if char == "{" or char == "}":
235 raise ParseError(
236 f"Unexpected character whilst scanning quoted value: {char}", pos
237 )
238
239 if char == "\n":
240 tokens.append(tokens.start + 1, pos, "value")
241 return State.SCANNING_QUOTED_VALUE
242
243 return State.SCANNING_QUOTED_VALUE
244
245
246HANDLERS: dict[State, Callable[[str, int, TokenState], State]] = {
247 State.START: handle_start,
248 State.SCANNING: handle_scanning,
249 State.SCANNING_COMMENT: handle_scanning_comment,
250 State.SCANNING_ID: handle_scanning_id,
251 State.SCANNING_CLASS: handle_scanning_class,
252 State.SCANNING_KEY: handle_scanning_key,
253 State.SCANNING_VALUE: handle_scanning_value,
254 State.SCANNING_BARE_VALUE: handle_scanning_bare_value,
255 State.SCANNING_QUOTED_VALUE: handle_scanning_quoted_value,
256 State.SCANNING_ESCAPED: handle_scanning_escaped,
257}