Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/mdit_py_plugins/attrs/parse.py: 99%

158 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-07 06:15 +0000

1"""Parser for attributes:: 

2 

3 attributes { id = "foo", class = "bar baz", 

4 key1 = "val1", key2 = "val2" } 

5 

6Adapted from: 

7https://github.com/jgm/djot/blob/fae7364b86bfce69bc6d5b5eede1f5196d845fd6/djot/attributes.lua#L1 

8 

9syntax: 

10 

11attributes <- '{' whitespace* attribute (whitespace attribute)* whitespace* '}' 

12attribute <- identifier | class | keyval 

13identifier <- '#' name 

14class <- '.' name 

15name <- (nonspace, nonpunctuation other than ':', '_', '-')+ 

16keyval <- key '=' val 

17key <- (ASCII_ALPHANUM | ':' | '_' | '-')+ 

18val <- bareval | quotedval 

19bareval <- (ASCII_ALPHANUM | ':' | '_' | '-')+ 

20quotedval <- '"' ([^"] | '\"') '"' 

21""" 

22from __future__ import annotations 

23 

24from enum import Enum 

25import re 

26from typing import Callable 

27 

28 

29class State(Enum): 

30 START = 0 

31 SCANNING = 1 

32 SCANNING_ID = 2 

33 SCANNING_CLASS = 3 

34 SCANNING_KEY = 4 

35 SCANNING_VALUE = 5 

36 SCANNING_BARE_VALUE = 6 

37 SCANNING_QUOTED_VALUE = 7 

38 SCANNING_COMMENT = 8 

39 SCANNING_ESCAPED = 9 

40 DONE = 10 

41 

42 

43REGEX_SPACE = re.compile(r"\s") 

44REGEX_SPACE_PUNCTUATION = re.compile(r"[\s!\"#$%&'()*+,./;<=>?@[\]^`{|}~]") 

45REGEX_KEY_CHARACTERS = re.compile(r"[a-zA-Z\d_:-]") 

46 

47 

48class TokenState: 

49 def __init__(self) -> None: 

50 self._tokens: list[tuple[int, int, str]] = [] 

51 self.start: int = 0 

52 

53 def set_start(self, start: int) -> None: 

54 self.start = start 

55 

56 def append(self, start: int, end: int, ttype: str) -> None: 

57 self._tokens.append((start, end, ttype)) 

58 

59 def compile(self, string: str) -> dict[str, str]: 

60 """compile the tokens into a dictionary""" 

61 attributes = {} 

62 classes = [] 

63 idx = 0 

64 while idx < len(self._tokens): 

65 start, end, ttype = self._tokens[idx] 

66 if ttype == "id": 

67 attributes["id"] = string[start:end] 

68 elif ttype == "class": 

69 classes.append(string[start:end]) 

70 elif ttype == "key": 

71 key = string[start:end] 

72 if idx + 1 < len(self._tokens): 

73 start, end, ttype = self._tokens[idx + 1] 

74 if ttype == "value": 

75 if key == "class": 

76 classes.append(string[start:end]) 

77 else: 

78 attributes[key] = string[start:end] 

79 idx += 1 

80 idx += 1 

81 if classes: 

82 attributes["class"] = " ".join(classes) 

83 return attributes 

84 

85 def __str__(self) -> str: 

86 return str(self._tokens) 

87 

88 def __repr__(self) -> str: 

89 return repr(self._tokens) 

90 

91 

92class ParseError(Exception): 

93 def __init__(self, msg: str, pos: int) -> None: 

94 self.pos = pos 

95 super().__init__(msg + f" at position {pos}") 

96 

97 

98def parse(string: str) -> tuple[int, dict[str, str]]: 

99 """Parse attributes from start of string. 

100 

101 :returns: (length of parsed string, dict of attributes) 

102 """ 

103 pos = 0 

104 state: State = State.START 

105 tokens = TokenState() 

106 while pos < len(string): 

107 state = HANDLERS[state](string[pos], pos, tokens) 

108 if state == State.DONE: 

109 return pos, tokens.compile(string) 

110 pos = pos + 1 

111 

112 return pos, tokens.compile(string) 

113 

114 

115def handle_start(char: str, pos: int, tokens: TokenState) -> State: 

116 if char == "{": 

117 return State.SCANNING 

118 raise ParseError("Attributes must start with '{'", pos) 

119 

120 

121def handle_scanning(char: str, pos: int, tokens: TokenState) -> State: 

122 if char == " " or char == "\t" or char == "\n" or char == "\r": 

123 return State.SCANNING 

124 if char == "}": 

125 return State.DONE 

126 if char == "#": 

127 tokens.set_start(pos) 

128 return State.SCANNING_ID 

129 if char == "%": 

130 tokens.set_start(pos) 

131 return State.SCANNING_COMMENT 

132 if char == ".": 

133 tokens.set_start(pos) 

134 return State.SCANNING_CLASS 

135 if REGEX_KEY_CHARACTERS.fullmatch(char): 

136 tokens.set_start(pos) 

137 return State.SCANNING_KEY 

138 

139 raise ParseError(f"Unexpected character whilst scanning: {char}", pos) 

140 

141 

142def handle_scanning_comment(char: str, pos: int, tokens: TokenState) -> State: 

143 if char == "%": 

144 return State.SCANNING 

145 

146 return State.SCANNING_COMMENT 

147 

148 

149def handle_scanning_id(char: str, pos: int, tokens: TokenState) -> State: 

150 if not REGEX_SPACE_PUNCTUATION.fullmatch(char): 

151 return State.SCANNING_ID 

152 

153 if char == "}": 

154 if (pos - 1) > tokens.start: 

155 tokens.append(tokens.start + 1, pos, "id") 

156 return State.DONE 

157 

158 if REGEX_SPACE.fullmatch(char): 

159 if (pos - 1) > tokens.start: 

160 tokens.append(tokens.start + 1, pos, "id") 

161 return State.SCANNING 

162 

163 raise ParseError(f"Unexpected character whilst scanning id: {char}", pos) 

164 

165 

166def handle_scanning_class(char: str, pos: int, tokens: TokenState) -> State: 

167 if not REGEX_SPACE_PUNCTUATION.fullmatch(char): 

168 return State.SCANNING_CLASS 

169 

170 if char == "}": 

171 if (pos - 1) > tokens.start: 

172 tokens.append(tokens.start + 1, pos, "class") 

173 return State.DONE 

174 

175 if REGEX_SPACE.fullmatch(char): 

176 if (pos - 1) > tokens.start: 

177 tokens.append(tokens.start + 1, pos, "class") 

178 return State.SCANNING 

179 

180 raise ParseError(f"Unexpected character whilst scanning class: {char}", pos) 

181 

182 

183def handle_scanning_key(char: str, pos: int, tokens: TokenState) -> State: 

184 if char == "=": 

185 tokens.append(tokens.start, pos, "key") 

186 return State.SCANNING_VALUE 

187 

188 if REGEX_KEY_CHARACTERS.fullmatch(char): 

189 return State.SCANNING_KEY 

190 

191 raise ParseError(f"Unexpected character whilst scanning key: {char}", pos) 

192 

193 

194def handle_scanning_value(char: str, pos: int, tokens: TokenState) -> State: 

195 if char == '"': 

196 tokens.set_start(pos) 

197 return State.SCANNING_QUOTED_VALUE 

198 

199 if REGEX_KEY_CHARACTERS.fullmatch(char): 

200 tokens.set_start(pos) 

201 return State.SCANNING_BARE_VALUE 

202 

203 raise ParseError(f"Unexpected character whilst scanning value: {char}", pos) 

204 

205 

206def handle_scanning_bare_value(char: str, pos: int, tokens: TokenState) -> State: 

207 if REGEX_KEY_CHARACTERS.fullmatch(char): 

208 return State.SCANNING_BARE_VALUE 

209 

210 if char == "}": 

211 tokens.append(tokens.start, pos, "value") 

212 return State.DONE 

213 

214 if REGEX_SPACE.fullmatch(char): 

215 tokens.append(tokens.start, pos, "value") 

216 return State.SCANNING 

217 

218 raise ParseError(f"Unexpected character whilst scanning bare value: {char}", pos) 

219 

220 

221def handle_scanning_escaped(char: str, pos: int, tokens: TokenState) -> State: 

222 return State.SCANNING_QUOTED_VALUE 

223 

224 

225def handle_scanning_quoted_value(char: str, pos: int, tokens: TokenState) -> State: 

226 if char == '"': 

227 tokens.append(tokens.start + 1, pos, "value") 

228 return State.SCANNING 

229 

230 if char == "\\": 

231 return State.SCANNING_ESCAPED 

232 

233 if char == "{" or char == "}": 

234 raise ParseError( 

235 f"Unexpected character whilst scanning quoted value: {char}", pos 

236 ) 

237 

238 if char == "\n": 

239 tokens.append(tokens.start + 1, pos, "value") 

240 return State.SCANNING_QUOTED_VALUE 

241 

242 return State.SCANNING_QUOTED_VALUE 

243 

244 

245HANDLERS: dict[State, Callable[[str, int, TokenState], State]] = { 

246 State.START: handle_start, 

247 State.SCANNING: handle_scanning, 

248 State.SCANNING_COMMENT: handle_scanning_comment, 

249 State.SCANNING_ID: handle_scanning_id, 

250 State.SCANNING_CLASS: handle_scanning_class, 

251 State.SCANNING_KEY: handle_scanning_key, 

252 State.SCANNING_VALUE: handle_scanning_value, 

253 State.SCANNING_BARE_VALUE: handle_scanning_bare_value, 

254 State.SCANNING_QUOTED_VALUE: handle_scanning_quoted_value, 

255 State.SCANNING_ESCAPED: handle_scanning_escaped, 

256}