Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/mdit_py_plugins/attrs/parse.py: 99%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

158 statements  

1"""Parser for attributes:: 

2 

3 attributes { id = "foo", class = "bar baz", 

4 key1 = "val1", key2 = "val2" } 

5 

6Adapted from: 

7https://github.com/jgm/djot/blob/fae7364b86bfce69bc6d5b5eede1f5196d845fd6/djot/attributes.lua#L1 

8 

9syntax: 

10 

11attributes <- '{' whitespace* attribute (whitespace attribute)* whitespace* '}' 

12attribute <- identifier | class | keyval 

13identifier <- '#' name 

14class <- '.' name 

15name <- (nonspace, nonpunctuation other than ':', '_', '-')+ 

16keyval <- key '=' val 

17key <- (ASCII_ALPHANUM | ':' | '_' | '-')+ 

18val <- bareval | quotedval 

19bareval <- (ASCII_ALPHANUM | ':' | '_' | '-')+ 

20quotedval <- '"' ([^"] | '\"') '"' 

21""" 

22 

23from __future__ import annotations 

24 

25from enum import Enum 

26import re 

27from typing import Callable 

28 

29 

30class State(Enum): 

31 START = 0 

32 SCANNING = 1 

33 SCANNING_ID = 2 

34 SCANNING_CLASS = 3 

35 SCANNING_KEY = 4 

36 SCANNING_VALUE = 5 

37 SCANNING_BARE_VALUE = 6 

38 SCANNING_QUOTED_VALUE = 7 

39 SCANNING_COMMENT = 8 

40 SCANNING_ESCAPED = 9 

41 DONE = 10 

42 

43 

44REGEX_SPACE = re.compile(r"\s") 

45REGEX_SPACE_PUNCTUATION = re.compile(r"[\s!\"#$%&'()*+,./;<=>?@[\]^`{|}~]") 

46REGEX_KEY_CHARACTERS = re.compile(r"[a-zA-Z\d_:-]") 

47 

48 

49class TokenState: 

50 def __init__(self) -> None: 

51 self._tokens: list[tuple[int, int, str]] = [] 

52 self.start: int = 0 

53 

54 def set_start(self, start: int) -> None: 

55 self.start = start 

56 

57 def append(self, start: int, end: int, ttype: str) -> None: 

58 self._tokens.append((start, end, ttype)) 

59 

60 def compile(self, string: str) -> dict[str, str]: 

61 """compile the tokens into a dictionary""" 

62 attributes = {} 

63 classes = [] 

64 idx = 0 

65 while idx < len(self._tokens): 

66 start, end, ttype = self._tokens[idx] 

67 if ttype == "id": 

68 attributes["id"] = string[start:end] 

69 elif ttype == "class": 

70 classes.append(string[start:end]) 

71 elif ttype == "key": 

72 key = string[start:end] 

73 if idx + 1 < len(self._tokens): 

74 start, end, ttype = self._tokens[idx + 1] 

75 if ttype == "value": 

76 if key == "class": 

77 classes.append(string[start:end]) 

78 else: 

79 attributes[key] = string[start:end] 

80 idx += 1 

81 idx += 1 

82 if classes: 

83 attributes["class"] = " ".join(classes) 

84 return attributes 

85 

86 def __str__(self) -> str: 

87 return str(self._tokens) 

88 

89 def __repr__(self) -> str: 

90 return repr(self._tokens) 

91 

92 

93class ParseError(Exception): 

94 def __init__(self, msg: str, pos: int) -> None: 

95 self.pos = pos 

96 super().__init__(msg + f" at position {pos}") 

97 

98 

99def parse(string: str) -> tuple[int, dict[str, str]]: 

100 """Parse attributes from start of string. 

101 

102 :returns: (length of parsed string, dict of attributes) 

103 """ 

104 pos = 0 

105 state: State = State.START 

106 tokens = TokenState() 

107 while pos < len(string): 

108 state = HANDLERS[state](string[pos], pos, tokens) 

109 if state == State.DONE: 

110 return pos, tokens.compile(string) 

111 pos = pos + 1 

112 

113 return pos, tokens.compile(string) 

114 

115 

116def handle_start(char: str, pos: int, tokens: TokenState) -> State: 

117 if char == "{": 

118 return State.SCANNING 

119 raise ParseError("Attributes must start with '{'", pos) 

120 

121 

122def handle_scanning(char: str, pos: int, tokens: TokenState) -> State: 

123 if char == " " or char == "\t" or char == "\n" or char == "\r": 

124 return State.SCANNING 

125 if char == "}": 

126 return State.DONE 

127 if char == "#": 

128 tokens.set_start(pos) 

129 return State.SCANNING_ID 

130 if char == "%": 

131 tokens.set_start(pos) 

132 return State.SCANNING_COMMENT 

133 if char == ".": 

134 tokens.set_start(pos) 

135 return State.SCANNING_CLASS 

136 if REGEX_KEY_CHARACTERS.fullmatch(char): 

137 tokens.set_start(pos) 

138 return State.SCANNING_KEY 

139 

140 raise ParseError(f"Unexpected character whilst scanning: {char}", pos) 

141 

142 

143def handle_scanning_comment(char: str, pos: int, tokens: TokenState) -> State: 

144 if char == "%": 

145 return State.SCANNING 

146 

147 return State.SCANNING_COMMENT 

148 

149 

150def handle_scanning_id(char: str, pos: int, tokens: TokenState) -> State: 

151 if not REGEX_SPACE_PUNCTUATION.fullmatch(char): 

152 return State.SCANNING_ID 

153 

154 if char == "}": 

155 if (pos - 1) > tokens.start: 

156 tokens.append(tokens.start + 1, pos, "id") 

157 return State.DONE 

158 

159 if REGEX_SPACE.fullmatch(char): 

160 if (pos - 1) > tokens.start: 

161 tokens.append(tokens.start + 1, pos, "id") 

162 return State.SCANNING 

163 

164 raise ParseError(f"Unexpected character whilst scanning id: {char}", pos) 

165 

166 

167def handle_scanning_class(char: str, pos: int, tokens: TokenState) -> State: 

168 if not REGEX_SPACE_PUNCTUATION.fullmatch(char): 

169 return State.SCANNING_CLASS 

170 

171 if char == "}": 

172 if (pos - 1) > tokens.start: 

173 tokens.append(tokens.start + 1, pos, "class") 

174 return State.DONE 

175 

176 if REGEX_SPACE.fullmatch(char): 

177 if (pos - 1) > tokens.start: 

178 tokens.append(tokens.start + 1, pos, "class") 

179 return State.SCANNING 

180 

181 raise ParseError(f"Unexpected character whilst scanning class: {char}", pos) 

182 

183 

184def handle_scanning_key(char: str, pos: int, tokens: TokenState) -> State: 

185 if char == "=": 

186 tokens.append(tokens.start, pos, "key") 

187 return State.SCANNING_VALUE 

188 

189 if REGEX_KEY_CHARACTERS.fullmatch(char): 

190 return State.SCANNING_KEY 

191 

192 raise ParseError(f"Unexpected character whilst scanning key: {char}", pos) 

193 

194 

195def handle_scanning_value(char: str, pos: int, tokens: TokenState) -> State: 

196 if char == '"': 

197 tokens.set_start(pos) 

198 return State.SCANNING_QUOTED_VALUE 

199 

200 if REGEX_KEY_CHARACTERS.fullmatch(char): 

201 tokens.set_start(pos) 

202 return State.SCANNING_BARE_VALUE 

203 

204 raise ParseError(f"Unexpected character whilst scanning value: {char}", pos) 

205 

206 

207def handle_scanning_bare_value(char: str, pos: int, tokens: TokenState) -> State: 

208 if REGEX_KEY_CHARACTERS.fullmatch(char): 

209 return State.SCANNING_BARE_VALUE 

210 

211 if char == "}": 

212 tokens.append(tokens.start, pos, "value") 

213 return State.DONE 

214 

215 if REGEX_SPACE.fullmatch(char): 

216 tokens.append(tokens.start, pos, "value") 

217 return State.SCANNING 

218 

219 raise ParseError(f"Unexpected character whilst scanning bare value: {char}", pos) 

220 

221 

222def handle_scanning_escaped(char: str, pos: int, tokens: TokenState) -> State: 

223 return State.SCANNING_QUOTED_VALUE 

224 

225 

226def handle_scanning_quoted_value(char: str, pos: int, tokens: TokenState) -> State: 

227 if char == '"': 

228 tokens.append(tokens.start + 1, pos, "value") 

229 return State.SCANNING 

230 

231 if char == "\\": 

232 return State.SCANNING_ESCAPED 

233 

234 if char == "{" or char == "}": 

235 raise ParseError( 

236 f"Unexpected character whilst scanning quoted value: {char}", pos 

237 ) 

238 

239 if char == "\n": 

240 tokens.append(tokens.start + 1, pos, "value") 

241 return State.SCANNING_QUOTED_VALUE 

242 

243 return State.SCANNING_QUOTED_VALUE 

244 

245 

246HANDLERS: dict[State, Callable[[str, int, TokenState], State]] = { 

247 State.START: handle_start, 

248 State.SCANNING: handle_scanning, 

249 State.SCANNING_COMMENT: handle_scanning_comment, 

250 State.SCANNING_ID: handle_scanning_id, 

251 State.SCANNING_CLASS: handle_scanning_class, 

252 State.SCANNING_KEY: handle_scanning_key, 

253 State.SCANNING_VALUE: handle_scanning_value, 

254 State.SCANNING_BARE_VALUE: handle_scanning_bare_value, 

255 State.SCANNING_QUOTED_VALUE: handle_scanning_quoted_value, 

256 State.SCANNING_ESCAPED: handle_scanning_escaped, 

257}