Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.10/site-packages/markdown/htmlparser.py: 44%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

140 statements  

1# Python Markdown 

2 

3# A Python implementation of John Gruber's Markdown. 

4 

5# Documentation: https://python-markdown.github.io/ 

6# GitHub: https://github.com/Python-Markdown/markdown/ 

7# PyPI: https://pypi.org/project/Markdown/ 

8 

9# Started by Manfred Stienstra (http://www.dwerg.net/). 

10# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org). 

11# Currently maintained by Waylan Limberg (https://github.com/waylan), 

12# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser). 

13 

14# Copyright 2007-2023 The Python Markdown Project (v. 1.7 and later) 

15# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) 

16# Copyright 2004 Manfred Stienstra (the original version) 

17 

18# License: BSD (see LICENSE.md for details). 

19 

20""" 

21This module imports a copy of [`html.parser.HTMLParser`][] and modifies it heavily through monkey-patches. 

22A copy is imported rather than the module being directly imported as this ensures that the user can import 

23and use the unmodified library for their own needs. 

24""" 

25 

26from __future__ import annotations 

27 

28import re 

29import importlib.util 

30import sys 

31from typing import TYPE_CHECKING, Sequence 

32 

33if TYPE_CHECKING: # pragma: no cover 

34 from markdown import Markdown 

35 

36 

37# Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it. 

38# Users can still do `from html import parser` and get the default behavior. 

39spec = importlib.util.find_spec('html.parser') 

40htmlparser = importlib.util.module_from_spec(spec) 

41spec.loader.exec_module(htmlparser) 

42sys.modules['htmlparser'] = htmlparser 

43 

44# Monkeypatch `HTMLParser` to only accept `?>` to close Processing Instructions. 

45htmlparser.piclose = re.compile(r'\?>') 

46# Monkeypatch `HTMLParser` to only recognize entity references with a closing semicolon. 

47htmlparser.entityref = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);') 

48# Monkeypatch `HTMLParser` to no longer support partial entities. We are always feeding a complete block, 

49# so the 'incomplete' functionality is unnecessary. As the `entityref` regex is run right before incomplete, 

50# and the two regex are the same, then incomplete will simply never match and we avoid the logic within. 

51htmlparser.incomplete = htmlparser.entityref 

52# Monkeypatch `HTMLParser` to not accept a backtick in a tag name, attribute name, or bare value. 

53htmlparser.locatestarttagend_tolerant = re.compile(r""" 

54 <[a-zA-Z][^`\t\n\r\f />\x00]* # tag name <= added backtick here 

55 (?:[\s/]* # optional whitespace before attribute name 

56 (?:(?<=['"\s/])[^`\s/>][^\s/=>]* # attribute name <= added backtick here 

57 (?:\s*=+\s* # value indicator 

58 (?:'[^']*' # LITA-enclosed value 

59 |"[^"]*" # LIT-enclosed value 

60 |(?!['"])[^`>\s]* # bare value <= added backtick here 

61 ) 

62 (?:\s*,)* # possibly followed by a comma 

63 )?(?:\s|/(?!>))* 

64 )* 

65 )? 

66 \s* # trailing whitespace 

67""", re.VERBOSE) 

68 

69# Match a blank line at the start of a block of text (two newlines). 

70# The newlines may be preceded by additional whitespace. 

71blank_line_re = re.compile(r'^([ ]*\n){2}') 

72 

73 

74class HTMLExtractor(htmlparser.HTMLParser): 

75 """ 

76 Extract raw HTML from text. 

77 

78 The raw HTML is stored in the [`htmlStash`][markdown.util.HtmlStash] of the 

79 [`Markdown`][markdown.Markdown] instance passed to `md` and the remaining text 

80 is stored in `cleandoc` as a list of strings. 

81 """ 

82 

83 def __init__(self, md: Markdown, *args, **kwargs): 

84 if 'convert_charrefs' not in kwargs: 

85 kwargs['convert_charrefs'] = False 

86 

87 # Block tags that should contain no content (self closing) 

88 self.empty_tags = set(['hr']) 

89 

90 self.lineno_start_cache = [0] 

91 

92 # This calls self.reset 

93 super().__init__(*args, **kwargs) 

94 self.md = md 

95 

96 def reset(self): 

97 """Reset this instance. Loses all unprocessed data.""" 

98 self.inraw = False 

99 self.intail = False 

100 self.stack: list[str] = [] # When `inraw==True`, stack contains a list of tags 

101 self._cache: list[str] = [] 

102 self.cleandoc: list[str] = [] 

103 self.lineno_start_cache = [0] 

104 

105 super().reset() 

106 

107 def close(self): 

108 """Handle any buffered data.""" 

109 super().close() 

110 if len(self.rawdata): 

111 # Temp fix for https://bugs.python.org/issue41989 

112 # TODO: remove this when the bug is fixed in all supported Python versions. 

113 if self.convert_charrefs and not self.cdata_elem: # pragma: no cover 

114 self.handle_data(htmlparser.unescape(self.rawdata)) 

115 else: 

116 self.handle_data(self.rawdata) 

117 # Handle any unclosed tags. 

118 if len(self._cache): 

119 self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) 

120 self._cache = [] 

121 

122 @property 

123 def line_offset(self) -> int: 

124 """Returns char index in `self.rawdata` for the start of the current line. """ 

125 for ii in range(len(self.lineno_start_cache)-1, self.lineno-1): 

126 last_line_start_pos = self.lineno_start_cache[ii] 

127 lf_pos = self.rawdata.find('\n', last_line_start_pos) 

128 if lf_pos == -1: 

129 # No more newlines found. Use end of raw data as start of line beyond end. 

130 lf_pos = len(self.rawdata) 

131 self.lineno_start_cache.append(lf_pos+1) 

132 

133 return self.lineno_start_cache[self.lineno-1] 

134 

135 def at_line_start(self) -> bool: 

136 """ 

137 Returns True if current position is at start of line. 

138 

139 Allows for up to three blank spaces at start of line. 

140 """ 

141 if self.offset == 0: 

142 return True 

143 if self.offset > 3: 

144 return False 

145 # Confirm up to first 3 chars are whitespace 

146 return self.rawdata[self.line_offset:self.line_offset + self.offset].strip() == '' 

147 

148 def get_endtag_text(self, tag: str) -> str: 

149 """ 

150 Returns the text of the end tag. 

151 

152 If it fails to extract the actual text from the raw data, it builds a closing tag with `tag`. 

153 """ 

154 # Attempt to extract actual tag from raw source text 

155 start = self.line_offset + self.offset 

156 m = htmlparser.endendtag.search(self.rawdata, start) 

157 if m: 

158 return self.rawdata[start:m.end()] 

159 else: # pragma: no cover 

160 # Failed to extract from raw data. Assume well formed and lowercase. 

161 return '</{}>'.format(tag) 

162 

163 def handle_starttag(self, tag: str, attrs: Sequence[tuple[str, str]]): 

164 # Handle tags that should always be empty and do not specify a closing tag 

165 if tag in self.empty_tags: 

166 self.handle_startendtag(tag, attrs) 

167 return 

168 

169 if self.md.is_block_level(tag) and (self.intail or (self.at_line_start() and not self.inraw)): 

170 # Started a new raw block. Prepare stack. 

171 self.inraw = True 

172 self.cleandoc.append('\n') 

173 

174 text = self.get_starttag_text() 

175 if self.inraw: 

176 self.stack.append(tag) 

177 self._cache.append(text) 

178 else: 

179 self.cleandoc.append(text) 

180 if tag in self.CDATA_CONTENT_ELEMENTS: 

181 # This is presumably a standalone tag in a code span (see #1036). 

182 self.clear_cdata_mode() 

183 

184 def handle_endtag(self, tag: str): 

185 text = self.get_endtag_text(tag) 

186 

187 if self.inraw: 

188 self._cache.append(text) 

189 if tag in self.stack: 

190 # Remove tag from stack 

191 while self.stack: 

192 if self.stack.pop() == tag: 

193 break 

194 if len(self.stack) == 0: 

195 # End of raw block. 

196 if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(text):]): 

197 # Preserve blank line and end of raw block. 

198 self._cache.append('\n') 

199 else: 

200 # More content exists after `endtag`. 

201 self.intail = True 

202 # Reset stack. 

203 self.inraw = False 

204 self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) 

205 # Insert blank line between this and next line. 

206 self.cleandoc.append('\n\n') 

207 self._cache = [] 

208 else: 

209 self.cleandoc.append(text) 

210 

211 def handle_data(self, data: str): 

212 if self.intail and '\n' in data: 

213 self.intail = False 

214 if self.inraw: 

215 self._cache.append(data) 

216 else: 

217 self.cleandoc.append(data) 

218 

219 def handle_empty_tag(self, data: str, is_block: bool): 

220 """ Handle empty tags (`<data>`). """ 

221 if self.inraw or self.intail: 

222 # Append this to the existing raw block 

223 self._cache.append(data) 

224 elif self.at_line_start() and is_block: 

225 # Handle this as a standalone raw block 

226 if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(data):]): 

227 # Preserve blank line after tag in raw block. 

228 data += '\n' 

229 else: 

230 # More content exists after tag. 

231 self.intail = True 

232 item = self.cleandoc[-1] if self.cleandoc else '' 

233 # If we only have one newline before block element, add another 

234 if not item.endswith('\n\n') and item.endswith('\n'): 

235 self.cleandoc.append('\n') 

236 self.cleandoc.append(self.md.htmlStash.store(data)) 

237 # Insert blank line between this and next line. 

238 self.cleandoc.append('\n\n') 

239 else: 

240 self.cleandoc.append(data) 

241 

242 def handle_startendtag(self, tag: str, attrs): 

243 self.handle_empty_tag(self.get_starttag_text(), is_block=self.md.is_block_level(tag)) 

244 

245 def handle_charref(self, name: str): 

246 self.handle_empty_tag('&#{};'.format(name), is_block=False) 

247 

248 def handle_entityref(self, name: str): 

249 self.handle_empty_tag('&{};'.format(name), is_block=False) 

250 

251 def handle_comment(self, data: str): 

252 self.handle_empty_tag('<!--{}-->'.format(data), is_block=True) 

253 

254 def handle_decl(self, data: str): 

255 self.handle_empty_tag('<!{}>'.format(data), is_block=True) 

256 

257 def handle_pi(self, data: str): 

258 self.handle_empty_tag('<?{}?>'.format(data), is_block=True) 

259 

260 def unknown_decl(self, data: str): 

261 end = ']]>' if data.startswith('CDATA[') else ']>' 

262 self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True) 

263 

264 def parse_pi(self, i: int) -> int: 

265 if self.at_line_start() or self.intail: 

266 return super().parse_pi(i) 

267 # This is not the beginning of a raw block so treat as plain data 

268 # and avoid consuming any tags which may follow (see #1066). 

269 self.handle_data('<?') 

270 return i + 2 

271 

272 def parse_html_declaration(self, i: int) -> int: 

273 if self.at_line_start() or self.intail: 

274 return super().parse_html_declaration(i) 

275 # This is not the beginning of a raw block so treat as plain data 

276 # and avoid consuming any tags which may follow (see #1066). 

277 self.handle_data('<!') 

278 return i + 2 

279 

280 def parse_bogus_comment(self, i: int, report: int = 0) -> int: 

281 # Override the default behavior so that bogus comments get passed 

282 # through unaltered by setting `report` to `0` (see #1425). 

283 pos = super().parse_bogus_comment(i, report) 

284 if pos == -1: # pragma: no cover 

285 return -1 

286 self.handle_empty_tag(self.rawdata[i:pos], is_block=False) 

287 return pos 

288 

289 # The rest has been copied from base class in standard lib to address #1036. 

290 # As `__startag_text` is private, all references to it must be in this subclass. 

291 # The last few lines of `parse_starttag` are reversed so that `handle_starttag` 

292 # can override `cdata_mode` in certain situations (in a code span). 

293 __starttag_text: str | None = None 

294 

295 def get_starttag_text(self) -> str: 

296 """Return full source of start tag: `<...>`.""" 

297 return self.__starttag_text 

298 

299 def parse_starttag(self, i: int) -> int: # pragma: no cover 

300 self.__starttag_text = None 

301 endpos = self.check_for_whole_start_tag(i) 

302 if endpos < 0: 

303 return endpos 

304 rawdata = self.rawdata 

305 self.__starttag_text = rawdata[i:endpos] 

306 

307 # Now parse the data between `i+1` and `j` into a tag and `attrs` 

308 attrs = [] 

309 match = htmlparser.tagfind_tolerant.match(rawdata, i+1) 

310 assert match, 'unexpected call to parse_starttag()' 

311 k = match.end() 

312 self.lasttag = tag = match.group(1).lower() 

313 while k < endpos: 

314 m = htmlparser.attrfind_tolerant.match(rawdata, k) 

315 if not m: 

316 break 

317 attrname, rest, attrvalue = m.group(1, 2, 3) 

318 if not rest: 

319 attrvalue = None 

320 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ 

321 attrvalue[:1] == '"' == attrvalue[-1:]: # noqa: E127 

322 attrvalue = attrvalue[1:-1] 

323 if attrvalue: 

324 attrvalue = htmlparser.unescape(attrvalue) 

325 attrs.append((attrname.lower(), attrvalue)) 

326 k = m.end() 

327 

328 end = rawdata[k:endpos].strip() 

329 if end not in (">", "/>"): 

330 lineno, offset = self.getpos() 

331 if "\n" in self.__starttag_text: 

332 lineno = lineno + self.__starttag_text.count("\n") 

333 offset = len(self.__starttag_text) \ 

334 - self.__starttag_text.rfind("\n") # noqa: E127 

335 else: 

336 offset = offset + len(self.__starttag_text) 

337 self.handle_data(rawdata[i:endpos]) 

338 return endpos 

339 if end.endswith('/>'): 

340 # XHTML-style empty tag: `<span attr="value" />` 

341 self.handle_startendtag(tag, attrs) 

342 else: 

343 # *** set `cdata_mode` first so we can override it in `handle_starttag` (see #1036) *** 

344 if tag in self.CDATA_CONTENT_ELEMENTS: 

345 self.set_cdata_mode(tag) 

346 self.handle_starttag(tag, attrs) 

347 return endpos