Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/markdown/htmlparser.py: 94%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

160 statements  

1# Python Markdown 

2 

3# A Python implementation of John Gruber's Markdown. 

4 

5# Documentation: https://python-markdown.github.io/ 

6# GitHub: https://github.com/Python-Markdown/markdown/ 

7# PyPI: https://pypi.org/project/Markdown/ 

8 

9# Started by Manfred Stienstra (http://www.dwerg.net/). 

10# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org). 

11# Currently maintained by Waylan Limberg (https://github.com/waylan), 

12# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser). 

13 

14# Copyright 2007-2023 The Python Markdown Project (v. 1.7 and later) 

15# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) 

16# Copyright 2004 Manfred Stienstra (the original version) 

17 

18# License: BSD (see LICENSE.md for details). 

19 

20""" 

21This module imports a copy of [`html.parser.HTMLParser`][] and modifies it heavily through monkey-patches. 

22A copy is imported rather than the module being directly imported as this ensures that the user can import 

23and use the unmodified library for their own needs. 

24""" 

25 

26from __future__ import annotations 

27 

28import re 

29import importlib.util 

30import sys 

31from typing import TYPE_CHECKING, Sequence 

32 

33if TYPE_CHECKING: # pragma: no cover 

34 from markdown import Markdown 

35 

36 

37# Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it. 

38# Users can still do `from html import parser` and get the default behavior. 

39spec = importlib.util.find_spec('html.parser') 

40htmlparser = importlib.util.module_from_spec(spec) 

41spec.loader.exec_module(htmlparser) 

42sys.modules['htmlparser'] = htmlparser 

43 

44# This is a hack. We are sneaking in `</>` so we can capture it without the HTML parser 

45# throwing it away. When we see it, we will process it as data. 

46htmlparser.starttagopen = re.compile('<[a-zA-Z]|</>') 

47 

48# Monkeypatch `HTMLParser` to only accept `?>` to close Processing Instructions. 

49htmlparser.piclose = re.compile(r'\?>') 

50# Monkeypatch `HTMLParser` to only recognize entity references with a closing semicolon. 

51htmlparser.entityref = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);') 

52# Monkeypatch `HTMLParser` to no longer support partial entities. We are always feeding a complete block, 

53# so the 'incomplete' functionality is unnecessary. As the `entityref` regex is run right before incomplete, 

54# and the two regex are the same, then incomplete will simply never match and we avoid the logic within. 

55htmlparser.incomplete = htmlparser.entityref 

56# Monkeypatch `HTMLParser` to not accept a backtick in a tag name, attribute name, or bare value. 

57htmlparser.locatestarttagend_tolerant = re.compile(r""" 

58 <[a-zA-Z][^`\t\n\r\f />\x00]* # tag name <= added backtick here 

59 (?:[\s/]* # optional whitespace before attribute name 

60 (?:(?<=['"\s/])[^`\s/>][^\s/=>]* # attribute name <= added backtick here 

61 (?:\s*=+\s* # value indicator 

62 (?:'[^']*' # LITA-enclosed value 

63 |"[^"]*" # LIT-enclosed value 

64 |(?!['"])[^`>\s]* # bare value <= added backtick here 

65 ) 

66 (?:\s*,)* # possibly followed by a comma 

67 )?(?:\s|/(?!>))* 

68 )* 

69 )? 

70 \s* # trailing whitespace 

71""", re.VERBOSE) 

72 

73# Match a blank line at the start of a block of text (two newlines). 

74# The newlines may be preceded by additional whitespace. 

75blank_line_re = re.compile(r'^([ ]*\n){2}') 

76 

77 

78class HTMLExtractor(htmlparser.HTMLParser): 

79 """ 

80 Extract raw HTML from text. 

81 

82 The raw HTML is stored in the [`htmlStash`][markdown.util.HtmlStash] of the 

83 [`Markdown`][markdown.Markdown] instance passed to `md` and the remaining text 

84 is stored in `cleandoc` as a list of strings. 

85 """ 

86 

87 def __init__(self, md: Markdown, *args, **kwargs): 

88 if 'convert_charrefs' not in kwargs: 

89 kwargs['convert_charrefs'] = False 

90 

91 # Block tags that should contain no content (self closing) 

92 self.empty_tags = set(['hr']) 

93 

94 self.lineno_start_cache = [0] 

95 

96 self.override_comment_update = False 

97 

98 # This calls self.reset 

99 super().__init__(*args, **kwargs) 

100 self.md = md 

101 

102 def reset(self): 

103 """Reset this instance. Loses all unprocessed data.""" 

104 self.inraw = False 

105 self.intail = False 

106 self.stack: list[str] = [] # When `inraw==True`, stack contains a list of tags 

107 self._cache: list[str] = [] 

108 self.cleandoc: list[str] = [] 

109 self.lineno_start_cache = [0] 

110 

111 super().reset() 

112 

113 def close(self): 

114 """Handle any buffered data.""" 

115 super().close() 

116 if len(self.rawdata): 

117 # Temp fix for https://bugs.python.org/issue41989 

118 # TODO: remove this when the bug is fixed in all supported Python versions. 

119 if self.convert_charrefs and not self.cdata_elem: # pragma: no cover 

120 self.handle_data(htmlparser.unescape(self.rawdata)) 

121 else: 

122 self.handle_data(self.rawdata) 

123 # Handle any unclosed tags. 

124 if len(self._cache): 

125 self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) 

126 self._cache = [] 

127 

128 @property 

129 def line_offset(self) -> int: 

130 """Returns char index in `self.rawdata` for the start of the current line. """ 

131 for ii in range(len(self.lineno_start_cache)-1, self.lineno-1): 

132 last_line_start_pos = self.lineno_start_cache[ii] 

133 lf_pos = self.rawdata.find('\n', last_line_start_pos) 

134 if lf_pos == -1: 

135 # No more newlines found. Use end of raw data as start of line beyond end. 

136 lf_pos = len(self.rawdata) 

137 self.lineno_start_cache.append(lf_pos+1) 

138 

139 return self.lineno_start_cache[self.lineno-1] 

140 

141 def at_line_start(self) -> bool: 

142 """ 

143 Returns True if current position is at start of line. 

144 

145 Allows for up to three blank spaces at start of line. 

146 """ 

147 if self.offset == 0: 

148 return True 

149 if self.offset > 3: 

150 return False 

151 # Confirm up to first 3 chars are whitespace 

152 return self.rawdata[self.line_offset:self.line_offset + self.offset].strip() == '' 

153 

154 def get_endtag_text(self, tag: str) -> str: 

155 """ 

156 Returns the text of the end tag. 

157 

158 If it fails to extract the actual text from the raw data, it builds a closing tag with `tag`. 

159 """ 

160 # Attempt to extract actual tag from raw source text 

161 start = self.line_offset + self.offset 

162 m = htmlparser.endendtag.search(self.rawdata, start) 

163 if m: 

164 return self.rawdata[start:m.end()] 

165 else: # pragma: no cover 

166 # Failed to extract from raw data. Assume well formed and lowercase. 

167 return '</{}>'.format(tag) 

168 

169 def handle_starttag(self, tag: str, attrs: Sequence[tuple[str, str]]): 

170 # Handle tags that should always be empty and do not specify a closing tag 

171 if tag in self.empty_tags: 

172 self.handle_startendtag(tag, attrs) 

173 return 

174 

175 if self.md.is_block_level(tag) and (self.intail or (self.at_line_start() and not self.inraw)): 

176 # Started a new raw block. Prepare stack. 

177 self.inraw = True 

178 self.cleandoc.append('\n') 

179 

180 text = self.get_starttag_text() 

181 if self.inraw: 

182 self.stack.append(tag) 

183 self._cache.append(text) 

184 else: 

185 self.cleandoc.append(text) 

186 if tag in self.CDATA_CONTENT_ELEMENTS: 

187 # This is presumably a standalone tag in a code span (see #1036). 

188 self.clear_cdata_mode() 

189 

190 def handle_endtag(self, tag: str): 

191 text = self.get_endtag_text(tag) 

192 

193 if self.inraw: 

194 self._cache.append(text) 

195 if tag in self.stack: 

196 # Remove tag from stack 

197 while self.stack: 

198 if self.stack.pop() == tag: 

199 break 

200 if len(self.stack) == 0: 

201 # End of raw block. 

202 if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(text):]): 

203 # Preserve blank line and end of raw block. 

204 self._cache.append('\n') 

205 else: 

206 # More content exists after `endtag`. 

207 self.intail = True 

208 # Reset stack. 

209 self.inraw = False 

210 self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) 

211 # Insert blank line between this and next line. 

212 self.cleandoc.append('\n\n') 

213 self._cache = [] 

214 else: 

215 self.cleandoc.append(text) 

216 

217 def handle_data(self, data: str): 

218 if self.intail and '\n' in data: 

219 self.intail = False 

220 if self.inraw: 

221 self._cache.append(data) 

222 else: 

223 self.cleandoc.append(data) 

224 

225 def handle_empty_tag(self, data: str, is_block: bool): 

226 """ Handle empty tags (`<data>`). """ 

227 if self.inraw or self.intail: 

228 # Append this to the existing raw block 

229 self._cache.append(data) 

230 elif self.at_line_start() and is_block: 

231 # Handle this as a standalone raw block 

232 if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(data):]): 

233 # Preserve blank line after tag in raw block. 

234 data += '\n' 

235 else: 

236 # More content exists after tag. 

237 self.intail = True 

238 item = self.cleandoc[-1] if self.cleandoc else '' 

239 # If we only have one newline before block element, add another 

240 if not item.endswith('\n\n') and item.endswith('\n'): 

241 self.cleandoc.append('\n') 

242 self.cleandoc.append(self.md.htmlStash.store(data)) 

243 # Insert blank line between this and next line. 

244 self.cleandoc.append('\n\n') 

245 else: 

246 self.cleandoc.append(data) 

247 

248 def handle_startendtag(self, tag: str, attrs): 

249 self.handle_empty_tag(self.get_starttag_text(), is_block=self.md.is_block_level(tag)) 

250 

251 def handle_charref(self, name: str): 

252 self.handle_empty_tag('&#{};'.format(name), is_block=False) 

253 

254 def handle_entityref(self, name: str): 

255 self.handle_empty_tag('&{};'.format(name), is_block=False) 

256 

257 def handle_comment(self, data: str): 

258 # Check if the comment is unclosed, if so, we need to override position 

259 i = self.line_offset + self.offset + len(data) + 4 

260 if self.rawdata[i:i + 3] != '-->': 

261 self.handle_data('<') 

262 self.override_comment_update = True 

263 return 

264 self.handle_empty_tag('<!--{}-->'.format(data), is_block=True) 

265 

266 def updatepos(self, i: int, j: int) -> int: 

267 if self.override_comment_update: 

268 self.override_comment_update = False 

269 i = 0 

270 j = 1 

271 return super().updatepos(i, j) 

272 

273 def handle_decl(self, data: str): 

274 self.handle_empty_tag('<!{}>'.format(data), is_block=True) 

275 

276 def handle_pi(self, data: str): 

277 self.handle_empty_tag('<?{}?>'.format(data), is_block=True) 

278 

279 def unknown_decl(self, data: str): 

280 end = ']]>' if data.startswith('CDATA[') else ']>' 

281 self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True) 

282 

283 def parse_pi(self, i: int) -> int: 

284 if self.at_line_start() or self.intail: 

285 return super().parse_pi(i) 

286 # This is not the beginning of a raw block so treat as plain data 

287 # and avoid consuming any tags which may follow (see #1066). 

288 self.handle_data('<?') 

289 return i + 2 

290 

291 def parse_html_declaration(self, i: int) -> int: 

292 if self.at_line_start() or self.intail: 

293 if self.rawdata[i:i+3] == '<![' and not self.rawdata[i:i+9] == '<![CDATA[': 

294 # We have encountered the bug in #1534 (Python bug `gh-77057`). 

295 # Provide an override until we drop support for Python < 3.13. 

296 result = self.parse_bogus_comment(i) 

297 if result == -1: 

298 self.handle_data(self.rawdata[i:i + 1]) 

299 return i + 1 

300 return result 

301 return super().parse_html_declaration(i) 

302 # This is not the beginning of a raw block so treat as plain data 

303 # and avoid consuming any tags which may follow (see #1066). 

304 self.handle_data('<!') 

305 return i + 2 

306 

307 def parse_bogus_comment(self, i: int, report: int = 0) -> int: 

308 # Override the default behavior so that bogus comments get passed 

309 # through unaltered by setting `report` to `0` (see #1425). 

310 pos = super().parse_bogus_comment(i, report) 

311 if pos == -1: # pragma: no cover 

312 return -1 

313 self.handle_empty_tag(self.rawdata[i:pos], is_block=False) 

314 return pos 

315 

316 # The rest has been copied from base class in standard lib to address #1036. 

317 # As `__startag_text` is private, all references to it must be in this subclass. 

318 # The last few lines of `parse_starttag` are reversed so that `handle_starttag` 

319 # can override `cdata_mode` in certain situations (in a code span). 

320 __starttag_text: str | None = None 

321 

322 def get_starttag_text(self) -> str: 

323 """Return full source of start tag: `<...>`.""" 

324 return self.__starttag_text 

325 

326 def parse_starttag(self, i: int) -> int: # pragma: no cover 

327 # Treat `</>` as normal data as it is not a real tag. 

328 if self.rawdata[i:i + 3] == '</>': 

329 self.handle_data(self.rawdata[i:i + 3]) 

330 return i + 3 

331 

332 self.__starttag_text = None 

333 endpos = self.check_for_whole_start_tag(i) 

334 if endpos < 0: 

335 self.handle_data(self.rawdata[i:i + 1]) 

336 return i + 1 

337 rawdata = self.rawdata 

338 self.__starttag_text = rawdata[i:endpos] 

339 

340 # Now parse the data between `i+1` and `j` into a tag and `attrs` 

341 attrs = [] 

342 match = htmlparser.tagfind_tolerant.match(rawdata, i+1) 

343 assert match, 'unexpected call to parse_starttag()' 

344 k = match.end() 

345 self.lasttag = tag = match.group(1).lower() 

346 while k < endpos: 

347 m = htmlparser.attrfind_tolerant.match(rawdata, k) 

348 if not m: 

349 break 

350 attrname, rest, attrvalue = m.group(1, 2, 3) 

351 if not rest: 

352 attrvalue = None 

353 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ 

354 attrvalue[:1] == '"' == attrvalue[-1:]: # noqa: E127 

355 attrvalue = attrvalue[1:-1] 

356 if attrvalue: 

357 attrvalue = htmlparser.unescape(attrvalue) 

358 attrs.append((attrname.lower(), attrvalue)) 

359 k = m.end() 

360 

361 end = rawdata[k:endpos].strip() 

362 if end not in (">", "/>"): 

363 lineno, offset = self.getpos() 

364 if "\n" in self.__starttag_text: 

365 lineno = lineno + self.__starttag_text.count("\n") 

366 offset = len(self.__starttag_text) \ 

367 - self.__starttag_text.rfind("\n") # noqa: E127 

368 else: 

369 offset = offset + len(self.__starttag_text) 

370 self.handle_data(rawdata[i:endpos]) 

371 return endpos 

372 if end.endswith('/>'): 

373 # XHTML-style empty tag: `<span attr="value" />` 

374 self.handle_startendtag(tag, attrs) 

375 else: 

376 # *** set `cdata_mode` first so we can override it in `handle_starttag` (see #1036) *** 

377 if tag in self.CDATA_CONTENT_ELEMENTS: 

378 self.set_cdata_mode(tag) 

379 self.handle_starttag(tag, attrs) 

380 return endpos