Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/markdown/htmlparser.py: 99%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

171 statements  

1# Python Markdown 

2 

3# A Python implementation of John Gruber's Markdown. 

4 

5# Documentation: https://python-markdown.github.io/ 

6# GitHub: https://github.com/Python-Markdown/markdown/ 

7# PyPI: https://pypi.org/project/Markdown/ 

8 

9# Started by Manfred Stienstra (http://www.dwerg.net/). 

10# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org). 

11# Currently maintained by Waylan Limberg (https://github.com/waylan), 

12# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser). 

13 

14# Copyright 2007-2023 The Python Markdown Project (v. 1.7 and later) 

15# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) 

16# Copyright 2004 Manfred Stienstra (the original version) 

17 

18# License: BSD (see LICENSE.md for details). 

19 

20""" 

21This module imports a copy of [`html.parser.HTMLParser`][] and modifies it heavily through monkey-patches. 

22A copy is imported rather than the module being directly imported as this ensures that the user can import 

23and use the unmodified library for their own needs. 

24""" 

25 

26from __future__ import annotations 

27 

28import re 

29import importlib.util 

30import sys 

31from typing import TYPE_CHECKING, Sequence 

32 

33if TYPE_CHECKING: # pragma: no cover 

34 from markdown import Markdown 

35 

36# Included for versions which do not have current comment fix 

37commentclose = re.compile(r'--!?>') 

38 

39# Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it. 

40# Users can still do `from html import parser` and get the default behavior. 

41spec = importlib.util.find_spec('html.parser') 

42htmlparser = importlib.util.module_from_spec(spec) 

43spec.loader.exec_module(htmlparser) 

44sys.modules['htmlparser'] = htmlparser 

45 

46# This is a hack. We are sneaking in `</>` so we can capture it without the HTML parser 

47# throwing it away. When we see it, we will process it as data. 

48htmlparser.starttagopen = re.compile('<[a-zA-Z]|</>') 

49 

50htmlparser.endtagopen = re.compile('</[a-zA-Z]?') 

51 

52# Monkeypatch `HTMLParser` to only accept `?>` to close Processing Instructions. 

53htmlparser.piclose = re.compile(r'\?>') 

54# Monkeypatch `HTMLParser` to only recognize entity references with a closing semicolon. 

55htmlparser.entityref = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);') 

56# Monkeypatch `HTMLParser` to no longer support partial entities. We are always feeding a complete block, 

57# so the 'incomplete' functionality is unnecessary. As the `entityref` regex is run right before incomplete, 

58# and the two regex are the same, then incomplete will simply never match and we avoid the logic within. 

59htmlparser.incomplete = htmlparser.entityref 

60# Monkeypatch `HTMLParser` to not accept a backtick in a tag name, attribute name, or bare value. 

61htmlparser.locatestarttagend_tolerant = re.compile(r""" 

62 <[a-zA-Z][^`\t\n\r\f />\x00]* # tag name <= added backtick here 

63 (?:[\s/]* # optional whitespace before attribute name 

64 (?:(?<=['"\s/])[^`\s/>][^\s/=>]* # attribute name <= added backtick here 

65 (?:\s*=+\s* # value indicator 

66 (?:'[^']*' # LITA-enclosed value 

67 |"[^"]*" # LIT-enclosed value 

68 |(?!['"])[^`>\s]* # bare value <= added backtick here 

69 ) 

70 (?:\s*,)* # possibly followed by a comma 

71 )?(?:\s|/(?!>))* 

72 )* 

73 )? 

74 \s* # trailing whitespace 

75""", re.VERBOSE) 

76htmlparser.locatetagend = re.compile(r""" 

77 [a-zA-Z][^`\t\n\r\f />]* # tag name 

78 [\t\n\r\f /]* # optional whitespace before attribute name 

79 (?:(?<=['"\t\n\r\f /])[^`\t\n\r\f />][^\t\n\r\f /=>]* # attribute name 

80 (?:= # value indicator 

81 (?:'[^']*' # LITA-enclosed value 

82 |"[^"]*" # LIT-enclosed value 

83 |(?!['"])[^>\t\n\r\f ]* # bare value 

84 ) 

85 )? 

86 [\t\n\r\f /]* # possibly followed by a space 

87 )* 

88 >? 

89""", re.VERBOSE) 

90 

91# Match a blank line at the start of a block of text (two newlines). 

92# The newlines may be preceded by additional whitespace. 

93blank_line_re = re.compile(r'^([ ]*\n){2}') 

94 

95 

96class _HTMLParser(htmlparser.HTMLParser): 

97 """Handle special start and end tags.""" 

98 

99 def parse_endtag(self, i): 

100 start = self.rawdata[i:i+3] 

101 c = ord(start[-1]) 

102 if len(start) < 3 or not (65 <= c <= 90 or 97 <= c <= 122): 

103 self.handle_data(self.rawdata[i:i + 2]) 

104 return i + 2 

105 return super().parse_endtag(i) 

106 

107 def parse_starttag(self, i: int) -> int: # pragma: no cover 

108 # Treat `</>` as normal data as it is not a real tag. 

109 if self.rawdata[i:i + 3] == '</>': 

110 self.handle_data(self.rawdata[i:i + 3]) 

111 return i + 3 

112 

113 return super().parse_starttag(i) 

114 

115 

116# Overwrite our custom one for people like MkDocs that pull it in 

117htmlparser.HTMLParser = _HTMLParser 

118 

119 

120class HTMLExtractor(htmlparser.HTMLParser): 

121 """ 

122 Extract raw HTML from text. 

123 

124 The raw HTML is stored in the [`htmlStash`][markdown.util.HtmlStash] of the 

125 [`Markdown`][markdown.Markdown] instance passed to `md` and the remaining text 

126 is stored in `cleandoc` as a list of strings. 

127 """ 

128 

129 def __init__(self, md: Markdown, *args, **kwargs): 

130 if 'convert_charrefs' not in kwargs: 

131 kwargs['convert_charrefs'] = False 

132 

133 # Block tags that should contain no content (self closing) 

134 self.empty_tags = set(['hr']) 

135 

136 self.lineno_start_cache = [0] 

137 

138 # This calls self.reset 

139 super().__init__(*args, **kwargs) 

140 self.md = md 

141 

142 def reset(self): 

143 """Reset this instance. Loses all unprocessed data.""" 

144 self.inraw = False 

145 self.intail = False 

146 self.stack: list[str] = [] # When `inraw==True`, stack contains a list of tags 

147 self._cache: list[str] = [] 

148 self.cleandoc: list[str] = [] 

149 self.lineno_start_cache = [0] 

150 

151 super().reset() 

152 

153 def close(self): 

154 """Handle any buffered data.""" 

155 super().close() 

156 if len(self.rawdata): 

157 # Temp fix for https://bugs.python.org/issue41989 

158 # TODO: remove this when the bug is fixed in all supported Python versions. 

159 if self.convert_charrefs and not self.cdata_elem: # pragma: no cover 

160 self.handle_data(htmlparser.unescape(self.rawdata)) 

161 else: 

162 self.handle_data(self.rawdata) 

163 # Handle any unclosed tags. 

164 if len(self._cache): 

165 self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) 

166 self._cache = [] 

167 

168 @property 

169 def line_offset(self) -> int: 

170 """Returns char index in `self.rawdata` for the start of the current line. """ 

171 for ii in range(len(self.lineno_start_cache)-1, self.lineno-1): 

172 last_line_start_pos = self.lineno_start_cache[ii] 

173 lf_pos = self.rawdata.find('\n', last_line_start_pos) 

174 if lf_pos == -1: 

175 # No more newlines found. Use end of raw data as start of line beyond end. 

176 lf_pos = len(self.rawdata) 

177 self.lineno_start_cache.append(lf_pos+1) 

178 

179 return self.lineno_start_cache[self.lineno-1] 

180 

181 def at_line_start(self) -> bool: 

182 """ 

183 Returns True if current position is at start of line. 

184 

185 Allows for up to three blank spaces at start of line. 

186 """ 

187 if self.offset == 0: 

188 return True 

189 if self.offset > 3: 

190 return False 

191 # Confirm up to first 3 chars are whitespace 

192 return self.rawdata[self.line_offset:self.line_offset + self.offset].strip() == '' 

193 

194 def get_endtag_text(self, tag: str) -> str: 

195 """ 

196 Returns the text of the end tag. 

197 

198 If it fails to extract the actual text from the raw data, it builds a closing tag with `tag`. 

199 """ 

200 # Attempt to extract actual tag from raw source text 

201 start = self.line_offset + self.offset 

202 m = htmlparser.endendtag.search(self.rawdata, start) 

203 if m: 

204 return self.rawdata[start:m.end()] 

205 else: # pragma: no cover 

206 # Failed to extract from raw data. Assume well formed and lowercase. 

207 return '</{}>'.format(tag) 

208 

209 def handle_starttag(self, tag: str, attrs: Sequence[tuple[str, str]]): 

210 # Handle tags that should always be empty and do not specify a closing tag 

211 if tag in self.empty_tags: 

212 self.handle_startendtag(tag, attrs) 

213 return 

214 

215 if self.md.is_block_level(tag) and (self.intail or (self.at_line_start() and not self.inraw)): 

216 # Started a new raw block. Prepare stack. 

217 self.inraw = True 

218 self.cleandoc.append('\n') 

219 

220 text = self.get_starttag_text() 

221 if self.inraw: 

222 self.stack.append(tag) 

223 self._cache.append(text) 

224 else: 

225 self.cleandoc.append(text) 

226 if tag in self.CDATA_CONTENT_ELEMENTS: 

227 # This is presumably a standalone tag in a code span (see #1036). 

228 self.clear_cdata_mode() 

229 

230 def handle_endtag(self, tag: str): 

231 text = self.get_endtag_text(tag) 

232 

233 if self.inraw: 

234 self._cache.append(text) 

235 if tag in self.stack: 

236 # Remove tag from stack 

237 while self.stack: 

238 if self.stack.pop() == tag: 

239 break 

240 if len(self.stack) == 0: 

241 # End of raw block. 

242 if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(text):]): 

243 # Preserve blank line and end of raw block. 

244 self._cache.append('\n') 

245 else: 

246 # More content exists after `endtag`. 

247 self.intail = True 

248 # Reset stack. 

249 self.inraw = False 

250 self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) 

251 # Insert blank line between this and next line. 

252 self.cleandoc.append('\n\n') 

253 self._cache = [] 

254 else: 

255 self.cleandoc.append(text) 

256 

257 def handle_data(self, data: str): 

258 if self.intail and '\n' in data: 

259 self.intail = False 

260 if self.inraw: 

261 self._cache.append(data) 

262 else: 

263 self.cleandoc.append(data) 

264 

265 def handle_empty_tag(self, data: str, is_block: bool): 

266 """ Handle empty tags (`<data>`). """ 

267 if self.inraw or self.intail: 

268 # Append this to the existing raw block 

269 self._cache.append(data) 

270 elif self.at_line_start() and is_block: 

271 # Handle this as a standalone raw block 

272 if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(data):]): 

273 # Preserve blank line after tag in raw block. 

274 data += '\n' 

275 else: 

276 # More content exists after tag. 

277 self.intail = True 

278 item = self.cleandoc[-1] if self.cleandoc else '' 

279 # If we only have one newline before block element, add another 

280 if not item.endswith('\n\n') and item.endswith('\n'): 

281 self.cleandoc.append('\n') 

282 self.cleandoc.append(self.md.htmlStash.store(data)) 

283 # Insert blank line between this and next line. 

284 self.cleandoc.append('\n\n') 

285 else: 

286 self.cleandoc.append(data) 

287 

288 def handle_startendtag(self, tag: str, attrs): 

289 self.handle_empty_tag(self.get_starttag_text(), is_block=self.md.is_block_level(tag)) 

290 

291 def handle_charref(self, name: str): 

292 self.handle_empty_tag('&#{};'.format(name), is_block=False) 

293 

294 def handle_entityref(self, name: str): 

295 self.handle_empty_tag('&{};'.format(name), is_block=False) 

296 

297 def handle_comment(self, data: str): 

298 # Check if the comment is unclosed, if so, we need to override position 

299 self.handle_empty_tag('<!--{}-->'.format(data), is_block=True) 

300 

301 def handle_decl(self, data: str): 

302 self.handle_empty_tag('<!{}>'.format(data), is_block=True) 

303 

304 def handle_pi(self, data: str): 

305 self.handle_empty_tag('<?{}?>'.format(data), is_block=True) 

306 

307 def unknown_decl(self, data: str): 

308 end = ']]>' if data.startswith('CDATA[') else ']>' 

309 self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True) 

310 

311 def parse_pi(self, i: int) -> int: 

312 if self.at_line_start() or self.intail: 

313 return super().parse_pi(i) 

314 # This is not the beginning of a raw block so treat as plain data 

315 # and avoid consuming any tags which may follow (see #1066). 

316 self.handle_data('<?') 

317 return i + 2 

318 

319 # Internal -- parse comment, return length or -1 if not terminated 

320 # see https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state 

321 def parse_comment(self, i, report=True): 

322 rawdata = self.rawdata 

323 assert rawdata.startswith('<!--', i), 'unexpected call to parse_comment()' 

324 match = commentclose.search(rawdata, i+4) 

325 if not match: 

326 self.handle_data('<') 

327 return i + 1 

328 if report: 

329 j = match.start() 

330 self.handle_comment(rawdata[i+4: j]) 

331 return match.end() 

332 

333 def parse_html_declaration(self, i: int) -> int: 

334 if self.at_line_start() or self.intail: 

335 if self.rawdata[i:i+3] == '<![' and not self.rawdata[i:i+9] == '<![CDATA[': 

336 # We have encountered the bug in #1534 (Python bug `gh-77057`). 

337 # Provide an override until we drop support for Python < 3.13. 

338 result = self.parse_bogus_comment(i) 

339 if result == -1: 

340 self.handle_data(self.rawdata[i:i + 1]) 

341 return i + 1 

342 return result 

343 return super().parse_html_declaration(i) 

344 # This is not the beginning of a raw block so treat as plain data 

345 # and avoid consuming any tags which may follow (see #1066). 

346 self.handle_data('<!') 

347 return i + 2 

348 

349 def parse_bogus_comment(self, i: int, report: int = 0) -> int: 

350 # Override the default behavior so that bogus comments get passed 

351 # through unaltered by setting `report` to `0` (see #1425). 

352 pos = super().parse_bogus_comment(i, report) 

353 if pos == -1: # pragma: no cover 

354 return -1 

355 self.handle_empty_tag(self.rawdata[i:pos], is_block=False) 

356 return pos 

357 

358 # The rest has been copied from base class in standard lib to address #1036. 

359 # As `__startag_text` is private, all references to it must be in this subclass. 

360 # The last few lines of `parse_starttag` are reversed so that `handle_starttag` 

361 # can override `cdata_mode` in certain situations (in a code span). 

362 __starttag_text: str | None = None 

363 

364 def get_starttag_text(self) -> str: 

365 """Return full source of start tag: `<...>`.""" 

366 return self.__starttag_text 

367 

368 def parse_starttag(self, i: int) -> int: # pragma: no cover 

369 # Treat `</>` as normal data as it is not a real tag. 

370 if self.rawdata[i:i + 3] == '</>': 

371 self.handle_data(self.rawdata[i:i + 3]) 

372 return i + 3 

373 

374 self.__starttag_text = None 

375 endpos = self.check_for_whole_start_tag(i) 

376 if endpos < 0: 

377 self.handle_data(self.rawdata[i:i + 1]) 

378 return i + 1 

379 rawdata = self.rawdata 

380 self.__starttag_text = rawdata[i:endpos] 

381 

382 # Now parse the data between `i+1` and `j` into a tag and `attrs` 

383 attrs = [] 

384 match = htmlparser.tagfind_tolerant.match(rawdata, i+1) 

385 assert match, 'unexpected call to parse_starttag()' 

386 k = match.end() 

387 self.lasttag = tag = match.group(1).lower() 

388 while k < endpos: 

389 m = htmlparser.attrfind_tolerant.match(rawdata, k) 

390 if not m: 

391 break 

392 attrname, rest, attrvalue = m.group(1, 2, 3) 

393 if not rest: 

394 attrvalue = None 

395 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ 

396 attrvalue[:1] == '"' == attrvalue[-1:]: # noqa: E127 

397 attrvalue = attrvalue[1:-1] 

398 if attrvalue: 

399 attrvalue = htmlparser.unescape(attrvalue) 

400 attrs.append((attrname.lower(), attrvalue)) 

401 k = m.end() 

402 

403 end = rawdata[k:endpos].strip() 

404 if end not in (">", "/>"): 

405 lineno, offset = self.getpos() 

406 if "\n" in self.__starttag_text: 

407 lineno = lineno + self.__starttag_text.count("\n") 

408 offset = len(self.__starttag_text) \ 

409 - self.__starttag_text.rfind("\n") # noqa: E127 

410 else: 

411 offset = offset + len(self.__starttag_text) 

412 self.handle_data(rawdata[i:endpos]) 

413 return endpos 

414 if end.endswith('/>'): 

415 # XHTML-style empty tag: `<span attr="value" />` 

416 self.handle_startendtag(tag, attrs) 

417 else: 

418 # *** set `cdata_mode` first so we can override it in `handle_starttag` (see #1036) *** 

419 if tag in self.CDATA_CONTENT_ELEMENTS: 

420 self.set_cdata_mode(tag) 

421 self.handle_starttag(tag, attrs) 

422 return endpos