Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/markdown/htmlparser.py: 96%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

176 statements  

1# Python Markdown 

2 

3# A Python implementation of John Gruber's Markdown. 

4 

5# Documentation: https://python-markdown.github.io/ 

6# GitHub: https://github.com/Python-Markdown/markdown/ 

7# PyPI: https://pypi.org/project/Markdown/ 

8 

9# Started by Manfred Stienstra (http://www.dwerg.net/). 

10# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org). 

11# Currently maintained by Waylan Limberg (https://github.com/waylan), 

12# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser). 

13 

14# Copyright 2007-2023 The Python Markdown Project (v. 1.7 and later) 

15# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) 

16# Copyright 2004 Manfred Stienstra (the original version) 

17 

18# License: BSD (see LICENSE.md for details). 

19 

20""" 

21This module imports a copy of [`html.parser.HTMLParser`][] and modifies it heavily through monkey-patches. 

22A copy is imported rather than the module being directly imported as this ensures that the user can import 

23and use the unmodified library for their own needs. 

24""" 

25 

26from __future__ import annotations 

27 

28import re 

29import importlib.util 

30import sys 

31from typing import TYPE_CHECKING, Sequence 

32 

33if TYPE_CHECKING: # pragma: no cover 

34 from markdown import Markdown 

35 

36# Included for versions which do not have current comment fix 

37commentclose = re.compile(r'--!?>') 

38commentabruptclose = re.compile(r'-?>') 

39 

40# Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it. 

41# Users can still do `from html import parser` and get the default behavior. 

42spec = importlib.util.find_spec('html.parser') 

43htmlparser = importlib.util.module_from_spec(spec) 

44spec.loader.exec_module(htmlparser) 

45sys.modules['htmlparser'] = htmlparser 

46 

47# This is a hack. We are sneaking in `</>` so we can capture it without the HTML parser 

48# throwing it away. When we see it, we will process it as data. 

49htmlparser.starttagopen = re.compile('<[a-zA-Z]|</>') 

50 

51# Monkeypatch `HTMLParser` to only accept `?>` to close Processing Instructions. 

52htmlparser.piclose = re.compile(r'\?>') 

53# Monkeypatch `HTMLParser` to only recognize entity references with a closing semicolon. 

54htmlparser.entityref = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);') 

55# Monkeypatch `HTMLParser` to no longer support partial entities. We are always feeding a complete block, 

56# so the 'incomplete' functionality is unnecessary. As the `entityref` regex is run right before incomplete, 

57# and the two regex are the same, then incomplete will simply never match and we avoid the logic within. 

58htmlparser.incomplete = htmlparser.entityref 

59# Monkeypatch `HTMLParser` to not accept a backtick in a tag name, attribute name, or bare value. 

60htmlparser.locatestarttagend_tolerant = re.compile(r""" 

61 <[a-zA-Z][^`\t\n\r\f />\x00]* # tag name <= added backtick here 

62 (?:[\s/]* # optional whitespace before attribute name 

63 (?:(?<=['"\s/])[^`\s/>][^\s/=>]* # attribute name <= added backtick here 

64 (?:\s*=+\s* # value indicator 

65 (?:'[^']*' # LITA-enclosed value 

66 |"[^"]*" # LIT-enclosed value 

67 |(?!['"])[^`>\s]* # bare value <= added backtick here 

68 ) 

69 (?:\s*,)* # possibly followed by a comma 

70 )?(?:\s|/(?!>))* 

71 )* 

72 )? 

73 \s* # trailing whitespace 

74""", re.VERBOSE) 

75htmlparser.locatetagend = re.compile(r""" 

76 [a-zA-Z][^`\t\n\r\f />]* # tag name 

77 [\t\n\r\f /]* # optional whitespace before attribute name 

78 (?:(?<=['"\t\n\r\f /])[^`\t\n\r\f />][^\t\n\r\f /=>]* # attribute name 

79 (?:= # value indicator 

80 (?:'[^']*' # LITA-enclosed value 

81 |"[^"]*" # LIT-enclosed value 

82 |(?!['"])[^>\t\n\r\f ]* # bare value 

83 ) 

84 )? 

85 [\t\n\r\f /]* # possibly followed by a space 

86 )* 

87 >? 

88""", re.VERBOSE) 

89 

90# Match a blank line at the start of a block of text (two newlines). 

91# The newlines may be preceded by additional whitespace. 

92blank_line_re = re.compile(r'^([ ]*\n){2}') 

93 

94 

95class HTMLExtractor(htmlparser.HTMLParser): 

96 """ 

97 Extract raw HTML from text. 

98 

99 The raw HTML is stored in the [`htmlStash`][markdown.util.HtmlStash] of the 

100 [`Markdown`][markdown.Markdown] instance passed to `md` and the remaining text 

101 is stored in `cleandoc` as a list of strings. 

102 """ 

103 

104 def __init__(self, md: Markdown, *args, **kwargs): 

105 if 'convert_charrefs' not in kwargs: 

106 kwargs['convert_charrefs'] = False 

107 

108 # Block tags that should contain no content (self closing) 

109 self.empty_tags = set(['hr']) 

110 

111 self.lineno_start_cache = [0] 

112 

113 self.override_comment_update = False 

114 

115 # This calls self.reset 

116 super().__init__(*args, **kwargs) 

117 self.md = md 

118 

119 def reset(self): 

120 """Reset this instance. Loses all unprocessed data.""" 

121 self.inraw = False 

122 self.intail = False 

123 self.stack: list[str] = [] # When `inraw==True`, stack contains a list of tags 

124 self._cache: list[str] = [] 

125 self.cleandoc: list[str] = [] 

126 self.lineno_start_cache = [0] 

127 

128 super().reset() 

129 

130 def close(self): 

131 """Handle any buffered data.""" 

132 super().close() 

133 if len(self.rawdata): 

134 # Temp fix for https://bugs.python.org/issue41989 

135 # TODO: remove this when the bug is fixed in all supported Python versions. 

136 if self.convert_charrefs and not self.cdata_elem: # pragma: no cover 

137 self.handle_data(htmlparser.unescape(self.rawdata)) 

138 else: 

139 self.handle_data(self.rawdata) 

140 # Handle any unclosed tags. 

141 if len(self._cache): 

142 self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) 

143 self._cache = [] 

144 

145 @property 

146 def line_offset(self) -> int: 

147 """Returns char index in `self.rawdata` for the start of the current line. """ 

148 for ii in range(len(self.lineno_start_cache)-1, self.lineno-1): 

149 last_line_start_pos = self.lineno_start_cache[ii] 

150 lf_pos = self.rawdata.find('\n', last_line_start_pos) 

151 if lf_pos == -1: 

152 # No more newlines found. Use end of raw data as start of line beyond end. 

153 lf_pos = len(self.rawdata) 

154 self.lineno_start_cache.append(lf_pos+1) 

155 

156 return self.lineno_start_cache[self.lineno-1] 

157 

158 def at_line_start(self) -> bool: 

159 """ 

160 Returns True if current position is at start of line. 

161 

162 Allows for up to three blank spaces at start of line. 

163 """ 

164 if self.offset == 0: 

165 return True 

166 if self.offset > 3: 

167 return False 

168 # Confirm up to first 3 chars are whitespace 

169 return self.rawdata[self.line_offset:self.line_offset + self.offset].strip() == '' 

170 

171 def get_endtag_text(self, tag: str) -> str: 

172 """ 

173 Returns the text of the end tag. 

174 

175 If it fails to extract the actual text from the raw data, it builds a closing tag with `tag`. 

176 """ 

177 # Attempt to extract actual tag from raw source text 

178 start = self.line_offset + self.offset 

179 m = htmlparser.endendtag.search(self.rawdata, start) 

180 if m: 

181 return self.rawdata[start:m.end()] 

182 else: # pragma: no cover 

183 # Failed to extract from raw data. Assume well formed and lowercase. 

184 return '</{}>'.format(tag) 

185 

186 def handle_starttag(self, tag: str, attrs: Sequence[tuple[str, str]]): 

187 # Handle tags that should always be empty and do not specify a closing tag 

188 if tag in self.empty_tags: 

189 self.handle_startendtag(tag, attrs) 

190 return 

191 

192 if self.md.is_block_level(tag) and (self.intail or (self.at_line_start() and not self.inraw)): 

193 # Started a new raw block. Prepare stack. 

194 self.inraw = True 

195 self.cleandoc.append('\n') 

196 

197 text = self.get_starttag_text() 

198 if self.inraw: 

199 self.stack.append(tag) 

200 self._cache.append(text) 

201 else: 

202 self.cleandoc.append(text) 

203 if tag in self.CDATA_CONTENT_ELEMENTS: 

204 # This is presumably a standalone tag in a code span (see #1036). 

205 self.clear_cdata_mode() 

206 

207 def handle_endtag(self, tag: str): 

208 text = self.get_endtag_text(tag) 

209 

210 if self.inraw: 

211 self._cache.append(text) 

212 if tag in self.stack: 

213 # Remove tag from stack 

214 while self.stack: 

215 if self.stack.pop() == tag: 

216 break 

217 if len(self.stack) == 0: 

218 # End of raw block. 

219 if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(text):]): 

220 # Preserve blank line and end of raw block. 

221 self._cache.append('\n') 

222 else: 

223 # More content exists after `endtag`. 

224 self.intail = True 

225 # Reset stack. 

226 self.inraw = False 

227 self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) 

228 # Insert blank line between this and next line. 

229 self.cleandoc.append('\n\n') 

230 self._cache = [] 

231 else: 

232 self.cleandoc.append(text) 

233 

234 def handle_data(self, data: str): 

235 if self.intail and '\n' in data: 

236 self.intail = False 

237 if self.inraw: 

238 self._cache.append(data) 

239 else: 

240 self.cleandoc.append(data) 

241 

242 def handle_empty_tag(self, data: str, is_block: bool): 

243 """ Handle empty tags (`<data>`). """ 

244 if self.inraw or self.intail: 

245 # Append this to the existing raw block 

246 self._cache.append(data) 

247 elif self.at_line_start() and is_block: 

248 # Handle this as a standalone raw block 

249 if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(data):]): 

250 # Preserve blank line after tag in raw block. 

251 data += '\n' 

252 else: 

253 # More content exists after tag. 

254 self.intail = True 

255 item = self.cleandoc[-1] if self.cleandoc else '' 

256 # If we only have one newline before block element, add another 

257 if not item.endswith('\n\n') and item.endswith('\n'): 

258 self.cleandoc.append('\n') 

259 self.cleandoc.append(self.md.htmlStash.store(data)) 

260 # Insert blank line between this and next line. 

261 self.cleandoc.append('\n\n') 

262 else: 

263 self.cleandoc.append(data) 

264 

265 def handle_startendtag(self, tag: str, attrs): 

266 self.handle_empty_tag(self.get_starttag_text(), is_block=self.md.is_block_level(tag)) 

267 

268 def handle_charref(self, name: str): 

269 self.handle_empty_tag('&#{};'.format(name), is_block=False) 

270 

271 def handle_entityref(self, name: str): 

272 self.handle_empty_tag('&{};'.format(name), is_block=False) 

273 

274 def handle_comment(self, data: str): 

275 # Check if the comment is unclosed, if so, we need to override position 

276 i = self.line_offset + self.offset + len(data) + 4 

277 if self.rawdata[i:i + 3] != '-->': 

278 self.handle_data('<') 

279 self.override_comment_update = True 

280 return 

281 self.handle_empty_tag('<!--{}-->'.format(data), is_block=True) 

282 

283 def updatepos(self, i: int, j: int) -> int: 

284 if self.override_comment_update: 

285 self.override_comment_update = False 

286 i = 0 

287 j = 1 

288 return super().updatepos(i, j) 

289 

290 def handle_decl(self, data: str): 

291 self.handle_empty_tag('<!{}>'.format(data), is_block=True) 

292 

293 def handle_pi(self, data: str): 

294 self.handle_empty_tag('<?{}?>'.format(data), is_block=True) 

295 

296 def unknown_decl(self, data: str): 

297 end = ']]>' if data.startswith('CDATA[') else ']>' 

298 self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True) 

299 

300 def parse_pi(self, i: int) -> int: 

301 if self.at_line_start() or self.intail: 

302 return super().parse_pi(i) 

303 # This is not the beginning of a raw block so treat as plain data 

304 # and avoid consuming any tags which may follow (see #1066). 

305 self.handle_data('<?') 

306 return i + 2 

307 

308 if not hasattr(htmlparser, 'commentabruptclose'): 

309 # Internal -- parse comment, return length or -1 if not terminated 

310 # see https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state 

311 def parse_comment(self, i, report=True): 

312 rawdata = self.rawdata 

313 assert rawdata.startswith('<!--', i), 'unexpected call to parse_comment()' 

314 match = commentclose.search(rawdata, i+4) 

315 if not match: 

316 match = commentabruptclose.match(rawdata, i+4) 

317 if not match: 

318 return -1 

319 if report: 

320 j = match.start() 

321 self.handle_comment(rawdata[i+4: j]) 

322 return match.end() 

323 

324 def parse_html_declaration(self, i: int) -> int: 

325 if self.at_line_start() or self.intail: 

326 if self.rawdata[i:i+3] == '<![' and not self.rawdata[i:i+9] == '<![CDATA[': 

327 # We have encountered the bug in #1534 (Python bug `gh-77057`). 

328 # Provide an override until we drop support for Python < 3.13. 

329 result = self.parse_bogus_comment(i) 

330 if result == -1: 

331 self.handle_data(self.rawdata[i:i + 1]) 

332 return i + 1 

333 return result 

334 return super().parse_html_declaration(i) 

335 # This is not the beginning of a raw block so treat as plain data 

336 # and avoid consuming any tags which may follow (see #1066). 

337 self.handle_data('<!') 

338 return i + 2 

339 

340 def parse_bogus_comment(self, i: int, report: int = 0) -> int: 

341 # Override the default behavior so that bogus comments get passed 

342 # through unaltered by setting `report` to `0` (see #1425). 

343 pos = super().parse_bogus_comment(i, report) 

344 if pos == -1: # pragma: no cover 

345 return -1 

346 self.handle_empty_tag(self.rawdata[i:pos], is_block=False) 

347 return pos 

348 

349 # The rest has been copied from base class in standard lib to address #1036. 

350 # As `__startag_text` is private, all references to it must be in this subclass. 

351 # The last few lines of `parse_starttag` are reversed so that `handle_starttag` 

352 # can override `cdata_mode` in certain situations (in a code span). 

353 __starttag_text: str | None = None 

354 

355 def get_starttag_text(self) -> str: 

356 """Return full source of start tag: `<...>`.""" 

357 return self.__starttag_text 

358 

359 def parse_starttag(self, i: int) -> int: # pragma: no cover 

360 # Treat `</>` as normal data as it is not a real tag. 

361 if self.rawdata[i:i + 3] == '</>': 

362 self.handle_data(self.rawdata[i:i + 3]) 

363 return i + 3 

364 

365 self.__starttag_text = None 

366 endpos = self.check_for_whole_start_tag(i) 

367 if endpos < 0: 

368 self.handle_data(self.rawdata[i:i + 1]) 

369 return i + 1 

370 rawdata = self.rawdata 

371 self.__starttag_text = rawdata[i:endpos] 

372 

373 # Now parse the data between `i+1` and `j` into a tag and `attrs` 

374 attrs = [] 

375 match = htmlparser.tagfind_tolerant.match(rawdata, i+1) 

376 assert match, 'unexpected call to parse_starttag()' 

377 k = match.end() 

378 self.lasttag = tag = match.group(1).lower() 

379 while k < endpos: 

380 m = htmlparser.attrfind_tolerant.match(rawdata, k) 

381 if not m: 

382 break 

383 attrname, rest, attrvalue = m.group(1, 2, 3) 

384 if not rest: 

385 attrvalue = None 

386 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ 

387 attrvalue[:1] == '"' == attrvalue[-1:]: # noqa: E127 

388 attrvalue = attrvalue[1:-1] 

389 if attrvalue: 

390 attrvalue = htmlparser.unescape(attrvalue) 

391 attrs.append((attrname.lower(), attrvalue)) 

392 k = m.end() 

393 

394 end = rawdata[k:endpos].strip() 

395 if end not in (">", "/>"): 

396 lineno, offset = self.getpos() 

397 if "\n" in self.__starttag_text: 

398 lineno = lineno + self.__starttag_text.count("\n") 

399 offset = len(self.__starttag_text) \ 

400 - self.__starttag_text.rfind("\n") # noqa: E127 

401 else: 

402 offset = offset + len(self.__starttag_text) 

403 self.handle_data(rawdata[i:endpos]) 

404 return endpos 

405 if end.endswith('/>'): 

406 # XHTML-style empty tag: `<span attr="value" />` 

407 self.handle_startendtag(tag, attrs) 

408 else: 

409 # *** set `cdata_mode` first so we can override it in `handle_starttag` (see #1036) *** 

410 if tag in self.CDATA_CONTENT_ELEMENTS: 

411 self.set_cdata_mode(tag) 

412 self.handle_starttag(tag, attrs) 

413 return endpos