Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/markdown/htmlparser.py: 96%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

161 statements  

1# Python Markdown 

2 

3# A Python implementation of John Gruber's Markdown. 

4 

5# Documentation: https://python-markdown.github.io/ 

6# GitHub: https://github.com/Python-Markdown/markdown/ 

7# PyPI: https://pypi.org/project/Markdown/ 

8 

9# Started by Manfred Stienstra (http://www.dwerg.net/). 

10# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org). 

11# Currently maintained by Waylan Limberg (https://github.com/waylan), 

12# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser). 

13 

14# Copyright 2007-2023 The Python Markdown Project (v. 1.7 and later) 

15# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) 

16# Copyright 2004 Manfred Stienstra (the original version) 

17 

18# License: BSD (see LICENSE.md for details). 

19 

20""" 

21This module imports a copy of [`html.parser.HTMLParser`][] and modifies it heavily through monkey-patches. 

22A copy is imported rather than the module being directly imported as this ensures that the user can import 

23and use the unmodified library for their own needs. 

24""" 

25 

26from __future__ import annotations 

27 

28import re 

29import importlib.util 

30import sys 

31from typing import TYPE_CHECKING, Sequence 

32 

33if TYPE_CHECKING: # pragma: no cover 

34 from markdown import Markdown 

35 

36 

37# Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it. 

38# Users can still do `from html import parser` and get the default behavior. 

39spec = importlib.util.find_spec('html.parser') 

40htmlparser = importlib.util.module_from_spec(spec) 

41spec.loader.exec_module(htmlparser) 

42sys.modules['htmlparser'] = htmlparser 

43 

44# This is a hack. We are sneaking in `</>` so we can capture it without the HTML parser 

45# throwing it away. When we see it, we will process it as data. 

46htmlparser.starttagopen = re.compile('<[a-zA-Z]|</>') 

47 

48# Monkeypatch `HTMLParser` to only accept `?>` to close Processing Instructions. 

49htmlparser.piclose = re.compile(r'\?>') 

50# Monkeypatch `HTMLParser` to only recognize entity references with a closing semicolon. 

51htmlparser.entityref = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);') 

52# Monkeypatch `HTMLParser` to no longer support partial entities. We are always feeding a complete block, 

53# so the 'incomplete' functionality is unnecessary. As the `entityref` regex is run right before incomplete, 

54# and the two regex are the same, then incomplete will simply never match and we avoid the logic within. 

55htmlparser.incomplete = htmlparser.entityref 

56# Monkeypatch `HTMLParser` to not accept a backtick in a tag name, attribute name, or bare value. 

57htmlparser.locatestarttagend_tolerant = re.compile(r""" 

58 <[a-zA-Z][^`\t\n\r\f />\x00]* # tag name <= added backtick here 

59 (?:[\s/]* # optional whitespace before attribute name 

60 (?:(?<=['"\s/])[^`\s/>][^\s/=>]* # attribute name <= added backtick here 

61 (?:\s*=+\s* # value indicator 

62 (?:'[^']*' # LITA-enclosed value 

63 |"[^"]*" # LIT-enclosed value 

64 |(?!['"])[^`>\s]* # bare value <= added backtick here 

65 ) 

66 (?:\s*,)* # possibly followed by a comma 

67 )?(?:\s|/(?!>))* 

68 )* 

69 )? 

70 \s* # trailing whitespace 

71""", re.VERBOSE) 

72htmlparser.locatetagend = re.compile(r""" 

73 [a-zA-Z][^`\t\n\r\f />]* # tag name 

74 [\t\n\r\f /]* # optional whitespace before attribute name 

75 (?:(?<=['"\t\n\r\f /])[^`\t\n\r\f />][^\t\n\r\f /=>]* # attribute name 

76 (?:= # value indicator 

77 (?:'[^']*' # LITA-enclosed value 

78 |"[^"]*" # LIT-enclosed value 

79 |(?!['"])[^>\t\n\r\f ]* # bare value 

80 ) 

81 )? 

82 [\t\n\r\f /]* # possibly followed by a space 

83 )* 

84 >? 

85""", re.VERBOSE) 

86 

87# Match a blank line at the start of a block of text (two newlines). 

88# The newlines may be preceded by additional whitespace. 

89blank_line_re = re.compile(r'^([ ]*\n){2}') 

90 

91 

92class HTMLExtractor(htmlparser.HTMLParser): 

93 """ 

94 Extract raw HTML from text. 

95 

96 The raw HTML is stored in the [`htmlStash`][markdown.util.HtmlStash] of the 

97 [`Markdown`][markdown.Markdown] instance passed to `md` and the remaining text 

98 is stored in `cleandoc` as a list of strings. 

99 """ 

100 

101 def __init__(self, md: Markdown, *args, **kwargs): 

102 if 'convert_charrefs' not in kwargs: 

103 kwargs['convert_charrefs'] = False 

104 

105 # Block tags that should contain no content (self closing) 

106 self.empty_tags = set(['hr']) 

107 

108 self.lineno_start_cache = [0] 

109 

110 self.override_comment_update = False 

111 

112 # This calls self.reset 

113 super().__init__(*args, **kwargs) 

114 self.md = md 

115 

116 def reset(self): 

117 """Reset this instance. Loses all unprocessed data.""" 

118 self.inraw = False 

119 self.intail = False 

120 self.stack: list[str] = [] # When `inraw==True`, stack contains a list of tags 

121 self._cache: list[str] = [] 

122 self.cleandoc: list[str] = [] 

123 self.lineno_start_cache = [0] 

124 

125 super().reset() 

126 

127 def close(self): 

128 """Handle any buffered data.""" 

129 super().close() 

130 if len(self.rawdata): 

131 # Temp fix for https://bugs.python.org/issue41989 

132 # TODO: remove this when the bug is fixed in all supported Python versions. 

133 if self.convert_charrefs and not self.cdata_elem: # pragma: no cover 

134 self.handle_data(htmlparser.unescape(self.rawdata)) 

135 else: 

136 self.handle_data(self.rawdata) 

137 # Handle any unclosed tags. 

138 if len(self._cache): 

139 self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) 

140 self._cache = [] 

141 

142 @property 

143 def line_offset(self) -> int: 

144 """Returns char index in `self.rawdata` for the start of the current line. """ 

145 for ii in range(len(self.lineno_start_cache)-1, self.lineno-1): 

146 last_line_start_pos = self.lineno_start_cache[ii] 

147 lf_pos = self.rawdata.find('\n', last_line_start_pos) 

148 if lf_pos == -1: 

149 # No more newlines found. Use end of raw data as start of line beyond end. 

150 lf_pos = len(self.rawdata) 

151 self.lineno_start_cache.append(lf_pos+1) 

152 

153 return self.lineno_start_cache[self.lineno-1] 

154 

155 def at_line_start(self) -> bool: 

156 """ 

157 Returns True if current position is at start of line. 

158 

159 Allows for up to three blank spaces at start of line. 

160 """ 

161 if self.offset == 0: 

162 return True 

163 if self.offset > 3: 

164 return False 

165 # Confirm up to first 3 chars are whitespace 

166 return self.rawdata[self.line_offset:self.line_offset + self.offset].strip() == '' 

167 

168 def get_endtag_text(self, tag: str) -> str: 

169 """ 

170 Returns the text of the end tag. 

171 

172 If it fails to extract the actual text from the raw data, it builds a closing tag with `tag`. 

173 """ 

174 # Attempt to extract actual tag from raw source text 

175 start = self.line_offset + self.offset 

176 m = htmlparser.endendtag.search(self.rawdata, start) 

177 if m: 

178 return self.rawdata[start:m.end()] 

179 else: # pragma: no cover 

180 # Failed to extract from raw data. Assume well formed and lowercase. 

181 return '</{}>'.format(tag) 

182 

183 def handle_starttag(self, tag: str, attrs: Sequence[tuple[str, str]]): 

184 # Handle tags that should always be empty and do not specify a closing tag 

185 if tag in self.empty_tags: 

186 self.handle_startendtag(tag, attrs) 

187 return 

188 

189 if self.md.is_block_level(tag) and (self.intail or (self.at_line_start() and not self.inraw)): 

190 # Started a new raw block. Prepare stack. 

191 self.inraw = True 

192 self.cleandoc.append('\n') 

193 

194 text = self.get_starttag_text() 

195 if self.inraw: 

196 self.stack.append(tag) 

197 self._cache.append(text) 

198 else: 

199 self.cleandoc.append(text) 

200 if tag in self.CDATA_CONTENT_ELEMENTS: 

201 # This is presumably a standalone tag in a code span (see #1036). 

202 self.clear_cdata_mode() 

203 

204 def handle_endtag(self, tag: str): 

205 text = self.get_endtag_text(tag) 

206 

207 if self.inraw: 

208 self._cache.append(text) 

209 if tag in self.stack: 

210 # Remove tag from stack 

211 while self.stack: 

212 if self.stack.pop() == tag: 

213 break 

214 if len(self.stack) == 0: 

215 # End of raw block. 

216 if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(text):]): 

217 # Preserve blank line and end of raw block. 

218 self._cache.append('\n') 

219 else: 

220 # More content exists after `endtag`. 

221 self.intail = True 

222 # Reset stack. 

223 self.inraw = False 

224 self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) 

225 # Insert blank line between this and next line. 

226 self.cleandoc.append('\n\n') 

227 self._cache = [] 

228 else: 

229 self.cleandoc.append(text) 

230 

231 def handle_data(self, data: str): 

232 if self.intail and '\n' in data: 

233 self.intail = False 

234 if self.inraw: 

235 self._cache.append(data) 

236 else: 

237 self.cleandoc.append(data) 

238 

239 def handle_empty_tag(self, data: str, is_block: bool): 

240 """ Handle empty tags (`<data>`). """ 

241 if self.inraw or self.intail: 

242 # Append this to the existing raw block 

243 self._cache.append(data) 

244 elif self.at_line_start() and is_block: 

245 # Handle this as a standalone raw block 

246 if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(data):]): 

247 # Preserve blank line after tag in raw block. 

248 data += '\n' 

249 else: 

250 # More content exists after tag. 

251 self.intail = True 

252 item = self.cleandoc[-1] if self.cleandoc else '' 

253 # If we only have one newline before block element, add another 

254 if not item.endswith('\n\n') and item.endswith('\n'): 

255 self.cleandoc.append('\n') 

256 self.cleandoc.append(self.md.htmlStash.store(data)) 

257 # Insert blank line between this and next line. 

258 self.cleandoc.append('\n\n') 

259 else: 

260 self.cleandoc.append(data) 

261 

262 def handle_startendtag(self, tag: str, attrs): 

263 self.handle_empty_tag(self.get_starttag_text(), is_block=self.md.is_block_level(tag)) 

264 

265 def handle_charref(self, name: str): 

266 self.handle_empty_tag('&#{};'.format(name), is_block=False) 

267 

268 def handle_entityref(self, name: str): 

269 self.handle_empty_tag('&{};'.format(name), is_block=False) 

270 

271 def handle_comment(self, data: str): 

272 # Check if the comment is unclosed, if so, we need to override position 

273 i = self.line_offset + self.offset + len(data) + 4 

274 if self.rawdata[i:i + 3] != '-->': 

275 self.handle_data('<') 

276 self.override_comment_update = True 

277 return 

278 self.handle_empty_tag('<!--{}-->'.format(data), is_block=True) 

279 

280 def updatepos(self, i: int, j: int) -> int: 

281 if self.override_comment_update: 

282 self.override_comment_update = False 

283 i = 0 

284 j = 1 

285 return super().updatepos(i, j) 

286 

287 def handle_decl(self, data: str): 

288 self.handle_empty_tag('<!{}>'.format(data), is_block=True) 

289 

290 def handle_pi(self, data: str): 

291 self.handle_empty_tag('<?{}?>'.format(data), is_block=True) 

292 

293 def unknown_decl(self, data: str): 

294 end = ']]>' if data.startswith('CDATA[') else ']>' 

295 self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True) 

296 

297 def parse_pi(self, i: int) -> int: 

298 if self.at_line_start() or self.intail: 

299 return super().parse_pi(i) 

300 # This is not the beginning of a raw block so treat as plain data 

301 # and avoid consuming any tags which may follow (see #1066). 

302 self.handle_data('<?') 

303 return i + 2 

304 

305 def parse_html_declaration(self, i: int) -> int: 

306 if self.at_line_start() or self.intail: 

307 if self.rawdata[i:i+3] == '<![' and not self.rawdata[i:i+9] == '<![CDATA[': 

308 # We have encountered the bug in #1534 (Python bug `gh-77057`). 

309 # Provide an override until we drop support for Python < 3.13. 

310 result = self.parse_bogus_comment(i) 

311 if result == -1: 

312 self.handle_data(self.rawdata[i:i + 1]) 

313 return i + 1 

314 return result 

315 return super().parse_html_declaration(i) 

316 # This is not the beginning of a raw block so treat as plain data 

317 # and avoid consuming any tags which may follow (see #1066). 

318 self.handle_data('<!') 

319 return i + 2 

320 

321 def parse_bogus_comment(self, i: int, report: int = 0) -> int: 

322 # Override the default behavior so that bogus comments get passed 

323 # through unaltered by setting `report` to `0` (see #1425). 

324 pos = super().parse_bogus_comment(i, report) 

325 if pos == -1: # pragma: no cover 

326 return -1 

327 self.handle_empty_tag(self.rawdata[i:pos], is_block=False) 

328 return pos 

329 

330 # The rest has been copied from base class in standard lib to address #1036. 

331 # As `__startag_text` is private, all references to it must be in this subclass. 

332 # The last few lines of `parse_starttag` are reversed so that `handle_starttag` 

333 # can override `cdata_mode` in certain situations (in a code span). 

334 __starttag_text: str | None = None 

335 

336 def get_starttag_text(self) -> str: 

337 """Return full source of start tag: `<...>`.""" 

338 return self.__starttag_text 

339 

340 def parse_starttag(self, i: int) -> int: # pragma: no cover 

341 # Treat `</>` as normal data as it is not a real tag. 

342 if self.rawdata[i:i + 3] == '</>': 

343 self.handle_data(self.rawdata[i:i + 3]) 

344 return i + 3 

345 

346 self.__starttag_text = None 

347 endpos = self.check_for_whole_start_tag(i) 

348 if endpos < 0: 

349 self.handle_data(self.rawdata[i:i + 1]) 

350 return i + 1 

351 rawdata = self.rawdata 

352 self.__starttag_text = rawdata[i:endpos] 

353 

354 # Now parse the data between `i+1` and `j` into a tag and `attrs` 

355 attrs = [] 

356 match = htmlparser.tagfind_tolerant.match(rawdata, i+1) 

357 assert match, 'unexpected call to parse_starttag()' 

358 k = match.end() 

359 self.lasttag = tag = match.group(1).lower() 

360 while k < endpos: 

361 m = htmlparser.attrfind_tolerant.match(rawdata, k) 

362 if not m: 

363 break 

364 attrname, rest, attrvalue = m.group(1, 2, 3) 

365 if not rest: 

366 attrvalue = None 

367 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ 

368 attrvalue[:1] == '"' == attrvalue[-1:]: # noqa: E127 

369 attrvalue = attrvalue[1:-1] 

370 if attrvalue: 

371 attrvalue = htmlparser.unescape(attrvalue) 

372 attrs.append((attrname.lower(), attrvalue)) 

373 k = m.end() 

374 

375 end = rawdata[k:endpos].strip() 

376 if end not in (">", "/>"): 

377 lineno, offset = self.getpos() 

378 if "\n" in self.__starttag_text: 

379 lineno = lineno + self.__starttag_text.count("\n") 

380 offset = len(self.__starttag_text) \ 

381 - self.__starttag_text.rfind("\n") # noqa: E127 

382 else: 

383 offset = offset + len(self.__starttag_text) 

384 self.handle_data(rawdata[i:endpos]) 

385 return endpos 

386 if end.endswith('/>'): 

387 # XHTML-style empty tag: `<span attr="value" />` 

388 self.handle_startendtag(tag, attrs) 

389 else: 

390 # *** set `cdata_mode` first so we can override it in `handle_starttag` (see #1036) *** 

391 if tag in self.CDATA_CONTENT_ELEMENTS: 

392 self.set_cdata_mode(tag) 

393 self.handle_starttag(tag, attrs) 

394 return endpos