Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/markdown/htmlparser.py: 96%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# Python Markdown
3# A Python implementation of John Gruber's Markdown.
5# Documentation: https://python-markdown.github.io/
6# GitHub: https://github.com/Python-Markdown/markdown/
7# PyPI: https://pypi.org/project/Markdown/
9# Started by Manfred Stienstra (http://www.dwerg.net/).
10# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
11# Currently maintained by Waylan Limberg (https://github.com/waylan),
12# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
14# Copyright 2007-2023 The Python Markdown Project (v. 1.7 and later)
15# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
16# Copyright 2004 Manfred Stienstra (the original version)
18# License: BSD (see LICENSE.md for details).
20"""
21This module imports a copy of [`html.parser.HTMLParser`][] and modifies it heavily through monkey-patches.
22A copy is imported rather than the module being directly imported as this ensures that the user can import
23and use the unmodified library for their own needs.
24"""
26from __future__ import annotations
28import re
29import importlib.util
30import sys
31from typing import TYPE_CHECKING, Sequence
33if TYPE_CHECKING: # pragma: no cover
34 from markdown import Markdown
36# Included for versions which do not have current comment fix
37commentclose = re.compile(r'--!?>')
38commentabruptclose = re.compile(r'-?>')
40# Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it.
41# Users can still do `from html import parser` and get the default behavior.
42spec = importlib.util.find_spec('html.parser')
43htmlparser = importlib.util.module_from_spec(spec)
44spec.loader.exec_module(htmlparser)
45sys.modules['htmlparser'] = htmlparser
47# This is a hack. We are sneaking in `</>` so we can capture it without the HTML parser
48# throwing it away. When we see it, we will process it as data.
49htmlparser.starttagopen = re.compile('<[a-zA-Z]|</>')
51# Monkeypatch `HTMLParser` to only accept `?>` to close Processing Instructions.
52htmlparser.piclose = re.compile(r'\?>')
53# Monkeypatch `HTMLParser` to only recognize entity references with a closing semicolon.
54htmlparser.entityref = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);')
55# Monkeypatch `HTMLParser` to no longer support partial entities. We are always feeding a complete block,
56# so the 'incomplete' functionality is unnecessary. As the `entityref` regex is run right before incomplete,
57# and the two regex are the same, then incomplete will simply never match and we avoid the logic within.
58htmlparser.incomplete = htmlparser.entityref
59# Monkeypatch `HTMLParser` to not accept a backtick in a tag name, attribute name, or bare value.
60htmlparser.locatestarttagend_tolerant = re.compile(r"""
61 <[a-zA-Z][^`\t\n\r\f />\x00]* # tag name <= added backtick here
62 (?:[\s/]* # optional whitespace before attribute name
63 (?:(?<=['"\s/])[^`\s/>][^\s/=>]* # attribute name <= added backtick here
64 (?:\s*=+\s* # value indicator
65 (?:'[^']*' # LITA-enclosed value
66 |"[^"]*" # LIT-enclosed value
67 |(?!['"])[^`>\s]* # bare value <= added backtick here
68 )
69 (?:\s*,)* # possibly followed by a comma
70 )?(?:\s|/(?!>))*
71 )*
72 )?
73 \s* # trailing whitespace
74""", re.VERBOSE)
75htmlparser.locatetagend = re.compile(r"""
76 [a-zA-Z][^`\t\n\r\f />]* # tag name
77 [\t\n\r\f /]* # optional whitespace before attribute name
78 (?:(?<=['"\t\n\r\f /])[^`\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
79 (?:= # value indicator
80 (?:'[^']*' # LITA-enclosed value
81 |"[^"]*" # LIT-enclosed value
82 |(?!['"])[^>\t\n\r\f ]* # bare value
83 )
84 )?
85 [\t\n\r\f /]* # possibly followed by a space
86 )*
87 >?
88""", re.VERBOSE)
90# Match a blank line at the start of a block of text (two newlines).
91# The newlines may be preceded by additional whitespace.
92blank_line_re = re.compile(r'^([ ]*\n){2}')
95class HTMLExtractor(htmlparser.HTMLParser):
96 """
97 Extract raw HTML from text.
99 The raw HTML is stored in the [`htmlStash`][markdown.util.HtmlStash] of the
100 [`Markdown`][markdown.Markdown] instance passed to `md` and the remaining text
101 is stored in `cleandoc` as a list of strings.
102 """
104 def __init__(self, md: Markdown, *args, **kwargs):
105 if 'convert_charrefs' not in kwargs:
106 kwargs['convert_charrefs'] = False
108 # Block tags that should contain no content (self closing)
109 self.empty_tags = set(['hr'])
111 self.lineno_start_cache = [0]
113 self.override_comment_update = False
115 # This calls self.reset
116 super().__init__(*args, **kwargs)
117 self.md = md
119 def reset(self):
120 """Reset this instance. Loses all unprocessed data."""
121 self.inraw = False
122 self.intail = False
123 self.stack: list[str] = [] # When `inraw==True`, stack contains a list of tags
124 self._cache: list[str] = []
125 self.cleandoc: list[str] = []
126 self.lineno_start_cache = [0]
128 super().reset()
130 def close(self):
131 """Handle any buffered data."""
132 super().close()
133 if len(self.rawdata):
134 # Temp fix for https://bugs.python.org/issue41989
135 # TODO: remove this when the bug is fixed in all supported Python versions.
136 if self.convert_charrefs and not self.cdata_elem: # pragma: no cover
137 self.handle_data(htmlparser.unescape(self.rawdata))
138 else:
139 self.handle_data(self.rawdata)
140 # Handle any unclosed tags.
141 if len(self._cache):
142 self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
143 self._cache = []
145 @property
146 def line_offset(self) -> int:
147 """Returns char index in `self.rawdata` for the start of the current line. """
148 for ii in range(len(self.lineno_start_cache)-1, self.lineno-1):
149 last_line_start_pos = self.lineno_start_cache[ii]
150 lf_pos = self.rawdata.find('\n', last_line_start_pos)
151 if lf_pos == -1:
152 # No more newlines found. Use end of raw data as start of line beyond end.
153 lf_pos = len(self.rawdata)
154 self.lineno_start_cache.append(lf_pos+1)
156 return self.lineno_start_cache[self.lineno-1]
158 def at_line_start(self) -> bool:
159 """
160 Returns True if current position is at start of line.
162 Allows for up to three blank spaces at start of line.
163 """
164 if self.offset == 0:
165 return True
166 if self.offset > 3:
167 return False
168 # Confirm up to first 3 chars are whitespace
169 return self.rawdata[self.line_offset:self.line_offset + self.offset].strip() == ''
171 def get_endtag_text(self, tag: str) -> str:
172 """
173 Returns the text of the end tag.
175 If it fails to extract the actual text from the raw data, it builds a closing tag with `tag`.
176 """
177 # Attempt to extract actual tag from raw source text
178 start = self.line_offset + self.offset
179 m = htmlparser.endendtag.search(self.rawdata, start)
180 if m:
181 return self.rawdata[start:m.end()]
182 else: # pragma: no cover
183 # Failed to extract from raw data. Assume well formed and lowercase.
184 return '</{}>'.format(tag)
186 def handle_starttag(self, tag: str, attrs: Sequence[tuple[str, str]]):
187 # Handle tags that should always be empty and do not specify a closing tag
188 if tag in self.empty_tags:
189 self.handle_startendtag(tag, attrs)
190 return
192 if self.md.is_block_level(tag) and (self.intail or (self.at_line_start() and not self.inraw)):
193 # Started a new raw block. Prepare stack.
194 self.inraw = True
195 self.cleandoc.append('\n')
197 text = self.get_starttag_text()
198 if self.inraw:
199 self.stack.append(tag)
200 self._cache.append(text)
201 else:
202 self.cleandoc.append(text)
203 if tag in self.CDATA_CONTENT_ELEMENTS:
204 # This is presumably a standalone tag in a code span (see #1036).
205 self.clear_cdata_mode()
207 def handle_endtag(self, tag: str):
208 text = self.get_endtag_text(tag)
210 if self.inraw:
211 self._cache.append(text)
212 if tag in self.stack:
213 # Remove tag from stack
214 while self.stack:
215 if self.stack.pop() == tag:
216 break
217 if len(self.stack) == 0:
218 # End of raw block.
219 if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(text):]):
220 # Preserve blank line and end of raw block.
221 self._cache.append('\n')
222 else:
223 # More content exists after `endtag`.
224 self.intail = True
225 # Reset stack.
226 self.inraw = False
227 self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
228 # Insert blank line between this and next line.
229 self.cleandoc.append('\n\n')
230 self._cache = []
231 else:
232 self.cleandoc.append(text)
234 def handle_data(self, data: str):
235 if self.intail and '\n' in data:
236 self.intail = False
237 if self.inraw:
238 self._cache.append(data)
239 else:
240 self.cleandoc.append(data)
242 def handle_empty_tag(self, data: str, is_block: bool):
243 """ Handle empty tags (`<data>`). """
244 if self.inraw or self.intail:
245 # Append this to the existing raw block
246 self._cache.append(data)
247 elif self.at_line_start() and is_block:
248 # Handle this as a standalone raw block
249 if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(data):]):
250 # Preserve blank line after tag in raw block.
251 data += '\n'
252 else:
253 # More content exists after tag.
254 self.intail = True
255 item = self.cleandoc[-1] if self.cleandoc else ''
256 # If we only have one newline before block element, add another
257 if not item.endswith('\n\n') and item.endswith('\n'):
258 self.cleandoc.append('\n')
259 self.cleandoc.append(self.md.htmlStash.store(data))
260 # Insert blank line between this and next line.
261 self.cleandoc.append('\n\n')
262 else:
263 self.cleandoc.append(data)
265 def handle_startendtag(self, tag: str, attrs):
266 self.handle_empty_tag(self.get_starttag_text(), is_block=self.md.is_block_level(tag))
268 def handle_charref(self, name: str):
269 self.handle_empty_tag('&#{};'.format(name), is_block=False)
271 def handle_entityref(self, name: str):
272 self.handle_empty_tag('&{};'.format(name), is_block=False)
274 def handle_comment(self, data: str):
275 # Check if the comment is unclosed, if so, we need to override position
276 i = self.line_offset + self.offset + len(data) + 4
277 if self.rawdata[i:i + 3] != '-->':
278 self.handle_data('<')
279 self.override_comment_update = True
280 return
281 self.handle_empty_tag('<!--{}-->'.format(data), is_block=True)
283 def updatepos(self, i: int, j: int) -> int:
284 if self.override_comment_update:
285 self.override_comment_update = False
286 i = 0
287 j = 1
288 return super().updatepos(i, j)
290 def handle_decl(self, data: str):
291 self.handle_empty_tag('<!{}>'.format(data), is_block=True)
293 def handle_pi(self, data: str):
294 self.handle_empty_tag('<?{}?>'.format(data), is_block=True)
296 def unknown_decl(self, data: str):
297 end = ']]>' if data.startswith('CDATA[') else ']>'
298 self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True)
300 def parse_pi(self, i: int) -> int:
301 if self.at_line_start() or self.intail:
302 return super().parse_pi(i)
303 # This is not the beginning of a raw block so treat as plain data
304 # and avoid consuming any tags which may follow (see #1066).
305 self.handle_data('<?')
306 return i + 2
308 if not hasattr(htmlparser, 'commentabruptclose'):
309 # Internal -- parse comment, return length or -1 if not terminated
310 # see https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
311 def parse_comment(self, i, report=True):
312 rawdata = self.rawdata
313 assert rawdata.startswith('<!--', i), 'unexpected call to parse_comment()'
314 match = commentclose.search(rawdata, i+4)
315 if not match:
316 match = commentabruptclose.match(rawdata, i+4)
317 if not match:
318 return -1
319 if report:
320 j = match.start()
321 self.handle_comment(rawdata[i+4: j])
322 return match.end()
324 def parse_html_declaration(self, i: int) -> int:
325 if self.at_line_start() or self.intail:
326 if self.rawdata[i:i+3] == '<![' and not self.rawdata[i:i+9] == '<![CDATA[':
327 # We have encountered the bug in #1534 (Python bug `gh-77057`).
328 # Provide an override until we drop support for Python < 3.13.
329 result = self.parse_bogus_comment(i)
330 if result == -1:
331 self.handle_data(self.rawdata[i:i + 1])
332 return i + 1
333 return result
334 return super().parse_html_declaration(i)
335 # This is not the beginning of a raw block so treat as plain data
336 # and avoid consuming any tags which may follow (see #1066).
337 self.handle_data('<!')
338 return i + 2
340 def parse_bogus_comment(self, i: int, report: int = 0) -> int:
341 # Override the default behavior so that bogus comments get passed
342 # through unaltered by setting `report` to `0` (see #1425).
343 pos = super().parse_bogus_comment(i, report)
344 if pos == -1: # pragma: no cover
345 return -1
346 self.handle_empty_tag(self.rawdata[i:pos], is_block=False)
347 return pos
349 # The rest has been copied from base class in standard lib to address #1036.
350 # As `__startag_text` is private, all references to it must be in this subclass.
351 # The last few lines of `parse_starttag` are reversed so that `handle_starttag`
352 # can override `cdata_mode` in certain situations (in a code span).
353 __starttag_text: str | None = None
355 def get_starttag_text(self) -> str:
356 """Return full source of start tag: `<...>`."""
357 return self.__starttag_text
359 def parse_starttag(self, i: int) -> int: # pragma: no cover
360 # Treat `</>` as normal data as it is not a real tag.
361 if self.rawdata[i:i + 3] == '</>':
362 self.handle_data(self.rawdata[i:i + 3])
363 return i + 3
365 self.__starttag_text = None
366 endpos = self.check_for_whole_start_tag(i)
367 if endpos < 0:
368 self.handle_data(self.rawdata[i:i + 1])
369 return i + 1
370 rawdata = self.rawdata
371 self.__starttag_text = rawdata[i:endpos]
373 # Now parse the data between `i+1` and `j` into a tag and `attrs`
374 attrs = []
375 match = htmlparser.tagfind_tolerant.match(rawdata, i+1)
376 assert match, 'unexpected call to parse_starttag()'
377 k = match.end()
378 self.lasttag = tag = match.group(1).lower()
379 while k < endpos:
380 m = htmlparser.attrfind_tolerant.match(rawdata, k)
381 if not m:
382 break
383 attrname, rest, attrvalue = m.group(1, 2, 3)
384 if not rest:
385 attrvalue = None
386 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
387 attrvalue[:1] == '"' == attrvalue[-1:]: # noqa: E127
388 attrvalue = attrvalue[1:-1]
389 if attrvalue:
390 attrvalue = htmlparser.unescape(attrvalue)
391 attrs.append((attrname.lower(), attrvalue))
392 k = m.end()
394 end = rawdata[k:endpos].strip()
395 if end not in (">", "/>"):
396 lineno, offset = self.getpos()
397 if "\n" in self.__starttag_text:
398 lineno = lineno + self.__starttag_text.count("\n")
399 offset = len(self.__starttag_text) \
400 - self.__starttag_text.rfind("\n") # noqa: E127
401 else:
402 offset = offset + len(self.__starttag_text)
403 self.handle_data(rawdata[i:endpos])
404 return endpos
405 if end.endswith('/>'):
406 # XHTML-style empty tag: `<span attr="value" />`
407 self.handle_startendtag(tag, attrs)
408 else:
409 # *** set `cdata_mode` first so we can override it in `handle_starttag` (see #1036) ***
410 if tag in self.CDATA_CONTENT_ELEMENTS:
411 self.set_cdata_mode(tag)
412 self.handle_starttag(tag, attrs)
413 return endpos