Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/markdown/htmlparser.py: 99%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# Python Markdown
3# A Python implementation of John Gruber's Markdown.
5# Documentation: https://python-markdown.github.io/
6# GitHub: https://github.com/Python-Markdown/markdown/
7# PyPI: https://pypi.org/project/Markdown/
9# Started by Manfred Stienstra (http://www.dwerg.net/).
10# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
11# Currently maintained by Waylan Limberg (https://github.com/waylan),
12# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
14# Copyright 2007-2023 The Python Markdown Project (v. 1.7 and later)
15# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
16# Copyright 2004 Manfred Stienstra (the original version)
18# License: BSD (see LICENSE.md for details).
20"""
21This module imports a copy of [`html.parser.HTMLParser`][] and modifies it heavily through monkey-patches.
22A copy is imported rather than the module being directly imported as this ensures that the user can import
23and use the unmodified library for their own needs.
24"""
26from __future__ import annotations
28import re
29import importlib.util
30import sys
31from typing import TYPE_CHECKING, Sequence
33if TYPE_CHECKING: # pragma: no cover
34 from markdown import Markdown
36# Included for versions which do not have current comment fix
37commentclose = re.compile(r'--!?>')
39# Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it.
40# Users can still do `from html import parser` and get the default behavior.
41spec = importlib.util.find_spec('html.parser')
42htmlparser = importlib.util.module_from_spec(spec)
43spec.loader.exec_module(htmlparser)
44sys.modules['htmlparser'] = htmlparser
46# This is a hack. We are sneaking in `</>` so we can capture it without the HTML parser
47# throwing it away. When we see it, we will process it as data.
48htmlparser.starttagopen = re.compile('<[a-zA-Z]|</>')
50htmlparser.endtagopen = re.compile('</[a-zA-Z]?')
52# Monkeypatch `HTMLParser` to only accept `?>` to close Processing Instructions.
53htmlparser.piclose = re.compile(r'\?>')
54# Monkeypatch `HTMLParser` to only recognize entity references with a closing semicolon.
55htmlparser.entityref = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);')
56# Monkeypatch `HTMLParser` to no longer support partial entities. We are always feeding a complete block,
57# so the 'incomplete' functionality is unnecessary. As the `entityref` regex is run right before incomplete,
58# and the two regex are the same, then incomplete will simply never match and we avoid the logic within.
59htmlparser.incomplete = htmlparser.entityref
60# Monkeypatch `HTMLParser` to not accept a backtick in a tag name, attribute name, or bare value.
61htmlparser.locatestarttagend_tolerant = re.compile(r"""
62 <[a-zA-Z][^`\t\n\r\f />\x00]* # tag name <= added backtick here
63 (?:[\s/]* # optional whitespace before attribute name
64 (?:(?<=['"\s/])[^`\s/>][^\s/=>]* # attribute name <= added backtick here
65 (?:\s*=+\s* # value indicator
66 (?:'[^']*' # LITA-enclosed value
67 |"[^"]*" # LIT-enclosed value
68 |(?!['"])[^`>\s]* # bare value <= added backtick here
69 )
70 (?:\s*,)* # possibly followed by a comma
71 )?(?:\s|/(?!>))*
72 )*
73 )?
74 \s* # trailing whitespace
75""", re.VERBOSE)
76htmlparser.locatetagend = re.compile(r"""
77 [a-zA-Z][^`\t\n\r\f />]* # tag name
78 [\t\n\r\f /]* # optional whitespace before attribute name
79 (?:(?<=['"\t\n\r\f /])[^`\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
80 (?:= # value indicator
81 (?:'[^']*' # LITA-enclosed value
82 |"[^"]*" # LIT-enclosed value
83 |(?!['"])[^>\t\n\r\f ]* # bare value
84 )
85 )?
86 [\t\n\r\f /]* # possibly followed by a space
87 )*
88 >?
89""", re.VERBOSE)
91# Match a blank line at the start of a block of text (two newlines).
92# The newlines may be preceded by additional whitespace.
93blank_line_re = re.compile(r'^([ ]*\n){2}')
96class _HTMLParser(htmlparser.HTMLParser):
97 """Handle special start and end tags."""
99 def parse_endtag(self, i):
100 start = self.rawdata[i:i+3]
101 c = ord(start[-1])
102 if len(start) < 3 or not (65 <= c <= 90 or 97 <= c <= 122):
103 self.handle_data(self.rawdata[i:i + 2])
104 return i + 2
105 return super().parse_endtag(i)
107 def parse_starttag(self, i: int) -> int: # pragma: no cover
108 # Treat `</>` as normal data as it is not a real tag.
109 if self.rawdata[i:i + 3] == '</>':
110 self.handle_data(self.rawdata[i:i + 3])
111 return i + 3
113 return super().parse_starttag(i)
116# Overwrite our custom one for people like MkDocs that pull it in
117htmlparser.HTMLParser = _HTMLParser
120class HTMLExtractor(htmlparser.HTMLParser):
121 """
122 Extract raw HTML from text.
124 The raw HTML is stored in the [`htmlStash`][markdown.util.HtmlStash] of the
125 [`Markdown`][markdown.Markdown] instance passed to `md` and the remaining text
126 is stored in `cleandoc` as a list of strings.
127 """
129 def __init__(self, md: Markdown, *args, **kwargs):
130 if 'convert_charrefs' not in kwargs:
131 kwargs['convert_charrefs'] = False
133 # Block tags that should contain no content (self closing)
134 self.empty_tags = set(['hr'])
136 self.lineno_start_cache = [0]
138 # This calls self.reset
139 super().__init__(*args, **kwargs)
140 self.md = md
142 def reset(self):
143 """Reset this instance. Loses all unprocessed data."""
144 self.inraw = False
145 self.intail = False
146 self.stack: list[str] = [] # When `inraw==True`, stack contains a list of tags
147 self._cache: list[str] = []
148 self.cleandoc: list[str] = []
149 self.lineno_start_cache = [0]
151 super().reset()
153 def close(self):
154 """Handle any buffered data."""
155 super().close()
156 if len(self.rawdata):
157 # Temp fix for https://bugs.python.org/issue41989
158 # TODO: remove this when the bug is fixed in all supported Python versions.
159 if self.convert_charrefs and not self.cdata_elem: # pragma: no cover
160 self.handle_data(htmlparser.unescape(self.rawdata))
161 else:
162 self.handle_data(self.rawdata)
163 # Handle any unclosed tags.
164 if len(self._cache):
165 self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
166 self._cache = []
168 @property
169 def line_offset(self) -> int:
170 """Returns char index in `self.rawdata` for the start of the current line. """
171 for ii in range(len(self.lineno_start_cache)-1, self.lineno-1):
172 last_line_start_pos = self.lineno_start_cache[ii]
173 lf_pos = self.rawdata.find('\n', last_line_start_pos)
174 if lf_pos == -1:
175 # No more newlines found. Use end of raw data as start of line beyond end.
176 lf_pos = len(self.rawdata)
177 self.lineno_start_cache.append(lf_pos+1)
179 return self.lineno_start_cache[self.lineno-1]
181 def at_line_start(self) -> bool:
182 """
183 Returns True if current position is at start of line.
185 Allows for up to three blank spaces at start of line.
186 """
187 if self.offset == 0:
188 return True
189 if self.offset > 3:
190 return False
191 # Confirm up to first 3 chars are whitespace
192 return self.rawdata[self.line_offset:self.line_offset + self.offset].strip() == ''
194 def get_endtag_text(self, tag: str) -> str:
195 """
196 Returns the text of the end tag.
198 If it fails to extract the actual text from the raw data, it builds a closing tag with `tag`.
199 """
200 # Attempt to extract actual tag from raw source text
201 start = self.line_offset + self.offset
202 m = htmlparser.endendtag.search(self.rawdata, start)
203 if m:
204 return self.rawdata[start:m.end()]
205 else: # pragma: no cover
206 # Failed to extract from raw data. Assume well formed and lowercase.
207 return '</{}>'.format(tag)
209 def handle_starttag(self, tag: str, attrs: Sequence[tuple[str, str]]):
210 # Handle tags that should always be empty and do not specify a closing tag
211 if tag in self.empty_tags:
212 self.handle_startendtag(tag, attrs)
213 return
215 if self.md.is_block_level(tag) and (self.intail or (self.at_line_start() and not self.inraw)):
216 # Started a new raw block. Prepare stack.
217 self.inraw = True
218 self.cleandoc.append('\n')
220 text = self.get_starttag_text()
221 if self.inraw:
222 self.stack.append(tag)
223 self._cache.append(text)
224 else:
225 self.cleandoc.append(text)
226 if tag in self.CDATA_CONTENT_ELEMENTS:
227 # This is presumably a standalone tag in a code span (see #1036).
228 self.clear_cdata_mode()
230 def handle_endtag(self, tag: str):
231 text = self.get_endtag_text(tag)
233 if self.inraw:
234 self._cache.append(text)
235 if tag in self.stack:
236 # Remove tag from stack
237 while self.stack:
238 if self.stack.pop() == tag:
239 break
240 if len(self.stack) == 0:
241 # End of raw block.
242 if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(text):]):
243 # Preserve blank line and end of raw block.
244 self._cache.append('\n')
245 else:
246 # More content exists after `endtag`.
247 self.intail = True
248 # Reset stack.
249 self.inraw = False
250 self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
251 # Insert blank line between this and next line.
252 self.cleandoc.append('\n\n')
253 self._cache = []
254 else:
255 self.cleandoc.append(text)
257 def handle_data(self, data: str):
258 if self.intail and '\n' in data:
259 self.intail = False
260 if self.inraw:
261 self._cache.append(data)
262 else:
263 self.cleandoc.append(data)
265 def handle_empty_tag(self, data: str, is_block: bool):
266 """ Handle empty tags (`<data>`). """
267 if self.inraw or self.intail:
268 # Append this to the existing raw block
269 self._cache.append(data)
270 elif self.at_line_start() and is_block:
271 # Handle this as a standalone raw block
272 if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(data):]):
273 # Preserve blank line after tag in raw block.
274 data += '\n'
275 else:
276 # More content exists after tag.
277 self.intail = True
278 item = self.cleandoc[-1] if self.cleandoc else ''
279 # If we only have one newline before block element, add another
280 if not item.endswith('\n\n') and item.endswith('\n'):
281 self.cleandoc.append('\n')
282 self.cleandoc.append(self.md.htmlStash.store(data))
283 # Insert blank line between this and next line.
284 self.cleandoc.append('\n\n')
285 else:
286 self.cleandoc.append(data)
288 def handle_startendtag(self, tag: str, attrs):
289 self.handle_empty_tag(self.get_starttag_text(), is_block=self.md.is_block_level(tag))
291 def handle_charref(self, name: str):
292 self.handle_empty_tag('&#{};'.format(name), is_block=False)
294 def handle_entityref(self, name: str):
295 self.handle_empty_tag('&{};'.format(name), is_block=False)
297 def handle_comment(self, data: str):
298 # Check if the comment is unclosed, if so, we need to override position
299 self.handle_empty_tag('<!--{}-->'.format(data), is_block=True)
301 def handle_decl(self, data: str):
302 self.handle_empty_tag('<!{}>'.format(data), is_block=True)
304 def handle_pi(self, data: str):
305 self.handle_empty_tag('<?{}?>'.format(data), is_block=True)
307 def unknown_decl(self, data: str):
308 end = ']]>' if data.startswith('CDATA[') else ']>'
309 self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True)
311 def parse_pi(self, i: int) -> int:
312 if self.at_line_start() or self.intail:
313 return super().parse_pi(i)
314 # This is not the beginning of a raw block so treat as plain data
315 # and avoid consuming any tags which may follow (see #1066).
316 self.handle_data('<?')
317 return i + 2
319 # Internal -- parse comment, return length or -1 if not terminated
320 # see https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
321 def parse_comment(self, i, report=True):
322 rawdata = self.rawdata
323 assert rawdata.startswith('<!--', i), 'unexpected call to parse_comment()'
324 match = commentclose.search(rawdata, i+4)
325 if not match:
326 self.handle_data('<')
327 return i + 1
328 if report:
329 j = match.start()
330 self.handle_comment(rawdata[i+4: j])
331 return match.end()
333 def parse_html_declaration(self, i: int) -> int:
334 if self.at_line_start() or self.intail:
335 if self.rawdata[i:i+3] == '<![' and not self.rawdata[i:i+9] == '<![CDATA[':
336 # We have encountered the bug in #1534 (Python bug `gh-77057`).
337 # Provide an override until we drop support for Python < 3.13.
338 result = self.parse_bogus_comment(i)
339 if result == -1:
340 self.handle_data(self.rawdata[i:i + 1])
341 return i + 1
342 return result
343 return super().parse_html_declaration(i)
344 # This is not the beginning of a raw block so treat as plain data
345 # and avoid consuming any tags which may follow (see #1066).
346 self.handle_data('<!')
347 return i + 2
349 def parse_bogus_comment(self, i: int, report: int = 0) -> int:
350 # Override the default behavior so that bogus comments get passed
351 # through unaltered by setting `report` to `0` (see #1425).
352 pos = super().parse_bogus_comment(i, report)
353 if pos == -1: # pragma: no cover
354 return -1
355 self.handle_empty_tag(self.rawdata[i:pos], is_block=False)
356 return pos
358 # The rest has been copied from base class in standard lib to address #1036.
359 # As `__startag_text` is private, all references to it must be in this subclass.
360 # The last few lines of `parse_starttag` are reversed so that `handle_starttag`
361 # can override `cdata_mode` in certain situations (in a code span).
362 __starttag_text: str | None = None
364 def get_starttag_text(self) -> str:
365 """Return full source of start tag: `<...>`."""
366 return self.__starttag_text
368 def parse_starttag(self, i: int) -> int: # pragma: no cover
369 # Treat `</>` as normal data as it is not a real tag.
370 if self.rawdata[i:i + 3] == '</>':
371 self.handle_data(self.rawdata[i:i + 3])
372 return i + 3
374 self.__starttag_text = None
375 endpos = self.check_for_whole_start_tag(i)
376 if endpos < 0:
377 self.handle_data(self.rawdata[i:i + 1])
378 return i + 1
379 rawdata = self.rawdata
380 self.__starttag_text = rawdata[i:endpos]
382 # Now parse the data between `i+1` and `j` into a tag and `attrs`
383 attrs = []
384 match = htmlparser.tagfind_tolerant.match(rawdata, i+1)
385 assert match, 'unexpected call to parse_starttag()'
386 k = match.end()
387 self.lasttag = tag = match.group(1).lower()
388 while k < endpos:
389 m = htmlparser.attrfind_tolerant.match(rawdata, k)
390 if not m:
391 break
392 attrname, rest, attrvalue = m.group(1, 2, 3)
393 if not rest:
394 attrvalue = None
395 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
396 attrvalue[:1] == '"' == attrvalue[-1:]: # noqa: E127
397 attrvalue = attrvalue[1:-1]
398 if attrvalue:
399 attrvalue = htmlparser.unescape(attrvalue)
400 attrs.append((attrname.lower(), attrvalue))
401 k = m.end()
403 end = rawdata[k:endpos].strip()
404 if end not in (">", "/>"):
405 lineno, offset = self.getpos()
406 if "\n" in self.__starttag_text:
407 lineno = lineno + self.__starttag_text.count("\n")
408 offset = len(self.__starttag_text) \
409 - self.__starttag_text.rfind("\n") # noqa: E127
410 else:
411 offset = offset + len(self.__starttag_text)
412 self.handle_data(rawdata[i:endpos])
413 return endpos
414 if end.endswith('/>'):
415 # XHTML-style empty tag: `<span attr="value" />`
416 self.handle_startendtag(tag, attrs)
417 else:
418 # *** set `cdata_mode` first so we can override it in `handle_starttag` (see #1036) ***
419 if tag in self.CDATA_CONTENT_ELEMENTS:
420 self.set_cdata_mode(tag)
421 self.handle_starttag(tag, attrs)
422 return endpos