Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/markdown/htmlparser.py: 96%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# Python Markdown
3# A Python implementation of John Gruber's Markdown.
5# Documentation: https://python-markdown.github.io/
6# GitHub: https://github.com/Python-Markdown/markdown/
7# PyPI: https://pypi.org/project/Markdown/
9# Started by Manfred Stienstra (http://www.dwerg.net/).
10# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
11# Currently maintained by Waylan Limberg (https://github.com/waylan),
12# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
14# Copyright 2007-2023 The Python Markdown Project (v. 1.7 and later)
15# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
16# Copyright 2004 Manfred Stienstra (the original version)
18# License: BSD (see LICENSE.md for details).
20"""
21This module imports a copy of [`html.parser.HTMLParser`][] and modifies it heavily through monkey-patches.
22A copy is imported rather than the module being directly imported as this ensures that the user can import
23and use the unmodified library for their own needs.
24"""
26from __future__ import annotations
28import re
29import importlib.util
30import sys
31from typing import TYPE_CHECKING, Sequence
33if TYPE_CHECKING: # pragma: no cover
34 from markdown import Markdown
37# Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it.
38# Users can still do `from html import parser` and get the default behavior.
39spec = importlib.util.find_spec('html.parser')
40htmlparser = importlib.util.module_from_spec(spec)
41spec.loader.exec_module(htmlparser)
42sys.modules['htmlparser'] = htmlparser
44# This is a hack. We are sneaking in `</>` so we can capture it without the HTML parser
45# throwing it away. When we see it, we will process it as data.
46htmlparser.starttagopen = re.compile('<[a-zA-Z]|</>')
48# Monkeypatch `HTMLParser` to only accept `?>` to close Processing Instructions.
49htmlparser.piclose = re.compile(r'\?>')
50# Monkeypatch `HTMLParser` to only recognize entity references with a closing semicolon.
51htmlparser.entityref = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);')
52# Monkeypatch `HTMLParser` to no longer support partial entities. We are always feeding a complete block,
53# so the 'incomplete' functionality is unnecessary. As the `entityref` regex is run right before incomplete,
54# and the two regex are the same, then incomplete will simply never match and we avoid the logic within.
55htmlparser.incomplete = htmlparser.entityref
56# Monkeypatch `HTMLParser` to not accept a backtick in a tag name, attribute name, or bare value.
57htmlparser.locatestarttagend_tolerant = re.compile(r"""
58 <[a-zA-Z][^`\t\n\r\f />\x00]* # tag name <= added backtick here
59 (?:[\s/]* # optional whitespace before attribute name
60 (?:(?<=['"\s/])[^`\s/>][^\s/=>]* # attribute name <= added backtick here
61 (?:\s*=+\s* # value indicator
62 (?:'[^']*' # LITA-enclosed value
63 |"[^"]*" # LIT-enclosed value
64 |(?!['"])[^`>\s]* # bare value <= added backtick here
65 )
66 (?:\s*,)* # possibly followed by a comma
67 )?(?:\s|/(?!>))*
68 )*
69 )?
70 \s* # trailing whitespace
71""", re.VERBOSE)
72htmlparser.locatetagend = re.compile(r"""
73 [a-zA-Z][^`\t\n\r\f />]* # tag name
74 [\t\n\r\f /]* # optional whitespace before attribute name
75 (?:(?<=['"\t\n\r\f /])[^`\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
76 (?:= # value indicator
77 (?:'[^']*' # LITA-enclosed value
78 |"[^"]*" # LIT-enclosed value
79 |(?!['"])[^>\t\n\r\f ]* # bare value
80 )
81 )?
82 [\t\n\r\f /]* # possibly followed by a space
83 )*
84 >?
85""", re.VERBOSE)
87# Match a blank line at the start of a block of text (two newlines).
88# The newlines may be preceded by additional whitespace.
89blank_line_re = re.compile(r'^([ ]*\n){2}')
92class HTMLExtractor(htmlparser.HTMLParser):
93 """
94 Extract raw HTML from text.
96 The raw HTML is stored in the [`htmlStash`][markdown.util.HtmlStash] of the
97 [`Markdown`][markdown.Markdown] instance passed to `md` and the remaining text
98 is stored in `cleandoc` as a list of strings.
99 """
101 def __init__(self, md: Markdown, *args, **kwargs):
102 if 'convert_charrefs' not in kwargs:
103 kwargs['convert_charrefs'] = False
105 # Block tags that should contain no content (self closing)
106 self.empty_tags = set(['hr'])
108 self.lineno_start_cache = [0]
110 self.override_comment_update = False
112 # This calls self.reset
113 super().__init__(*args, **kwargs)
114 self.md = md
116 def reset(self):
117 """Reset this instance. Loses all unprocessed data."""
118 self.inraw = False
119 self.intail = False
120 self.stack: list[str] = [] # When `inraw==True`, stack contains a list of tags
121 self._cache: list[str] = []
122 self.cleandoc: list[str] = []
123 self.lineno_start_cache = [0]
125 super().reset()
127 def close(self):
128 """Handle any buffered data."""
129 super().close()
130 if len(self.rawdata):
131 # Temp fix for https://bugs.python.org/issue41989
132 # TODO: remove this when the bug is fixed in all supported Python versions.
133 if self.convert_charrefs and not self.cdata_elem: # pragma: no cover
134 self.handle_data(htmlparser.unescape(self.rawdata))
135 else:
136 self.handle_data(self.rawdata)
137 # Handle any unclosed tags.
138 if len(self._cache):
139 self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
140 self._cache = []
142 @property
143 def line_offset(self) -> int:
144 """Returns char index in `self.rawdata` for the start of the current line. """
145 for ii in range(len(self.lineno_start_cache)-1, self.lineno-1):
146 last_line_start_pos = self.lineno_start_cache[ii]
147 lf_pos = self.rawdata.find('\n', last_line_start_pos)
148 if lf_pos == -1:
149 # No more newlines found. Use end of raw data as start of line beyond end.
150 lf_pos = len(self.rawdata)
151 self.lineno_start_cache.append(lf_pos+1)
153 return self.lineno_start_cache[self.lineno-1]
155 def at_line_start(self) -> bool:
156 """
157 Returns True if current position is at start of line.
159 Allows for up to three blank spaces at start of line.
160 """
161 if self.offset == 0:
162 return True
163 if self.offset > 3:
164 return False
165 # Confirm up to first 3 chars are whitespace
166 return self.rawdata[self.line_offset:self.line_offset + self.offset].strip() == ''
168 def get_endtag_text(self, tag: str) -> str:
169 """
170 Returns the text of the end tag.
172 If it fails to extract the actual text from the raw data, it builds a closing tag with `tag`.
173 """
174 # Attempt to extract actual tag from raw source text
175 start = self.line_offset + self.offset
176 m = htmlparser.endendtag.search(self.rawdata, start)
177 if m:
178 return self.rawdata[start:m.end()]
179 else: # pragma: no cover
180 # Failed to extract from raw data. Assume well formed and lowercase.
181 return '</{}>'.format(tag)
183 def handle_starttag(self, tag: str, attrs: Sequence[tuple[str, str]]):
184 # Handle tags that should always be empty and do not specify a closing tag
185 if tag in self.empty_tags:
186 self.handle_startendtag(tag, attrs)
187 return
189 if self.md.is_block_level(tag) and (self.intail or (self.at_line_start() and not self.inraw)):
190 # Started a new raw block. Prepare stack.
191 self.inraw = True
192 self.cleandoc.append('\n')
194 text = self.get_starttag_text()
195 if self.inraw:
196 self.stack.append(tag)
197 self._cache.append(text)
198 else:
199 self.cleandoc.append(text)
200 if tag in self.CDATA_CONTENT_ELEMENTS:
201 # This is presumably a standalone tag in a code span (see #1036).
202 self.clear_cdata_mode()
204 def handle_endtag(self, tag: str):
205 text = self.get_endtag_text(tag)
207 if self.inraw:
208 self._cache.append(text)
209 if tag in self.stack:
210 # Remove tag from stack
211 while self.stack:
212 if self.stack.pop() == tag:
213 break
214 if len(self.stack) == 0:
215 # End of raw block.
216 if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(text):]):
217 # Preserve blank line and end of raw block.
218 self._cache.append('\n')
219 else:
220 # More content exists after `endtag`.
221 self.intail = True
222 # Reset stack.
223 self.inraw = False
224 self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
225 # Insert blank line between this and next line.
226 self.cleandoc.append('\n\n')
227 self._cache = []
228 else:
229 self.cleandoc.append(text)
231 def handle_data(self, data: str):
232 if self.intail and '\n' in data:
233 self.intail = False
234 if self.inraw:
235 self._cache.append(data)
236 else:
237 self.cleandoc.append(data)
239 def handle_empty_tag(self, data: str, is_block: bool):
240 """ Handle empty tags (`<data>`). """
241 if self.inraw or self.intail:
242 # Append this to the existing raw block
243 self._cache.append(data)
244 elif self.at_line_start() and is_block:
245 # Handle this as a standalone raw block
246 if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(data):]):
247 # Preserve blank line after tag in raw block.
248 data += '\n'
249 else:
250 # More content exists after tag.
251 self.intail = True
252 item = self.cleandoc[-1] if self.cleandoc else ''
253 # If we only have one newline before block element, add another
254 if not item.endswith('\n\n') and item.endswith('\n'):
255 self.cleandoc.append('\n')
256 self.cleandoc.append(self.md.htmlStash.store(data))
257 # Insert blank line between this and next line.
258 self.cleandoc.append('\n\n')
259 else:
260 self.cleandoc.append(data)
262 def handle_startendtag(self, tag: str, attrs):
263 self.handle_empty_tag(self.get_starttag_text(), is_block=self.md.is_block_level(tag))
265 def handle_charref(self, name: str):
266 self.handle_empty_tag('&#{};'.format(name), is_block=False)
268 def handle_entityref(self, name: str):
269 self.handle_empty_tag('&{};'.format(name), is_block=False)
271 def handle_comment(self, data: str):
272 # Check if the comment is unclosed, if so, we need to override position
273 i = self.line_offset + self.offset + len(data) + 4
274 if self.rawdata[i:i + 3] != '-->':
275 self.handle_data('<')
276 self.override_comment_update = True
277 return
278 self.handle_empty_tag('<!--{}-->'.format(data), is_block=True)
280 def updatepos(self, i: int, j: int) -> int:
281 if self.override_comment_update:
282 self.override_comment_update = False
283 i = 0
284 j = 1
285 return super().updatepos(i, j)
287 def handle_decl(self, data: str):
288 self.handle_empty_tag('<!{}>'.format(data), is_block=True)
290 def handle_pi(self, data: str):
291 self.handle_empty_tag('<?{}?>'.format(data), is_block=True)
293 def unknown_decl(self, data: str):
294 end = ']]>' if data.startswith('CDATA[') else ']>'
295 self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True)
297 def parse_pi(self, i: int) -> int:
298 if self.at_line_start() or self.intail:
299 return super().parse_pi(i)
300 # This is not the beginning of a raw block so treat as plain data
301 # and avoid consuming any tags which may follow (see #1066).
302 self.handle_data('<?')
303 return i + 2
305 def parse_html_declaration(self, i: int) -> int:
306 if self.at_line_start() or self.intail:
307 if self.rawdata[i:i+3] == '<![' and not self.rawdata[i:i+9] == '<![CDATA[':
308 # We have encountered the bug in #1534 (Python bug `gh-77057`).
309 # Provide an override until we drop support for Python < 3.13.
310 result = self.parse_bogus_comment(i)
311 if result == -1:
312 self.handle_data(self.rawdata[i:i + 1])
313 return i + 1
314 return result
315 return super().parse_html_declaration(i)
316 # This is not the beginning of a raw block so treat as plain data
317 # and avoid consuming any tags which may follow (see #1066).
318 self.handle_data('<!')
319 return i + 2
321 def parse_bogus_comment(self, i: int, report: int = 0) -> int:
322 # Override the default behavior so that bogus comments get passed
323 # through unaltered by setting `report` to `0` (see #1425).
324 pos = super().parse_bogus_comment(i, report)
325 if pos == -1: # pragma: no cover
326 return -1
327 self.handle_empty_tag(self.rawdata[i:pos], is_block=False)
328 return pos
330 # The rest has been copied from base class in standard lib to address #1036.
331 # As `__startag_text` is private, all references to it must be in this subclass.
332 # The last few lines of `parse_starttag` are reversed so that `handle_starttag`
333 # can override `cdata_mode` in certain situations (in a code span).
334 __starttag_text: str | None = None
336 def get_starttag_text(self) -> str:
337 """Return full source of start tag: `<...>`."""
338 return self.__starttag_text
340 def parse_starttag(self, i: int) -> int: # pragma: no cover
341 # Treat `</>` as normal data as it is not a real tag.
342 if self.rawdata[i:i + 3] == '</>':
343 self.handle_data(self.rawdata[i:i + 3])
344 return i + 3
346 self.__starttag_text = None
347 endpos = self.check_for_whole_start_tag(i)
348 if endpos < 0:
349 self.handle_data(self.rawdata[i:i + 1])
350 return i + 1
351 rawdata = self.rawdata
352 self.__starttag_text = rawdata[i:endpos]
354 # Now parse the data between `i+1` and `j` into a tag and `attrs`
355 attrs = []
356 match = htmlparser.tagfind_tolerant.match(rawdata, i+1)
357 assert match, 'unexpected call to parse_starttag()'
358 k = match.end()
359 self.lasttag = tag = match.group(1).lower()
360 while k < endpos:
361 m = htmlparser.attrfind_tolerant.match(rawdata, k)
362 if not m:
363 break
364 attrname, rest, attrvalue = m.group(1, 2, 3)
365 if not rest:
366 attrvalue = None
367 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
368 attrvalue[:1] == '"' == attrvalue[-1:]: # noqa: E127
369 attrvalue = attrvalue[1:-1]
370 if attrvalue:
371 attrvalue = htmlparser.unescape(attrvalue)
372 attrs.append((attrname.lower(), attrvalue))
373 k = m.end()
375 end = rawdata[k:endpos].strip()
376 if end not in (">", "/>"):
377 lineno, offset = self.getpos()
378 if "\n" in self.__starttag_text:
379 lineno = lineno + self.__starttag_text.count("\n")
380 offset = len(self.__starttag_text) \
381 - self.__starttag_text.rfind("\n") # noqa: E127
382 else:
383 offset = offset + len(self.__starttag_text)
384 self.handle_data(rawdata[i:endpos])
385 return endpos
386 if end.endswith('/>'):
387 # XHTML-style empty tag: `<span attr="value" />`
388 self.handle_startendtag(tag, attrs)
389 else:
390 # *** set `cdata_mode` first so we can override it in `handle_starttag` (see #1036) ***
391 if tag in self.CDATA_CONTENT_ELEMENTS:
392 self.set_cdata_mode(tag)
393 self.handle_starttag(tag, attrs)
394 return endpos