Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.10/site-packages/markdown/htmlparser.py: 44%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# Python Markdown
3# A Python implementation of John Gruber's Markdown.
5# Documentation: https://python-markdown.github.io/
6# GitHub: https://github.com/Python-Markdown/markdown/
7# PyPI: https://pypi.org/project/Markdown/
9# Started by Manfred Stienstra (http://www.dwerg.net/).
10# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
11# Currently maintained by Waylan Limberg (https://github.com/waylan),
12# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
14# Copyright 2007-2023 The Python Markdown Project (v. 1.7 and later)
15# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
16# Copyright 2004 Manfred Stienstra (the original version)
18# License: BSD (see LICENSE.md for details).
20"""
21This module imports a copy of [`html.parser.HTMLParser`][] and modifies it heavily through monkey-patches.
22A copy is imported rather than the module being directly imported as this ensures that the user can import
23and use the unmodified library for their own needs.
24"""
26from __future__ import annotations
28import re
29import importlib.util
30import sys
31from typing import TYPE_CHECKING, Sequence
33if TYPE_CHECKING: # pragma: no cover
34 from markdown import Markdown
37# Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it.
38# Users can still do `from html import parser` and get the default behavior.
39spec = importlib.util.find_spec('html.parser')
40htmlparser = importlib.util.module_from_spec(spec)
41spec.loader.exec_module(htmlparser)
42sys.modules['htmlparser'] = htmlparser
44# Monkeypatch `HTMLParser` to only accept `?>` to close Processing Instructions.
45htmlparser.piclose = re.compile(r'\?>')
46# Monkeypatch `HTMLParser` to only recognize entity references with a closing semicolon.
47htmlparser.entityref = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);')
48# Monkeypatch `HTMLParser` to no longer support partial entities. We are always feeding a complete block,
49# so the 'incomplete' functionality is unnecessary. As the `entityref` regex is run right before incomplete,
50# and the two regex are the same, then incomplete will simply never match and we avoid the logic within.
51htmlparser.incomplete = htmlparser.entityref
52# Monkeypatch `HTMLParser` to not accept a backtick in a tag name, attribute name, or bare value.
53htmlparser.locatestarttagend_tolerant = re.compile(r"""
54 <[a-zA-Z][^`\t\n\r\f />\x00]* # tag name <= added backtick here
55 (?:[\s/]* # optional whitespace before attribute name
56 (?:(?<=['"\s/])[^`\s/>][^\s/=>]* # attribute name <= added backtick here
57 (?:\s*=+\s* # value indicator
58 (?:'[^']*' # LITA-enclosed value
59 |"[^"]*" # LIT-enclosed value
60 |(?!['"])[^`>\s]* # bare value <= added backtick here
61 )
62 (?:\s*,)* # possibly followed by a comma
63 )?(?:\s|/(?!>))*
64 )*
65 )?
66 \s* # trailing whitespace
67""", re.VERBOSE)
69# Match a blank line at the start of a block of text (two newlines).
70# The newlines may be preceded by additional whitespace.
71blank_line_re = re.compile(r'^([ ]*\n){2}')
74class HTMLExtractor(htmlparser.HTMLParser):
75 """
76 Extract raw HTML from text.
78 The raw HTML is stored in the [`htmlStash`][markdown.util.HtmlStash] of the
79 [`Markdown`][markdown.Markdown] instance passed to `md` and the remaining text
80 is stored in `cleandoc` as a list of strings.
81 """
83 def __init__(self, md: Markdown, *args, **kwargs):
84 if 'convert_charrefs' not in kwargs:
85 kwargs['convert_charrefs'] = False
87 # Block tags that should contain no content (self closing)
88 self.empty_tags = set(['hr'])
90 self.lineno_start_cache = [0]
92 # This calls self.reset
93 super().__init__(*args, **kwargs)
94 self.md = md
96 def reset(self):
97 """Reset this instance. Loses all unprocessed data."""
98 self.inraw = False
99 self.intail = False
100 self.stack: list[str] = [] # When `inraw==True`, stack contains a list of tags
101 self._cache: list[str] = []
102 self.cleandoc: list[str] = []
103 self.lineno_start_cache = [0]
105 super().reset()
107 def close(self):
108 """Handle any buffered data."""
109 super().close()
110 if len(self.rawdata):
111 # Temp fix for https://bugs.python.org/issue41989
112 # TODO: remove this when the bug is fixed in all supported Python versions.
113 if self.convert_charrefs and not self.cdata_elem: # pragma: no cover
114 self.handle_data(htmlparser.unescape(self.rawdata))
115 else:
116 self.handle_data(self.rawdata)
117 # Handle any unclosed tags.
118 if len(self._cache):
119 self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
120 self._cache = []
122 @property
123 def line_offset(self) -> int:
124 """Returns char index in `self.rawdata` for the start of the current line. """
125 for ii in range(len(self.lineno_start_cache)-1, self.lineno-1):
126 last_line_start_pos = self.lineno_start_cache[ii]
127 lf_pos = self.rawdata.find('\n', last_line_start_pos)
128 if lf_pos == -1:
129 # No more newlines found. Use end of raw data as start of line beyond end.
130 lf_pos = len(self.rawdata)
131 self.lineno_start_cache.append(lf_pos+1)
133 return self.lineno_start_cache[self.lineno-1]
135 def at_line_start(self) -> bool:
136 """
137 Returns True if current position is at start of line.
139 Allows for up to three blank spaces at start of line.
140 """
141 if self.offset == 0:
142 return True
143 if self.offset > 3:
144 return False
145 # Confirm up to first 3 chars are whitespace
146 return self.rawdata[self.line_offset:self.line_offset + self.offset].strip() == ''
148 def get_endtag_text(self, tag: str) -> str:
149 """
150 Returns the text of the end tag.
152 If it fails to extract the actual text from the raw data, it builds a closing tag with `tag`.
153 """
154 # Attempt to extract actual tag from raw source text
155 start = self.line_offset + self.offset
156 m = htmlparser.endendtag.search(self.rawdata, start)
157 if m:
158 return self.rawdata[start:m.end()]
159 else: # pragma: no cover
160 # Failed to extract from raw data. Assume well formed and lowercase.
161 return '</{}>'.format(tag)
163 def handle_starttag(self, tag: str, attrs: Sequence[tuple[str, str]]):
164 # Handle tags that should always be empty and do not specify a closing tag
165 if tag in self.empty_tags:
166 self.handle_startendtag(tag, attrs)
167 return
169 if self.md.is_block_level(tag) and (self.intail or (self.at_line_start() and not self.inraw)):
170 # Started a new raw block. Prepare stack.
171 self.inraw = True
172 self.cleandoc.append('\n')
174 text = self.get_starttag_text()
175 if self.inraw:
176 self.stack.append(tag)
177 self._cache.append(text)
178 else:
179 self.cleandoc.append(text)
180 if tag in self.CDATA_CONTENT_ELEMENTS:
181 # This is presumably a standalone tag in a code span (see #1036).
182 self.clear_cdata_mode()
184 def handle_endtag(self, tag: str):
185 text = self.get_endtag_text(tag)
187 if self.inraw:
188 self._cache.append(text)
189 if tag in self.stack:
190 # Remove tag from stack
191 while self.stack:
192 if self.stack.pop() == tag:
193 break
194 if len(self.stack) == 0:
195 # End of raw block.
196 if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(text):]):
197 # Preserve blank line and end of raw block.
198 self._cache.append('\n')
199 else:
200 # More content exists after `endtag`.
201 self.intail = True
202 # Reset stack.
203 self.inraw = False
204 self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
205 # Insert blank line between this and next line.
206 self.cleandoc.append('\n\n')
207 self._cache = []
208 else:
209 self.cleandoc.append(text)
211 def handle_data(self, data: str):
212 if self.intail and '\n' in data:
213 self.intail = False
214 if self.inraw:
215 self._cache.append(data)
216 else:
217 self.cleandoc.append(data)
219 def handle_empty_tag(self, data: str, is_block: bool):
220 """ Handle empty tags (`<data>`). """
221 if self.inraw or self.intail:
222 # Append this to the existing raw block
223 self._cache.append(data)
224 elif self.at_line_start() and is_block:
225 # Handle this as a standalone raw block
226 if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(data):]):
227 # Preserve blank line after tag in raw block.
228 data += '\n'
229 else:
230 # More content exists after tag.
231 self.intail = True
232 item = self.cleandoc[-1] if self.cleandoc else ''
233 # If we only have one newline before block element, add another
234 if not item.endswith('\n\n') and item.endswith('\n'):
235 self.cleandoc.append('\n')
236 self.cleandoc.append(self.md.htmlStash.store(data))
237 # Insert blank line between this and next line.
238 self.cleandoc.append('\n\n')
239 else:
240 self.cleandoc.append(data)
242 def handle_startendtag(self, tag: str, attrs):
243 self.handle_empty_tag(self.get_starttag_text(), is_block=self.md.is_block_level(tag))
245 def handle_charref(self, name: str):
246 self.handle_empty_tag('&#{};'.format(name), is_block=False)
248 def handle_entityref(self, name: str):
249 self.handle_empty_tag('&{};'.format(name), is_block=False)
251 def handle_comment(self, data: str):
252 self.handle_empty_tag('<!--{}-->'.format(data), is_block=True)
254 def handle_decl(self, data: str):
255 self.handle_empty_tag('<!{}>'.format(data), is_block=True)
257 def handle_pi(self, data: str):
258 self.handle_empty_tag('<?{}?>'.format(data), is_block=True)
260 def unknown_decl(self, data: str):
261 end = ']]>' if data.startswith('CDATA[') else ']>'
262 self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True)
264 def parse_pi(self, i: int) -> int:
265 if self.at_line_start() or self.intail:
266 return super().parse_pi(i)
267 # This is not the beginning of a raw block so treat as plain data
268 # and avoid consuming any tags which may follow (see #1066).
269 self.handle_data('<?')
270 return i + 2
272 def parse_html_declaration(self, i: int) -> int:
273 if self.at_line_start() or self.intail:
274 return super().parse_html_declaration(i)
275 # This is not the beginning of a raw block so treat as plain data
276 # and avoid consuming any tags which may follow (see #1066).
277 self.handle_data('<!')
278 return i + 2
280 def parse_bogus_comment(self, i: int, report: int = 0) -> int:
281 # Override the default behavior so that bogus comments get passed
282 # through unaltered by setting `report` to `0` (see #1425).
283 pos = super().parse_bogus_comment(i, report)
284 if pos == -1: # pragma: no cover
285 return -1
286 self.handle_empty_tag(self.rawdata[i:pos], is_block=False)
287 return pos
289 # The rest has been copied from base class in standard lib to address #1036.
290 # As `__startag_text` is private, all references to it must be in this subclass.
291 # The last few lines of `parse_starttag` are reversed so that `handle_starttag`
292 # can override `cdata_mode` in certain situations (in a code span).
293 __starttag_text: str | None = None
295 def get_starttag_text(self) -> str:
296 """Return full source of start tag: `<...>`."""
297 return self.__starttag_text
299 def parse_starttag(self, i: int) -> int: # pragma: no cover
300 self.__starttag_text = None
301 endpos = self.check_for_whole_start_tag(i)
302 if endpos < 0:
303 return endpos
304 rawdata = self.rawdata
305 self.__starttag_text = rawdata[i:endpos]
307 # Now parse the data between `i+1` and `j` into a tag and `attrs`
308 attrs = []
309 match = htmlparser.tagfind_tolerant.match(rawdata, i+1)
310 assert match, 'unexpected call to parse_starttag()'
311 k = match.end()
312 self.lasttag = tag = match.group(1).lower()
313 while k < endpos:
314 m = htmlparser.attrfind_tolerant.match(rawdata, k)
315 if not m:
316 break
317 attrname, rest, attrvalue = m.group(1, 2, 3)
318 if not rest:
319 attrvalue = None
320 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
321 attrvalue[:1] == '"' == attrvalue[-1:]: # noqa: E127
322 attrvalue = attrvalue[1:-1]
323 if attrvalue:
324 attrvalue = htmlparser.unescape(attrvalue)
325 attrs.append((attrname.lower(), attrvalue))
326 k = m.end()
328 end = rawdata[k:endpos].strip()
329 if end not in (">", "/>"):
330 lineno, offset = self.getpos()
331 if "\n" in self.__starttag_text:
332 lineno = lineno + self.__starttag_text.count("\n")
333 offset = len(self.__starttag_text) \
334 - self.__starttag_text.rfind("\n") # noqa: E127
335 else:
336 offset = offset + len(self.__starttag_text)
337 self.handle_data(rawdata[i:endpos])
338 return endpos
339 if end.endswith('/>'):
340 # XHTML-style empty tag: `<span attr="value" />`
341 self.handle_startendtag(tag, attrs)
342 else:
343 # *** set `cdata_mode` first so we can override it in `handle_starttag` (see #1036) ***
344 if tag in self.CDATA_CONTENT_ELEMENTS:
345 self.set_cdata_mode(tag)
346 self.handle_starttag(tag, attrs)
347 return endpos