1# Python Markdown
2
3# A Python implementation of John Gruber's Markdown.
4
5# Documentation: https://python-markdown.github.io/
6# GitHub: https://github.com/Python-Markdown/markdown/
7# PyPI: https://pypi.org/project/Markdown/
8
9# Started by Manfred Stienstra (http://www.dwerg.net/).
10# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
11# Currently maintained by Waylan Limberg (https://github.com/waylan),
12# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
13
14# Copyright 2007-2023 The Python Markdown Project (v. 1.7 and later)
15# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
16# Copyright 2004 Manfred Stienstra (the original version)
17
18# License: BSD (see LICENSE.md for details).
19
20"""
21This module imports a copy of [`html.parser.HTMLParser`][] and modifies it heavily through monkey-patches.
22A copy is imported rather than the module being directly imported as this ensures that the user can import
23and use the unmodified library for their own needs.
24"""
25
26from __future__ import annotations
27
28import re
29import importlib.util
30import sys
31from typing import TYPE_CHECKING, Sequence
32
33if TYPE_CHECKING: # pragma: no cover
34 from markdown import Markdown
35
36
37# Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it.
38# Users can still do `from html import parser` and get the default behavior.
39spec = importlib.util.find_spec('html.parser')
40htmlparser = importlib.util.module_from_spec(spec)
41spec.loader.exec_module(htmlparser)
42sys.modules['htmlparser'] = htmlparser
43
44# This is a hack. We are sneaking in `</>` so we can capture it without the HTML parser
45# throwing it away. When we see it, we will process it as data.
46htmlparser.starttagopen = re.compile('<[a-zA-Z]|</>')
47
48# Monkeypatch `HTMLParser` to only accept `?>` to close Processing Instructions.
49htmlparser.piclose = re.compile(r'\?>')
50# Monkeypatch `HTMLParser` to only recognize entity references with a closing semicolon.
51htmlparser.entityref = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);')
52# Monkeypatch `HTMLParser` to no longer support partial entities. We are always feeding a complete block,
53# so the 'incomplete' functionality is unnecessary. As the `entityref` regex is run right before incomplete,
54# and the two regex are the same, then incomplete will simply never match and we avoid the logic within.
55htmlparser.incomplete = htmlparser.entityref
56# Monkeypatch `HTMLParser` to not accept a backtick in a tag name, attribute name, or bare value.
57htmlparser.locatestarttagend_tolerant = re.compile(r"""
58 <[a-zA-Z][^`\t\n\r\f />\x00]* # tag name <= added backtick here
59 (?:[\s/]* # optional whitespace before attribute name
60 (?:(?<=['"\s/])[^`\s/>][^\s/=>]* # attribute name <= added backtick here
61 (?:\s*=+\s* # value indicator
62 (?:'[^']*' # LITA-enclosed value
63 |"[^"]*" # LIT-enclosed value
64 |(?!['"])[^`>\s]* # bare value <= added backtick here
65 )
66 (?:\s*,)* # possibly followed by a comma
67 )?(?:\s|/(?!>))*
68 )*
69 )?
70 \s* # trailing whitespace
71""", re.VERBOSE)
72
73# Match a blank line at the start of a block of text (two newlines).
74# The newlines may be preceded by additional whitespace.
75blank_line_re = re.compile(r'^([ ]*\n){2}')
76
77
78class HTMLExtractor(htmlparser.HTMLParser):
79 """
80 Extract raw HTML from text.
81
82 The raw HTML is stored in the [`htmlStash`][markdown.util.HtmlStash] of the
83 [`Markdown`][markdown.Markdown] instance passed to `md` and the remaining text
84 is stored in `cleandoc` as a list of strings.
85 """
86
87 def __init__(self, md: Markdown, *args, **kwargs):
88 if 'convert_charrefs' not in kwargs:
89 kwargs['convert_charrefs'] = False
90
91 # Block tags that should contain no content (self closing)
92 self.empty_tags = set(['hr'])
93
94 self.lineno_start_cache = [0]
95
96 self.override_comment_update = False
97
98 # This calls self.reset
99 super().__init__(*args, **kwargs)
100 self.md = md
101
102 def reset(self):
103 """Reset this instance. Loses all unprocessed data."""
104 self.inraw = False
105 self.intail = False
106 self.stack: list[str] = [] # When `inraw==True`, stack contains a list of tags
107 self._cache: list[str] = []
108 self.cleandoc: list[str] = []
109 self.lineno_start_cache = [0]
110
111 super().reset()
112
113 def close(self):
114 """Handle any buffered data."""
115 super().close()
116 if len(self.rawdata):
117 # Temp fix for https://bugs.python.org/issue41989
118 # TODO: remove this when the bug is fixed in all supported Python versions.
119 if self.convert_charrefs and not self.cdata_elem: # pragma: no cover
120 self.handle_data(htmlparser.unescape(self.rawdata))
121 else:
122 self.handle_data(self.rawdata)
123 # Handle any unclosed tags.
124 if len(self._cache):
125 self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
126 self._cache = []
127
128 @property
129 def line_offset(self) -> int:
130 """Returns char index in `self.rawdata` for the start of the current line. """
131 for ii in range(len(self.lineno_start_cache)-1, self.lineno-1):
132 last_line_start_pos = self.lineno_start_cache[ii]
133 lf_pos = self.rawdata.find('\n', last_line_start_pos)
134 if lf_pos == -1:
135 # No more newlines found. Use end of raw data as start of line beyond end.
136 lf_pos = len(self.rawdata)
137 self.lineno_start_cache.append(lf_pos+1)
138
139 return self.lineno_start_cache[self.lineno-1]
140
141 def at_line_start(self) -> bool:
142 """
143 Returns True if current position is at start of line.
144
145 Allows for up to three blank spaces at start of line.
146 """
147 if self.offset == 0:
148 return True
149 if self.offset > 3:
150 return False
151 # Confirm up to first 3 chars are whitespace
152 return self.rawdata[self.line_offset:self.line_offset + self.offset].strip() == ''
153
154 def get_endtag_text(self, tag: str) -> str:
155 """
156 Returns the text of the end tag.
157
158 If it fails to extract the actual text from the raw data, it builds a closing tag with `tag`.
159 """
160 # Attempt to extract actual tag from raw source text
161 start = self.line_offset + self.offset
162 m = htmlparser.endendtag.search(self.rawdata, start)
163 if m:
164 return self.rawdata[start:m.end()]
165 else: # pragma: no cover
166 # Failed to extract from raw data. Assume well formed and lowercase.
167 return '</{}>'.format(tag)
168
169 def handle_starttag(self, tag: str, attrs: Sequence[tuple[str, str]]):
170 # Handle tags that should always be empty and do not specify a closing tag
171 if tag in self.empty_tags:
172 self.handle_startendtag(tag, attrs)
173 return
174
175 if self.md.is_block_level(tag) and (self.intail or (self.at_line_start() and not self.inraw)):
176 # Started a new raw block. Prepare stack.
177 self.inraw = True
178 self.cleandoc.append('\n')
179
180 text = self.get_starttag_text()
181 if self.inraw:
182 self.stack.append(tag)
183 self._cache.append(text)
184 else:
185 self.cleandoc.append(text)
186 if tag in self.CDATA_CONTENT_ELEMENTS:
187 # This is presumably a standalone tag in a code span (see #1036).
188 self.clear_cdata_mode()
189
190 def handle_endtag(self, tag: str):
191 text = self.get_endtag_text(tag)
192
193 if self.inraw:
194 self._cache.append(text)
195 if tag in self.stack:
196 # Remove tag from stack
197 while self.stack:
198 if self.stack.pop() == tag:
199 break
200 if len(self.stack) == 0:
201 # End of raw block.
202 if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(text):]):
203 # Preserve blank line and end of raw block.
204 self._cache.append('\n')
205 else:
206 # More content exists after `endtag`.
207 self.intail = True
208 # Reset stack.
209 self.inraw = False
210 self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
211 # Insert blank line between this and next line.
212 self.cleandoc.append('\n\n')
213 self._cache = []
214 else:
215 self.cleandoc.append(text)
216
217 def handle_data(self, data: str):
218 if self.intail and '\n' in data:
219 self.intail = False
220 if self.inraw:
221 self._cache.append(data)
222 else:
223 self.cleandoc.append(data)
224
225 def handle_empty_tag(self, data: str, is_block: bool):
226 """ Handle empty tags (`<data>`). """
227 if self.inraw or self.intail:
228 # Append this to the existing raw block
229 self._cache.append(data)
230 elif self.at_line_start() and is_block:
231 # Handle this as a standalone raw block
232 if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(data):]):
233 # Preserve blank line after tag in raw block.
234 data += '\n'
235 else:
236 # More content exists after tag.
237 self.intail = True
238 item = self.cleandoc[-1] if self.cleandoc else ''
239 # If we only have one newline before block element, add another
240 if not item.endswith('\n\n') and item.endswith('\n'):
241 self.cleandoc.append('\n')
242 self.cleandoc.append(self.md.htmlStash.store(data))
243 # Insert blank line between this and next line.
244 self.cleandoc.append('\n\n')
245 else:
246 self.cleandoc.append(data)
247
248 def handle_startendtag(self, tag: str, attrs):
249 self.handle_empty_tag(self.get_starttag_text(), is_block=self.md.is_block_level(tag))
250
251 def handle_charref(self, name: str):
252 self.handle_empty_tag('&#{};'.format(name), is_block=False)
253
254 def handle_entityref(self, name: str):
255 self.handle_empty_tag('&{};'.format(name), is_block=False)
256
257 def handle_comment(self, data: str):
258 # Check if the comment is unclosed, if so, we need to override position
259 i = self.line_offset + self.offset + len(data) + 4
260 if self.rawdata[i:i + 3] != '-->':
261 self.handle_data('<')
262 self.override_comment_update = True
263 return
264 self.handle_empty_tag('<!--{}-->'.format(data), is_block=True)
265
266 def updatepos(self, i: int, j: int) -> int:
267 if self.override_comment_update:
268 self.override_comment_update = False
269 i = 0
270 j = 1
271 return super().updatepos(i, j)
272
273 def handle_decl(self, data: str):
274 self.handle_empty_tag('<!{}>'.format(data), is_block=True)
275
276 def handle_pi(self, data: str):
277 self.handle_empty_tag('<?{}?>'.format(data), is_block=True)
278
279 def unknown_decl(self, data: str):
280 end = ']]>' if data.startswith('CDATA[') else ']>'
281 self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True)
282
283 def parse_pi(self, i: int) -> int:
284 if self.at_line_start() or self.intail:
285 return super().parse_pi(i)
286 # This is not the beginning of a raw block so treat as plain data
287 # and avoid consuming any tags which may follow (see #1066).
288 self.handle_data('<?')
289 return i + 2
290
291 def parse_html_declaration(self, i: int) -> int:
292 if self.at_line_start() or self.intail:
293 if self.rawdata[i:i+3] == '<![' and not self.rawdata[i:i+9] == '<![CDATA[':
294 # We have encountered the bug in #1534 (Python bug `gh-77057`).
295 # Provide an override until we drop support for Python < 3.13.
296 result = self.parse_bogus_comment(i)
297 if result == -1:
298 self.handle_data(self.rawdata[i:i + 1])
299 return i + 1
300 return result
301 return super().parse_html_declaration(i)
302 # This is not the beginning of a raw block so treat as plain data
303 # and avoid consuming any tags which may follow (see #1066).
304 self.handle_data('<!')
305 return i + 2
306
307 def parse_bogus_comment(self, i: int, report: int = 0) -> int:
308 # Override the default behavior so that bogus comments get passed
309 # through unaltered by setting `report` to `0` (see #1425).
310 pos = super().parse_bogus_comment(i, report)
311 if pos == -1: # pragma: no cover
312 return -1
313 self.handle_empty_tag(self.rawdata[i:pos], is_block=False)
314 return pos
315
316 # The rest has been copied from base class in standard lib to address #1036.
317 # As `__startag_text` is private, all references to it must be in this subclass.
318 # The last few lines of `parse_starttag` are reversed so that `handle_starttag`
319 # can override `cdata_mode` in certain situations (in a code span).
320 __starttag_text: str | None = None
321
322 def get_starttag_text(self) -> str:
323 """Return full source of start tag: `<...>`."""
324 return self.__starttag_text
325
326 def parse_starttag(self, i: int) -> int: # pragma: no cover
327 # Treat `</>` as normal data as it is not a real tag.
328 if self.rawdata[i:i + 3] == '</>':
329 self.handle_data(self.rawdata[i:i + 3])
330 return i + 3
331
332 self.__starttag_text = None
333 endpos = self.check_for_whole_start_tag(i)
334 if endpos < 0:
335 self.handle_data(self.rawdata[i:i + 1])
336 return i + 1
337 rawdata = self.rawdata
338 self.__starttag_text = rawdata[i:endpos]
339
340 # Now parse the data between `i+1` and `j` into a tag and `attrs`
341 attrs = []
342 match = htmlparser.tagfind_tolerant.match(rawdata, i+1)
343 assert match, 'unexpected call to parse_starttag()'
344 k = match.end()
345 self.lasttag = tag = match.group(1).lower()
346 while k < endpos:
347 m = htmlparser.attrfind_tolerant.match(rawdata, k)
348 if not m:
349 break
350 attrname, rest, attrvalue = m.group(1, 2, 3)
351 if not rest:
352 attrvalue = None
353 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
354 attrvalue[:1] == '"' == attrvalue[-1:]: # noqa: E127
355 attrvalue = attrvalue[1:-1]
356 if attrvalue:
357 attrvalue = htmlparser.unescape(attrvalue)
358 attrs.append((attrname.lower(), attrvalue))
359 k = m.end()
360
361 end = rawdata[k:endpos].strip()
362 if end not in (">", "/>"):
363 lineno, offset = self.getpos()
364 if "\n" in self.__starttag_text:
365 lineno = lineno + self.__starttag_text.count("\n")
366 offset = len(self.__starttag_text) \
367 - self.__starttag_text.rfind("\n") # noqa: E127
368 else:
369 offset = offset + len(self.__starttag_text)
370 self.handle_data(rawdata[i:endpos])
371 return endpos
372 if end.endswith('/>'):
373 # XHTML-style empty tag: `<span attr="value" />`
374 self.handle_startendtag(tag, attrs)
375 else:
376 # *** set `cdata_mode` first so we can override it in `handle_starttag` (see #1036) ***
377 if tag in self.CDATA_CONTENT_ELEMENTS:
378 self.set_cdata_mode(tag)
379 self.handle_starttag(tag, attrs)
380 return endpos