Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/markdown/htmlparser.py: 99%

1# Python Markdown

3# A Python implementation of John Gruber's Markdown.

5# Documentation: https://python-markdown.github.io/

6# GitHub: https://github.com/Python-Markdown/markdown/

7# PyPI: https://pypi.org/project/Markdown/

9# Started by Manfred Stienstra (http://www.dwerg.net/).

10# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).

11# Currently maintained by Waylan Limberg (https://github.com/waylan),

12# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).

18# License: BSD (see LICENSE.md for details).

20"""

21This module imports a copy of [`html.parser.HTMLParser`][] and modifies it heavily through monkey-patches.

22A copy is imported rather than the module being directly imported as this ensures that the user can import

23and use the unmodified library for their own needs.

24"""

26from __future__ import annotations

28import re

29import importlib.util

30import sys

31from typing import TYPE_CHECKING, Sequence

33if TYPE_CHECKING: # pragma: no cover

34 from markdown import Markdown

36# Included for versions which do not have current comment fix

37commentclose = re.compile(r'--!?>')

39# Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it.

40# Users can still do `from html import parser` and get the default behavior.

41spec = importlib.util.find_spec('html.parser')

42htmlparser = importlib.util.module_from_spec(spec)

43spec.loader.exec_module(htmlparser)

44sys.modules['htmlparser'] = htmlparser

46# This is a hack. We are sneaking in `</>` so we can capture it without the HTML parser

47# throwing it away. When we see it, we will process it as data.

48htmlparser.starttagopen = re.compile('<[a-zA-Z]|</>')

50htmlparser.endtagopen = re.compile('</[a-zA-Z]?')

52# Monkeypatch `HTMLParser` to only accept `?>` to close Processing Instructions.

53htmlparser.piclose = re.compile(r'\?>')

54# Monkeypatch `HTMLParser` to only recognize entity references with a closing semicolon.

55htmlparser.entityref = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);')

56# Monkeypatch `HTMLParser` to no longer support partial entities. We are always feeding a complete block,

57# so the 'incomplete' functionality is unnecessary. As the `entityref` regex is run right before incomplete,

58# and the two regex are the same, then incomplete will simply never match and we avoid the logic within.

59htmlparser.incomplete = htmlparser.entityref

60# Monkeypatch `HTMLParser` to not accept a backtick in a tag name, attribute name, or bare value.

61htmlparser.locatestarttagend_tolerant = re.compile(r"""

62 <[a-zA-Z][^`\t\n\r\f />\x00]* # tag name <= added backtick here

63 (?:[\s/]* # optional whitespace before attribute name

64 (?:(?<=['"\s/])[^`\s/>][^\s/=>]* # attribute name <= added backtick here

65 (?:\s*=+\s* # value indicator

66 (?:'[^']*' # LITA-enclosed value

67 |"[^"]*" # LIT-enclosed value

68 |(?!['"])[^`>\s]* # bare value <= added backtick here

69 )

70 (?:\s*,)* # possibly followed by a comma

71 )?(?:\s|/(?!>))*

72 )*

73 )?

74 \s* # trailing whitespace

75""", re.VERBOSE)

76htmlparser.locatetagend = re.compile(r"""

77 [a-zA-Z][^`\t\n\r\f />]* # tag name

78 [\t\n\r\f /]* # optional whitespace before attribute name

79 (?:(?<=['"\t\n\r\f /])[^`\t\n\r\f />][^\t\n\r\f /=>]* # attribute name

80 (?:= # value indicator

81 (?:'[^']*' # LITA-enclosed value

82 |"[^"]*" # LIT-enclosed value

83 |(?!['"])[^>\t\n\r\f ]* # bare value

84 )

85 )?

86 [\t\n\r\f /]* # possibly followed by a space

87 )*

88 >?

89""", re.VERBOSE)

91# Match a blank line at the start of a block of text (two newlines).

92# The newlines may be preceded by additional whitespace.

93blank_line_re = re.compile(r'^([ ]*\n){2}')

96class _HTMLParser(htmlparser.HTMLParser):

97 """Handle special start and end tags."""

99 def parse_endtag(self, i):

100 start = self.rawdata[i:i+3]

101 c = ord(start[-1])

102 if len(start) < 3 or not (65 <= c <= 90 or 97 <= c <= 122):

103 self.handle_data(self.rawdata[i:i + 2])

104 return i + 2

105 return super().parse_endtag(i)

106

107 def parse_starttag(self, i: int) -> int: # pragma: no cover

108 # Treat `</>` as normal data as it is not a real tag.

109 if self.rawdata[i:i + 3] == '</>':

110 self.handle_data(self.rawdata[i:i + 3])

111 return i + 3

112

113 return super().parse_starttag(i)

114

115

116# Overwrite our custom one for people like MkDocs that pull it in

117htmlparser.HTMLParser = _HTMLParser

118

119

120class HTMLExtractor(htmlparser.HTMLParser):

121 """

122 Extract raw HTML from text.

123

124 The raw HTML is stored in the [`htmlStash`][markdown.util.HtmlStash] of the

125 [`Markdown`][markdown.Markdown] instance passed to `md` and the remaining text

126 is stored in `cleandoc` as a list of strings.

127 """

128

129 def __init__(self, md: Markdown, *args, **kwargs):

130 if 'convert_charrefs' not in kwargs:

131 kwargs['convert_charrefs'] = False

132

133 # Block tags that should contain no content (self closing)

134 self.empty_tags = set(['hr'])

135

136 self.lineno_start_cache = [0]

137

138 # This calls self.reset

139 super().__init__(*args, **kwargs)

140 self.md = md

141

142 def reset(self):

143 """Reset this instance. Loses all unprocessed data."""

144 self.inraw = False

145 self.intail = False

146 self.stack: list[str] = [] # When `inraw==True`, stack contains a list of tags

147 self._cache: list[str] = []

148 self.cleandoc: list[str] = []

149 self.lineno_start_cache = [0]

150

151 super().reset()

152

153 def close(self):

154 """Handle any buffered data."""

155 super().close()

156 if len(self.rawdata):

157 # Temp fix for https://bugs.python.org/issue41989

158 # TODO: remove this when the bug is fixed in all supported Python versions.

159 if self.convert_charrefs and not self.cdata_elem: # pragma: no cover

160 self.handle_data(htmlparser.unescape(self.rawdata))

161 else:

162 self.handle_data(self.rawdata)

163 # Handle any unclosed tags.

164 if len(self._cache):

165 self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))

166 self._cache = []

167

168 @property

169 def line_offset(self) -> int:

170 """Returns char index in `self.rawdata` for the start of the current line. """

171 for ii in range(len(self.lineno_start_cache)-1, self.lineno-1):

172 last_line_start_pos = self.lineno_start_cache[ii]

173 lf_pos = self.rawdata.find('\n', last_line_start_pos)

174 if lf_pos == -1:

175 # No more newlines found. Use end of raw data as start of line beyond end.

176 lf_pos = len(self.rawdata)

177 self.lineno_start_cache.append(lf_pos+1)

178

179 return self.lineno_start_cache[self.lineno-1]

180

181 def at_line_start(self) -> bool:

182 """

183 Returns True if current position is at start of line.

184

185 Allows for up to three blank spaces at start of line.

186 """

187 if self.offset == 0:

188 return True

189 if self.offset > 3:

190 return False

191 # Confirm up to first 3 chars are whitespace

192 return self.rawdata[self.line_offset:self.line_offset + self.offset].strip() == ''

193

194 def get_endtag_text(self, tag: str) -> str:

195 """

196 Returns the text of the end tag.

197

198 If it fails to extract the actual text from the raw data, it builds a closing tag with `tag`.

199 """

200 # Attempt to extract actual tag from raw source text

201 start = self.line_offset + self.offset

202 m = htmlparser.endendtag.search(self.rawdata, start)

203 if m:

204 return self.rawdata[start:m.end()]

205 else: # pragma: no cover

206 # Failed to extract from raw data. Assume well formed and lowercase.

207 return '</{}>'.format(tag)

208

209 def handle_starttag(self, tag: str, attrs: Sequence[tuple[str, str]]):

210 # Handle tags that should always be empty and do not specify a closing tag

211 if tag in self.empty_tags:

212 self.handle_startendtag(tag, attrs)

213 return

214

215 if self.md.is_block_level(tag) and (self.intail or (self.at_line_start() and not self.inraw)):

216 # Started a new raw block. Prepare stack.

217 self.inraw = True

218 self.cleandoc.append('\n')

219

220 text = self.get_starttag_text()

221 if self.inraw:

222 self.stack.append(tag)

223 self._cache.append(text)

224 else:

225 self.cleandoc.append(text)

226 if tag in self.CDATA_CONTENT_ELEMENTS:

227 # This is presumably a standalone tag in a code span (see #1036).

228 self.clear_cdata_mode()

229

230 def handle_endtag(self, tag: str):

231 text = self.get_endtag_text(tag)

232

233 if self.inraw:

234 self._cache.append(text)

235 if tag in self.stack:

236 # Remove tag from stack

237 while self.stack:

238 if self.stack.pop() == tag:

239 break

240 if len(self.stack) == 0:

241 # End of raw block.

242 if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(text):]):

243 # Preserve blank line and end of raw block.

244 self._cache.append('\n')

245 else:

246 # More content exists after `endtag`.

247 self.intail = True

248 # Reset stack.

249 self.inraw = False

250 self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))

251 # Insert blank line between this and next line.

252 self.cleandoc.append('\n\n')

253 self._cache = []

254 else:

255 self.cleandoc.append(text)

256

257 def handle_data(self, data: str):

258 if self.intail and '\n' in data:

259 self.intail = False

260 if self.inraw:

261 self._cache.append(data)

262 else:

263 self.cleandoc.append(data)

264

265 def handle_empty_tag(self, data: str, is_block: bool):

266 """ Handle empty tags (`<data>`). """

267 if self.inraw or self.intail:

268 # Append this to the existing raw block

269 self._cache.append(data)

270 elif self.at_line_start() and is_block:

271 # Handle this as a standalone raw block

272 if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(data):]):

273 # Preserve blank line after tag in raw block.

274 data += '\n'

275 else:

276 # More content exists after tag.

277 self.intail = True

278 item = self.cleandoc[-1] if self.cleandoc else ''

279 # If we only have one newline before block element, add another

280 if not item.endswith('\n\n') and item.endswith('\n'):

281 self.cleandoc.append('\n')

282 self.cleandoc.append(self.md.htmlStash.store(data))

283 # Insert blank line between this and next line.

284 self.cleandoc.append('\n\n')

285 else:

286 self.cleandoc.append(data)

287

288 def handle_startendtag(self, tag: str, attrs):

289 self.handle_empty_tag(self.get_starttag_text(), is_block=self.md.is_block_level(tag))

290

291 def handle_charref(self, name: str):

292 self.handle_empty_tag('&#{};'.format(name), is_block=False)

293

294 def handle_entityref(self, name: str):

295 self.handle_empty_tag('&{};'.format(name), is_block=False)

296

297 def handle_comment(self, data: str):

298 # Check if the comment is unclosed, if so, we need to override position

299 self.handle_empty_tag(''.format(data), is_block=True)

300

301 def handle_decl(self, data: str):

302 self.handle_empty_tag('<!{}>'.format(data), is_block=True)

303

304 def handle_pi(self, data: str):

305 self.handle_empty_tag('<?{}?>'.format(data), is_block=True)

306

307 def unknown_decl(self, data: str):

308 end = ']]>' if data.startswith('CDATA[') else ']>'

309 self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True)

310

311 def parse_pi(self, i: int) -> int:

312 if self.at_line_start() or self.intail:

313 return super().parse_pi(i)

314 # This is not the beginning of a raw block so treat as plain data

315 # and avoid consuming any tags which may follow (see #1066).

316 self.handle_data('<?')

317 return i + 2

318

319 # Internal -- parse comment, return length or -1 if not terminated

320 # see https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state

321 def parse_comment(self, i, report=True):

322 rawdata = self.rawdata

323 assert rawdata.startswith('<!--', i), 'unexpected call to parse_comment()'

324 match = commentclose.search(rawdata, i+4)

325 if not match:

326 self.handle_data('<')

327 return i + 1

328 if report:

329 j = match.start()

330 self.handle_comment(rawdata[i+4: j])

331 return match.end()

332

333 def parse_html_declaration(self, i: int) -> int:

334 if self.at_line_start() or self.intail:

335 if self.rawdata[i:i+3] == '<![' and not self.rawdata[i:i+9] == '<![CDATA[':

336 # We have encountered the bug in #1534 (Python bug `gh-77057`).

337 # Provide an override until we drop support for Python < 3.13.

338 result = self.parse_bogus_comment(i)

339 if result == -1:

340 self.handle_data(self.rawdata[i:i + 1])

341 return i + 1

342 return result

343 return super().parse_html_declaration(i)

344 # This is not the beginning of a raw block so treat as plain data

345 # and avoid consuming any tags which may follow (see #1066).

346 self.handle_data('<!')

347 return i + 2

348

349 def parse_bogus_comment(self, i: int, report: int = 0) -> int:

350 # Override the default behavior so that bogus comments get passed

351 # through unaltered by setting `report` to `0` (see #1425).

352 pos = super().parse_bogus_comment(i, report)

353 if pos == -1: # pragma: no cover

354 return -1

355 self.handle_empty_tag(self.rawdata[i:pos], is_block=False)

356 return pos

357

358 # The rest has been copied from base class in standard lib to address #1036.

359 # As `__startag_text` is private, all references to it must be in this subclass.

360 # The last few lines of `parse_starttag` are reversed so that `handle_starttag`

361 # can override `cdata_mode` in certain situations (in a code span).

362 __starttag_text: str | None = None

363

364 def get_starttag_text(self) -> str:

365 """Return full source of start tag: `<...>`."""

366 return self.__starttag_text

367

368 def parse_starttag(self, i: int) -> int: # pragma: no cover

369 # Treat `</>` as normal data as it is not a real tag.

370 if self.rawdata[i:i + 3] == '</>':

371 self.handle_data(self.rawdata[i:i + 3])

372 return i + 3

373

374 self.__starttag_text = None

375 endpos = self.check_for_whole_start_tag(i)

376 if endpos < 0:

377 self.handle_data(self.rawdata[i:i + 1])

378 return i + 1

379 rawdata = self.rawdata

380 self.__starttag_text = rawdata[i:endpos]

381

382 # Now parse the data between `i+1` and `j` into a tag and `attrs`

383 attrs = []

384 match = htmlparser.tagfind_tolerant.match(rawdata, i+1)

385 assert match, 'unexpected call to parse_starttag()'

386 k = match.end()

387 self.lasttag = tag = match.group(1).lower()

388 while k < endpos:

389 m = htmlparser.attrfind_tolerant.match(rawdata, k)

390 if not m:

391 break

392 attrname, rest, attrvalue = m.group(1, 2, 3)

393 if not rest:

394 attrvalue = None

395 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \

396 attrvalue[:1] == '"' == attrvalue[-1:]: # noqa: E127

397 attrvalue = attrvalue[1:-1]

398 if attrvalue:

399 attrvalue = htmlparser.unescape(attrvalue)

400 attrs.append((attrname.lower(), attrvalue))

401 k = m.end()

402

403 end = rawdata[k:endpos].strip()

404 if end not in (">", "/>"):

405 lineno, offset = self.getpos()

406 if "\n" in self.__starttag_text:

407 lineno = lineno + self.__starttag_text.count("\n")

408 offset = len(self.__starttag_text) \

409 - self.__starttag_text.rfind("\n") # noqa: E127

410 else:

411 offset = offset + len(self.__starttag_text)

412 self.handle_data(rawdata[i:endpos])

413 return endpos

414 if end.endswith('/>'):

415 # XHTML-style empty tag: `<span attr="value" />`

416 self.handle_startendtag(tag, attrs)

417 else:

418 # *** set `cdata_mode` first so we can override it in `handle_starttag` (see #1036) ***

419 if tag in self.CDATA_CONTENT_ELEMENTS:

420 self.set_cdata_mode(tag)

421 self.handle_starttag(tag, attrs)

422 return endpos