Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/markdown/htmlparser.py: 96%

1# Python Markdown

3# A Python implementation of John Gruber's Markdown.

5# Documentation: https://python-markdown.github.io/

6# GitHub: https://github.com/Python-Markdown/markdown/

7# PyPI: https://pypi.org/project/Markdown/

9# Started by Manfred Stienstra (http://www.dwerg.net/).

10# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).

11# Currently maintained by Waylan Limberg (https://github.com/waylan),

12# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).

18# License: BSD (see LICENSE.md for details).

20"""

21This module imports a copy of [`html.parser.HTMLParser`][] and modifies it heavily through monkey-patches.

22A copy is imported rather than the module being directly imported as this ensures that the user can import

23and use the unmodified library for their own needs.

24"""

26from __future__ import annotations

28import re

29import importlib.util

30import sys

31from typing import TYPE_CHECKING, Sequence

33if TYPE_CHECKING: # pragma: no cover

34 from markdown import Markdown

36# Included for versions which do not have current comment fix

37commentclose = re.compile(r'--!?>')

38commentabruptclose = re.compile(r'-?>')

40# Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it.

41# Users can still do `from html import parser` and get the default behavior.

42spec = importlib.util.find_spec('html.parser')

43htmlparser = importlib.util.module_from_spec(spec)

44spec.loader.exec_module(htmlparser)

45sys.modules['htmlparser'] = htmlparser

47# This is a hack. We are sneaking in `</>` so we can capture it without the HTML parser

48# throwing it away. When we see it, we will process it as data.

49htmlparser.starttagopen = re.compile('<[a-zA-Z]|</>')

51# Monkeypatch `HTMLParser` to only accept `?>` to close Processing Instructions.

52htmlparser.piclose = re.compile(r'\?>')

53# Monkeypatch `HTMLParser` to only recognize entity references with a closing semicolon.

54htmlparser.entityref = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);')

55# Monkeypatch `HTMLParser` to no longer support partial entities. We are always feeding a complete block,

56# so the 'incomplete' functionality is unnecessary. As the `entityref` regex is run right before incomplete,

57# and the two regex are the same, then incomplete will simply never match and we avoid the logic within.

58htmlparser.incomplete = htmlparser.entityref

59# Monkeypatch `HTMLParser` to not accept a backtick in a tag name, attribute name, or bare value.

60htmlparser.locatestarttagend_tolerant = re.compile(r"""

61 <[a-zA-Z][^`\t\n\r\f />\x00]* # tag name <= added backtick here

62 (?:[\s/]* # optional whitespace before attribute name

63 (?:(?<=['"\s/])[^`\s/>][^\s/=>]* # attribute name <= added backtick here

64 (?:\s*=+\s* # value indicator

65 (?:'[^']*' # LITA-enclosed value

66 |"[^"]*" # LIT-enclosed value

67 |(?!['"])[^`>\s]* # bare value <= added backtick here

68 )

69 (?:\s*,)* # possibly followed by a comma

70 )?(?:\s|/(?!>))*

71 )*

72 )?

73 \s* # trailing whitespace

74""", re.VERBOSE)

75htmlparser.locatetagend = re.compile(r"""

76 [a-zA-Z][^`\t\n\r\f />]* # tag name

77 [\t\n\r\f /]* # optional whitespace before attribute name

78 (?:(?<=['"\t\n\r\f /])[^`\t\n\r\f />][^\t\n\r\f /=>]* # attribute name

79 (?:= # value indicator

80 (?:'[^']*' # LITA-enclosed value

81 |"[^"]*" # LIT-enclosed value

82 |(?!['"])[^>\t\n\r\f ]* # bare value

83 )

84 )?

85 [\t\n\r\f /]* # possibly followed by a space

86 )*

87 >?

88""", re.VERBOSE)

90# Match a blank line at the start of a block of text (two newlines).

91# The newlines may be preceded by additional whitespace.

92blank_line_re = re.compile(r'^([ ]*\n){2}')

95class HTMLExtractor(htmlparser.HTMLParser):

96 """

97 Extract raw HTML from text.

99 The raw HTML is stored in the [`htmlStash`][markdown.util.HtmlStash] of the

100 [`Markdown`][markdown.Markdown] instance passed to `md` and the remaining text

101 is stored in `cleandoc` as a list of strings.

102 """

103

104 def __init__(self, md: Markdown, *args, **kwargs):

105 if 'convert_charrefs' not in kwargs:

106 kwargs['convert_charrefs'] = False

107

108 # Block tags that should contain no content (self closing)

109 self.empty_tags = set(['hr'])

110

111 self.lineno_start_cache = [0]

112

113 self.override_comment_update = False

114

115 # This calls self.reset

116 super().__init__(*args, **kwargs)

117 self.md = md

118

119 def reset(self):

120 """Reset this instance. Loses all unprocessed data."""

121 self.inraw = False

122 self.intail = False

123 self.stack: list[str] = [] # When `inraw==True`, stack contains a list of tags

124 self._cache: list[str] = []

125 self.cleandoc: list[str] = []

126 self.lineno_start_cache = [0]

127

128 super().reset()

129

130 def close(self):

131 """Handle any buffered data."""

132 super().close()

133 if len(self.rawdata):

134 # Temp fix for https://bugs.python.org/issue41989

135 # TODO: remove this when the bug is fixed in all supported Python versions.

136 if self.convert_charrefs and not self.cdata_elem: # pragma: no cover

137 self.handle_data(htmlparser.unescape(self.rawdata))

138 else:

139 self.handle_data(self.rawdata)

140 # Handle any unclosed tags.

141 if len(self._cache):

142 self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))

143 self._cache = []

144

145 @property

146 def line_offset(self) -> int:

147 """Returns char index in `self.rawdata` for the start of the current line. """

148 for ii in range(len(self.lineno_start_cache)-1, self.lineno-1):

149 last_line_start_pos = self.lineno_start_cache[ii]

150 lf_pos = self.rawdata.find('\n', last_line_start_pos)

151 if lf_pos == -1:

152 # No more newlines found. Use end of raw data as start of line beyond end.

153 lf_pos = len(self.rawdata)

154 self.lineno_start_cache.append(lf_pos+1)

155

156 return self.lineno_start_cache[self.lineno-1]

157

158 def at_line_start(self) -> bool:

159 """

160 Returns True if current position is at start of line.

161

162 Allows for up to three blank spaces at start of line.

163 """

164 if self.offset == 0:

165 return True

166 if self.offset > 3:

167 return False

168 # Confirm up to first 3 chars are whitespace

169 return self.rawdata[self.line_offset:self.line_offset + self.offset].strip() == ''

170

171 def get_endtag_text(self, tag: str) -> str:

172 """

173 Returns the text of the end tag.

174

175 If it fails to extract the actual text from the raw data, it builds a closing tag with `tag`.

176 """

177 # Attempt to extract actual tag from raw source text

178 start = self.line_offset + self.offset

179 m = htmlparser.endendtag.search(self.rawdata, start)

180 if m:

181 return self.rawdata[start:m.end()]

182 else: # pragma: no cover

183 # Failed to extract from raw data. Assume well formed and lowercase.

184 return '</{}>'.format(tag)

185

186 def handle_starttag(self, tag: str, attrs: Sequence[tuple[str, str]]):

187 # Handle tags that should always be empty and do not specify a closing tag

188 if tag in self.empty_tags:

189 self.handle_startendtag(tag, attrs)

190 return

191

192 if self.md.is_block_level(tag) and (self.intail or (self.at_line_start() and not self.inraw)):

193 # Started a new raw block. Prepare stack.

194 self.inraw = True

195 self.cleandoc.append('\n')

196

197 text = self.get_starttag_text()

198 if self.inraw:

199 self.stack.append(tag)

200 self._cache.append(text)

201 else:

202 self.cleandoc.append(text)

203 if tag in self.CDATA_CONTENT_ELEMENTS:

204 # This is presumably a standalone tag in a code span (see #1036).

205 self.clear_cdata_mode()

206

207 def handle_endtag(self, tag: str):

208 text = self.get_endtag_text(tag)

209

210 if self.inraw:

211 self._cache.append(text)

212 if tag in self.stack:

213 # Remove tag from stack

214 while self.stack:

215 if self.stack.pop() == tag:

216 break

217 if len(self.stack) == 0:

218 # End of raw block.

219 if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(text):]):

220 # Preserve blank line and end of raw block.

221 self._cache.append('\n')

222 else:

223 # More content exists after `endtag`.

224 self.intail = True

225 # Reset stack.

226 self.inraw = False

227 self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))

228 # Insert blank line between this and next line.

229 self.cleandoc.append('\n\n')

230 self._cache = []

231 else:

232 self.cleandoc.append(text)

233

234 def handle_data(self, data: str):

235 if self.intail and '\n' in data:

236 self.intail = False

237 if self.inraw:

238 self._cache.append(data)

239 else:

240 self.cleandoc.append(data)

241

242 def handle_empty_tag(self, data: str, is_block: bool):

243 """ Handle empty tags (`<data>`). """

244 if self.inraw or self.intail:

245 # Append this to the existing raw block

246 self._cache.append(data)

247 elif self.at_line_start() and is_block:

248 # Handle this as a standalone raw block

249 if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(data):]):

250 # Preserve blank line after tag in raw block.

251 data += '\n'

252 else:

253 # More content exists after tag.

254 self.intail = True

255 item = self.cleandoc[-1] if self.cleandoc else ''

256 # If we only have one newline before block element, add another

257 if not item.endswith('\n\n') and item.endswith('\n'):

258 self.cleandoc.append('\n')

259 self.cleandoc.append(self.md.htmlStash.store(data))

260 # Insert blank line between this and next line.

261 self.cleandoc.append('\n\n')

262 else:

263 self.cleandoc.append(data)

264

265 def handle_startendtag(self, tag: str, attrs):

266 self.handle_empty_tag(self.get_starttag_text(), is_block=self.md.is_block_level(tag))

267

268 def handle_charref(self, name: str):

269 self.handle_empty_tag('&#{};'.format(name), is_block=False)

270

271 def handle_entityref(self, name: str):

272 self.handle_empty_tag('&{};'.format(name), is_block=False)

273

274 def handle_comment(self, data: str):

275 # Check if the comment is unclosed, if so, we need to override position

276 i = self.line_offset + self.offset + len(data) + 4

277 if self.rawdata[i:i + 3] != '-->':

278 self.handle_data('<')

279 self.override_comment_update = True

280 return

281 self.handle_empty_tag(''.format(data), is_block=True)

282

283 def updatepos(self, i: int, j: int) -> int:

284 if self.override_comment_update:

285 self.override_comment_update = False

286 i = 0

287 j = 1

288 return super().updatepos(i, j)

289

290 def handle_decl(self, data: str):

291 self.handle_empty_tag('<!{}>'.format(data), is_block=True)

292

293 def handle_pi(self, data: str):

294 self.handle_empty_tag('<?{}?>'.format(data), is_block=True)

295

296 def unknown_decl(self, data: str):

297 end = ']]>' if data.startswith('CDATA[') else ']>'

298 self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True)

299

300 def parse_pi(self, i: int) -> int:

301 if self.at_line_start() or self.intail:

302 return super().parse_pi(i)

303 # This is not the beginning of a raw block so treat as plain data

304 # and avoid consuming any tags which may follow (see #1066).

305 self.handle_data('<?')

306 return i + 2

307

308 if not hasattr(htmlparser, 'commentabruptclose'):

309 # Internal -- parse comment, return length or -1 if not terminated

310 # see https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state

311 def parse_comment(self, i, report=True):

312 rawdata = self.rawdata

313 assert rawdata.startswith('<!--', i), 'unexpected call to parse_comment()'

314 match = commentclose.search(rawdata, i+4)

315 if not match:

316 match = commentabruptclose.match(rawdata, i+4)

317 if not match:

318 return -1

319 if report:

320 j = match.start()

321 self.handle_comment(rawdata[i+4: j])

322 return match.end()

323

324 def parse_html_declaration(self, i: int) -> int:

325 if self.at_line_start() or self.intail:

326 if self.rawdata[i:i+3] == '<![' and not self.rawdata[i:i+9] == '<![CDATA[':

327 # We have encountered the bug in #1534 (Python bug `gh-77057`).

328 # Provide an override until we drop support for Python < 3.13.

329 result = self.parse_bogus_comment(i)

330 if result == -1:

331 self.handle_data(self.rawdata[i:i + 1])

332 return i + 1

333 return result

334 return super().parse_html_declaration(i)

335 # This is not the beginning of a raw block so treat as plain data

336 # and avoid consuming any tags which may follow (see #1066).

337 self.handle_data('<!')

338 return i + 2

339

340 def parse_bogus_comment(self, i: int, report: int = 0) -> int:

341 # Override the default behavior so that bogus comments get passed

342 # through unaltered by setting `report` to `0` (see #1425).

343 pos = super().parse_bogus_comment(i, report)

344 if pos == -1: # pragma: no cover

345 return -1

346 self.handle_empty_tag(self.rawdata[i:pos], is_block=False)

347 return pos

348

349 # The rest has been copied from base class in standard lib to address #1036.

350 # As `__startag_text` is private, all references to it must be in this subclass.

351 # The last few lines of `parse_starttag` are reversed so that `handle_starttag`

352 # can override `cdata_mode` in certain situations (in a code span).

353 __starttag_text: str | None = None

354

355 def get_starttag_text(self) -> str:

356 """Return full source of start tag: `<...>`."""

357 return self.__starttag_text

358

359 def parse_starttag(self, i: int) -> int: # pragma: no cover

360 # Treat `</>` as normal data as it is not a real tag.

361 if self.rawdata[i:i + 3] == '</>':

362 self.handle_data(self.rawdata[i:i + 3])

363 return i + 3

364

365 self.__starttag_text = None

366 endpos = self.check_for_whole_start_tag(i)

367 if endpos < 0:

368 self.handle_data(self.rawdata[i:i + 1])

369 return i + 1

370 rawdata = self.rawdata

371 self.__starttag_text = rawdata[i:endpos]

372

373 # Now parse the data between `i+1` and `j` into a tag and `attrs`

374 attrs = []

375 match = htmlparser.tagfind_tolerant.match(rawdata, i+1)

376 assert match, 'unexpected call to parse_starttag()'

377 k = match.end()

378 self.lasttag = tag = match.group(1).lower()

379 while k < endpos:

380 m = htmlparser.attrfind_tolerant.match(rawdata, k)

381 if not m:

382 break

383 attrname, rest, attrvalue = m.group(1, 2, 3)

384 if not rest:

385 attrvalue = None

386 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \

387 attrvalue[:1] == '"' == attrvalue[-1:]: # noqa: E127

388 attrvalue = attrvalue[1:-1]

389 if attrvalue:

390 attrvalue = htmlparser.unescape(attrvalue)

391 attrs.append((attrname.lower(), attrvalue))

392 k = m.end()

393

394 end = rawdata[k:endpos].strip()

395 if end not in (">", "/>"):

396 lineno, offset = self.getpos()

397 if "\n" in self.__starttag_text:

398 lineno = lineno + self.__starttag_text.count("\n")

399 offset = len(self.__starttag_text) \

400 - self.__starttag_text.rfind("\n") # noqa: E127

401 else:

402 offset = offset + len(self.__starttag_text)

403 self.handle_data(rawdata[i:endpos])

404 return endpos

405 if end.endswith('/>'):

406 # XHTML-style empty tag: `<span attr="value" />`

407 self.handle_startendtag(tag, attrs)

408 else:

409 # *** set `cdata_mode` first so we can override it in `handle_starttag` (see #1036) ***

410 if tag in self.CDATA_CONTENT_ELEMENTS:

411 self.set_cdata_mode(tag)

412 self.handle_starttag(tag, attrs)

413 return endpos