Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/markdown/htmlparser.py: 94%

1# Python Markdown

3# A Python implementation of John Gruber's Markdown.

5# Documentation: https://python-markdown.github.io/

6# GitHub: https://github.com/Python-Markdown/markdown/

7# PyPI: https://pypi.org/project/Markdown/

9# Started by Manfred Stienstra (http://www.dwerg.net/).

10# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).

11# Currently maintained by Waylan Limberg (https://github.com/waylan),

12# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).

18# License: BSD (see LICENSE.md for details).

20"""

21This module imports a copy of [`html.parser.HTMLParser`][] and modifies it heavily through monkey-patches.

22A copy is imported rather than the module being directly imported as this ensures that the user can import

23and use the unmodified library for their own needs.

24"""

26from __future__ import annotations

28import re

29import importlib.util

30import sys

31from typing import TYPE_CHECKING, Sequence

33if TYPE_CHECKING: # pragma: no cover

34 from markdown import Markdown

37# Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it.

38# Users can still do `from html import parser` and get the default behavior.

39spec = importlib.util.find_spec('html.parser')

40htmlparser = importlib.util.module_from_spec(spec)

41spec.loader.exec_module(htmlparser)

42sys.modules['htmlparser'] = htmlparser

44# This is a hack. We are sneaking in `</>` so we can capture it without the HTML parser

45# throwing it away. When we see it, we will process it as data.

46htmlparser.starttagopen = re.compile('<[a-zA-Z]|</>')

48# Monkeypatch `HTMLParser` to only accept `?>` to close Processing Instructions.

49htmlparser.piclose = re.compile(r'\?>')

50# Monkeypatch `HTMLParser` to only recognize entity references with a closing semicolon.

51htmlparser.entityref = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);')

52# Monkeypatch `HTMLParser` to no longer support partial entities. We are always feeding a complete block,

53# so the 'incomplete' functionality is unnecessary. As the `entityref` regex is run right before incomplete,

54# and the two regex are the same, then incomplete will simply never match and we avoid the logic within.

55htmlparser.incomplete = htmlparser.entityref

56# Monkeypatch `HTMLParser` to not accept a backtick in a tag name, attribute name, or bare value.

57htmlparser.locatestarttagend_tolerant = re.compile(r"""

58 <[a-zA-Z][^`\t\n\r\f />\x00]* # tag name <= added backtick here

59 (?:[\s/]* # optional whitespace before attribute name

60 (?:(?<=['"\s/])[^`\s/>][^\s/=>]* # attribute name <= added backtick here

61 (?:\s*=+\s* # value indicator

62 (?:'[^']*' # LITA-enclosed value

63 |"[^"]*" # LIT-enclosed value

64 |(?!['"])[^`>\s]* # bare value <= added backtick here

65 )

66 (?:\s*,)* # possibly followed by a comma

67 )?(?:\s|/(?!>))*

68 )*

69 )?

70 \s* # trailing whitespace

71""", re.VERBOSE)

73# Match a blank line at the start of a block of text (two newlines).

74# The newlines may be preceded by additional whitespace.

75blank_line_re = re.compile(r'^([ ]*\n){2}')

78class HTMLExtractor(htmlparser.HTMLParser):

79 """

80 Extract raw HTML from text.

82 The raw HTML is stored in the [`htmlStash`][markdown.util.HtmlStash] of the

83 [`Markdown`][markdown.Markdown] instance passed to `md` and the remaining text

84 is stored in `cleandoc` as a list of strings.

85 """

87 def __init__(self, md: Markdown, *args, **kwargs):

88 if 'convert_charrefs' not in kwargs:

89 kwargs['convert_charrefs'] = False

91 # Block tags that should contain no content (self closing)

92 self.empty_tags = set(['hr'])

94 self.lineno_start_cache = [0]

96 self.override_comment_update = False

98 # This calls self.reset

99 super().__init__(*args, **kwargs)

100 self.md = md

101

102 def reset(self):

103 """Reset this instance. Loses all unprocessed data."""

104 self.inraw = False

105 self.intail = False

106 self.stack: list[str] = [] # When `inraw==True`, stack contains a list of tags

107 self._cache: list[str] = []

108 self.cleandoc: list[str] = []

109 self.lineno_start_cache = [0]

110

111 super().reset()

112

113 def close(self):

114 """Handle any buffered data."""

115 super().close()

116 if len(self.rawdata):

117 # Temp fix for https://bugs.python.org/issue41989

118 # TODO: remove this when the bug is fixed in all supported Python versions.

119 if self.convert_charrefs and not self.cdata_elem: # pragma: no cover

120 self.handle_data(htmlparser.unescape(self.rawdata))

121 else:

122 self.handle_data(self.rawdata)

123 # Handle any unclosed tags.

124 if len(self._cache):

125 self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))

126 self._cache = []

127

128 @property

129 def line_offset(self) -> int:

130 """Returns char index in `self.rawdata` for the start of the current line. """

131 for ii in range(len(self.lineno_start_cache)-1, self.lineno-1):

132 last_line_start_pos = self.lineno_start_cache[ii]

133 lf_pos = self.rawdata.find('\n', last_line_start_pos)

134 if lf_pos == -1:

135 # No more newlines found. Use end of raw data as start of line beyond end.

136 lf_pos = len(self.rawdata)

137 self.lineno_start_cache.append(lf_pos+1)

138

139 return self.lineno_start_cache[self.lineno-1]

140

141 def at_line_start(self) -> bool:

142 """

143 Returns True if current position is at start of line.

144

145 Allows for up to three blank spaces at start of line.

146 """

147 if self.offset == 0:

148 return True

149 if self.offset > 3:

150 return False

151 # Confirm up to first 3 chars are whitespace

152 return self.rawdata[self.line_offset:self.line_offset + self.offset].strip() == ''

153

154 def get_endtag_text(self, tag: str) -> str:

155 """

156 Returns the text of the end tag.

157

158 If it fails to extract the actual text from the raw data, it builds a closing tag with `tag`.

159 """

160 # Attempt to extract actual tag from raw source text

161 start = self.line_offset + self.offset

162 m = htmlparser.endendtag.search(self.rawdata, start)

163 if m:

164 return self.rawdata[start:m.end()]

165 else: # pragma: no cover

166 # Failed to extract from raw data. Assume well formed and lowercase.

167 return '</{}>'.format(tag)

168

169 def handle_starttag(self, tag: str, attrs: Sequence[tuple[str, str]]):

170 # Handle tags that should always be empty and do not specify a closing tag

171 if tag in self.empty_tags:

172 self.handle_startendtag(tag, attrs)

173 return

174

175 if self.md.is_block_level(tag) and (self.intail or (self.at_line_start() and not self.inraw)):

176 # Started a new raw block. Prepare stack.

177 self.inraw = True

178 self.cleandoc.append('\n')

179

180 text = self.get_starttag_text()

181 if self.inraw:

182 self.stack.append(tag)

183 self._cache.append(text)

184 else:

185 self.cleandoc.append(text)

186 if tag in self.CDATA_CONTENT_ELEMENTS:

187 # This is presumably a standalone tag in a code span (see #1036).

188 self.clear_cdata_mode()

189

190 def handle_endtag(self, tag: str):

191 text = self.get_endtag_text(tag)

192

193 if self.inraw:

194 self._cache.append(text)

195 if tag in self.stack:

196 # Remove tag from stack

197 while self.stack:

198 if self.stack.pop() == tag:

199 break

200 if len(self.stack) == 0:

201 # End of raw block.

202 if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(text):]):

203 # Preserve blank line and end of raw block.

204 self._cache.append('\n')

205 else:

206 # More content exists after `endtag`.

207 self.intail = True

208 # Reset stack.

209 self.inraw = False

210 self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))

211 # Insert blank line between this and next line.

212 self.cleandoc.append('\n\n')

213 self._cache = []

214 else:

215 self.cleandoc.append(text)

216

217 def handle_data(self, data: str):

218 if self.intail and '\n' in data:

219 self.intail = False

220 if self.inraw:

221 self._cache.append(data)

222 else:

223 self.cleandoc.append(data)

224

225 def handle_empty_tag(self, data: str, is_block: bool):

226 """ Handle empty tags (`<data>`). """

227 if self.inraw or self.intail:

228 # Append this to the existing raw block

229 self._cache.append(data)

230 elif self.at_line_start() and is_block:

231 # Handle this as a standalone raw block

232 if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(data):]):

233 # Preserve blank line after tag in raw block.

234 data += '\n'

235 else:

236 # More content exists after tag.

237 self.intail = True

238 item = self.cleandoc[-1] if self.cleandoc else ''

239 # If we only have one newline before block element, add another

240 if not item.endswith('\n\n') and item.endswith('\n'):

241 self.cleandoc.append('\n')

242 self.cleandoc.append(self.md.htmlStash.store(data))

243 # Insert blank line between this and next line.

244 self.cleandoc.append('\n\n')

245 else:

246 self.cleandoc.append(data)

247

248 def handle_startendtag(self, tag: str, attrs):

249 self.handle_empty_tag(self.get_starttag_text(), is_block=self.md.is_block_level(tag))

250

251 def handle_charref(self, name: str):

252 self.handle_empty_tag('&#{};'.format(name), is_block=False)

253

254 def handle_entityref(self, name: str):

255 self.handle_empty_tag('&{};'.format(name), is_block=False)

256

257 def handle_comment(self, data: str):

258 # Check if the comment is unclosed, if so, we need to override position

259 i = self.line_offset + self.offset + len(data) + 4

260 if self.rawdata[i:i + 3] != '-->':

261 self.handle_data('<')

262 self.override_comment_update = True

263 return

264 self.handle_empty_tag(''.format(data), is_block=True)

265

266 def updatepos(self, i: int, j: int) -> int:

267 if self.override_comment_update:

268 self.override_comment_update = False

269 i = 0

270 j = 1

271 return super().updatepos(i, j)

272

273 def handle_decl(self, data: str):

274 self.handle_empty_tag('<!{}>'.format(data), is_block=True)

275

276 def handle_pi(self, data: str):

277 self.handle_empty_tag('<?{}?>'.format(data), is_block=True)

278

279 def unknown_decl(self, data: str):

280 end = ']]>' if data.startswith('CDATA[') else ']>'

281 self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True)

282

283 def parse_pi(self, i: int) -> int:

284 if self.at_line_start() or self.intail:

285 return super().parse_pi(i)

286 # This is not the beginning of a raw block so treat as plain data

287 # and avoid consuming any tags which may follow (see #1066).

288 self.handle_data('<?')

289 return i + 2

290

291 def parse_html_declaration(self, i: int) -> int:

292 if self.at_line_start() or self.intail:

293 if self.rawdata[i:i+3] == '<![' and not self.rawdata[i:i+9] == '<![CDATA[':

294 # We have encountered the bug in #1534 (Python bug `gh-77057`).

295 # Provide an override until we drop support for Python < 3.13.

296 result = self.parse_bogus_comment(i)

297 if result == -1:

298 self.handle_data(self.rawdata[i:i + 1])

299 return i + 1

300 return result

301 return super().parse_html_declaration(i)

302 # This is not the beginning of a raw block so treat as plain data

303 # and avoid consuming any tags which may follow (see #1066).

304 self.handle_data('<!')

305 return i + 2

306

307 def parse_bogus_comment(self, i: int, report: int = 0) -> int:

308 # Override the default behavior so that bogus comments get passed

309 # through unaltered by setting `report` to `0` (see #1425).

310 pos = super().parse_bogus_comment(i, report)

311 if pos == -1: # pragma: no cover

312 return -1

313 self.handle_empty_tag(self.rawdata[i:pos], is_block=False)

314 return pos

315

316 # The rest has been copied from base class in standard lib to address #1036.

317 # As `__startag_text` is private, all references to it must be in this subclass.

318 # The last few lines of `parse_starttag` are reversed so that `handle_starttag`

319 # can override `cdata_mode` in certain situations (in a code span).

320 __starttag_text: str | None = None

321

322 def get_starttag_text(self) -> str:

323 """Return full source of start tag: `<...>`."""

324 return self.__starttag_text

325

326 def parse_starttag(self, i: int) -> int: # pragma: no cover

327 # Treat `</>` as normal data as it is not a real tag.

328 if self.rawdata[i:i + 3] == '</>':

329 self.handle_data(self.rawdata[i:i + 3])

330 return i + 3

331

332 self.__starttag_text = None

333 endpos = self.check_for_whole_start_tag(i)

334 if endpos < 0:

335 self.handle_data(self.rawdata[i:i + 1])

336 return i + 1

337 rawdata = self.rawdata

338 self.__starttag_text = rawdata[i:endpos]

339

340 # Now parse the data between `i+1` and `j` into a tag and `attrs`

341 attrs = []

342 match = htmlparser.tagfind_tolerant.match(rawdata, i+1)

343 assert match, 'unexpected call to parse_starttag()'

344 k = match.end()

345 self.lasttag = tag = match.group(1).lower()

346 while k < endpos:

347 m = htmlparser.attrfind_tolerant.match(rawdata, k)

348 if not m:

349 break

350 attrname, rest, attrvalue = m.group(1, 2, 3)

351 if not rest:

352 attrvalue = None

353 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \

354 attrvalue[:1] == '"' == attrvalue[-1:]: # noqa: E127

355 attrvalue = attrvalue[1:-1]

356 if attrvalue:

357 attrvalue = htmlparser.unescape(attrvalue)

358 attrs.append((attrname.lower(), attrvalue))

359 k = m.end()

360

361 end = rawdata[k:endpos].strip()

362 if end not in (">", "/>"):

363 lineno, offset = self.getpos()

364 if "\n" in self.__starttag_text:

365 lineno = lineno + self.__starttag_text.count("\n")

366 offset = len(self.__starttag_text) \

367 - self.__starttag_text.rfind("\n") # noqa: E127

368 else:

369 offset = offset + len(self.__starttag_text)

370 self.handle_data(rawdata[i:endpos])

371 return endpos

372 if end.endswith('/>'):

373 # XHTML-style empty tag: `<span attr="value" />`

374 self.handle_startendtag(tag, attrs)

375 else:

376 # *** set `cdata_mode` first so we can override it in `handle_starttag` (see #1036) ***

377 if tag in self.CDATA_CONTENT_ELEMENTS:

378 self.set_cdata_mode(tag)

379 self.handle_starttag(tag, attrs)

380 return endpos