Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/markdown/htmlparser.py: 96%

1# Python Markdown

3# A Python implementation of John Gruber's Markdown.

5# Documentation: https://python-markdown.github.io/

6# GitHub: https://github.com/Python-Markdown/markdown/

7# PyPI: https://pypi.org/project/Markdown/

9# Started by Manfred Stienstra (http://www.dwerg.net/).

10# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).

11# Currently maintained by Waylan Limberg (https://github.com/waylan),

12# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).

18# License: BSD (see LICENSE.md for details).

20"""

21This module imports a copy of [`html.parser.HTMLParser`][] and modifies it heavily through monkey-patches.

22A copy is imported rather than the module being directly imported as this ensures that the user can import

23and use the unmodified library for their own needs.

24"""

26from __future__ import annotations

28import re

29import importlib.util

30import sys

31from typing import TYPE_CHECKING, Sequence

33if TYPE_CHECKING: # pragma: no cover

34 from markdown import Markdown

37# Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it.

38# Users can still do `from html import parser` and get the default behavior.

39spec = importlib.util.find_spec('html.parser')

40htmlparser = importlib.util.module_from_spec(spec)

41spec.loader.exec_module(htmlparser)

42sys.modules['htmlparser'] = htmlparser

44# This is a hack. We are sneaking in `</>` so we can capture it without the HTML parser

45# throwing it away. When we see it, we will process it as data.

46htmlparser.starttagopen = re.compile('<[a-zA-Z]|</>')

48# Monkeypatch `HTMLParser` to only accept `?>` to close Processing Instructions.

49htmlparser.piclose = re.compile(r'\?>')

50# Monkeypatch `HTMLParser` to only recognize entity references with a closing semicolon.

51htmlparser.entityref = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);')

52# Monkeypatch `HTMLParser` to no longer support partial entities. We are always feeding a complete block,

53# so the 'incomplete' functionality is unnecessary. As the `entityref` regex is run right before incomplete,

54# and the two regex are the same, then incomplete will simply never match and we avoid the logic within.

55htmlparser.incomplete = htmlparser.entityref

56# Monkeypatch `HTMLParser` to not accept a backtick in a tag name, attribute name, or bare value.

57htmlparser.locatestarttagend_tolerant = re.compile(r"""

58 <[a-zA-Z][^`\t\n\r\f />\x00]* # tag name <= added backtick here

59 (?:[\s/]* # optional whitespace before attribute name

60 (?:(?<=['"\s/])[^`\s/>][^\s/=>]* # attribute name <= added backtick here

61 (?:\s*=+\s* # value indicator

62 (?:'[^']*' # LITA-enclosed value

63 |"[^"]*" # LIT-enclosed value

64 |(?!['"])[^`>\s]* # bare value <= added backtick here

65 )

66 (?:\s*,)* # possibly followed by a comma

67 )?(?:\s|/(?!>))*

68 )*

69 )?

70 \s* # trailing whitespace

71""", re.VERBOSE)

72htmlparser.locatetagend = re.compile(r"""

73 [a-zA-Z][^`\t\n\r\f />]* # tag name

74 [\t\n\r\f /]* # optional whitespace before attribute name

75 (?:(?<=['"\t\n\r\f /])[^`\t\n\r\f />][^\t\n\r\f /=>]* # attribute name

76 (?:= # value indicator

77 (?:'[^']*' # LITA-enclosed value

78 |"[^"]*" # LIT-enclosed value

79 |(?!['"])[^>\t\n\r\f ]* # bare value

80 )

81 )?

82 [\t\n\r\f /]* # possibly followed by a space

83 )*

84 >?

85""", re.VERBOSE)

87# Match a blank line at the start of a block of text (two newlines).

88# The newlines may be preceded by additional whitespace.

89blank_line_re = re.compile(r'^([ ]*\n){2}')

92class HTMLExtractor(htmlparser.HTMLParser):

93 """

94 Extract raw HTML from text.

96 The raw HTML is stored in the [`htmlStash`][markdown.util.HtmlStash] of the

97 [`Markdown`][markdown.Markdown] instance passed to `md` and the remaining text

98 is stored in `cleandoc` as a list of strings.

99 """

100

101 def __init__(self, md: Markdown, *args, **kwargs):

102 if 'convert_charrefs' not in kwargs:

103 kwargs['convert_charrefs'] = False

104

105 # Block tags that should contain no content (self closing)

106 self.empty_tags = set(['hr'])

107

108 self.lineno_start_cache = [0]

109

110 self.override_comment_update = False

111

112 # This calls self.reset

113 super().__init__(*args, **kwargs)

114 self.md = md

115

116 def reset(self):

117 """Reset this instance. Loses all unprocessed data."""

118 self.inraw = False

119 self.intail = False

120 self.stack: list[str] = [] # When `inraw==True`, stack contains a list of tags

121 self._cache: list[str] = []

122 self.cleandoc: list[str] = []

123 self.lineno_start_cache = [0]

124

125 super().reset()

126

127 def close(self):

128 """Handle any buffered data."""

129 super().close()

130 if len(self.rawdata):

131 # Temp fix for https://bugs.python.org/issue41989

132 # TODO: remove this when the bug is fixed in all supported Python versions.

133 if self.convert_charrefs and not self.cdata_elem: # pragma: no cover

134 self.handle_data(htmlparser.unescape(self.rawdata))

135 else:

136 self.handle_data(self.rawdata)

137 # Handle any unclosed tags.

138 if len(self._cache):

139 self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))

140 self._cache = []

141

142 @property

143 def line_offset(self) -> int:

144 """Returns char index in `self.rawdata` for the start of the current line. """

145 for ii in range(len(self.lineno_start_cache)-1, self.lineno-1):

146 last_line_start_pos = self.lineno_start_cache[ii]

147 lf_pos = self.rawdata.find('\n', last_line_start_pos)

148 if lf_pos == -1:

149 # No more newlines found. Use end of raw data as start of line beyond end.

150 lf_pos = len(self.rawdata)

151 self.lineno_start_cache.append(lf_pos+1)

152

153 return self.lineno_start_cache[self.lineno-1]

154

155 def at_line_start(self) -> bool:

156 """

157 Returns True if current position is at start of line.

158

159 Allows for up to three blank spaces at start of line.

160 """

161 if self.offset == 0:

162 return True

163 if self.offset > 3:

164 return False

165 # Confirm up to first 3 chars are whitespace

166 return self.rawdata[self.line_offset:self.line_offset + self.offset].strip() == ''

167

168 def get_endtag_text(self, tag: str) -> str:

169 """

170 Returns the text of the end tag.

171

172 If it fails to extract the actual text from the raw data, it builds a closing tag with `tag`.

173 """

174 # Attempt to extract actual tag from raw source text

175 start = self.line_offset + self.offset

176 m = htmlparser.endendtag.search(self.rawdata, start)

177 if m:

178 return self.rawdata[start:m.end()]

179 else: # pragma: no cover

180 # Failed to extract from raw data. Assume well formed and lowercase.

181 return '</{}>'.format(tag)

182

183 def handle_starttag(self, tag: str, attrs: Sequence[tuple[str, str]]):

184 # Handle tags that should always be empty and do not specify a closing tag

185 if tag in self.empty_tags:

186 self.handle_startendtag(tag, attrs)

187 return

188

189 if self.md.is_block_level(tag) and (self.intail or (self.at_line_start() and not self.inraw)):

190 # Started a new raw block. Prepare stack.

191 self.inraw = True

192 self.cleandoc.append('\n')

193

194 text = self.get_starttag_text()

195 if self.inraw:

196 self.stack.append(tag)

197 self._cache.append(text)

198 else:

199 self.cleandoc.append(text)

200 if tag in self.CDATA_CONTENT_ELEMENTS:

201 # This is presumably a standalone tag in a code span (see #1036).

202 self.clear_cdata_mode()

203

204 def handle_endtag(self, tag: str):

205 text = self.get_endtag_text(tag)

206

207 if self.inraw:

208 self._cache.append(text)

209 if tag in self.stack:

210 # Remove tag from stack

211 while self.stack:

212 if self.stack.pop() == tag:

213 break

214 if len(self.stack) == 0:

215 # End of raw block.

216 if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(text):]):

217 # Preserve blank line and end of raw block.

218 self._cache.append('\n')

219 else:

220 # More content exists after `endtag`.

221 self.intail = True

222 # Reset stack.

223 self.inraw = False

224 self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))

225 # Insert blank line between this and next line.

226 self.cleandoc.append('\n\n')

227 self._cache = []

228 else:

229 self.cleandoc.append(text)

230

231 def handle_data(self, data: str):

232 if self.intail and '\n' in data:

233 self.intail = False

234 if self.inraw:

235 self._cache.append(data)

236 else:

237 self.cleandoc.append(data)

238

239 def handle_empty_tag(self, data: str, is_block: bool):

240 """ Handle empty tags (`<data>`). """

241 if self.inraw or self.intail:

242 # Append this to the existing raw block

243 self._cache.append(data)

244 elif self.at_line_start() and is_block:

245 # Handle this as a standalone raw block

246 if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(data):]):

247 # Preserve blank line after tag in raw block.

248 data += '\n'

249 else:

250 # More content exists after tag.

251 self.intail = True

252 item = self.cleandoc[-1] if self.cleandoc else ''

253 # If we only have one newline before block element, add another

254 if not item.endswith('\n\n') and item.endswith('\n'):

255 self.cleandoc.append('\n')

256 self.cleandoc.append(self.md.htmlStash.store(data))

257 # Insert blank line between this and next line.

258 self.cleandoc.append('\n\n')

259 else:

260 self.cleandoc.append(data)

261

262 def handle_startendtag(self, tag: str, attrs):

263 self.handle_empty_tag(self.get_starttag_text(), is_block=self.md.is_block_level(tag))

264

265 def handle_charref(self, name: str):

266 self.handle_empty_tag('&#{};'.format(name), is_block=False)

267

268 def handle_entityref(self, name: str):

269 self.handle_empty_tag('&{};'.format(name), is_block=False)

270

271 def handle_comment(self, data: str):

272 # Check if the comment is unclosed, if so, we need to override position

273 i = self.line_offset + self.offset + len(data) + 4

274 if self.rawdata[i:i + 3] != '-->':

275 self.handle_data('<')

276 self.override_comment_update = True

277 return

278 self.handle_empty_tag(''.format(data), is_block=True)

279

280 def updatepos(self, i: int, j: int) -> int:

281 if self.override_comment_update:

282 self.override_comment_update = False

283 i = 0

284 j = 1

285 return super().updatepos(i, j)

286

287 def handle_decl(self, data: str):

288 self.handle_empty_tag('<!{}>'.format(data), is_block=True)

289

290 def handle_pi(self, data: str):

291 self.handle_empty_tag('<?{}?>'.format(data), is_block=True)

292

293 def unknown_decl(self, data: str):

294 end = ']]>' if data.startswith('CDATA[') else ']>'

295 self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True)

296

297 def parse_pi(self, i: int) -> int:

298 if self.at_line_start() or self.intail:

299 return super().parse_pi(i)

300 # This is not the beginning of a raw block so treat as plain data

301 # and avoid consuming any tags which may follow (see #1066).

302 self.handle_data('<?')

303 return i + 2

304

305 def parse_html_declaration(self, i: int) -> int:

306 if self.at_line_start() or self.intail:

307 if self.rawdata[i:i+3] == '<![' and not self.rawdata[i:i+9] == '<![CDATA[':

308 # We have encountered the bug in #1534 (Python bug `gh-77057`).

309 # Provide an override until we drop support for Python < 3.13.

310 result = self.parse_bogus_comment(i)

311 if result == -1:

312 self.handle_data(self.rawdata[i:i + 1])

313 return i + 1

314 return result

315 return super().parse_html_declaration(i)

316 # This is not the beginning of a raw block so treat as plain data

317 # and avoid consuming any tags which may follow (see #1066).

318 self.handle_data('<!')

319 return i + 2

320

321 def parse_bogus_comment(self, i: int, report: int = 0) -> int:

322 # Override the default behavior so that bogus comments get passed

323 # through unaltered by setting `report` to `0` (see #1425).

324 pos = super().parse_bogus_comment(i, report)

325 if pos == -1: # pragma: no cover

326 return -1

327 self.handle_empty_tag(self.rawdata[i:pos], is_block=False)

328 return pos

329

330 # The rest has been copied from base class in standard lib to address #1036.

331 # As `__startag_text` is private, all references to it must be in this subclass.

332 # The last few lines of `parse_starttag` are reversed so that `handle_starttag`

333 # can override `cdata_mode` in certain situations (in a code span).

334 __starttag_text: str | None = None

335

336 def get_starttag_text(self) -> str:

337 """Return full source of start tag: `<...>`."""

338 return self.__starttag_text

339

340 def parse_starttag(self, i: int) -> int: # pragma: no cover

341 # Treat `</>` as normal data as it is not a real tag.

342 if self.rawdata[i:i + 3] == '</>':

343 self.handle_data(self.rawdata[i:i + 3])

344 return i + 3

345

346 self.__starttag_text = None

347 endpos = self.check_for_whole_start_tag(i)

348 if endpos < 0:

349 self.handle_data(self.rawdata[i:i + 1])

350 return i + 1

351 rawdata = self.rawdata

352 self.__starttag_text = rawdata[i:endpos]

353

354 # Now parse the data between `i+1` and `j` into a tag and `attrs`

355 attrs = []

356 match = htmlparser.tagfind_tolerant.match(rawdata, i+1)

357 assert match, 'unexpected call to parse_starttag()'

358 k = match.end()

359 self.lasttag = tag = match.group(1).lower()

360 while k < endpos:

361 m = htmlparser.attrfind_tolerant.match(rawdata, k)

362 if not m:

363 break

364 attrname, rest, attrvalue = m.group(1, 2, 3)

365 if not rest:

366 attrvalue = None

367 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \

368 attrvalue[:1] == '"' == attrvalue[-1:]: # noqa: E127

369 attrvalue = attrvalue[1:-1]

370 if attrvalue:

371 attrvalue = htmlparser.unescape(attrvalue)

372 attrs.append((attrname.lower(), attrvalue))

373 k = m.end()

374

375 end = rawdata[k:endpos].strip()

376 if end not in (">", "/>"):

377 lineno, offset = self.getpos()

378 if "\n" in self.__starttag_text:

379 lineno = lineno + self.__starttag_text.count("\n")

380 offset = len(self.__starttag_text) \

381 - self.__starttag_text.rfind("\n") # noqa: E127

382 else:

383 offset = offset + len(self.__starttag_text)

384 self.handle_data(rawdata[i:endpos])

385 return endpos

386 if end.endswith('/>'):

387 # XHTML-style empty tag: `<span attr="value" />`

388 self.handle_startendtag(tag, attrs)

389 else:

390 # *** set `cdata_mode` first so we can override it in `handle_starttag` (see #1036) ***

391 if tag in self.CDATA_CONTENT_ELEMENTS:

392 self.set_cdata_mode(tag)

393 self.handle_starttag(tag, attrs)

394 return endpos