Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.10/site-packages/markdown/htmlparser.py: 44%

1# Python Markdown

3# A Python implementation of John Gruber's Markdown.

5# Documentation: https://python-markdown.github.io/

6# GitHub: https://github.com/Python-Markdown/markdown/

7# PyPI: https://pypi.org/project/Markdown/

9# Started by Manfred Stienstra (http://www.dwerg.net/).

10# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).

11# Currently maintained by Waylan Limberg (https://github.com/waylan),

12# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).

18# License: BSD (see LICENSE.md for details).

20"""

21This module imports a copy of [`html.parser.HTMLParser`][] and modifies it heavily through monkey-patches.

22A copy is imported rather than the module being directly imported as this ensures that the user can import

23and use the unmodified library for their own needs.

24"""

26from __future__ import annotations

28import re

29import importlib.util

30import sys

31from typing import TYPE_CHECKING, Sequence

33if TYPE_CHECKING: # pragma: no cover

34 from markdown import Markdown

37# Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it.

38# Users can still do `from html import parser` and get the default behavior.

39spec = importlib.util.find_spec('html.parser')

40htmlparser = importlib.util.module_from_spec(spec)

41spec.loader.exec_module(htmlparser)

42sys.modules['htmlparser'] = htmlparser

44# Monkeypatch `HTMLParser` to only accept `?>` to close Processing Instructions.

45htmlparser.piclose = re.compile(r'\?>')

46# Monkeypatch `HTMLParser` to only recognize entity references with a closing semicolon.

47htmlparser.entityref = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);')

48# Monkeypatch `HTMLParser` to no longer support partial entities. We are always feeding a complete block,

49# so the 'incomplete' functionality is unnecessary. As the `entityref` regex is run right before incomplete,

50# and the two regex are the same, then incomplete will simply never match and we avoid the logic within.

51htmlparser.incomplete = htmlparser.entityref

52# Monkeypatch `HTMLParser` to not accept a backtick in a tag name, attribute name, or bare value.

53htmlparser.locatestarttagend_tolerant = re.compile(r"""

54 <[a-zA-Z][^`\t\n\r\f />\x00]* # tag name <= added backtick here

55 (?:[\s/]* # optional whitespace before attribute name

56 (?:(?<=['"\s/])[^`\s/>][^\s/=>]* # attribute name <= added backtick here

57 (?:\s*=+\s* # value indicator

58 (?:'[^']*' # LITA-enclosed value

59 |"[^"]*" # LIT-enclosed value

60 |(?!['"])[^`>\s]* # bare value <= added backtick here

61 )

62 (?:\s*,)* # possibly followed by a comma

63 )?(?:\s|/(?!>))*

64 )*

65 )?

66 \s* # trailing whitespace

67""", re.VERBOSE)

69# Match a blank line at the start of a block of text (two newlines).

70# The newlines may be preceded by additional whitespace.

71blank_line_re = re.compile(r'^([ ]*\n){2}')

74class HTMLExtractor(htmlparser.HTMLParser):

75 """

76 Extract raw HTML from text.

78 The raw HTML is stored in the [`htmlStash`][markdown.util.HtmlStash] of the

79 [`Markdown`][markdown.Markdown] instance passed to `md` and the remaining text

80 is stored in `cleandoc` as a list of strings.

81 """

83 def __init__(self, md: Markdown, *args, **kwargs):

84 if 'convert_charrefs' not in kwargs:

85 kwargs['convert_charrefs'] = False

87 # Block tags that should contain no content (self closing)

88 self.empty_tags = set(['hr'])

90 self.lineno_start_cache = [0]

92 # This calls self.reset

93 super().__init__(*args, **kwargs)

94 self.md = md

96 def reset(self):

97 """Reset this instance. Loses all unprocessed data."""

98 self.inraw = False

99 self.intail = False

100 self.stack: list[str] = [] # When `inraw==True`, stack contains a list of tags

101 self._cache: list[str] = []

102 self.cleandoc: list[str] = []

103 self.lineno_start_cache = [0]

104

105 super().reset()

106

107 def close(self):

108 """Handle any buffered data."""

109 super().close()

110 if len(self.rawdata):

111 # Temp fix for https://bugs.python.org/issue41989

112 # TODO: remove this when the bug is fixed in all supported Python versions.

113 if self.convert_charrefs and not self.cdata_elem: # pragma: no cover

114 self.handle_data(htmlparser.unescape(self.rawdata))

115 else:

116 self.handle_data(self.rawdata)

117 # Handle any unclosed tags.

118 if len(self._cache):

119 self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))

120 self._cache = []

121

122 @property

123 def line_offset(self) -> int:

124 """Returns char index in `self.rawdata` for the start of the current line. """

125 for ii in range(len(self.lineno_start_cache)-1, self.lineno-1):

126 last_line_start_pos = self.lineno_start_cache[ii]

127 lf_pos = self.rawdata.find('\n', last_line_start_pos)

128 if lf_pos == -1:

129 # No more newlines found. Use end of raw data as start of line beyond end.

130 lf_pos = len(self.rawdata)

131 self.lineno_start_cache.append(lf_pos+1)

132

133 return self.lineno_start_cache[self.lineno-1]

134

135 def at_line_start(self) -> bool:

136 """

137 Returns True if current position is at start of line.

138

139 Allows for up to three blank spaces at start of line.

140 """

141 if self.offset == 0:

142 return True

143 if self.offset > 3:

144 return False

145 # Confirm up to first 3 chars are whitespace

146 return self.rawdata[self.line_offset:self.line_offset + self.offset].strip() == ''

147

148 def get_endtag_text(self, tag: str) -> str:

149 """

150 Returns the text of the end tag.

151

152 If it fails to extract the actual text from the raw data, it builds a closing tag with `tag`.

153 """

154 # Attempt to extract actual tag from raw source text

155 start = self.line_offset + self.offset

156 m = htmlparser.endendtag.search(self.rawdata, start)

157 if m:

158 return self.rawdata[start:m.end()]

159 else: # pragma: no cover

160 # Failed to extract from raw data. Assume well formed and lowercase.

161 return '</{}>'.format(tag)

162

163 def handle_starttag(self, tag: str, attrs: Sequence[tuple[str, str]]):

164 # Handle tags that should always be empty and do not specify a closing tag

165 if tag in self.empty_tags:

166 self.handle_startendtag(tag, attrs)

167 return

168

169 if self.md.is_block_level(tag) and (self.intail or (self.at_line_start() and not self.inraw)):

170 # Started a new raw block. Prepare stack.

171 self.inraw = True

172 self.cleandoc.append('\n')

173

174 text = self.get_starttag_text()

175 if self.inraw:

176 self.stack.append(tag)

177 self._cache.append(text)

178 else:

179 self.cleandoc.append(text)

180 if tag in self.CDATA_CONTENT_ELEMENTS:

181 # This is presumably a standalone tag in a code span (see #1036).

182 self.clear_cdata_mode()

183

184 def handle_endtag(self, tag: str):

185 text = self.get_endtag_text(tag)

186

187 if self.inraw:

188 self._cache.append(text)

189 if tag in self.stack:

190 # Remove tag from stack

191 while self.stack:

192 if self.stack.pop() == tag:

193 break

194 if len(self.stack) == 0:

195 # End of raw block.

196 if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(text):]):

197 # Preserve blank line and end of raw block.

198 self._cache.append('\n')

199 else:

200 # More content exists after `endtag`.

201 self.intail = True

202 # Reset stack.

203 self.inraw = False

204 self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))

205 # Insert blank line between this and next line.

206 self.cleandoc.append('\n\n')

207 self._cache = []

208 else:

209 self.cleandoc.append(text)

210

211 def handle_data(self, data: str):

212 if self.intail and '\n' in data:

213 self.intail = False

214 if self.inraw:

215 self._cache.append(data)

216 else:

217 self.cleandoc.append(data)

218

219 def handle_empty_tag(self, data: str, is_block: bool):

220 """ Handle empty tags (`<data>`). """

221 if self.inraw or self.intail:

222 # Append this to the existing raw block

223 self._cache.append(data)

224 elif self.at_line_start() and is_block:

225 # Handle this as a standalone raw block

226 if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(data):]):

227 # Preserve blank line after tag in raw block.

228 data += '\n'

229 else:

230 # More content exists after tag.

231 self.intail = True

232 item = self.cleandoc[-1] if self.cleandoc else ''

233 # If we only have one newline before block element, add another

234 if not item.endswith('\n\n') and item.endswith('\n'):

235 self.cleandoc.append('\n')

236 self.cleandoc.append(self.md.htmlStash.store(data))

237 # Insert blank line between this and next line.

238 self.cleandoc.append('\n\n')

239 else:

240 self.cleandoc.append(data)

241

242 def handle_startendtag(self, tag: str, attrs):

243 self.handle_empty_tag(self.get_starttag_text(), is_block=self.md.is_block_level(tag))

244

245 def handle_charref(self, name: str):

246 self.handle_empty_tag('&#{};'.format(name), is_block=False)

247

248 def handle_entityref(self, name: str):

249 self.handle_empty_tag('&{};'.format(name), is_block=False)

250

251 def handle_comment(self, data: str):

252 self.handle_empty_tag(''.format(data), is_block=True)

253

254 def handle_decl(self, data: str):

255 self.handle_empty_tag('<!{}>'.format(data), is_block=True)

256

257 def handle_pi(self, data: str):

258 self.handle_empty_tag('<?{}?>'.format(data), is_block=True)

259

260 def unknown_decl(self, data: str):

261 end = ']]>' if data.startswith('CDATA[') else ']>'

262 self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True)

263

264 def parse_pi(self, i: int) -> int:

265 if self.at_line_start() or self.intail:

266 return super().parse_pi(i)

267 # This is not the beginning of a raw block so treat as plain data

268 # and avoid consuming any tags which may follow (see #1066).

269 self.handle_data('<?')

270 return i + 2

271

272 def parse_html_declaration(self, i: int) -> int:

273 if self.at_line_start() or self.intail:

274 return super().parse_html_declaration(i)

275 # This is not the beginning of a raw block so treat as plain data

276 # and avoid consuming any tags which may follow (see #1066).

277 self.handle_data('<!')

278 return i + 2

279

280 def parse_bogus_comment(self, i: int, report: int = 0) -> int:

281 # Override the default behavior so that bogus comments get passed

282 # through unaltered by setting `report` to `0` (see #1425).

283 pos = super().parse_bogus_comment(i, report)

284 if pos == -1: # pragma: no cover

285 return -1

286 self.handle_empty_tag(self.rawdata[i:pos], is_block=False)

287 return pos

288

289 # The rest has been copied from base class in standard lib to address #1036.

290 # As `__startag_text` is private, all references to it must be in this subclass.

291 # The last few lines of `parse_starttag` are reversed so that `handle_starttag`

292 # can override `cdata_mode` in certain situations (in a code span).

293 __starttag_text: str | None = None

294

295 def get_starttag_text(self) -> str:

296 """Return full source of start tag: `<...>`."""

297 return self.__starttag_text

298

299 def parse_starttag(self, i: int) -> int: # pragma: no cover

300 self.__starttag_text = None

301 endpos = self.check_for_whole_start_tag(i)

302 if endpos < 0:

303 return endpos

304 rawdata = self.rawdata

305 self.__starttag_text = rawdata[i:endpos]

306

307 # Now parse the data between `i+1` and `j` into a tag and `attrs`

308 attrs = []

309 match = htmlparser.tagfind_tolerant.match(rawdata, i+1)

310 assert match, 'unexpected call to parse_starttag()'

311 k = match.end()

312 self.lasttag = tag = match.group(1).lower()

313 while k < endpos:

314 m = htmlparser.attrfind_tolerant.match(rawdata, k)

315 if not m:

316 break

317 attrname, rest, attrvalue = m.group(1, 2, 3)

318 if not rest:

319 attrvalue = None

320 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \

321 attrvalue[:1] == '"' == attrvalue[-1:]: # noqa: E127

322 attrvalue = attrvalue[1:-1]

323 if attrvalue:

324 attrvalue = htmlparser.unescape(attrvalue)

325 attrs.append((attrname.lower(), attrvalue))

326 k = m.end()

327

328 end = rawdata[k:endpos].strip()

329 if end not in (">", "/>"):

330 lineno, offset = self.getpos()

331 if "\n" in self.__starttag_text:

332 lineno = lineno + self.__starttag_text.count("\n")

333 offset = len(self.__starttag_text) \

334 - self.__starttag_text.rfind("\n") # noqa: E127

335 else:

336 offset = offset + len(self.__starttag_text)

337 self.handle_data(rawdata[i:endpos])

338 return endpos

339 if end.endswith('/>'):

340 # XHTML-style empty tag: `<span attr="value" />`

341 self.handle_startendtag(tag, attrs)

342 else:

343 # *** set `cdata_mode` first so we can override it in `handle_starttag` (see #1036) ***

344 if tag in self.CDATA_CONTENT_ELEMENTS:

345 self.set_cdata_mode(tag)

346 self.handle_starttag(tag, attrs)

347 return endpos