Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/markdown/extensions/toc.py: 78%

1# Table of Contents Extension for Python-Markdown

2# ===============================================

4# See https://Python-Markdown.github.io/extensions/toc

5# for documentation.

11# License: [BSD](https://opensource.org/licenses/bsd-license.php)

13"""

14Add table of contents support to Python-Markdown.

16See the [documentation](https://Python-Markdown.github.io/extensions/toc)

17for details.

18"""

20from __future__ import annotations

22from . import Extension

23from ..treeprocessors import Treeprocessor

24from ..util import parseBoolValue, AMP_SUBSTITUTE, deprecated, HTML_PLACEHOLDER_RE, AtomicString

25from ..treeprocessors import UnescapeTreeprocessor

26from ..serializers import RE_AMP

27import re

28import html

29import unicodedata

30from copy import deepcopy

31import xml.etree.ElementTree as etree

32from typing import TYPE_CHECKING, Any, Iterator, MutableSet

34if TYPE_CHECKING: # pragma: no cover

35 from markdown import Markdown

38def slugify(value: str, separator: str, unicode: bool = False) -> str:

39 """ Slugify a string, to make it URL friendly. """

40 if not unicode:

41 # Replace Extended Latin characters with ASCII, i.e. `žlutý` => `zluty`

42 value = unicodedata.normalize('NFKD', value)

43 value = value.encode('ascii', 'ignore').decode('ascii')

44 value = re.sub(r'[^\w\s-]', '', value).strip().lower()

45 return re.sub(r'[{}\s]+'.format(separator), separator, value)

48def slugify_unicode(value: str, separator: str) -> str:

49 """ Slugify a string, to make it URL friendly while preserving Unicode characters. """

50 return slugify(value, separator, unicode=True)

53IDCOUNT_RE = re.compile(r'^(.*)_([0-9]+)$')

56def unique(id: str, ids: MutableSet[str]) -> str:

57 """ Ensure id is unique in set of ids. Append '_1', '_2'... if not """

58 while id in ids or not id:

59 m = IDCOUNT_RE.match(id)

60 if m:

61 id = '%s_%d' % (m.group(1), int(m.group(2))+1)

62 else:

63 id = '%s_%d' % (id, 1)

64 ids.add(id)

65 return id

68@deprecated('Use `render_inner_html` and `striptags` instead.')

69def get_name(el: etree.Element) -> str:

70 """Get title name."""

72 text = []

73 for c in el.itertext():

74 if isinstance(c, AtomicString):

75 text.append(html.unescape(c))

76 else:

77 text.append(c)

78 return ''.join(text).strip()

81@deprecated('Use `run_postprocessors`, `render_inner_html` and/or `striptags` instead.')

82def stashedHTML2text(text: str, md: Markdown, strip_entities: bool = True) -> str:

83 """ Extract raw HTML from stash, reduce to plain text and swap with placeholder. """

84 def _html_sub(m: re.Match[str]) -> str:

85 """ Substitute raw html with plain text. """

86 try:

87 raw = md.htmlStash.rawHtmlBlocks[int(m.group(1))]

88 except (IndexError, TypeError): # pragma: no cover

89 return m.group(0)

90 # Strip out tags and/or entities - leaving text

91 res = re.sub(r'(<[^>]+>)', '', raw)

92 if strip_entities:

93 res = re.sub(r'(&[\#a-zA-Z0-9]+;)', '', res)

94 return res

96 return HTML_PLACEHOLDER_RE.sub(_html_sub, text)

99def unescape(text: str) -> str:

100 """ Unescape Markdown backslash escaped text. """

101 c = UnescapeTreeprocessor()

102 return c.unescape(text)

103

104

105def strip_tags(text: str) -> str:

106 """ Strip HTML tags and return plain text. Note: HTML entities are unaffected. """

107 # A comment could contain a tag, so strip comments first

108 while (start := text.find('', start)) != -1:

109 text = f'{text[:start]}{text[end + 3:]}'

110

111 while (start := text.find('<')) != -1 and (end := text.find('>', start)) != -1:

112 text = f'{text[:start]}{text[end + 1:]}'

113

114 # Collapse whitespace

115 text = ' '.join(text.split())

116 return text

117

118

119def escape_cdata(text: str) -> str:

120 """ Escape character data. """

121 if "&" in text:

122 # Only replace & when not part of an entity

123 text = RE_AMP.sub('&', text)

124 if "<" in text:

125 text = text.replace("<", "<")

126 if ">" in text:

127 text = text.replace(">", ">")

128 return text

129

130

131def run_postprocessors(text: str, md: Markdown) -> str:

132 """ Run postprocessors from Markdown instance on text. """

133 for pp in md.postprocessors:

134 text = pp.run(text)

135 return text.strip()

136

137

138def render_inner_html(el: etree.Element, md: Markdown) -> str:

139 """ Fully render inner html of an `etree` element as a string. """

140 # The `UnescapeTreeprocessor` runs after `toc` extension so run here.

141 text = unescape(md.serializer(el))

142

143 # strip parent tag

144 start = text.index('>') + 1

145 end = text.rindex('<')

146 text = text[start:end].strip()

147

148 return run_postprocessors(text, md)

149

150

151def remove_fnrefs(root: etree.Element) -> etree.Element:

152 """ Remove footnote references from a copy of the element, if any are present. """

153 # Remove footnote references, which look like this: `<sup id="fnref:1">...</sup>`.

154 # If there are no `sup` elements, then nothing to do.

155 if next(root.iter('sup'), None) is None:

156 return root

157 root = deepcopy(root)

158 # Find parent elements that contain `sup` elements.

159 for parent in root.findall('.//sup/..'):

160 carry_text = ""

161 for child in reversed(parent): # Reversed for the ability to mutate during iteration.

162 # Remove matching footnote references but carry any `tail` text to preceding elements.

163 if child.tag == 'sup' and child.get('id', '').startswith('fnref'):

164 carry_text = f'{child.tail or ""}{carry_text}'

165 parent.remove(child)

166 elif carry_text:

167 child.tail = f'{child.tail or ""}{carry_text}'

168 carry_text = ""

169 if carry_text:

170 parent.text = f'{parent.text or ""}{carry_text}'

171 return root

172

173

174def nest_toc_tokens(toc_list):

175 """Given an unsorted list with errors and skips, return a nested one.

176

177 [{'level': 1}, {'level': 2}]

178 =>

179 [{'level': 1, 'children': [{'level': 2, 'children': []}]}]

180

181 A wrong list is also converted:

182

183 [{'level': 2}, {'level': 1}]

184 =>

185 [{'level': 2, 'children': []}, {'level': 1, 'children': []}]

186 """

187

188 ordered_list = []

189 if len(toc_list):

190 # Initialize everything by processing the first entry

191 last = toc_list.pop(0)

192 last['children'] = []

193 levels = [last['level']]

194 ordered_list.append(last)

195 parents = []

196

197 # Walk the rest nesting the entries properly

198 while toc_list:

199 t = toc_list.pop(0)

200 current_level = t['level']

201 t['children'] = []

202

203 # Reduce depth if current level < last item's level

204 if current_level < levels[-1]:

205 # Pop last level since we know we are less than it

206 levels.pop()

207

208 # Pop parents and levels we are less than or equal to

209 to_pop = 0

210 for p in reversed(parents):

211 if current_level <= p['level']:

212 to_pop += 1

213 else: # pragma: no cover

214 break

215 if to_pop:

216 levels = levels[:-to_pop]

217 parents = parents[:-to_pop]

218

219 # Note current level as last

220 levels.append(current_level)

221

222 # Level is the same, so append to

223 # the current parent (if available)

224 if current_level == levels[-1]:

225 (parents[-1]['children'] if parents

226 else ordered_list).append(t)

227

228 # Current level is > last item's level,

229 # So make last item a parent and append current as child

230 else:

231 last['children'].append(t)

232 parents.append(last)

233 levels.append(current_level)

234 last = t

235

236 return ordered_list

237

238

239class TocTreeprocessor(Treeprocessor):

240 """ Step through document and build TOC. """

241

242 def __init__(self, md: Markdown, config: dict[str, Any]):

243 super().__init__(md)

244

245 self.marker: str = config["marker"]

246 self.title: str = config["title"]

247 self.base_level = int(config["baselevel"]) - 1

248 self.slugify = config["slugify"]

249 self.sep = config["separator"]

250 self.toc_class = config["toc_class"]

251 self.title_class: str = config["title_class"]

252 self.use_anchors: bool = parseBoolValue(config["anchorlink"])

253 self.anchorlink_class: str = config["anchorlink_class"]

254 self.use_permalinks = parseBoolValue(config["permalink"], False)

255 if self.use_permalinks is None:

256 self.use_permalinks = config["permalink"]

257 self.permalink_class: str = config["permalink_class"]

258 self.permalink_title: str = config["permalink_title"]

259 self.permalink_leading: bool | None = parseBoolValue(config["permalink_leading"], False)

260 self.header_rgx = re.compile("[Hh][123456]")

261 if isinstance(config["toc_depth"], str) and '-' in config["toc_depth"]:

262 self.toc_top, self.toc_bottom = [int(x) for x in config["toc_depth"].split('-')]

263 else:

264 self.toc_top = 1

265 self.toc_bottom = int(config["toc_depth"])

266

267 def iterparent(self, node: etree.Element) -> Iterator[tuple[etree.Element, etree.Element]]:

268 """ Iterator wrapper to get allowed parent and child all at once. """

269

270 # We do not allow the marker inside a header as that

271 # would causes an endless loop of placing a new TOC

272 # inside previously generated TOC.

273 for child in node:

274 if not self.header_rgx.match(child.tag) and child.tag not in ['pre', 'code']:

275 yield node, child

276 yield from self.iterparent(child)

277

278 def replace_marker(self, root: etree.Element, elem: etree.Element) -> None:

279 """ Replace marker with elem. """

280 for (p, c) in self.iterparent(root):

281 text = ''.join(c.itertext()).strip()

282 if not text:

283 continue

284

285 # To keep the output from screwing up the

286 # validation by putting a `<div>` inside of a `<p>`

287 # we actually replace the `<p>` in its entirety.

288

289 # The `<p>` element may contain more than a single text content

290 # (`nl2br` can introduce a `<br>`). In this situation, `c.text` returns

291 # the very first content, ignore children contents or tail content.

292 # `len(c) == 0` is here to ensure there is only text in the `<p>`.

293 if c.text and c.text.strip() == self.marker and len(c) == 0:

294 for i in range(len(p)):

295 if p[i] == c:

296 p[i] = elem

297 break

298

299 def set_level(self, elem: etree.Element) -> None:

300 """ Adjust header level according to base level. """

301 level = int(elem.tag[-1]) + self.base_level

302 if level > 6:

303 level = 6

304 elem.tag = 'h%d' % level

305

306 def add_anchor(self, c: etree.Element, elem_id: str) -> None:

307 anchor = etree.Element("a")

308 anchor.text = c.text

309 anchor.attrib["href"] = "#" + elem_id

310 anchor.attrib["class"] = self.anchorlink_class

311 c.text = ""

312 for elem in c:

313 anchor.append(elem)

314 while len(c):

315 c.remove(c[0])

316 c.append(anchor)

317

318 def add_permalink(self, c: etree.Element, elem_id: str) -> None:

319 permalink = etree.Element("a")

320 permalink.text = ("%spara;" % AMP_SUBSTITUTE

321 if self.use_permalinks is True

322 else self.use_permalinks)

323 permalink.attrib["href"] = "#" + elem_id

324 permalink.attrib["class"] = self.permalink_class

325 if self.permalink_title:

326 permalink.attrib["title"] = self.permalink_title

327 if self.permalink_leading:

328 permalink.tail = c.text

329 c.text = ""

330 c.insert(0, permalink)

331 else:

332 c.append(permalink)

333

334 def build_toc_div(self, toc_list: list) -> etree.Element:

335 """ Return a string div given a toc list. """

336 div = etree.Element("div")

337 div.attrib["class"] = self.toc_class

338

339 # Add title to the div

340 if self.title:

341 header = etree.SubElement(div, "span")

342 if self.title_class:

343 header.attrib["class"] = self.title_class

344 header.text = self.title

345

346 def build_etree_ul(toc_list: list, parent: etree.Element) -> etree.Element:

347 ul = etree.SubElement(parent, "ul")

348 for item in toc_list:

349 # List item link, to be inserted into the toc div

350 li = etree.SubElement(ul, "li")

351 link = etree.SubElement(li, "a")

352 link.text = item.get('name', '')

353 link.attrib["href"] = '#' + item.get('id', '')

354 if item['children']:

355 build_etree_ul(item['children'], li)

356 return ul

357

358 build_etree_ul(toc_list, div)

359

360 if 'prettify' in self.md.treeprocessors:

361 self.md.treeprocessors['prettify'].run(div)

362

363 return div

364

365 def run(self, doc: etree.Element) -> None:

366 # Get a list of id attributes

367 used_ids = set()

368 for el in doc.iter():

369 if "id" in el.attrib:

370 used_ids.add(el.attrib["id"])

371

372 toc_tokens = []

373 for el in doc.iter():

374 if isinstance(el.tag, str) and self.header_rgx.match(el.tag):

375 self.set_level(el)

376 innerhtml = render_inner_html(remove_fnrefs(el), self.md)

377 name = strip_tags(innerhtml)

378

379 # Do not override pre-existing ids

380 if "id" not in el.attrib:

381 el.attrib["id"] = unique(self.slugify(html.unescape(name), self.sep), used_ids)

382

383 data_toc_label = ''

384 if 'data-toc-label' in el.attrib:

385 data_toc_label = run_postprocessors(unescape(el.attrib['data-toc-label']), self.md)

386 # Overwrite name with sanitized value of `data-toc-label`.

387 name = escape_cdata(strip_tags(data_toc_label))

388 # Remove the data-toc-label attribute as it is no longer needed

389 del el.attrib['data-toc-label']

390

391 if int(el.tag[-1]) >= self.toc_top and int(el.tag[-1]) <= self.toc_bottom:

392 toc_tokens.append({

393 'level': int(el.tag[-1]),

394 'id': unescape(el.attrib["id"]),

395 'name': name,

396 'html': innerhtml,

397 'data-toc-label': data_toc_label

398 })

399

400 if self.use_anchors:

401 self.add_anchor(el, el.attrib["id"])

402 if self.use_permalinks not in [False, None]:

403 self.add_permalink(el, el.attrib["id"])

404

405 toc_tokens = nest_toc_tokens(toc_tokens)

406 div = self.build_toc_div(toc_tokens)

407 if self.marker:

408 self.replace_marker(doc, div)

409

410 # serialize and attach to markdown instance.

411 toc = self.md.serializer(div)

412 for pp in self.md.postprocessors:

413 toc = pp.run(toc)

414 self.md.toc_tokens = toc_tokens

415 self.md.toc = toc

416

417

418class TocExtension(Extension):

419

420 TreeProcessorClass = TocTreeprocessor

421

422 def __init__(self, **kwargs):

423 self.config = {

424 'marker': [

425 '[TOC]',

426 'Text to find and replace with Table of Contents. Set to an empty string to disable. '

427 'Default: `[TOC]`.'

428 ],

429 'title': [

430 '', 'Title to insert into TOC `<div>`. Default: an empty string.'

431 ],

432 'title_class': [

433 'toctitle', 'CSS class used for the title. Default: `toctitle`.'

434 ],

435 'toc_class': [

436 'toc', 'CSS class(es) used for the link. Default: `toclink`.'

437 ],

438 'anchorlink': [

439 False, 'True if header should be a self link. Default: `False`.'

440 ],

441 'anchorlink_class': [

442 'toclink', 'CSS class(es) used for the link. Defaults: `toclink`.'

443 ],

444 'permalink': [

445 0, 'True or link text if a Sphinx-style permalink should be added. Default: `False`.'

446 ],

447 'permalink_class': [

448 'headerlink', 'CSS class(es) used for the link. Default: `headerlink`.'

449 ],

450 'permalink_title': [

451 'Permanent link', 'Title attribute of the permalink. Default: `Permanent link`.'

452 ],

453 'permalink_leading': [

454 False,

455 'True if permalinks should be placed at start of the header, rather than end. Default: False.'

456 ],

457 'baselevel': ['1', 'Base level for headers. Default: `1`.'],

458 'slugify': [

459 slugify, 'Function to generate anchors based on header text. Default: `slugify`.'

460 ],

461 'separator': ['-', 'Word separator. Default: `-`.'],

462 'toc_depth': [

463 6,

464 'Define the range of section levels to include in the Table of Contents. A single integer '

465 '(b) defines the bottom section level (<h1>..<hb>) only. A string consisting of two digits '

466 'separated by a hyphen in between (`2-5`) defines the top (t) and the bottom (b) (<ht>..<hb>). '

467 'Default: `6` (bottom).'

468 ],

469 }

470 """ Default configuration options. """

471

472 super().__init__(**kwargs)

473

474 def extendMarkdown(self, md):

475 """ Add TOC tree processor to Markdown. """

476 md.registerExtension(self)

477 self.md = md

478 self.reset()

479 tocext = self.TreeProcessorClass(md, self.getConfigs())

480 md.treeprocessors.register(tocext, 'toc', 5)

481

482 def reset(self) -> None:

483 self.md.toc = ''

484 self.md.toc_tokens = []

485

486

487def makeExtension(**kwargs): # pragma: no cover

488 return TocExtension(**kwargs)