1# Table of Contents Extension for Python-Markdown
2# ===============================================
3
4# See https://Python-Markdown.github.io/extensions/toc
5# for documentation.
6
7# Original code Copyright 2008 [Jack Miller](https://codezen.org/)
8
9# All changes Copyright 2008-2024 The Python Markdown Project
10
11# License: [BSD](https://opensource.org/licenses/bsd-license.php)
12
13"""
14Add table of contents support to Python-Markdown.
15
16See the [documentation](https://Python-Markdown.github.io/extensions/toc)
17for details.
18"""
19
20from __future__ import annotations
21
22from . import Extension
23from ..treeprocessors import Treeprocessor
24from ..util import parseBoolValue, AMP_SUBSTITUTE, deprecated, HTML_PLACEHOLDER_RE, AtomicString
25from ..treeprocessors import UnescapeTreeprocessor
26from ..serializers import RE_AMP
27import re
28import html
29import unicodedata
30from copy import deepcopy
31import xml.etree.ElementTree as etree
32from typing import TYPE_CHECKING, Any, Iterator, MutableSet
33
34if TYPE_CHECKING: # pragma: no cover
35 from markdown import Markdown
36
37
38def slugify(value: str, separator: str, unicode: bool = False) -> str:
39 """ Slugify a string, to make it URL friendly. """
40 if not unicode:
41 # Replace Extended Latin characters with ASCII, i.e. `žlutý` => `zluty`
42 value = unicodedata.normalize('NFKD', value)
43 value = value.encode('ascii', 'ignore').decode('ascii')
44 value = re.sub(r'[^\w\s-]', '', value).strip().lower()
45 return re.sub(r'[{}\s]+'.format(separator), separator, value)
46
47
48def slugify_unicode(value: str, separator: str) -> str:
49 """ Slugify a string, to make it URL friendly while preserving Unicode characters. """
50 return slugify(value, separator, unicode=True)
51
52
53IDCOUNT_RE = re.compile(r'^(.*)_([0-9]+)$')
54
55
56def unique(id: str, ids: MutableSet[str]) -> str:
57 """ Ensure id is unique in set of ids. Append '_1', '_2'... if not """
58 while id in ids or not id:
59 m = IDCOUNT_RE.match(id)
60 if m:
61 id = '%s_%d' % (m.group(1), int(m.group(2))+1)
62 else:
63 id = '%s_%d' % (id, 1)
64 ids.add(id)
65 return id
66
67
68@deprecated('Use `render_inner_html` and `striptags` instead.')
69def get_name(el: etree.Element) -> str:
70 """Get title name."""
71
72 text = []
73 for c in el.itertext():
74 if isinstance(c, AtomicString):
75 text.append(html.unescape(c))
76 else:
77 text.append(c)
78 return ''.join(text).strip()
79
80
81@deprecated('Use `run_postprocessors`, `render_inner_html` and/or `striptags` instead.')
82def stashedHTML2text(text: str, md: Markdown, strip_entities: bool = True) -> str:
83 """ Extract raw HTML from stash, reduce to plain text and swap with placeholder. """
84 def _html_sub(m: re.Match[str]) -> str:
85 """ Substitute raw html with plain text. """
86 try:
87 raw = md.htmlStash.rawHtmlBlocks[int(m.group(1))]
88 except (IndexError, TypeError): # pragma: no cover
89 return m.group(0)
90 # Strip out tags and/or entities - leaving text
91 res = re.sub(r'(<[^>]+>)', '', raw)
92 if strip_entities:
93 res = re.sub(r'(&[\#a-zA-Z0-9]+;)', '', res)
94 return res
95
96 return HTML_PLACEHOLDER_RE.sub(_html_sub, text)
97
98
99def unescape(text: str) -> str:
100 """ Unescape Markdown backslash escaped text. """
101 c = UnescapeTreeprocessor()
102 return c.unescape(text)
103
104
105def strip_tags(text: str) -> str:
106 """ Strip HTML tags and return plain text. Note: HTML entities are unaffected. """
107 # A comment could contain a tag, so strip comments first
108 while (start := text.find('<!--')) != -1 and (end := text.find('-->', start)) != -1:
109 text = f'{text[:start]}{text[end + 3:]}'
110
111 while (start := text.find('<')) != -1 and (end := text.find('>', start)) != -1:
112 text = f'{text[:start]}{text[end + 1:]}'
113
114 # Collapse whitespace
115 text = ' '.join(text.split())
116 return text
117
118
119def escape_cdata(text: str) -> str:
120 """ Escape character data. """
121 if "&" in text:
122 # Only replace & when not part of an entity
123 text = RE_AMP.sub('&', text)
124 if "<" in text:
125 text = text.replace("<", "<")
126 if ">" in text:
127 text = text.replace(">", ">")
128 return text
129
130
131def run_postprocessors(text: str, md: Markdown) -> str:
132 """ Run postprocessors from Markdown instance on text. """
133 for pp in md.postprocessors:
134 text = pp.run(text)
135 return text.strip()
136
137
138def render_inner_html(el: etree.Element, md: Markdown) -> str:
139 """ Fully render inner html of an `etree` element as a string. """
140 # The `UnescapeTreeprocessor` runs after `toc` extension so run here.
141 text = unescape(md.serializer(el))
142
143 # strip parent tag
144 start = text.index('>') + 1
145 end = text.rindex('<')
146 text = text[start:end].strip()
147
148 return run_postprocessors(text, md)
149
150
151def remove_fnrefs(root: etree.Element) -> etree.Element:
152 """ Remove footnote references from a copy of the element, if any are present. """
153 # Remove footnote references, which look like this: `<sup id="fnref:1">...</sup>`.
154 # If there are no `sup` elements, then nothing to do.
155 if next(root.iter('sup'), None) is None:
156 return root
157 root = deepcopy(root)
158 # Find parent elements that contain `sup` elements.
159 for parent in root.findall('.//sup/..'):
160 carry_text = ""
161 for child in reversed(parent): # Reversed for the ability to mutate during iteration.
162 # Remove matching footnote references but carry any `tail` text to preceding elements.
163 if child.tag == 'sup' and child.get('id', '').startswith('fnref'):
164 carry_text = f'{child.tail or ""}{carry_text}'
165 parent.remove(child)
166 elif carry_text:
167 child.tail = f'{child.tail or ""}{carry_text}'
168 carry_text = ""
169 if carry_text:
170 parent.text = f'{parent.text or ""}{carry_text}'
171 return root
172
173
174def nest_toc_tokens(toc_list):
175 """Given an unsorted list with errors and skips, return a nested one.
176
177 [{'level': 1}, {'level': 2}]
178 =>
179 [{'level': 1, 'children': [{'level': 2, 'children': []}]}]
180
181 A wrong list is also converted:
182
183 [{'level': 2}, {'level': 1}]
184 =>
185 [{'level': 2, 'children': []}, {'level': 1, 'children': []}]
186 """
187
188 ordered_list = []
189 if len(toc_list):
190 # Initialize everything by processing the first entry
191 last = toc_list.pop(0)
192 last['children'] = []
193 levels = [last['level']]
194 ordered_list.append(last)
195 parents = []
196
197 # Walk the rest nesting the entries properly
198 while toc_list:
199 t = toc_list.pop(0)
200 current_level = t['level']
201 t['children'] = []
202
203 # Reduce depth if current level < last item's level
204 if current_level < levels[-1]:
205 # Pop last level since we know we are less than it
206 levels.pop()
207
208 # Pop parents and levels we are less than or equal to
209 to_pop = 0
210 for p in reversed(parents):
211 if current_level <= p['level']:
212 to_pop += 1
213 else: # pragma: no cover
214 break
215 if to_pop:
216 levels = levels[:-to_pop]
217 parents = parents[:-to_pop]
218
219 # Note current level as last
220 levels.append(current_level)
221
222 # Level is the same, so append to
223 # the current parent (if available)
224 if current_level == levels[-1]:
225 (parents[-1]['children'] if parents
226 else ordered_list).append(t)
227
228 # Current level is > last item's level,
229 # So make last item a parent and append current as child
230 else:
231 last['children'].append(t)
232 parents.append(last)
233 levels.append(current_level)
234 last = t
235
236 return ordered_list
237
238
239class TocTreeprocessor(Treeprocessor):
240 """ Step through document and build TOC. """
241
242 def __init__(self, md: Markdown, config: dict[str, Any]):
243 super().__init__(md)
244
245 self.marker: str = config["marker"]
246 self.title: str = config["title"]
247 self.base_level = int(config["baselevel"]) - 1
248 self.slugify = config["slugify"]
249 self.sep = config["separator"]
250 self.toc_class = config["toc_class"]
251 self.title_class: str = config["title_class"]
252 self.use_anchors: bool = parseBoolValue(config["anchorlink"])
253 self.anchorlink_class: str = config["anchorlink_class"]
254 self.use_permalinks = parseBoolValue(config["permalink"], False)
255 if self.use_permalinks is None:
256 self.use_permalinks = config["permalink"]
257 self.permalink_class: str = config["permalink_class"]
258 self.permalink_title: str = config["permalink_title"]
259 self.permalink_leading: bool | None = parseBoolValue(config["permalink_leading"], False)
260 self.header_rgx = re.compile("[Hh][123456]")
261 if isinstance(config["toc_depth"], str) and '-' in config["toc_depth"]:
262 self.toc_top, self.toc_bottom = [int(x) for x in config["toc_depth"].split('-')]
263 else:
264 self.toc_top = 1
265 self.toc_bottom = int(config["toc_depth"])
266
267 def iterparent(self, node: etree.Element) -> Iterator[tuple[etree.Element, etree.Element]]:
268 """ Iterator wrapper to get allowed parent and child all at once. """
269
270 # We do not allow the marker inside a header as that
271 # would causes an endless loop of placing a new TOC
272 # inside previously generated TOC.
273 for child in node:
274 if not self.header_rgx.match(child.tag) and child.tag not in ['pre', 'code']:
275 yield node, child
276 yield from self.iterparent(child)
277
278 def replace_marker(self, root: etree.Element, elem: etree.Element) -> None:
279 """ Replace marker with elem. """
280 for (p, c) in self.iterparent(root):
281 text = ''.join(c.itertext()).strip()
282 if not text:
283 continue
284
285 # To keep the output from screwing up the
286 # validation by putting a `<div>` inside of a `<p>`
287 # we actually replace the `<p>` in its entirety.
288
289 # The `<p>` element may contain more than a single text content
290 # (`nl2br` can introduce a `<br>`). In this situation, `c.text` returns
291 # the very first content, ignore children contents or tail content.
292 # `len(c) == 0` is here to ensure there is only text in the `<p>`.
293 if c.text and c.text.strip() == self.marker and len(c) == 0:
294 for i in range(len(p)):
295 if p[i] == c:
296 p[i] = elem
297 break
298
299 def set_level(self, elem: etree.Element) -> None:
300 """ Adjust header level according to base level. """
301 level = int(elem.tag[-1]) + self.base_level
302 if level > 6:
303 level = 6
304 elem.tag = 'h%d' % level
305
306 def add_anchor(self, c: etree.Element, elem_id: str) -> None:
307 anchor = etree.Element("a")
308 anchor.text = c.text
309 anchor.attrib["href"] = "#" + elem_id
310 anchor.attrib["class"] = self.anchorlink_class
311 c.text = ""
312 for elem in c:
313 anchor.append(elem)
314 while len(c):
315 c.remove(c[0])
316 c.append(anchor)
317
318 def add_permalink(self, c: etree.Element, elem_id: str) -> None:
319 permalink = etree.Element("a")
320 permalink.text = ("%spara;" % AMP_SUBSTITUTE
321 if self.use_permalinks is True
322 else self.use_permalinks)
323 permalink.attrib["href"] = "#" + elem_id
324 permalink.attrib["class"] = self.permalink_class
325 if self.permalink_title:
326 permalink.attrib["title"] = self.permalink_title
327 if self.permalink_leading:
328 permalink.tail = c.text
329 c.text = ""
330 c.insert(0, permalink)
331 else:
332 c.append(permalink)
333
334 def build_toc_div(self, toc_list: list) -> etree.Element:
335 """ Return a string div given a toc list. """
336 div = etree.Element("div")
337 div.attrib["class"] = self.toc_class
338
339 # Add title to the div
340 if self.title:
341 header = etree.SubElement(div, "span")
342 if self.title_class:
343 header.attrib["class"] = self.title_class
344 header.text = self.title
345
346 def build_etree_ul(toc_list: list, parent: etree.Element) -> etree.Element:
347 ul = etree.SubElement(parent, "ul")
348 for item in toc_list:
349 # List item link, to be inserted into the toc div
350 li = etree.SubElement(ul, "li")
351 link = etree.SubElement(li, "a")
352 link.text = item.get('name', '')
353 link.attrib["href"] = '#' + item.get('id', '')
354 if item['children']:
355 build_etree_ul(item['children'], li)
356 return ul
357
358 build_etree_ul(toc_list, div)
359
360 if 'prettify' in self.md.treeprocessors:
361 self.md.treeprocessors['prettify'].run(div)
362
363 return div
364
365 def run(self, doc: etree.Element) -> None:
366 # Get a list of id attributes
367 used_ids = set()
368 for el in doc.iter():
369 if "id" in el.attrib:
370 used_ids.add(el.attrib["id"])
371
372 toc_tokens = []
373 for el in doc.iter():
374 if isinstance(el.tag, str) and self.header_rgx.match(el.tag):
375 self.set_level(el)
376 innerhtml = render_inner_html(remove_fnrefs(el), self.md)
377 name = strip_tags(innerhtml)
378
379 # Do not override pre-existing ids
380 if "id" not in el.attrib:
381 el.attrib["id"] = unique(self.slugify(html.unescape(name), self.sep), used_ids)
382
383 data_toc_label = ''
384 if 'data-toc-label' in el.attrib:
385 data_toc_label = run_postprocessors(unescape(el.attrib['data-toc-label']), self.md)
386 # Overwrite name with sanitized value of `data-toc-label`.
387 name = escape_cdata(strip_tags(data_toc_label))
388 # Remove the data-toc-label attribute as it is no longer needed
389 del el.attrib['data-toc-label']
390
391 if int(el.tag[-1]) >= self.toc_top and int(el.tag[-1]) <= self.toc_bottom:
392 toc_tokens.append({
393 'level': int(el.tag[-1]),
394 'id': unescape(el.attrib["id"]),
395 'name': name,
396 'html': innerhtml,
397 'data-toc-label': data_toc_label
398 })
399
400 if self.use_anchors:
401 self.add_anchor(el, el.attrib["id"])
402 if self.use_permalinks not in [False, None]:
403 self.add_permalink(el, el.attrib["id"])
404
405 toc_tokens = nest_toc_tokens(toc_tokens)
406 div = self.build_toc_div(toc_tokens)
407 if self.marker:
408 self.replace_marker(doc, div)
409
410 # serialize and attach to markdown instance.
411 toc = self.md.serializer(div)
412 for pp in self.md.postprocessors:
413 toc = pp.run(toc)
414 self.md.toc_tokens = toc_tokens
415 self.md.toc = toc
416
417
418class TocExtension(Extension):
419
420 TreeProcessorClass = TocTreeprocessor
421
422 def __init__(self, **kwargs):
423 self.config = {
424 'marker': [
425 '[TOC]',
426 'Text to find and replace with Table of Contents. Set to an empty string to disable. '
427 'Default: `[TOC]`.'
428 ],
429 'title': [
430 '', 'Title to insert into TOC `<div>`. Default: an empty string.'
431 ],
432 'title_class': [
433 'toctitle', 'CSS class used for the title. Default: `toctitle`.'
434 ],
435 'toc_class': [
436 'toc', 'CSS class(es) used for the link. Default: `toclink`.'
437 ],
438 'anchorlink': [
439 False, 'True if header should be a self link. Default: `False`.'
440 ],
441 'anchorlink_class': [
442 'toclink', 'CSS class(es) used for the link. Defaults: `toclink`.'
443 ],
444 'permalink': [
445 0, 'True or link text if a Sphinx-style permalink should be added. Default: `False`.'
446 ],
447 'permalink_class': [
448 'headerlink', 'CSS class(es) used for the link. Default: `headerlink`.'
449 ],
450 'permalink_title': [
451 'Permanent link', 'Title attribute of the permalink. Default: `Permanent link`.'
452 ],
453 'permalink_leading': [
454 False,
455 'True if permalinks should be placed at start of the header, rather than end. Default: False.'
456 ],
457 'baselevel': ['1', 'Base level for headers. Default: `1`.'],
458 'slugify': [
459 slugify, 'Function to generate anchors based on header text. Default: `slugify`.'
460 ],
461 'separator': ['-', 'Word separator. Default: `-`.'],
462 'toc_depth': [
463 6,
464 'Define the range of section levels to include in the Table of Contents. A single integer '
465 '(b) defines the bottom section level (<h1>..<hb>) only. A string consisting of two digits '
466 'separated by a hyphen in between (`2-5`) defines the top (t) and the bottom (b) (<ht>..<hb>). '
467 'Default: `6` (bottom).'
468 ],
469 }
470 """ Default configuration options. """
471
472 super().__init__(**kwargs)
473
474 def extendMarkdown(self, md):
475 """ Add TOC tree processor to Markdown. """
476 md.registerExtension(self)
477 self.md = md
478 self.reset()
479 tocext = self.TreeProcessorClass(md, self.getConfigs())
480 md.treeprocessors.register(tocext, 'toc', 5)
481
482 def reset(self) -> None:
483 self.md.toc = ''
484 self.md.toc_tokens = []
485
486
487def makeExtension(**kwargs): # pragma: no cover
488 return TocExtension(**kwargs)