Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/markdown/extensions/toc.py: 78%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

258 statements  

1# Table of Contents Extension for Python-Markdown 

2# =============================================== 

3 

4# See https://Python-Markdown.github.io/extensions/toc 

5# for documentation. 

6 

7# Original code Copyright 2008 [Jack Miller](https://codezen.org/) 

8 

9# All changes Copyright 2008-2024 The Python Markdown Project 

10 

11# License: [BSD](https://opensource.org/licenses/bsd-license.php) 

12 

13""" 

14Add table of contents support to Python-Markdown. 

15 

16See the [documentation](https://Python-Markdown.github.io/extensions/toc) 

17for details. 

18""" 

19 

20from __future__ import annotations 

21 

22from . import Extension 

23from ..treeprocessors import Treeprocessor 

24from ..util import parseBoolValue, AMP_SUBSTITUTE, deprecated, HTML_PLACEHOLDER_RE, AtomicString 

25from ..treeprocessors import UnescapeTreeprocessor 

26from ..serializers import RE_AMP 

27import re 

28import html 

29import unicodedata 

30from copy import deepcopy 

31import xml.etree.ElementTree as etree 

32from typing import TYPE_CHECKING, Any, Iterator, MutableSet 

33 

34if TYPE_CHECKING: # pragma: no cover 

35 from markdown import Markdown 

36 

37 

38def slugify(value: str, separator: str, unicode: bool = False) -> str: 

39 """ Slugify a string, to make it URL friendly. """ 

40 if not unicode: 

41 # Replace Extended Latin characters with ASCII, i.e. `žlutý` => `zluty` 

42 value = unicodedata.normalize('NFKD', value) 

43 value = value.encode('ascii', 'ignore').decode('ascii') 

44 value = re.sub(r'[^\w\s-]', '', value).strip().lower() 

45 return re.sub(r'[{}\s]+'.format(separator), separator, value) 

46 

47 

48def slugify_unicode(value: str, separator: str) -> str: 

49 """ Slugify a string, to make it URL friendly while preserving Unicode characters. """ 

50 return slugify(value, separator, unicode=True) 

51 

52 

53IDCOUNT_RE = re.compile(r'^(.*)_([0-9]+)$') 

54 

55 

56def unique(id: str, ids: MutableSet[str]) -> str: 

57 """ Ensure id is unique in set of ids. Append '_1', '_2'... if not """ 

58 while id in ids or not id: 

59 m = IDCOUNT_RE.match(id) 

60 if m: 

61 id = '%s_%d' % (m.group(1), int(m.group(2))+1) 

62 else: 

63 id = '%s_%d' % (id, 1) 

64 ids.add(id) 

65 return id 

66 

67 

68@deprecated('Use `render_inner_html` and `striptags` instead.') 

69def get_name(el: etree.Element) -> str: 

70 """Get title name.""" 

71 

72 text = [] 

73 for c in el.itertext(): 

74 if isinstance(c, AtomicString): 

75 text.append(html.unescape(c)) 

76 else: 

77 text.append(c) 

78 return ''.join(text).strip() 

79 

80 

81@deprecated('Use `run_postprocessors`, `render_inner_html` and/or `striptags` instead.') 

82def stashedHTML2text(text: str, md: Markdown, strip_entities: bool = True) -> str: 

83 """ Extract raw HTML from stash, reduce to plain text and swap with placeholder. """ 

84 def _html_sub(m: re.Match[str]) -> str: 

85 """ Substitute raw html with plain text. """ 

86 try: 

87 raw = md.htmlStash.rawHtmlBlocks[int(m.group(1))] 

88 except (IndexError, TypeError): # pragma: no cover 

89 return m.group(0) 

90 # Strip out tags and/or entities - leaving text 

91 res = re.sub(r'(<[^>]+>)', '', raw) 

92 if strip_entities: 

93 res = re.sub(r'(&[\#a-zA-Z0-9]+;)', '', res) 

94 return res 

95 

96 return HTML_PLACEHOLDER_RE.sub(_html_sub, text) 

97 

98 

99def unescape(text: str) -> str: 

100 """ Unescape Markdown backslash escaped text. """ 

101 c = UnescapeTreeprocessor() 

102 return c.unescape(text) 

103 

104 

105def strip_tags(text: str) -> str: 

106 """ Strip HTML tags and return plain text. Note: HTML entities are unaffected. """ 

107 # A comment could contain a tag, so strip comments first 

108 while (start := text.find('<!--')) != -1 and (end := text.find('-->', start)) != -1: 

109 text = f'{text[:start]}{text[end + 3:]}' 

110 

111 while (start := text.find('<')) != -1 and (end := text.find('>', start)) != -1: 

112 text = f'{text[:start]}{text[end + 1:]}' 

113 

114 # Collapse whitespace 

115 text = ' '.join(text.split()) 

116 return text 

117 

118 

119def escape_cdata(text: str) -> str: 

120 """ Escape character data. """ 

121 if "&" in text: 

122 # Only replace & when not part of an entity 

123 text = RE_AMP.sub('&amp;', text) 

124 if "<" in text: 

125 text = text.replace("<", "&lt;") 

126 if ">" in text: 

127 text = text.replace(">", "&gt;") 

128 return text 

129 

130 

131def run_postprocessors(text: str, md: Markdown) -> str: 

132 """ Run postprocessors from Markdown instance on text. """ 

133 for pp in md.postprocessors: 

134 text = pp.run(text) 

135 return text.strip() 

136 

137 

138def render_inner_html(el: etree.Element, md: Markdown) -> str: 

139 """ Fully render inner html of an `etree` element as a string. """ 

140 # The `UnescapeTreeprocessor` runs after `toc` extension so run here. 

141 text = unescape(md.serializer(el)) 

142 

143 # strip parent tag 

144 start = text.index('>') + 1 

145 end = text.rindex('<') 

146 text = text[start:end].strip() 

147 

148 return run_postprocessors(text, md) 

149 

150 

151def remove_fnrefs(root: etree.Element) -> etree.Element: 

152 """ Remove footnote references from a copy of the element, if any are present. """ 

153 # Remove footnote references, which look like this: `<sup id="fnref:1">...</sup>`. 

154 # If there are no `sup` elements, then nothing to do. 

155 if next(root.iter('sup'), None) is None: 

156 return root 

157 root = deepcopy(root) 

158 # Find parent elements that contain `sup` elements. 

159 for parent in root.findall('.//sup/..'): 

160 carry_text = "" 

161 for child in reversed(parent): # Reversed for the ability to mutate during iteration. 

162 # Remove matching footnote references but carry any `tail` text to preceding elements. 

163 if child.tag == 'sup' and child.get('id', '').startswith('fnref'): 

164 carry_text = f'{child.tail or ""}{carry_text}' 

165 parent.remove(child) 

166 elif carry_text: 

167 child.tail = f'{child.tail or ""}{carry_text}' 

168 carry_text = "" 

169 if carry_text: 

170 parent.text = f'{parent.text or ""}{carry_text}' 

171 return root 

172 

173 

174def nest_toc_tokens(toc_list): 

175 """Given an unsorted list with errors and skips, return a nested one. 

176 

177 [{'level': 1}, {'level': 2}] 

178 => 

179 [{'level': 1, 'children': [{'level': 2, 'children': []}]}] 

180 

181 A wrong list is also converted: 

182 

183 [{'level': 2}, {'level': 1}] 

184 => 

185 [{'level': 2, 'children': []}, {'level': 1, 'children': []}] 

186 """ 

187 

188 ordered_list = [] 

189 if len(toc_list): 

190 # Initialize everything by processing the first entry 

191 last = toc_list.pop(0) 

192 last['children'] = [] 

193 levels = [last['level']] 

194 ordered_list.append(last) 

195 parents = [] 

196 

197 # Walk the rest nesting the entries properly 

198 while toc_list: 

199 t = toc_list.pop(0) 

200 current_level = t['level'] 

201 t['children'] = [] 

202 

203 # Reduce depth if current level < last item's level 

204 if current_level < levels[-1]: 

205 # Pop last level since we know we are less than it 

206 levels.pop() 

207 

208 # Pop parents and levels we are less than or equal to 

209 to_pop = 0 

210 for p in reversed(parents): 

211 if current_level <= p['level']: 

212 to_pop += 1 

213 else: # pragma: no cover 

214 break 

215 if to_pop: 

216 levels = levels[:-to_pop] 

217 parents = parents[:-to_pop] 

218 

219 # Note current level as last 

220 levels.append(current_level) 

221 

222 # Level is the same, so append to 

223 # the current parent (if available) 

224 if current_level == levels[-1]: 

225 (parents[-1]['children'] if parents 

226 else ordered_list).append(t) 

227 

228 # Current level is > last item's level, 

229 # So make last item a parent and append current as child 

230 else: 

231 last['children'].append(t) 

232 parents.append(last) 

233 levels.append(current_level) 

234 last = t 

235 

236 return ordered_list 

237 

238 

239class TocTreeprocessor(Treeprocessor): 

240 """ Step through document and build TOC. """ 

241 

242 def __init__(self, md: Markdown, config: dict[str, Any]): 

243 super().__init__(md) 

244 

245 self.marker: str = config["marker"] 

246 self.title: str = config["title"] 

247 self.base_level = int(config["baselevel"]) - 1 

248 self.slugify = config["slugify"] 

249 self.sep = config["separator"] 

250 self.toc_class = config["toc_class"] 

251 self.title_class: str = config["title_class"] 

252 self.use_anchors: bool = parseBoolValue(config["anchorlink"]) 

253 self.anchorlink_class: str = config["anchorlink_class"] 

254 self.use_permalinks = parseBoolValue(config["permalink"], False) 

255 if self.use_permalinks is None: 

256 self.use_permalinks = config["permalink"] 

257 self.permalink_class: str = config["permalink_class"] 

258 self.permalink_title: str = config["permalink_title"] 

259 self.permalink_leading: bool | None = parseBoolValue(config["permalink_leading"], False) 

260 self.header_rgx = re.compile("[Hh][123456]") 

261 if isinstance(config["toc_depth"], str) and '-' in config["toc_depth"]: 

262 self.toc_top, self.toc_bottom = [int(x) for x in config["toc_depth"].split('-')] 

263 else: 

264 self.toc_top = 1 

265 self.toc_bottom = int(config["toc_depth"]) 

266 

267 def iterparent(self, node: etree.Element) -> Iterator[tuple[etree.Element, etree.Element]]: 

268 """ Iterator wrapper to get allowed parent and child all at once. """ 

269 

270 # We do not allow the marker inside a header as that 

271 # would causes an endless loop of placing a new TOC 

272 # inside previously generated TOC. 

273 for child in node: 

274 if not self.header_rgx.match(child.tag) and child.tag not in ['pre', 'code']: 

275 yield node, child 

276 yield from self.iterparent(child) 

277 

278 def replace_marker(self, root: etree.Element, elem: etree.Element) -> None: 

279 """ Replace marker with elem. """ 

280 for (p, c) in self.iterparent(root): 

281 text = ''.join(c.itertext()).strip() 

282 if not text: 

283 continue 

284 

285 # To keep the output from screwing up the 

286 # validation by putting a `<div>` inside of a `<p>` 

287 # we actually replace the `<p>` in its entirety. 

288 

289 # The `<p>` element may contain more than a single text content 

290 # (`nl2br` can introduce a `<br>`). In this situation, `c.text` returns 

291 # the very first content, ignore children contents or tail content. 

292 # `len(c) == 0` is here to ensure there is only text in the `<p>`. 

293 if c.text and c.text.strip() == self.marker and len(c) == 0: 

294 for i in range(len(p)): 

295 if p[i] == c: 

296 p[i] = elem 

297 break 

298 

299 def set_level(self, elem: etree.Element) -> None: 

300 """ Adjust header level according to base level. """ 

301 level = int(elem.tag[-1]) + self.base_level 

302 if level > 6: 

303 level = 6 

304 elem.tag = 'h%d' % level 

305 

306 def add_anchor(self, c: etree.Element, elem_id: str) -> None: 

307 anchor = etree.Element("a") 

308 anchor.text = c.text 

309 anchor.attrib["href"] = "#" + elem_id 

310 anchor.attrib["class"] = self.anchorlink_class 

311 c.text = "" 

312 for elem in c: 

313 anchor.append(elem) 

314 while len(c): 

315 c.remove(c[0]) 

316 c.append(anchor) 

317 

318 def add_permalink(self, c: etree.Element, elem_id: str) -> None: 

319 permalink = etree.Element("a") 

320 permalink.text = ("%spara;" % AMP_SUBSTITUTE 

321 if self.use_permalinks is True 

322 else self.use_permalinks) 

323 permalink.attrib["href"] = "#" + elem_id 

324 permalink.attrib["class"] = self.permalink_class 

325 if self.permalink_title: 

326 permalink.attrib["title"] = self.permalink_title 

327 if self.permalink_leading: 

328 permalink.tail = c.text 

329 c.text = "" 

330 c.insert(0, permalink) 

331 else: 

332 c.append(permalink) 

333 

334 def build_toc_div(self, toc_list: list) -> etree.Element: 

335 """ Return a string div given a toc list. """ 

336 div = etree.Element("div") 

337 div.attrib["class"] = self.toc_class 

338 

339 # Add title to the div 

340 if self.title: 

341 header = etree.SubElement(div, "span") 

342 if self.title_class: 

343 header.attrib["class"] = self.title_class 

344 header.text = self.title 

345 

346 def build_etree_ul(toc_list: list, parent: etree.Element) -> etree.Element: 

347 ul = etree.SubElement(parent, "ul") 

348 for item in toc_list: 

349 # List item link, to be inserted into the toc div 

350 li = etree.SubElement(ul, "li") 

351 link = etree.SubElement(li, "a") 

352 link.text = item.get('name', '') 

353 link.attrib["href"] = '#' + item.get('id', '') 

354 if item['children']: 

355 build_etree_ul(item['children'], li) 

356 return ul 

357 

358 build_etree_ul(toc_list, div) 

359 

360 if 'prettify' in self.md.treeprocessors: 

361 self.md.treeprocessors['prettify'].run(div) 

362 

363 return div 

364 

365 def run(self, doc: etree.Element) -> None: 

366 # Get a list of id attributes 

367 used_ids = set() 

368 for el in doc.iter(): 

369 if "id" in el.attrib: 

370 used_ids.add(el.attrib["id"]) 

371 

372 toc_tokens = [] 

373 for el in doc.iter(): 

374 if isinstance(el.tag, str) and self.header_rgx.match(el.tag): 

375 self.set_level(el) 

376 innerhtml = render_inner_html(remove_fnrefs(el), self.md) 

377 name = strip_tags(innerhtml) 

378 

379 # Do not override pre-existing ids 

380 if "id" not in el.attrib: 

381 el.attrib["id"] = unique(self.slugify(html.unescape(name), self.sep), used_ids) 

382 

383 data_toc_label = '' 

384 if 'data-toc-label' in el.attrib: 

385 data_toc_label = run_postprocessors(unescape(el.attrib['data-toc-label']), self.md) 

386 # Overwrite name with sanitized value of `data-toc-label`. 

387 name = escape_cdata(strip_tags(data_toc_label)) 

388 # Remove the data-toc-label attribute as it is no longer needed 

389 del el.attrib['data-toc-label'] 

390 

391 if int(el.tag[-1]) >= self.toc_top and int(el.tag[-1]) <= self.toc_bottom: 

392 toc_tokens.append({ 

393 'level': int(el.tag[-1]), 

394 'id': unescape(el.attrib["id"]), 

395 'name': name, 

396 'html': innerhtml, 

397 'data-toc-label': data_toc_label 

398 }) 

399 

400 if self.use_anchors: 

401 self.add_anchor(el, el.attrib["id"]) 

402 if self.use_permalinks not in [False, None]: 

403 self.add_permalink(el, el.attrib["id"]) 

404 

405 toc_tokens = nest_toc_tokens(toc_tokens) 

406 div = self.build_toc_div(toc_tokens) 

407 if self.marker: 

408 self.replace_marker(doc, div) 

409 

410 # serialize and attach to markdown instance. 

411 toc = self.md.serializer(div) 

412 for pp in self.md.postprocessors: 

413 toc = pp.run(toc) 

414 self.md.toc_tokens = toc_tokens 

415 self.md.toc = toc 

416 

417 

418class TocExtension(Extension): 

419 

420 TreeProcessorClass = TocTreeprocessor 

421 

422 def __init__(self, **kwargs): 

423 self.config = { 

424 'marker': [ 

425 '[TOC]', 

426 'Text to find and replace with Table of Contents. Set to an empty string to disable. ' 

427 'Default: `[TOC]`.' 

428 ], 

429 'title': [ 

430 '', 'Title to insert into TOC `<div>`. Default: an empty string.' 

431 ], 

432 'title_class': [ 

433 'toctitle', 'CSS class used for the title. Default: `toctitle`.' 

434 ], 

435 'toc_class': [ 

436 'toc', 'CSS class(es) used for the link. Default: `toclink`.' 

437 ], 

438 'anchorlink': [ 

439 False, 'True if header should be a self link. Default: `False`.' 

440 ], 

441 'anchorlink_class': [ 

442 'toclink', 'CSS class(es) used for the link. Defaults: `toclink`.' 

443 ], 

444 'permalink': [ 

445 0, 'True or link text if a Sphinx-style permalink should be added. Default: `False`.' 

446 ], 

447 'permalink_class': [ 

448 'headerlink', 'CSS class(es) used for the link. Default: `headerlink`.' 

449 ], 

450 'permalink_title': [ 

451 'Permanent link', 'Title attribute of the permalink. Default: `Permanent link`.' 

452 ], 

453 'permalink_leading': [ 

454 False, 

455 'True if permalinks should be placed at start of the header, rather than end. Default: False.' 

456 ], 

457 'baselevel': ['1', 'Base level for headers. Default: `1`.'], 

458 'slugify': [ 

459 slugify, 'Function to generate anchors based on header text. Default: `slugify`.' 

460 ], 

461 'separator': ['-', 'Word separator. Default: `-`.'], 

462 'toc_depth': [ 

463 6, 

464 'Define the range of section levels to include in the Table of Contents. A single integer ' 

465 '(b) defines the bottom section level (<h1>..<hb>) only. A string consisting of two digits ' 

466 'separated by a hyphen in between (`2-5`) defines the top (t) and the bottom (b) (<ht>..<hb>). ' 

467 'Default: `6` (bottom).' 

468 ], 

469 } 

470 """ Default configuration options. """ 

471 

472 super().__init__(**kwargs) 

473 

474 def extendMarkdown(self, md): 

475 """ Add TOC tree processor to Markdown. """ 

476 md.registerExtension(self) 

477 self.md = md 

478 self.reset() 

479 tocext = self.TreeProcessorClass(md, self.getConfigs()) 

480 md.treeprocessors.register(tocext, 'toc', 5) 

481 

482 def reset(self) -> None: 

483 self.md.toc = '' 

484 self.md.toc_tokens = [] 

485 

486 

487def makeExtension(**kwargs): # pragma: no cover 

488 return TocExtension(**kwargs)