Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/markdown/treeprocessors.py: 97%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

229 statements  

1# Python Markdown 

2 

3# A Python implementation of John Gruber's Markdown. 

4 

5# Documentation: https://python-markdown.github.io/ 

6# GitHub: https://github.com/Python-Markdown/markdown/ 

7# PyPI: https://pypi.org/project/Markdown/ 

8 

9# Started by Manfred Stienstra (http://www.dwerg.net/). 

10# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org). 

11# Currently maintained by Waylan Limberg (https://github.com/waylan), 

12# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser). 

13 

14# Copyright 2007-2023 The Python Markdown Project (v. 1.7 and later) 

15# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) 

16# Copyright 2004 Manfred Stienstra (the original version) 

17 

18# License: BSD (see LICENSE.md for details). 

19 

20""" 

21Tree processors manipulate the tree created by block processors. They can even create an entirely 

22new `ElementTree` object. This is an excellent place for creating summaries, adding collected 

23references, or last minute adjustments. 

24 

25""" 

26 

27from __future__ import annotations 

28 

29import re 

30import xml.etree.ElementTree as etree 

31from typing import TYPE_CHECKING, Any 

32from . import util 

33from . import inlinepatterns 

34 

35if TYPE_CHECKING: # pragma: no cover 

36 from markdown import Markdown 

37 

38 

39def build_treeprocessors(md: Markdown, **kwargs: Any) -> util.Registry[Treeprocessor]: 

40 """ Build the default `treeprocessors` for Markdown. """ 

41 treeprocessors = util.Registry() 

42 treeprocessors.register(InlineProcessor(md), 'inline', 20) 

43 treeprocessors.register(PrettifyTreeprocessor(md), 'prettify', 10) 

44 treeprocessors.register(UnescapeTreeprocessor(md), 'unescape', 0) 

45 return treeprocessors 

46 

47 

48def isString(s: object) -> bool: 

49 """ Return `True` if object is a string but not an [`AtomicString`][markdown.util.AtomicString]. """ 

50 if not isinstance(s, util.AtomicString): 

51 return isinstance(s, str) 

52 return False 

53 

54 

55class Treeprocessor(util.Processor): 

56 """ 

57 `Treeprocessor`s are run on the `ElementTree` object before serialization. 

58 

59 Each `Treeprocessor` implements a `run` method that takes a pointer to an 

60 `Element` and modifies it as necessary. 

61 

62 `Treeprocessors` must extend `markdown.Treeprocessor`. 

63 

64 """ 

65 def run(self, root: etree.Element) -> etree.Element | None: 

66 """ 

67 Subclasses of `Treeprocessor` should implement a `run` method, which 

68 takes a root `Element`. This method can return another `Element` 

69 object, and the existing root `Element` will be replaced, or it can 

70 modify the current tree and return `None`. 

71 """ 

72 pass # pragma: no cover 

73 

74 

75class InlineProcessor(Treeprocessor): 

76 """ 

77 A `Treeprocessor` that traverses a tree, applying inline patterns. 

78 """ 

79 

80 def __init__(self, md: Markdown): 

81 self.__placeholder_prefix = util.INLINE_PLACEHOLDER_PREFIX 

82 self.__placeholder_suffix = util.ETX 

83 self.__placeholder_length = 4 + len(self.__placeholder_prefix) \ 

84 + len(self.__placeholder_suffix) 

85 self.__placeholder_re = util.INLINE_PLACEHOLDER_RE 

86 self.md = md 

87 self.inlinePatterns = md.inlinePatterns 

88 self.ancestors: list[str] = [] 

89 

90 def __makePlaceholder(self, type: str) -> tuple[str, str]: 

91 """ Generate a placeholder """ 

92 id = "%04d" % len(self.stashed_nodes) 

93 hash = util.INLINE_PLACEHOLDER % id 

94 return hash, id 

95 

96 def __findPlaceholder(self, data: str, index: int) -> tuple[str | None, int]: 

97 """ 

98 Extract id from data string, start from index. 

99 

100 Arguments: 

101 data: String. 

102 index: Index, from which we start search. 

103 

104 Returns: 

105 Placeholder id and string index, after the found placeholder. 

106 

107 """ 

108 m = self.__placeholder_re.search(data, index) 

109 if m: 

110 return m.group(1), m.end() 

111 else: 

112 return None, index + 1 

113 

114 def __stashNode(self, node: etree.Element | str, type: str) -> str: 

115 """ Add node to stash. """ 

116 placeholder, id = self.__makePlaceholder(type) 

117 self.stashed_nodes[id] = node 

118 return placeholder 

119 

120 def __handleInline(self, data: str, patternIndex: int = 0) -> str: 

121 """ 

122 Process string with inline patterns and replace it with placeholders. 

123 

124 Arguments: 

125 data: A line of Markdown text. 

126 patternIndex: The index of the `inlinePattern` to start with. 

127 

128 Returns: 

129 String with placeholders. 

130 

131 """ 

132 if not isinstance(data, util.AtomicString): 

133 startIndex = 0 

134 count = len(self.inlinePatterns) 

135 while patternIndex < count: 

136 data, matched, startIndex = self.__applyPattern( 

137 self.inlinePatterns[patternIndex], data, patternIndex, startIndex 

138 ) 

139 if not matched: 

140 patternIndex += 1 

141 return data 

142 

143 def __processElementText(self, node: etree.Element, subnode: etree.Element, isText: bool = True) -> None: 

144 """ 

145 Process placeholders in `Element.text` or `Element.tail` 

146 of Elements popped from `self.stashed_nodes`. 

147 

148 Arguments: 

149 node: Parent node. 

150 subnode: Processing node. 

151 isText: Boolean variable, True - it's text, False - it's a tail. 

152 

153 """ 

154 if isText: 

155 text = subnode.text 

156 subnode.text = None 

157 else: 

158 text = subnode.tail 

159 subnode.tail = None 

160 

161 childResult = self.__processPlaceholders(text, subnode, isText) 

162 

163 if not isText and node is not subnode: 

164 pos = list(node).index(subnode) + 1 

165 else: 

166 pos = 0 

167 

168 childResult.reverse() 

169 for newChild in childResult: 

170 node.insert(pos, newChild[0]) 

171 

172 def __processPlaceholders( 

173 self, 

174 data: str | None, 

175 parent: etree.Element, 

176 isText: bool = True 

177 ) -> list[tuple[etree.Element, list[str]]]: 

178 """ 

179 Process string with placeholders and generate `ElementTree` tree. 

180 

181 Arguments: 

182 data: String with placeholders instead of `ElementTree` elements. 

183 parent: Element, which contains processing inline data. 

184 isText: Boolean variable, True - it's text, False - it's a tail. 

185 

186 Returns: 

187 List with `ElementTree` elements with applied inline patterns. 

188 

189 """ 

190 def linkText(text: str | None) -> None: 

191 if text: 

192 if result: 

193 if result[-1][0].tail: 

194 result[-1][0].tail += text 

195 else: 

196 result[-1][0].tail = text 

197 elif not isText: 

198 if parent.tail: 

199 parent.tail += text 

200 else: 

201 parent.tail = text 

202 else: 

203 if parent.text: 

204 parent.text += text 

205 else: 

206 parent.text = text 

207 result = [] 

208 strartIndex = 0 

209 while data: 

210 index = data.find(self.__placeholder_prefix, strartIndex) 

211 if index != -1: 

212 id, phEndIndex = self.__findPlaceholder(data, index) 

213 

214 if id in self.stashed_nodes: 

215 node = self.stashed_nodes.get(id) 

216 

217 if index > 0: 

218 text = data[strartIndex:index] 

219 linkText(text) 

220 

221 if not isinstance(node, str): # it's Element 

222 for child in [node] + list(node): 

223 if child.tail: 

224 if child.tail.strip(): 

225 self.__processElementText( 

226 node, child, False 

227 ) 

228 if child.text: 

229 if child.text.strip(): 

230 self.__processElementText(child, child) 

231 else: # it's just a string 

232 linkText(node) 

233 strartIndex = phEndIndex 

234 continue 

235 

236 strartIndex = phEndIndex 

237 result.append((node, self.ancestors[:])) 

238 

239 else: # wrong placeholder 

240 end = index + len(self.__placeholder_prefix) 

241 linkText(data[strartIndex:end]) 

242 strartIndex = end 

243 else: 

244 text = data[strartIndex:] 

245 if isinstance(data, util.AtomicString): 

246 # We don't want to loose the `AtomicString` 

247 text = util.AtomicString(text) 

248 linkText(text) 

249 data = "" 

250 

251 return result 

252 

253 def __applyPattern( 

254 self, 

255 pattern: inlinepatterns.Pattern, 

256 data: str, 

257 patternIndex: int, 

258 startIndex: int = 0 

259 ) -> tuple[str, bool, int]: 

260 """ 

261 Check if the line fits the pattern, create the necessary 

262 elements, add it to `stashed_nodes`. 

263 

264 Arguments: 

265 data: The text to be processed. 

266 pattern: The pattern to be checked. 

267 patternIndex: Index of current pattern. 

268 startIndex: String index, from which we start searching. 

269 

270 Returns: 

271 String with placeholders instead of `ElementTree` elements. 

272 

273 """ 

274 new_style = isinstance(pattern, inlinepatterns.InlineProcessor) 

275 

276 for exclude in pattern.ANCESTOR_EXCLUDES: 

277 if exclude.lower() in self.ancestors: 

278 return data, False, 0 

279 

280 if new_style: 

281 match = None 

282 # Since `handleMatch` may reject our first match, 

283 # we iterate over the buffer looking for matches 

284 # until we can't find any more. 

285 for match in pattern.getCompiledRegExp().finditer(data, startIndex): 

286 node, start, end = pattern.handleMatch(match, data) 

287 if start is None or end is None: 

288 startIndex += match.end(0) 

289 match = None 

290 continue 

291 break 

292 else: # pragma: no cover 

293 match = pattern.getCompiledRegExp().match(data[startIndex:]) 

294 leftData = data[:startIndex] 

295 

296 if not match: 

297 return data, False, 0 

298 

299 if not new_style: # pragma: no cover 

300 node = pattern.handleMatch(match) 

301 start = match.start(0) 

302 end = match.end(0) 

303 

304 if node is None: 

305 return data, True, end 

306 

307 if not isinstance(node, str): 

308 if not isinstance(node.text, util.AtomicString): 

309 # We need to process current node too 

310 for child in [node] + list(node): 

311 if not isString(node): 

312 if child.text: 

313 self.ancestors.append(child.tag.lower()) 

314 child.text = self.__handleInline( 

315 child.text, patternIndex + 1 

316 ) 

317 self.ancestors.pop() 

318 if child.tail: 

319 child.tail = self.__handleInline( 

320 child.tail, patternIndex 

321 ) 

322 

323 placeholder = self.__stashNode(node, pattern.type()) 

324 

325 if new_style: 

326 return "{}{}{}".format(data[:start], 

327 placeholder, data[end:]), True, 0 

328 else: # pragma: no cover 

329 return "{}{}{}{}".format(leftData, 

330 match.group(1), 

331 placeholder, match.groups()[-1]), True, 0 

332 

333 def __build_ancestors(self, parent: etree.Element | None, parents: list[str]) -> None: 

334 """Build the ancestor list.""" 

335 ancestors = [] 

336 while parent is not None: 

337 if parent is not None: 

338 ancestors.append(parent.tag.lower()) 

339 parent = self.parent_map.get(parent) 

340 ancestors.reverse() 

341 parents.extend(ancestors) 

342 

343 def run(self, tree: etree.Element, ancestors: list[str] | None = None) -> etree.Element: 

344 """Apply inline patterns to a parsed Markdown tree. 

345 

346 Iterate over `Element`, find elements with inline tag, apply inline 

347 patterns and append newly created Elements to tree. To avoid further 

348 processing of string with inline patterns, instead of normal string, 

349 use subclass [`AtomicString`][markdown.util.AtomicString]: 

350 

351 node.text = markdown.util.AtomicString("This will not be processed.") 

352 

353 Arguments: 

354 tree: `Element` object, representing Markdown tree. 

355 ancestors: List of parent tag names that precede the tree node (if needed). 

356 

357 Returns: 

358 An element tree object with applied inline patterns. 

359 

360 """ 

361 self.stashed_nodes: dict[str, etree.Element | str] = {} 

362 

363 # Ensure a valid parent list, but copy passed in lists 

364 # to ensure we don't have the user accidentally change it on us. 

365 tree_parents = [] if ancestors is None else ancestors[:] 

366 

367 self.parent_map = {c: p for p in tree.iter() for c in p} 

368 stack = [(tree, tree_parents)] 

369 

370 while stack: 

371 currElement, parents = stack.pop() 

372 

373 self.ancestors = parents 

374 self.__build_ancestors(currElement, self.ancestors) 

375 

376 insertQueue = [] 

377 for child in currElement: 

378 if child.text and not isinstance( 

379 child.text, util.AtomicString 

380 ): 

381 self.ancestors.append(child.tag.lower()) 

382 text = child.text 

383 child.text = None 

384 lst = self.__processPlaceholders( 

385 self.__handleInline(text), child 

386 ) 

387 for item in lst: 

388 self.parent_map[item[0]] = child 

389 stack += lst 

390 insertQueue.append((child, lst)) 

391 self.ancestors.pop() 

392 if child.tail: 

393 tail = self.__handleInline(child.tail) 

394 dumby = etree.Element('d') 

395 child.tail = None 

396 tailResult = self.__processPlaceholders(tail, dumby, False) 

397 if dumby.tail: 

398 child.tail = dumby.tail 

399 pos = list(currElement).index(child) + 1 

400 tailResult.reverse() 

401 for newChild in tailResult: 

402 self.parent_map[newChild[0]] = currElement 

403 currElement.insert(pos, newChild[0]) 

404 if len(child): 

405 self.parent_map[child] = currElement 

406 stack.append((child, self.ancestors[:])) 

407 

408 for element, lst in insertQueue: 

409 for i, obj in enumerate(lst): 

410 newChild = obj[0] 

411 element.insert(i, newChild) 

412 return tree 

413 

414 

415class PrettifyTreeprocessor(Treeprocessor): 

416 """ Add line breaks to the html document. """ 

417 

418 def _prettifyETree(self, elem: etree.Element) -> None: 

419 """ Recursively add line breaks to `ElementTree` children. """ 

420 

421 i = "\n" 

422 if self.md.is_block_level(elem.tag) and elem.tag not in ['code', 'pre']: 

423 if (not elem.text or not elem.text.strip()) \ 

424 and len(elem) and self.md.is_block_level(elem[0].tag): 

425 elem.text = i 

426 for e in elem: 

427 if self.md.is_block_level(e.tag): 

428 self._prettifyETree(e) 

429 if not elem.tail or not elem.tail.strip(): 

430 elem.tail = i 

431 

432 def run(self, root: etree.Element) -> None: 

433 """ Add line breaks to `Element` object and its children. """ 

434 

435 self._prettifyETree(root) 

436 # Do `<br />`'s separately as they are often in the middle of 

437 # inline content and missed by `_prettifyETree`. 

438 brs = root.iter('br') 

439 for br in brs: 

440 if not br.tail or not br.tail.strip(): 

441 br.tail = '\n' 

442 else: 

443 br.tail = '\n%s' % br.tail 

444 # Clean up extra empty lines at end of code blocks. 

445 pres = root.iter('pre') 

446 for pre in pres: 

447 if len(pre) and pre[0].tag == 'code': 

448 code = pre[0] 

449 # Only prettify code containing text only 

450 if not len(code) and code.text is not None: 

451 code.text = util.AtomicString(code.text.rstrip() + '\n') 

452 

453 

454class UnescapeTreeprocessor(Treeprocessor): 

455 """ Restore escaped chars """ 

456 

457 RE = re.compile(r'{}(\d+){}'.format(util.STX, util.ETX)) 

458 

459 def _unescape(self, m: re.Match[str]) -> str: 

460 return chr(int(m.group(1))) 

461 

462 def unescape(self, text: str) -> str: 

463 return self.RE.sub(self._unescape, text) 

464 

465 def run(self, root: etree.Element) -> None: 

466 """ Loop over all elements and unescape all text. """ 

467 for elem in root.iter(): 

468 # Unescape text content 

469 if elem.text and not elem.tag == 'code': 

470 elem.text = self.unescape(elem.text) 

471 # Unescape tail content 

472 if elem.tail: 

473 elem.tail = self.unescape(elem.tail) 

474 # Unescape attribute values 

475 for key, value in elem.items(): 

476 elem.set(key, self.unescape(value))