Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/markdown/treeprocessors.py: 97%

1# Python Markdown

3# A Python implementation of John Gruber's Markdown.

5# Documentation: https://python-markdown.github.io/

6# GitHub: https://github.com/Python-Markdown/markdown/

7# PyPI: https://pypi.org/project/Markdown/

9# Started by Manfred Stienstra (http://www.dwerg.net/).

10# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).

11# Currently maintained by Waylan Limberg (https://github.com/waylan),

12# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).

18# License: BSD (see LICENSE.md for details).

20"""

21Tree processors manipulate the tree created by block processors. They can even create an entirely

22new `ElementTree` object. This is an excellent place for creating summaries, adding collected

23references, or last minute adjustments.

25"""

27from __future__ import annotations

29import re

30import xml.etree.ElementTree as etree

31from typing import TYPE_CHECKING, Any

32from . import util

33from . import inlinepatterns

35if TYPE_CHECKING: # pragma: no cover

36 from markdown import Markdown

39def build_treeprocessors(md: Markdown, **kwargs: Any) -> util.Registry[Treeprocessor]:

40 """ Build the default `treeprocessors` for Markdown. """

41 treeprocessors = util.Registry()

42 treeprocessors.register(InlineProcessor(md), 'inline', 20)

43 treeprocessors.register(PrettifyTreeprocessor(md), 'prettify', 10)

44 treeprocessors.register(UnescapeTreeprocessor(md), 'unescape', 0)

45 return treeprocessors

48def isString(s: object) -> bool:

49 """ Return `True` if object is a string but not an [`AtomicString`][markdown.util.AtomicString]. """

50 if not isinstance(s, util.AtomicString):

51 return isinstance(s, str)

52 return False

55class Treeprocessor(util.Processor):

56 """

57 `Treeprocessor`s are run on the `ElementTree` object before serialization.

59 Each `Treeprocessor` implements a `run` method that takes a pointer to an

60 `Element` and modifies it as necessary.

62 `Treeprocessors` must extend `markdown.Treeprocessor`.

64 """

65 def run(self, root: etree.Element) -> etree.Element | None:

66 """

67 Subclasses of `Treeprocessor` should implement a `run` method, which

68 takes a root `Element`. This method can return another `Element`

69 object, and the existing root `Element` will be replaced, or it can

70 modify the current tree and return `None`.

71 """

72 pass # pragma: no cover

75class InlineProcessor(Treeprocessor):

76 """

77 A `Treeprocessor` that traverses a tree, applying inline patterns.

78 """

80 def __init__(self, md: Markdown):

81 self.__placeholder_prefix = util.INLINE_PLACEHOLDER_PREFIX

82 self.__placeholder_suffix = util.ETX

83 self.__placeholder_length = 4 + len(self.__placeholder_prefix) \

84 + len(self.__placeholder_suffix)

85 self.__placeholder_re = util.INLINE_PLACEHOLDER_RE

86 self.md = md

87 self.inlinePatterns = md.inlinePatterns

88 self.ancestors: list[str] = []

90 def __makePlaceholder(self, type: str) -> tuple[str, str]:

91 """ Generate a placeholder """

92 id = "%04d" % len(self.stashed_nodes)

93 hash = util.INLINE_PLACEHOLDER % id

94 return hash, id

96 def __findPlaceholder(self, data: str, index: int) -> tuple[str | None, int]:

97 """

98 Extract id from data string, start from index.

100 Arguments:

101 data: String.

102 index: Index, from which we start search.

103

104 Returns:

105 Placeholder id and string index, after the found placeholder.

106

107 """

108 m = self.__placeholder_re.search(data, index)

109 if m:

110 return m.group(1), m.end()

111 else:

112 return None, index + 1

113

114 def __stashNode(self, node: etree.Element | str, type: str) -> str:

115 """ Add node to stash. """

116 placeholder, id = self.__makePlaceholder(type)

117 self.stashed_nodes[id] = node

118 return placeholder

119

120 def __handleInline(self, data: str, patternIndex: int = 0) -> str:

121 """

122 Process string with inline patterns and replace it with placeholders.

123

124 Arguments:

125 data: A line of Markdown text.

126 patternIndex: The index of the `inlinePattern` to start with.

127

128 Returns:

129 String with placeholders.

130

131 """

132 if not isinstance(data, util.AtomicString):

133 startIndex = 0

134 count = len(self.inlinePatterns)

135 while patternIndex < count:

136 data, matched, startIndex = self.__applyPattern(

137 self.inlinePatterns[patternIndex], data, patternIndex, startIndex

138 )

139 if not matched:

140 patternIndex += 1

141 return data

142

143 def __processElementText(self, node: etree.Element, subnode: etree.Element, isText: bool = True) -> None:

144 """

145 Process placeholders in `Element.text` or `Element.tail`

146 of Elements popped from `self.stashed_nodes`.

147

148 Arguments:

149 node: Parent node.

150 subnode: Processing node.

151 isText: Boolean variable, True - it's text, False - it's a tail.

152

153 """

154 if isText:

155 text = subnode.text

156 subnode.text = None

157 else:

158 text = subnode.tail

159 subnode.tail = None

160

161 childResult = self.__processPlaceholders(text, subnode, isText)

162

163 if not isText and node is not subnode:

164 pos = list(node).index(subnode) + 1

165 else:

166 pos = 0

167

168 childResult.reverse()

169 for newChild in childResult:

170 node.insert(pos, newChild[0])

171

172 def __processPlaceholders(

173 self,

174 data: str | None,

175 parent: etree.Element,

176 isText: bool = True

177 ) -> list[tuple[etree.Element, list[str]]]:

178 """

179 Process string with placeholders and generate `ElementTree` tree.

180

181 Arguments:

182 data: String with placeholders instead of `ElementTree` elements.

183 parent: Element, which contains processing inline data.

184 isText: Boolean variable, True - it's text, False - it's a tail.

185

186 Returns:

187 List with `ElementTree` elements with applied inline patterns.

188

189 """

190 def linkText(text: str | None) -> None:

191 if text:

192 if result:

193 if result[-1][0].tail:

194 result[-1][0].tail += text

195 else:

196 result[-1][0].tail = text

197 elif not isText:

198 if parent.tail:

199 parent.tail += text

200 else:

201 parent.tail = text

202 else:

203 if parent.text:

204 parent.text += text

205 else:

206 parent.text = text

207 result = []

208 strartIndex = 0

209 while data:

210 index = data.find(self.__placeholder_prefix, strartIndex)

211 if index != -1:

212 id, phEndIndex = self.__findPlaceholder(data, index)

213

214 if id in self.stashed_nodes:

215 node = self.stashed_nodes.get(id)

216

217 if index > 0:

218 text = data[strartIndex:index]

219 linkText(text)

220

221 if not isinstance(node, str): # it's Element

222 for child in [node] + list(node):

223 if child.tail:

224 if child.tail.strip():

225 self.__processElementText(

226 node, child, False

227 )

228 if child.text:

229 if child.text.strip():

230 self.__processElementText(child, child)

231 else: # it's just a string

232 linkText(node)

233 strartIndex = phEndIndex

234 continue

235

236 strartIndex = phEndIndex

237 result.append((node, self.ancestors[:]))

238

239 else: # wrong placeholder

240 end = index + len(self.__placeholder_prefix)

241 linkText(data[strartIndex:end])

242 strartIndex = end

243 else:

244 text = data[strartIndex:]

245 if isinstance(data, util.AtomicString):

246 # We don't want to loose the `AtomicString`

247 text = util.AtomicString(text)

248 linkText(text)

249 data = ""

250

251 return result

252

253 def __applyPattern(

254 self,

255 pattern: inlinepatterns.Pattern,

256 data: str,

257 patternIndex: int,

258 startIndex: int = 0

259 ) -> tuple[str, bool, int]:

260 """

261 Check if the line fits the pattern, create the necessary

262 elements, add it to `stashed_nodes`.

263

264 Arguments:

265 data: The text to be processed.

266 pattern: The pattern to be checked.

267 patternIndex: Index of current pattern.

268 startIndex: String index, from which we start searching.

269

270 Returns:

271 String with placeholders instead of `ElementTree` elements.

272

273 """

274 new_style = isinstance(pattern, inlinepatterns.InlineProcessor)

275

276 for exclude in pattern.ANCESTOR_EXCLUDES:

277 if exclude.lower() in self.ancestors:

278 return data, False, 0

279

280 if new_style:

281 match = None

282 # Since `handleMatch` may reject our first match,

283 # we iterate over the buffer looking for matches

284 # until we can't find any more.

285 for match in pattern.getCompiledRegExp().finditer(data, startIndex):

286 node, start, end = pattern.handleMatch(match, data)

287 if start is None or end is None:

288 startIndex += match.end(0)

289 match = None

290 continue

291 break

292 else: # pragma: no cover

293 match = pattern.getCompiledRegExp().match(data[startIndex:])

294 leftData = data[:startIndex]

295

296 if not match:

297 return data, False, 0

298

299 if not new_style: # pragma: no cover

300 node = pattern.handleMatch(match)

301 start = match.start(0)

302 end = match.end(0)

303

304 if node is None:

305 return data, True, end

306

307 if not isinstance(node, str):

308 if not isinstance(node.text, util.AtomicString):

309 # We need to process current node too

310 for child in [node] + list(node):

311 if not isString(node):

312 if child.text:

313 self.ancestors.append(child.tag.lower())

314 child.text = self.__handleInline(

315 child.text, patternIndex + 1

316 )

317 self.ancestors.pop()

318 if child.tail:

319 child.tail = self.__handleInline(

320 child.tail, patternIndex

321 )

322

323 placeholder = self.__stashNode(node, pattern.type())

324

325 if new_style:

326 return "{}{}{}".format(data[:start],

327 placeholder, data[end:]), True, 0

328 else: # pragma: no cover

329 return "{}{}{}{}".format(leftData,

330 match.group(1),

331 placeholder, match.groups()[-1]), True, 0

332

333 def __build_ancestors(self, parent: etree.Element | None, parents: list[str]) -> None:

334 """Build the ancestor list."""

335 ancestors = []

336 while parent is not None:

337 if parent is not None:

338 ancestors.append(parent.tag.lower())

339 parent = self.parent_map.get(parent)

340 ancestors.reverse()

341 parents.extend(ancestors)

342

343 def run(self, tree: etree.Element, ancestors: list[str] | None = None) -> etree.Element:

344 """Apply inline patterns to a parsed Markdown tree.

345

346 Iterate over `Element`, find elements with inline tag, apply inline

347 patterns and append newly created Elements to tree. To avoid further

348 processing of string with inline patterns, instead of normal string,

349 use subclass [`AtomicString`][markdown.util.AtomicString]:

350

351 node.text = markdown.util.AtomicString("This will not be processed.")

352

353 Arguments:

354 tree: `Element` object, representing Markdown tree.

355 ancestors: List of parent tag names that precede the tree node (if needed).

356

357 Returns:

358 An element tree object with applied inline patterns.

359

360 """

361 self.stashed_nodes: dict[str, etree.Element | str] = {}

362

363 # Ensure a valid parent list, but copy passed in lists

364 # to ensure we don't have the user accidentally change it on us.

365 tree_parents = [] if ancestors is None else ancestors[:]

366

367 self.parent_map = {c: p for p in tree.iter() for c in p}

368 stack = [(tree, tree_parents)]

369

370 while stack:

371 currElement, parents = stack.pop()

372

373 self.ancestors = parents

374 self.__build_ancestors(currElement, self.ancestors)

375

376 insertQueue = []

377 for child in currElement:

378 if child.text and not isinstance(

379 child.text, util.AtomicString

380 ):

381 self.ancestors.append(child.tag.lower())

382 text = child.text

383 child.text = None

384 lst = self.__processPlaceholders(

385 self.__handleInline(text), child

386 )

387 for item in lst:

388 self.parent_map[item[0]] = child

389 stack += lst

390 insertQueue.append((child, lst))

391 self.ancestors.pop()

392 if child.tail:

393 tail = self.__handleInline(child.tail)

394 dumby = etree.Element('d')

395 child.tail = None

396 tailResult = self.__processPlaceholders(tail, dumby, False)

397 if dumby.tail:

398 child.tail = dumby.tail

399 pos = list(currElement).index(child) + 1

400 tailResult.reverse()

401 for newChild in tailResult:

402 self.parent_map[newChild[0]] = currElement

403 currElement.insert(pos, newChild[0])

404 if len(child):

405 self.parent_map[child] = currElement

406 stack.append((child, self.ancestors[:]))

407

408 for element, lst in insertQueue:

409 for i, obj in enumerate(lst):

410 newChild = obj[0]

411 element.insert(i, newChild)

412 return tree

413

414

415class PrettifyTreeprocessor(Treeprocessor):

416 """ Add line breaks to the html document. """

417

418 def _prettifyETree(self, elem: etree.Element) -> None:

419 """ Recursively add line breaks to `ElementTree` children. """

420

421 i = "\n"

422 if self.md.is_block_level(elem.tag) and elem.tag not in ['code', 'pre']:

423 if (not elem.text or not elem.text.strip()) \

424 and len(elem) and self.md.is_block_level(elem[0].tag):

425 elem.text = i

426 for e in elem:

427 if self.md.is_block_level(e.tag):

428 self._prettifyETree(e)

429 if not elem.tail or not elem.tail.strip():

430 elem.tail = i

431

432 def run(self, root: etree.Element) -> None:

433 """ Add line breaks to `Element` object and its children. """

434

435 self._prettifyETree(root)

436 # Do `<br />`'s separately as they are often in the middle of

437 # inline content and missed by `_prettifyETree`.

438 brs = root.iter('br')

439 for br in brs:

440 if not br.tail or not br.tail.strip():

441 br.tail = '\n'

442 else:

443 br.tail = '\n%s' % br.tail

444 # Clean up extra empty lines at end of code blocks.

445 pres = root.iter('pre')

446 for pre in pres:

447 if len(pre) and pre[0].tag == 'code':

448 code = pre[0]

449 # Only prettify code containing text only

450 if not len(code) and code.text is not None:

451 code.text = util.AtomicString(code.text.rstrip() + '\n')

452

453

454class UnescapeTreeprocessor(Treeprocessor):

455 """ Restore escaped chars """

456

457 RE = re.compile(r'{}(\d+){}'.format(util.STX, util.ETX))

458

459 def _unescape(self, m: re.Match[str]) -> str:

460 return chr(int(m.group(1)))

461

462 def unescape(self, text: str) -> str:

463 return self.RE.sub(self._unescape, text)

464

465 def run(self, root: etree.Element) -> None:

466 """ Loop over all elements and unescape all text. """

467 for elem in root.iter():

468 # Unescape text content

469 if elem.text and not elem.tag == 'code':

470 elem.text = self.unescape(elem.text)

471 # Unescape tail content

472 if elem.tail:

473 elem.tail = self.unescape(elem.tail)

474 # Unescape attribute values

475 for key, value in elem.items():

476 elem.set(key, self.unescape(value))