Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/markdown/core.py: 68%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

152 statements  

1# Python Markdown 

2 

3# A Python implementation of John Gruber's Markdown. 

4 

5# Documentation: https://python-markdown.github.io/ 

6# GitHub: https://github.com/Python-Markdown/markdown/ 

7# PyPI: https://pypi.org/project/Markdown/ 

8 

9# Started by Manfred Stienstra (http://www.dwerg.net/). 

10# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org). 

11# Currently maintained by Waylan Limberg (https://github.com/waylan), 

12# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser). 

13 

14# Copyright 2007-2023 The Python Markdown Project (v. 1.7 and later) 

15# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) 

16# Copyright 2004 Manfred Stienstra (the original version) 

17 

18# License: BSD (see LICENSE.md for details). 

19 

20from __future__ import annotations 

21 

22import codecs 

23import sys 

24import logging 

25import importlib 

26from typing import TYPE_CHECKING, Any, BinaryIO, Callable, ClassVar, Mapping, Sequence 

27from . import util 

28from .preprocessors import build_preprocessors 

29from .blockprocessors import build_block_parser 

30from .treeprocessors import build_treeprocessors 

31from .inlinepatterns import build_inlinepatterns 

32from .postprocessors import build_postprocessors 

33from .extensions import Extension 

34from .serializers import to_html_string, to_xhtml_string 

35from .util import BLOCK_LEVEL_ELEMENTS 

36 

37if TYPE_CHECKING: # pragma: no cover 

38 from xml.etree.ElementTree import Element 

39 

40__all__ = ['Markdown', 'markdown', 'markdownFromFile'] 

41 

42 

43logger = logging.getLogger('MARKDOWN') 

44 

45 

46class Markdown: 

47 """ 

48 A parser which converts Markdown to HTML. 

49 

50 Attributes: 

51 Markdown.tab_length (int): The number of spaces which correspond to a single tab. Default: `4`. 

52 Markdown.ESCAPED_CHARS (list[str]): List of characters which get the backslash escape treatment. 

53 Markdown.block_level_elements (list[str]): List of HTML tags which get treated as block-level elements. 

54 See [`markdown.util.BLOCK_LEVEL_ELEMENTS`][] for the full list of elements. 

55 Markdown.registeredExtensions (list[Extension]): List of extensions which have called 

56 [`registerExtension`][markdown.Markdown.registerExtension] during setup. 

57 Markdown.doc_tag (str): Element used to wrap document. Default: `div`. 

58 Markdown.stripTopLevelTags (bool): Indicates whether the `doc_tag` should be removed. Default: 'True'. 

59 Markdown.references (dict[str, tuple[str, str]]): A mapping of link references found in a parsed document 

60 where the key is the reference name and the value is a tuple of the URL and title. 

61 Markdown.htmlStash (util.HtmlStash): The instance of the `HtmlStash` used by an instance of this class. 

62 Markdown.output_formats (dict[str, Callable[xml.etree.ElementTree.Element]]): A mapping of known output 

63 formats by name and their respective serializers. Each serializer must be a callable which accepts an 

64 [`Element`][xml.etree.ElementTree.Element] and returns a `str`. 

65 Markdown.output_format (str): The output format set by 

66 [`set_output_format`][markdown.Markdown.set_output_format]. 

67 Markdown.serializer (Callable[xml.etree.ElementTree.Element]): The serializer set by 

68 [`set_output_format`][markdown.Markdown.set_output_format]. 

69 Markdown.preprocessors (util.Registry): A collection of [`preprocessors`][markdown.preprocessors]. 

70 Markdown.parser (blockparser.BlockParser): A collection of [`blockprocessors`][markdown.blockprocessors]. 

71 Markdown.inlinePatterns (util.Registry): A collection of [`inlinepatterns`][markdown.inlinepatterns]. 

72 Markdown.treeprocessors (util.Registry): A collection of [`treeprocessors`][markdown.treeprocessors]. 

73 Markdown.postprocessors (util.Registry): A collection of [`postprocessors`][markdown.postprocessors]. 

74 

75 """ 

76 

77 doc_tag = "div" # Element used to wrap document - later removed 

78 

79 output_formats: ClassVar[dict[str, Callable[[Element], str]]] = { 

80 'html': to_html_string, 

81 'xhtml': to_xhtml_string, 

82 } 

83 """ 

84 A mapping of known output formats by name and their respective serializers. Each serializer must be a 

85 callable which accepts an [`Element`][xml.etree.ElementTree.Element] and returns a `str`. 

86 """ 

87 

88 def __init__(self, **kwargs): 

89 """ 

90 Creates a new Markdown instance. 

91 

92 Keyword Arguments: 

93 extensions (list[Extension | str]): A list of extensions. 

94 

95 If an item is an instance of a subclass of [`markdown.extensions.Extension`][], 

96 the instance will be used as-is. If an item is of type `str`, it is passed 

97 to [`build_extension`][markdown.Markdown.build_extension] with its corresponding 

98 `extension_configs` and the returned instance of [`markdown.extensions.Extension`][] 

99 is used. 

100 extension_configs (dict[str, dict[str, Any]]): Configuration settings for extensions. 

101 output_format (str): Format of output. Supported formats are: 

102 

103 * `xhtml`: Outputs XHTML style tags. Default. 

104 * `html`: Outputs HTML style tags. 

105 tab_length (int): Length of tabs in the source. Default: `4` 

106 

107 """ 

108 

109 self.tab_length: int = kwargs.get('tab_length', 4) 

110 

111 self.ESCAPED_CHARS: list[str] = [ 

112 '\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '>', '#', '+', '-', '.', '!' 

113 ] 

114 """ List of characters which get the backslash escape treatment. """ 

115 

116 self.block_level_elements: list[str] = BLOCK_LEVEL_ELEMENTS.copy() 

117 

118 self.registeredExtensions: list[Extension] = [] 

119 self.docType = "" # TODO: Maybe delete this. It does not appear to be used anymore. 

120 self.stripTopLevelTags: bool = True 

121 

122 self.build_parser() 

123 

124 self.references: dict[str, tuple[str, str]] = {} 

125 self.htmlStash: util.HtmlStash = util.HtmlStash() 

126 self.registerExtensions(extensions=kwargs.get('extensions', []), 

127 configs=kwargs.get('extension_configs', {})) 

128 self.set_output_format(kwargs.get('output_format', 'xhtml')) 

129 self.reset() 

130 

131 def build_parser(self) -> Markdown: 

132 """ 

133 Build the parser from the various parts. 

134 

135 Assigns a value to each of the following attributes on the class instance: 

136 

137 * **`Markdown.preprocessors`** ([`Registry`][markdown.util.Registry]) -- A collection of 

138 [`preprocessors`][markdown.preprocessors]. 

139 * **`Markdown.parser`** ([`BlockParser`][markdown.blockparser.BlockParser]) -- A collection of 

140 [`blockprocessors`][markdown.blockprocessors]. 

141 * **`Markdown.inlinePatterns`** ([`Registry`][markdown.util.Registry]) -- A collection of 

142 [`inlinepatterns`][markdown.inlinepatterns]. 

143 * **`Markdown.treeprocessors`** ([`Registry`][markdown.util.Registry]) -- A collection of 

144 [`treeprocessors`][markdown.treeprocessors]. 

145 * **`Markdown.postprocessors`** ([`Registry`][markdown.util.Registry]) -- A collection of 

146 [`postprocessors`][markdown.postprocessors]. 

147 

148 This method could be redefined in a subclass to build a custom parser which is made up of a different 

149 combination of processors and patterns. 

150 

151 """ 

152 self.preprocessors = build_preprocessors(self) 

153 self.parser = build_block_parser(self) 

154 self.inlinePatterns = build_inlinepatterns(self) 

155 self.treeprocessors = build_treeprocessors(self) 

156 self.postprocessors = build_postprocessors(self) 

157 return self 

158 

159 def registerExtensions( 

160 self, 

161 extensions: Sequence[Extension | str], 

162 configs: Mapping[str, dict[str, Any]] 

163 ) -> Markdown: 

164 """ 

165 Load a list of extensions into an instance of the `Markdown` class. 

166 

167 Arguments: 

168 extensions (list[Extension | str]): A list of extensions. 

169 

170 If an item is an instance of a subclass of [`markdown.extensions.Extension`][], 

171 the instance will be used as-is. If an item is of type `str`, it is passed 

172 to [`build_extension`][markdown.Markdown.build_extension] with its corresponding `configs` and the 

173 returned instance of [`markdown.extensions.Extension`][] is used. 

174 configs (dict[str, dict[str, Any]]): Configuration settings for extensions. 

175 

176 """ 

177 for ext in extensions: 

178 if isinstance(ext, str): 

179 ext = self.build_extension(ext, configs.get(ext, {})) 

180 if isinstance(ext, Extension): 

181 ext.extendMarkdown(self) 

182 logger.debug( 

183 'Successfully loaded extension "%s.%s".' 

184 % (ext.__class__.__module__, ext.__class__.__name__) 

185 ) 

186 elif ext is not None: 

187 raise TypeError( 

188 'Extension "{}.{}" must be of type: "{}.{}"'.format( 

189 ext.__class__.__module__, ext.__class__.__name__, 

190 Extension.__module__, Extension.__name__ 

191 ) 

192 ) 

193 return self 

194 

195 def build_extension(self, ext_name: str, configs: Mapping[str, Any]) -> Extension: 

196 """ 

197 Build extension from a string name, then return an instance using the given `configs`. 

198 

199 Arguments: 

200 ext_name: Name of extension as a string. 

201 configs: Configuration settings for extension. 

202 

203 Returns: 

204 An instance of the extension with the given configuration settings. 

205 

206 First attempt to load an entry point. The string name must be registered as an entry point in the 

207 `markdown.extensions` group which points to a subclass of the [`markdown.extensions.Extension`][] class. 

208 If multiple distributions have registered the same name, the first one found is returned. 

209 

210 If no entry point is found, assume dot notation (`path.to.module:ClassName`). Load the specified class and 

211 return an instance. If no class is specified, import the module and call a `makeExtension` function and return 

212 the [`markdown.extensions.Extension`][] instance returned by that function. 

213 """ 

214 configs = dict(configs) 

215 

216 entry_points = [ep for ep in util.get_installed_extensions() if ep.name == ext_name] 

217 if entry_points: 

218 ext = entry_points[0].load() 

219 return ext(**configs) 

220 

221 # Get class name (if provided): `path.to.module:ClassName` 

222 ext_name, class_name = ext_name.split(':', 1) if ':' in ext_name else (ext_name, '') 

223 

224 try: 

225 module = importlib.import_module(ext_name) 

226 logger.debug( 

227 'Successfully imported extension module "%s".' % ext_name 

228 ) 

229 except ImportError as e: 

230 message = 'Failed loading extension "%s".' % ext_name 

231 e.args = (message,) + e.args[1:] 

232 raise 

233 

234 if class_name: 

235 # Load given class name from module. 

236 return getattr(module, class_name)(**configs) 

237 else: 

238 # Expect `makeExtension()` function to return a class. 

239 try: 

240 return module.makeExtension(**configs) 

241 except AttributeError as e: 

242 message = e.args[0] 

243 message = "Failed to initiate extension " \ 

244 "'%s': %s" % (ext_name, message) 

245 e.args = (message,) + e.args[1:] 

246 raise 

247 

248 def registerExtension(self, extension: Extension) -> Markdown: 

249 """ 

250 Register an extension as having a resettable state. 

251 

252 Arguments: 

253 extension: An instance of the extension to register. 

254 

255 This should get called once by an extension during setup. A "registered" extension's 

256 `reset` method is called by [`Markdown.reset()`][markdown.Markdown.reset]. Not all extensions have or need a 

257 resettable state, and so it should not be assumed that all extensions are "registered." 

258 

259 """ 

260 self.registeredExtensions.append(extension) 

261 return self 

262 

263 def reset(self) -> Markdown: 

264 """ 

265 Resets all state variables to prepare the parser instance for new input. 

266 

267 Called once upon creation of a class instance. Should be called manually between calls 

268 to [`Markdown.convert`][markdown.Markdown.convert]. 

269 """ 

270 self.htmlStash.reset() 

271 self.references.clear() 

272 

273 for extension in self.registeredExtensions: 

274 if hasattr(extension, 'reset'): 

275 extension.reset() 

276 

277 return self 

278 

279 def set_output_format(self, format: str) -> Markdown: 

280 """ 

281 Set the output format for the class instance. 

282 

283 Arguments: 

284 format: Must be a known value in `Markdown.output_formats`. 

285 

286 """ 

287 self.output_format = format.lower().rstrip('145') # ignore number 

288 try: 

289 self.serializer = self.output_formats[self.output_format] 

290 except KeyError as e: 

291 valid_formats = list(self.output_formats.keys()) 

292 valid_formats.sort() 

293 message = 'Invalid Output Format: "%s". Use one of %s.' \ 

294 % (self.output_format, 

295 '"' + '", "'.join(valid_formats) + '"') 

296 e.args = (message,) + e.args[1:] 

297 raise 

298 return self 

299 

300 # Note: the `tag` argument is type annotated `Any` as ElementTree uses many various objects as tags. 

301 # As there is no standardization in ElementTree, the type of a given tag is unpredictable. 

302 def is_block_level(self, tag: Any) -> bool: 

303 """ 

304 Check if the given `tag` is a block level HTML tag. 

305 

306 Returns `True` for any string listed in `Markdown.block_level_elements`. A `tag` which is 

307 not a string always returns `False`. 

308 

309 """ 

310 if isinstance(tag, str): 

311 return tag.lower().rstrip('/') in self.block_level_elements 

312 # Some ElementTree tags are not strings, so return False. 

313 return False 

314 

315 def convert(self, source: str) -> str: 

316 """ 

317 Convert a Markdown string to a string in the specified output format. 

318 

319 Arguments: 

320 source: Markdown formatted text as Unicode or ASCII string. 

321 

322 Returns: 

323 A string in the specified output format. 

324 

325 Markdown parsing takes place in five steps: 

326 

327 1. A bunch of [`preprocessors`][markdown.preprocessors] munge the input text. 

328 2. A [`BlockParser`][markdown.blockparser.BlockParser] parses the high-level structural elements of the 

329 pre-processed text into an [`ElementTree`][xml.etree.ElementTree.ElementTree] object. 

330 3. A bunch of [`treeprocessors`][markdown.treeprocessors] are run against the 

331 [`ElementTree`][xml.etree.ElementTree.ElementTree] object. One such `treeprocessor` 

332 ([`markdown.treeprocessors.InlineProcessor`][]) runs [`inlinepatterns`][markdown.inlinepatterns] 

333 against the [`ElementTree`][xml.etree.ElementTree.ElementTree] object, parsing inline markup. 

334 4. Some [`postprocessors`][markdown.postprocessors] are run against the text after the 

335 [`ElementTree`][xml.etree.ElementTree.ElementTree] object has been serialized into text. 

336 5. The output is returned as a string. 

337 

338 """ 

339 

340 # Fix up the source text 

341 if not source.strip(): 

342 return '' # a blank Unicode string 

343 

344 try: 

345 source = str(source) 

346 except UnicodeDecodeError as e: # pragma: no cover 

347 # Customize error message while maintaining original traceback 

348 e.reason += '. -- Note: Markdown only accepts Unicode input!' 

349 raise 

350 

351 # Split into lines and run the line preprocessors. 

352 self.lines = source.split("\n") 

353 for prep in self.preprocessors: 

354 self.lines = prep.run(self.lines) 

355 

356 # Parse the high-level elements. 

357 root = self.parser.parseDocument(self.lines).getroot() 

358 

359 # Run the tree-processors 

360 for treeprocessor in self.treeprocessors: 

361 newRoot = treeprocessor.run(root) 

362 if newRoot is not None: 

363 root = newRoot 

364 

365 # Serialize _properly_. Strip top-level tags. 

366 output = self.serializer(root) 

367 if self.stripTopLevelTags: 

368 try: 

369 start = output.index( 

370 '<%s>' % self.doc_tag) + len(self.doc_tag) + 2 

371 end = output.rindex('</%s>' % self.doc_tag) 

372 output = output[start:end].strip() 

373 except ValueError as e: # pragma: no cover 

374 if output.strip().endswith('<%s />' % self.doc_tag): 

375 # We have an empty document 

376 output = '' 

377 else: 

378 # We have a serious problem 

379 raise ValueError('Markdown failed to strip top-level ' 

380 'tags. Document=%r' % output.strip()) from e 

381 

382 # Run the text post-processors 

383 for pp in self.postprocessors: 

384 output = pp.run(output) 

385 

386 return output.strip() 

387 

388 def convertFile( 

389 self, 

390 input: str | BinaryIO | None = None, 

391 output: str | BinaryIO | None = None, 

392 encoding: str | None = None, 

393 ) -> Markdown: 

394 """ 

395 Converts a Markdown file and returns the HTML as a Unicode string. 

396 

397 Decodes the file using the provided encoding (defaults to `utf-8`), 

398 passes the file content to markdown, and outputs the HTML to either 

399 the provided stream or the file with provided name, using the same 

400 encoding as the source file. The 

401 [`xmlcharrefreplace`](https://docs.python.org/3/library/codecs.html#error-handlers) 

402 error handler is used when encoding the output. 

403 

404 **Note:** This is the only place that decoding and encoding of Unicode 

405 takes place in Python-Markdown. (All other code is Unicode-in / 

406 Unicode-out.) 

407 

408 Arguments: 

409 input: File object or path. Reads from `stdin` if `None`. 

410 output: File object or path. Writes to `stdout` if `None`. 

411 encoding: Encoding of input and output files. Defaults to `utf-8`. 

412 

413 """ 

414 

415 encoding = encoding or "utf-8" 

416 

417 # Read the source 

418 if input: 

419 if isinstance(input, str): 

420 input_file = open(input, mode="r", encoding=encoding) 

421 else: 

422 input_file = codecs.getreader(encoding)(input) 

423 text = input_file.read() 

424 input_file.close() 

425 else: 

426 text = sys.stdin.read() 

427 

428 text = text.lstrip('\ufeff') # remove the byte-order mark 

429 

430 # Convert 

431 html = self.convert(text) 

432 

433 # Write to file or stdout 

434 if output: 

435 if isinstance(output, str): 

436 output_file = codecs.open(output, "w", 

437 encoding=encoding, 

438 errors="xmlcharrefreplace") 

439 output_file.write(html) 

440 output_file.close() 

441 else: 

442 writer = codecs.getwriter(encoding) 

443 output_file = writer(output, errors="xmlcharrefreplace") 

444 output_file.write(html) 

445 # Don't close here. User may want to write more. 

446 else: 

447 # Encode manually and write bytes to stdout. 

448 html = html.encode(encoding, "xmlcharrefreplace") 

449 sys.stdout.buffer.write(html) 

450 

451 return self 

452 

453 

454""" 

455EXPORTED FUNCTIONS 

456============================================================================= 

457 

458Those are the two functions we really mean to export: `markdown()` and 

459`markdownFromFile()`. 

460""" 

461 

462 

463def markdown(text: str, **kwargs: Any) -> str: 

464 """ 

465 Convert a markdown string to HTML and return HTML as a Unicode string. 

466 

467 This is a shortcut function for [`Markdown`][markdown.Markdown] class to cover the most 

468 basic use case. It initializes an instance of [`Markdown`][markdown.Markdown], loads the 

469 necessary extensions and runs the parser on the given text. 

470 

471 Arguments: 

472 text: Markdown formatted text as Unicode or ASCII string. 

473 

474 Keyword arguments: 

475 **kwargs: Any arguments accepted by the Markdown class. 

476 

477 Returns: 

478 A string in the specified output format. 

479 

480 """ 

481 md = Markdown(**kwargs) 

482 return md.convert(text) 

483 

484 

485def markdownFromFile(**kwargs: Any): 

486 """ 

487 Read Markdown text from a file and write output to a file or a stream. 

488 

489 This is a shortcut function which initializes an instance of [`Markdown`][markdown.Markdown], 

490 and calls the [`convertFile`][markdown.Markdown.convertFile] method rather than 

491 [`convert`][markdown.Markdown.convert]. 

492 

493 Keyword arguments: 

494 input (str | BinaryIO): A file name or readable object. 

495 output (str | BinaryIO): A file name or writable object. 

496 encoding (str): Encoding of input and output. 

497 **kwargs: Any arguments accepted by the `Markdown` class. 

498 

499 """ 

500 md = Markdown(**kwargs) 

501 md.convertFile(kwargs.get('input', None), 

502 kwargs.get('output', None), 

503 kwargs.get('encoding', None))