Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/markdown/core.py: 68%

1# Python Markdown

3# A Python implementation of John Gruber's Markdown.

5# Documentation: https://python-markdown.github.io/

6# GitHub: https://github.com/Python-Markdown/markdown/

7# PyPI: https://pypi.org/project/Markdown/

9# Started by Manfred Stienstra (http://www.dwerg.net/).

10# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).

11# Currently maintained by Waylan Limberg (https://github.com/waylan),

12# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).

18# License: BSD (see LICENSE.md for details).

20from __future__ import annotations

22import codecs

23import sys

24import logging

25import importlib

26from typing import TYPE_CHECKING, Any, BinaryIO, Callable, ClassVar, Mapping, Sequence

27from . import util

28from .preprocessors import build_preprocessors

29from .blockprocessors import build_block_parser

30from .treeprocessors import build_treeprocessors

31from .inlinepatterns import build_inlinepatterns

32from .postprocessors import build_postprocessors

33from .extensions import Extension

34from .serializers import to_html_string, to_xhtml_string

35from .util import BLOCK_LEVEL_ELEMENTS

37if TYPE_CHECKING: # pragma: no cover

38 from xml.etree.ElementTree import Element

40__all__ = ['Markdown', 'markdown', 'markdownFromFile']

43logger = logging.getLogger('MARKDOWN')

46class Markdown:

47 """

48 A parser which converts Markdown to HTML.

50 Attributes:

51 Markdown.tab_length (int): The number of spaces which correspond to a single tab. Default: `4`.

52 Markdown.ESCAPED_CHARS (list[str]): List of characters which get the backslash escape treatment.

53 Markdown.block_level_elements (list[str]): List of HTML tags which get treated as block-level elements.

54 See [`markdown.util.BLOCK_LEVEL_ELEMENTS`][] for the full list of elements.

55 Markdown.registeredExtensions (list[Extension]): List of extensions which have called

56 [`registerExtension`][markdown.Markdown.registerExtension] during setup.

57 Markdown.doc_tag (str): Element used to wrap document. Default: `div`.

58 Markdown.stripTopLevelTags (bool): Indicates whether the `doc_tag` should be removed. Default: 'True'.

59 Markdown.references (dict[str, tuple[str, str]]): A mapping of link references found in a parsed document

60 where the key is the reference name and the value is a tuple of the URL and title.

61 Markdown.htmlStash (util.HtmlStash): The instance of the `HtmlStash` used by an instance of this class.

62 Markdown.output_formats (dict[str, Callable[xml.etree.ElementTree.Element]]): A mapping of known output

63 formats by name and their respective serializers. Each serializer must be a callable which accepts an

64 [`Element`][xml.etree.ElementTree.Element] and returns a `str`.

65 Markdown.output_format (str): The output format set by

66 [`set_output_format`][markdown.Markdown.set_output_format].

67 Markdown.serializer (Callable[xml.etree.ElementTree.Element]): The serializer set by

68 [`set_output_format`][markdown.Markdown.set_output_format].

69 Markdown.preprocessors (util.Registry): A collection of [`preprocessors`][markdown.preprocessors].

70 Markdown.parser (blockparser.BlockParser): A collection of [`blockprocessors`][markdown.blockprocessors].

71 Markdown.inlinePatterns (util.Registry): A collection of [`inlinepatterns`][markdown.inlinepatterns].

72 Markdown.treeprocessors (util.Registry): A collection of [`treeprocessors`][markdown.treeprocessors].

73 Markdown.postprocessors (util.Registry): A collection of [`postprocessors`][markdown.postprocessors].

75 """

77 doc_tag = "div" # Element used to wrap document - later removed

79 output_formats: ClassVar[dict[str, Callable[[Element], str]]] = {

80 'html': to_html_string,

81 'xhtml': to_xhtml_string,

82 }

83 """

84 A mapping of known output formats by name and their respective serializers. Each serializer must be a

85 callable which accepts an [`Element`][xml.etree.ElementTree.Element] and returns a `str`.

86 """

88 def __init__(self, **kwargs):

89 """

90 Creates a new Markdown instance.

92 Keyword Arguments:

93 extensions (list[Extension | str]): A list of extensions.

95 If an item is an instance of a subclass of [`markdown.extensions.Extension`][],

96 the instance will be used as-is. If an item is of type `str`, it is passed

97 to [`build_extension`][markdown.Markdown.build_extension] with its corresponding

98 `extension_configs` and the returned instance of [`markdown.extensions.Extension`][]

99 is used.

100 extension_configs (dict[str, dict[str, Any]]): Configuration settings for extensions.

101 output_format (str): Format of output. Supported formats are:

102

103 * `xhtml`: Outputs XHTML style tags. Default.

104 * `html`: Outputs HTML style tags.

105 tab_length (int): Length of tabs in the source. Default: `4`

106

107 """

108

109 self.tab_length: int = kwargs.get('tab_length', 4)

110

111 self.ESCAPED_CHARS: list[str] = [

112 '\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '>', '#', '+', '-', '.', '!'

113 ]

114 """ List of characters which get the backslash escape treatment. """

115

116 self.block_level_elements: list[str] = BLOCK_LEVEL_ELEMENTS.copy()

117

118 self.registeredExtensions: list[Extension] = []

119 self.docType = "" # TODO: Maybe delete this. It does not appear to be used anymore.

120 self.stripTopLevelTags: bool = True

121

122 self.build_parser()

123

124 self.references: dict[str, tuple[str, str]] = {}

125 self.htmlStash: util.HtmlStash = util.HtmlStash()

126 self.registerExtensions(extensions=kwargs.get('extensions', []),

127 configs=kwargs.get('extension_configs', {}))

128 self.set_output_format(kwargs.get('output_format', 'xhtml'))

129 self.reset()

130

131 def build_parser(self) -> Markdown:

132 """

133 Build the parser from the various parts.

134

135 Assigns a value to each of the following attributes on the class instance:

136

137 * **`Markdown.preprocessors`** ([`Registry`][markdown.util.Registry]) -- A collection of

138 [`preprocessors`][markdown.preprocessors].

139 * **`Markdown.parser`** ([`BlockParser`][markdown.blockparser.BlockParser]) -- A collection of

140 [`blockprocessors`][markdown.blockprocessors].

141 * **`Markdown.inlinePatterns`** ([`Registry`][markdown.util.Registry]) -- A collection of

142 [`inlinepatterns`][markdown.inlinepatterns].

143 * **`Markdown.treeprocessors`** ([`Registry`][markdown.util.Registry]) -- A collection of

144 [`treeprocessors`][markdown.treeprocessors].

145 * **`Markdown.postprocessors`** ([`Registry`][markdown.util.Registry]) -- A collection of

146 [`postprocessors`][markdown.postprocessors].

147

148 This method could be redefined in a subclass to build a custom parser which is made up of a different

149 combination of processors and patterns.

150

151 """

152 self.preprocessors = build_preprocessors(self)

153 self.parser = build_block_parser(self)

154 self.inlinePatterns = build_inlinepatterns(self)

155 self.treeprocessors = build_treeprocessors(self)

156 self.postprocessors = build_postprocessors(self)

157 return self

158

159 def registerExtensions(

160 self,

161 extensions: Sequence[Extension | str],

162 configs: Mapping[str, dict[str, Any]]

163 ) -> Markdown:

164 """

165 Load a list of extensions into an instance of the `Markdown` class.

166

167 Arguments:

168 extensions (list[Extension | str]): A list of extensions.

169

170 If an item is an instance of a subclass of [`markdown.extensions.Extension`][],

171 the instance will be used as-is. If an item is of type `str`, it is passed

172 to [`build_extension`][markdown.Markdown.build_extension] with its corresponding `configs` and the

173 returned instance of [`markdown.extensions.Extension`][] is used.

174 configs (dict[str, dict[str, Any]]): Configuration settings for extensions.

175

176 """

177 for ext in extensions:

178 if isinstance(ext, str):

179 ext = self.build_extension(ext, configs.get(ext, {}))

180 if isinstance(ext, Extension):

181 ext.extendMarkdown(self)

182 logger.debug(

183 'Successfully loaded extension "%s.%s".'

184 % (ext.__class__.__module__, ext.__class__.__name__)

185 )

186 elif ext is not None:

187 raise TypeError(

188 'Extension "{}.{}" must be of type: "{}.{}"'.format(

189 ext.__class__.__module__, ext.__class__.__name__,

190 Extension.__module__, Extension.__name__

191 )

192 )

193 return self

194

195 def build_extension(self, ext_name: str, configs: Mapping[str, Any]) -> Extension:

196 """

197 Build extension from a string name, then return an instance using the given `configs`.

198

199 Arguments:

200 ext_name: Name of extension as a string.

201 configs: Configuration settings for extension.

202

203 Returns:

204 An instance of the extension with the given configuration settings.

205

206 First attempt to load an entry point. The string name must be registered as an entry point in the

207 `markdown.extensions` group which points to a subclass of the [`markdown.extensions.Extension`][] class.

208 If multiple distributions have registered the same name, the first one found is returned.

209

210 If no entry point is found, assume dot notation (`path.to.module:ClassName`). Load the specified class and

211 return an instance. If no class is specified, import the module and call a `makeExtension` function and return

212 the [`markdown.extensions.Extension`][] instance returned by that function.

213 """

214 configs = dict(configs)

215

216 entry_points = [ep for ep in util.get_installed_extensions() if ep.name == ext_name]

217 if entry_points:

218 ext = entry_points[0].load()

219 return ext(**configs)

220

221 # Get class name (if provided): `path.to.module:ClassName`

222 ext_name, class_name = ext_name.split(':', 1) if ':' in ext_name else (ext_name, '')

223

224 try:

225 module = importlib.import_module(ext_name)

226 logger.debug(

227 'Successfully imported extension module "%s".' % ext_name

228 )

229 except ImportError as e:

230 message = 'Failed loading extension "%s".' % ext_name

231 e.args = (message,) + e.args[1:]

232 raise

233

234 if class_name:

235 # Load given class name from module.

236 return getattr(module, class_name)(**configs)

237 else:

238 # Expect `makeExtension()` function to return a class.

239 try:

240 return module.makeExtension(**configs)

241 except AttributeError as e:

242 message = e.args[0]

243 message = "Failed to initiate extension " \

244 "'%s': %s" % (ext_name, message)

245 e.args = (message,) + e.args[1:]

246 raise

247

248 def registerExtension(self, extension: Extension) -> Markdown:

249 """

250 Register an extension as having a resettable state.

251

252 Arguments:

253 extension: An instance of the extension to register.

254

255 This should get called once by an extension during setup. A "registered" extension's

256 `reset` method is called by [`Markdown.reset()`][markdown.Markdown.reset]. Not all extensions have or need a

257 resettable state, and so it should not be assumed that all extensions are "registered."

258

259 """

260 self.registeredExtensions.append(extension)

261 return self

262

263 def reset(self) -> Markdown:

264 """

265 Resets all state variables to prepare the parser instance for new input.

266

267 Called once upon creation of a class instance. Should be called manually between calls

268 to [`Markdown.convert`][markdown.Markdown.convert].

269 """

270 self.htmlStash.reset()

271 self.references.clear()

272

273 for extension in self.registeredExtensions:

274 if hasattr(extension, 'reset'):

275 extension.reset()

276

277 return self

278

279 def set_output_format(self, format: str) -> Markdown:

280 """

281 Set the output format for the class instance.

282

283 Arguments:

284 format: Must be a known value in `Markdown.output_formats`.

285

286 """

287 self.output_format = format.lower().rstrip('145') # ignore number

288 try:

289 self.serializer = self.output_formats[self.output_format]

290 except KeyError as e:

291 valid_formats = list(self.output_formats.keys())

292 valid_formats.sort()

293 message = 'Invalid Output Format: "%s". Use one of %s.' \

294 % (self.output_format,

295 '"' + '", "'.join(valid_formats) + '"')

296 e.args = (message,) + e.args[1:]

297 raise

298 return self

299

300 # Note: the `tag` argument is type annotated `Any` as ElementTree uses many various objects as tags.

301 # As there is no standardization in ElementTree, the type of a given tag is unpredictable.

302 def is_block_level(self, tag: Any) -> bool:

303 """

304 Check if the given `tag` is a block level HTML tag.

305

306 Returns `True` for any string listed in `Markdown.block_level_elements`. A `tag` which is

307 not a string always returns `False`.

308

309 """

310 if isinstance(tag, str):

311 return tag.lower().rstrip('/') in self.block_level_elements

312 # Some ElementTree tags are not strings, so return False.

313 return False

314

315 def convert(self, source: str) -> str:

316 """

317 Convert a Markdown string to a string in the specified output format.

318

319 Arguments:

320 source: Markdown formatted text as Unicode or ASCII string.

321

322 Returns:

323 A string in the specified output format.

324

325 Markdown parsing takes place in five steps:

326

327 1. A bunch of [`preprocessors`][markdown.preprocessors] munge the input text.

328 2. A [`BlockParser`][markdown.blockparser.BlockParser] parses the high-level structural elements of the

329 pre-processed text into an [`ElementTree`][xml.etree.ElementTree.ElementTree] object.

330 3. A bunch of [`treeprocessors`][markdown.treeprocessors] are run against the

331 [`ElementTree`][xml.etree.ElementTree.ElementTree] object. One such `treeprocessor`

332 ([`markdown.treeprocessors.InlineProcessor`][]) runs [`inlinepatterns`][markdown.inlinepatterns]

333 against the [`ElementTree`][xml.etree.ElementTree.ElementTree] object, parsing inline markup.

334 4. Some [`postprocessors`][markdown.postprocessors] are run against the text after the

335 [`ElementTree`][xml.etree.ElementTree.ElementTree] object has been serialized into text.

336 5. The output is returned as a string.

337

338 """

339

340 # Fix up the source text

341 if not source.strip():

342 return '' # a blank Unicode string

343

344 try:

345 source = str(source)

346 except UnicodeDecodeError as e: # pragma: no cover

347 # Customize error message while maintaining original traceback

348 e.reason += '. -- Note: Markdown only accepts Unicode input!'

349 raise

350

351 # Split into lines and run the line preprocessors.

352 self.lines = source.split("\n")

353 for prep in self.preprocessors:

354 self.lines = prep.run(self.lines)

355

356 # Parse the high-level elements.

357 root = self.parser.parseDocument(self.lines).getroot()

358

359 # Run the tree-processors

360 for treeprocessor in self.treeprocessors:

361 newRoot = treeprocessor.run(root)

362 if newRoot is not None:

363 root = newRoot

364

365 # Serialize _properly_. Strip top-level tags.

366 output = self.serializer(root)

367 if self.stripTopLevelTags:

368 try:

369 start = output.index(

370 '<%s>' % self.doc_tag) + len(self.doc_tag) + 2

371 end = output.rindex('</%s>' % self.doc_tag)

372 output = output[start:end].strip()

373 except ValueError as e: # pragma: no cover

374 if output.strip().endswith('<%s />' % self.doc_tag):

375 # We have an empty document

376 output = ''

377 else:

378 # We have a serious problem

379 raise ValueError('Markdown failed to strip top-level '

380 'tags. Document=%r' % output.strip()) from e

381

382 # Run the text post-processors

383 for pp in self.postprocessors:

384 output = pp.run(output)

385

386 return output.strip()

387

388 def convertFile(

389 self,

390 input: str | BinaryIO | None = None,

391 output: str | BinaryIO | None = None,

392 encoding: str | None = None,

393 ) -> Markdown:

394 """

395 Converts a Markdown file and returns the HTML as a Unicode string.

396

397 Decodes the file using the provided encoding (defaults to `utf-8`),

398 passes the file content to markdown, and outputs the HTML to either

399 the provided stream or the file with provided name, using the same

400 encoding as the source file. The

401 [`xmlcharrefreplace`](https://docs.python.org/3/library/codecs.html#error-handlers)

402 error handler is used when encoding the output.

403

404 **Note:** This is the only place that decoding and encoding of Unicode

405 takes place in Python-Markdown. (All other code is Unicode-in /

406 Unicode-out.)

407

408 Arguments:

409 input: File object or path. Reads from `stdin` if `None`.

410 output: File object or path. Writes to `stdout` if `None`.

411 encoding: Encoding of input and output files. Defaults to `utf-8`.

412

413 """

414

415 encoding = encoding or "utf-8"

416

417 # Read the source

418 if input:

419 if isinstance(input, str):

420 input_file = open(input, mode="r", encoding=encoding)

421 else:

422 input_file = codecs.getreader(encoding)(input)

423 text = input_file.read()

424 input_file.close()

425 else:

426 text = sys.stdin.read()

427

428 text = text.lstrip('\ufeff') # remove the byte-order mark

429

430 # Convert

431 html = self.convert(text)

432

433 # Write to file or stdout

434 if output:

435 if isinstance(output, str):

436 output_file = codecs.open(output, "w",

437 encoding=encoding,

438 errors="xmlcharrefreplace")

439 output_file.write(html)

440 output_file.close()

441 else:

442 writer = codecs.getwriter(encoding)

443 output_file = writer(output, errors="xmlcharrefreplace")

444 output_file.write(html)

445 # Don't close here. User may want to write more.

446 else:

447 # Encode manually and write bytes to stdout.

448 html = html.encode(encoding, "xmlcharrefreplace")

449 sys.stdout.buffer.write(html)

450

451 return self

452

453

454"""

455EXPORTED FUNCTIONS

456=============================================================================

457

458Those are the two functions we really mean to export: `markdown()` and

459`markdownFromFile()`.

460"""

461

462

463def markdown(text: str, **kwargs: Any) -> str:

464 """

465 Convert a markdown string to HTML and return HTML as a Unicode string.

466

467 This is a shortcut function for [`Markdown`][markdown.Markdown] class to cover the most

468 basic use case. It initializes an instance of [`Markdown`][markdown.Markdown], loads the

469 necessary extensions and runs the parser on the given text.

470

471 Arguments:

472 text: Markdown formatted text as Unicode or ASCII string.

473

474 Keyword arguments:

475 **kwargs: Any arguments accepted by the Markdown class.

476

477 Returns:

478 A string in the specified output format.

479

480 """

481 md = Markdown(**kwargs)

482 return md.convert(text)

483

484

485def markdownFromFile(**kwargs: Any):

486 """

487 Read Markdown text from a file and write output to a file or a stream.

488

489 This is a shortcut function which initializes an instance of [`Markdown`][markdown.Markdown],

490 and calls the [`convertFile`][markdown.Markdown.convertFile] method rather than

491 [`convert`][markdown.Markdown.convert].

492

493 Keyword arguments:

494 input (str | BinaryIO): A file name or readable object.

495 output (str | BinaryIO): A file name or writable object.

496 encoding (str): Encoding of input and output.

497 **kwargs: Any arguments accepted by the `Markdown` class.

498

499 """

500 md = Markdown(**kwargs)

501 md.convertFile(kwargs.get('input', None),

502 kwargs.get('output', None),

503 kwargs.get('encoding', None))