1# CodeHilite Extension for Python-Markdown
2# ========================================
3
4# Adds code/syntax highlighting to standard Python-Markdown code blocks.
5
6# See https://Python-Markdown.github.io/extensions/code_hilite
7# for documentation.
8
9# Original code Copyright 2006-2008 [Waylan Limberg](http://achinghead.com/).
10
11# All changes Copyright 2008-2014 The Python Markdown Project
12
13# License: [BSD](https://opensource.org/licenses/bsd-license.php)
14
15"""
16Adds code/syntax highlighting to standard Python-Markdown code blocks.
17
18See the [documentation](https://Python-Markdown.github.io/extensions/code_hilite)
19for details.
20"""
21
22from __future__ import annotations
23
24from . import Extension
25from ..treeprocessors import Treeprocessor
26from ..util import parseBoolValue
27from typing import TYPE_CHECKING, Callable, Any
28
29if TYPE_CHECKING: # pragma: no cover
30 import xml.etree.ElementTree as etree
31
32try: # pragma: no cover
33 from pygments import highlight
34 from pygments.lexers import get_lexer_by_name, guess_lexer
35 from pygments.formatters import get_formatter_by_name
36 from pygments.util import ClassNotFound
37 pygments = True
38except ImportError: # pragma: no cover
39 pygments = False
40
41
42def parse_hl_lines(expr: str) -> list[int]:
43 """Support our syntax for emphasizing certain lines of code.
44
45 `expr` should be like '1 2' to emphasize lines 1 and 2 of a code block.
46 Returns a list of integers, the line numbers to emphasize.
47 """
48 if not expr:
49 return []
50
51 try:
52 return list(map(int, expr.split()))
53 except ValueError: # pragma: no cover
54 return []
55
56
57# ------------------ The Main CodeHilite Class ----------------------
58class CodeHilite:
59 """
60 Determine language of source code, and pass it on to the Pygments highlighter.
61
62 Usage:
63
64 ```python
65 code = CodeHilite(src=some_code, lang='python')
66 html = code.hilite()
67 ```
68
69 Arguments:
70 src: Source string or any object with a `.readline` attribute.
71
72 Keyword arguments:
73 lang (str): String name of Pygments lexer to use for highlighting. Default: `None`.
74 guess_lang (bool): Auto-detect which lexer to use.
75 Ignored if `lang` is set to a valid value. Default: `True`.
76 use_pygments (bool): Pass code to Pygments for code highlighting. If `False`, the code is
77 instead wrapped for highlighting by a JavaScript library. Default: `True`.
78 pygments_formatter (str): The name of a Pygments formatter or a formatter class used for
79 highlighting the code blocks. Default: `html`.
80 linenums (bool): An alias to Pygments `linenos` formatter option. Default: `None`.
81 css_class (str): An alias to Pygments `cssclass` formatter option. Default: 'codehilite'.
82 lang_prefix (str): Prefix prepended to the language. Default: "language-".
83
84 Other Options:
85
86 Any other options are accepted and passed on to the lexer and formatter. Therefore,
87 valid options include any options which are accepted by the `html` formatter or
88 whichever lexer the code's language uses. Note that most lexers do not have any
89 options. However, a few have very useful options, such as PHP's `startinline` option.
90 Any invalid options are ignored without error.
91
92 * **Formatter options**: <https://pygments.org/docs/formatters/#HtmlFormatter>
93 * **Lexer Options**: <https://pygments.org/docs/lexers/>
94
95 Additionally, when Pygments is enabled, the code's language is passed to the
96 formatter as an extra option `lang_str`, whose value being `{lang_prefix}{lang}`.
97 This option has no effect to the Pygments' builtin formatters.
98
99 Advanced Usage:
100
101 ```python
102 code = CodeHilite(
103 src = some_code,
104 lang = 'php',
105 startinline = True, # Lexer option. Snippet does not start with `<?php`.
106 linenostart = 42, # Formatter option. Snippet starts on line 42.
107 hl_lines = [45, 49, 50], # Formatter option. Highlight lines 45, 49, and 50.
108 linenos = 'inline' # Formatter option. Avoid alignment problems.
109 )
110 html = code.hilite()
111 ```
112
113 """
114
115 def __init__(self, src: str, **options):
116 self.src = src
117 self.lang: str | None = options.pop('lang', None)
118 self.guess_lang: bool = options.pop('guess_lang', True)
119 self.use_pygments: bool = options.pop('use_pygments', True)
120 self.lang_prefix: str = options.pop('lang_prefix', 'language-')
121 self.pygments_formatter: str | Callable = options.pop('pygments_formatter', 'html')
122
123 if 'linenos' not in options:
124 options['linenos'] = options.pop('linenums', None)
125 if 'cssclass' not in options:
126 options['cssclass'] = options.pop('css_class', 'codehilite')
127 if 'wrapcode' not in options:
128 # Override Pygments default
129 options['wrapcode'] = True
130 # Disallow use of `full` option
131 options['full'] = False
132
133 self.options = options
134
135 def hilite(self, shebang: bool = True) -> str:
136 """
137 Pass code to the [Pygments](https://pygments.org/) highlighter with
138 optional line numbers. The output should then be styled with CSS to
139 your liking. No styles are applied by default - only styling hooks
140 (i.e.: `<span class="k">`).
141
142 returns : A string of html.
143
144 """
145
146 self.src = self.src.strip('\n')
147
148 if self.lang is None and shebang:
149 self._parseHeader()
150
151 if pygments and self.use_pygments:
152 try:
153 lexer = get_lexer_by_name(self.lang, **self.options)
154 except ValueError:
155 try:
156 if self.guess_lang:
157 lexer = guess_lexer(self.src, **self.options)
158 else:
159 lexer = get_lexer_by_name('text', **self.options)
160 except ValueError: # pragma: no cover
161 lexer = get_lexer_by_name('text', **self.options)
162 if not self.lang:
163 # Use the guessed lexer's language instead
164 self.lang = lexer.aliases[0]
165 lang_str = f'{self.lang_prefix}{self.lang}'
166 if isinstance(self.pygments_formatter, str):
167 try:
168 formatter = get_formatter_by_name(self.pygments_formatter, **self.options)
169 except ClassNotFound:
170 formatter = get_formatter_by_name('html', **self.options)
171 else:
172 formatter = self.pygments_formatter(lang_str=lang_str, **self.options)
173 return highlight(self.src, lexer, formatter)
174 else:
175 # just escape and build markup usable by JavaScript highlighting libraries
176 txt = self.src.replace('&', '&')
177 txt = txt.replace('<', '<')
178 txt = txt.replace('>', '>')
179 txt = txt.replace('"', '"')
180 classes = []
181 if self.lang:
182 classes.append('{}{}'.format(self.lang_prefix, self.lang))
183 if self.options['linenos']:
184 classes.append('linenums')
185 class_str = ''
186 if classes:
187 class_str = ' class="{}"'.format(' '.join(classes))
188 return '<pre class="{}"><code{}>{}\n</code></pre>\n'.format(
189 self.options['cssclass'],
190 class_str,
191 txt
192 )
193
194 def _parseHeader(self) -> None:
195 """
196 Determines language of a code block from shebang line and whether the
197 said line should be removed or left in place. If the shebang line
198 contains a path (even a single /) then it is assumed to be a real
199 shebang line and left alone. However, if no path is given
200 (e.i.: `#!python` or `:::python`) then it is assumed to be a mock shebang
201 for language identification of a code fragment and removed from the
202 code block prior to processing for code highlighting. When a mock
203 shebang (e.i: `#!python`) is found, line numbering is turned on. When
204 colons are found in place of a shebang (e.i.: `:::python`), line
205 numbering is left in the current state - off by default.
206
207 Also parses optional list of highlight lines, like:
208
209 :::python hl_lines="1 3"
210 """
211
212 import re
213
214 # split text into lines
215 lines = self.src.split("\n")
216 # pull first line to examine
217 fl = lines.pop(0)
218
219 c = re.compile(r'''
220 (?:(?:^::+)|(?P<shebang>^[#]!)) # Shebang or 2 or more colons
221 (?P<path>(?:/\w+)*[/ ])? # Zero or 1 path
222 (?P<lang>[\w#.+-]*) # The language
223 \s* # Arbitrary whitespace
224 # Optional highlight lines, single- or double-quote-delimited
225 (hl_lines=(?P<quot>"|')(?P<hl_lines>.*?)(?P=quot))?
226 ''', re.VERBOSE)
227 # search first line for shebang
228 m = c.search(fl)
229 if m:
230 # we have a match
231 try:
232 self.lang = m.group('lang').lower()
233 except IndexError: # pragma: no cover
234 self.lang = None
235 if m.group('path'):
236 # path exists - restore first line
237 lines.insert(0, fl)
238 if self.options['linenos'] is None and m.group('shebang'):
239 # Overridable and Shebang exists - use line numbers
240 self.options['linenos'] = True
241
242 self.options['hl_lines'] = parse_hl_lines(m.group('hl_lines'))
243 else:
244 # No match
245 lines.insert(0, fl)
246
247 self.src = "\n".join(lines).strip("\n")
248
249
250# ------------------ The Markdown Extension -------------------------------
251
252
253class HiliteTreeprocessor(Treeprocessor):
254 """ Highlight source code in code blocks. """
255
256 config: dict[str, Any]
257
258 def code_unescape(self, text: str) -> str:
259 """Unescape code."""
260 text = text.replace("<", "<")
261 text = text.replace(">", ">")
262 # Escaped '&' should be replaced at the end to avoid
263 # conflicting with < and >.
264 text = text.replace("&", "&")
265 return text
266
267 def run(self, root: etree.Element) -> None:
268 """ Find code blocks and store in `htmlStash`. """
269 blocks = root.iter('pre')
270 for block in blocks:
271 if len(block) == 1 and block[0].tag == 'code':
272 local_config = self.config.copy()
273 text = block[0].text
274 if text is None:
275 continue
276 code = CodeHilite(
277 self.code_unescape(text),
278 tab_length=self.md.tab_length,
279 style=local_config.pop('pygments_style', 'default'),
280 **local_config
281 )
282 placeholder = self.md.htmlStash.store(code.hilite())
283 # Clear code block in `etree` instance
284 block.clear()
285 # Change to `p` element which will later
286 # be removed when inserting raw html
287 block.tag = 'p'
288 block.text = placeholder
289
290
291class CodeHiliteExtension(Extension):
292 """ Add source code highlighting to markdown code blocks. """
293
294 def __init__(self, **kwargs):
295 # define default configs
296 self.config = {
297 'linenums': [
298 None, "Use lines numbers. True|table|inline=yes, False=no, None=auto. Default: `None`."
299 ],
300 'guess_lang': [
301 True, "Automatic language detection - Default: `True`."
302 ],
303 'css_class': [
304 "codehilite", "Set class name for wrapper <div> - Default: `codehilite`."
305 ],
306 'pygments_style': [
307 'default', 'Pygments HTML Formatter Style (Colorscheme). Default: `default`.'
308 ],
309 'noclasses': [
310 False, 'Use inline styles instead of CSS classes - Default `False`.'
311 ],
312 'use_pygments': [
313 True, 'Highlight code blocks with pygments. Disable if using a JavaScript library. Default: `True`.'
314 ],
315 'lang_prefix': [
316 'language-', 'Prefix prepended to the language when `use_pygments` is false. Default: `language-`.'
317 ],
318 'pygments_formatter': [
319 'html', 'Use a specific formatter for Pygments highlighting. Default: `html`.'
320 ],
321 }
322 """ Default configuration options. """
323
324 for key, value in kwargs.items():
325 if key in self.config:
326 self.setConfig(key, value)
327 else:
328 # manually set unknown keywords.
329 if isinstance(value, str):
330 try:
331 # Attempt to parse `str` as a boolean value
332 value = parseBoolValue(value, preserve_none=True)
333 except ValueError:
334 pass # Assume it's not a boolean value. Use as-is.
335 self.config[key] = [value, '']
336
337 def extendMarkdown(self, md):
338 """ Add `HilitePostprocessor` to Markdown instance. """
339 hiliter = HiliteTreeprocessor(md)
340 hiliter.config = self.getConfigs()
341 md.treeprocessors.register(hiliter, 'hilite', 30)
342
343 md.registerExtension(self)
344
345
346def makeExtension(**kwargs): # pragma: no cover
347 return CodeHiliteExtension(**kwargs)