1# :Author: Georg Brandl; Lea Wiemann; Günter Milde
2# :Date: $Date$
3# :Copyright: This module has been placed in the public domain.
4
5"""Lexical analysis of formal languages (i.e. code) using Pygments."""
6
7from __future__ import annotations
8
9__docformat__ = 'reStructuredText'
10
11try:
12 import pygments
13 from pygments.lexers import get_lexer_by_name
14 from pygments.formatters.html import _get_ttype_class
15 with_pygments = True
16except ImportError:
17 with_pygments = False
18
19from docutils import ApplicationError
20
21# Filter the following token types from the list of class arguments:
22unstyled_tokens = ['token', # Token (base token type)
23 'text', # Token.Text
24 ''] # short name for Token and Text
25# (Add, e.g., Token.Punctuation with ``unstyled_tokens += 'punctuation'``.)
26
27
28class LexerError(ApplicationError):
29 pass
30
31
32class Lexer:
33 """Parse `code` lines and yield "classified" tokens.
34
35 Arguments
36
37 code -- string of source code to parse,
38 language -- formal language the code is written in,
39 tokennames -- either 'long', 'short', or 'none' (see below).
40
41 Merge subsequent tokens of the same token-type.
42
43 Iterating over an instance yields the tokens as ``(tokentype, value)``
44 tuples. The value of `tokennames` configures the naming of the tokentype:
45
46 'long': downcased full token type name,
47 'short': short name defined by pygments.token.STANDARD_TYPES
48 (= class argument used in pygments html output),
49 'none': skip lexical analysis.
50 """
51
52 def __init__(self, code, language, tokennames='short') -> None:
53 """
54 Set up a lexical analyzer for `code` in `language`.
55 """
56 self.code = code
57 self.language = language
58 self.tokennames = tokennames
59 self.lexer = None
60 # get lexical analyzer for `language`:
61 if language in ('', 'text') or tokennames == 'none':
62 return
63 if not with_pygments:
64 raise LexerError('Cannot analyze code. '
65 'Pygments package not found.')
66 try:
67 self.lexer = get_lexer_by_name(self.language)
68 except pygments.util.ClassNotFound:
69 raise LexerError('Cannot analyze code. '
70 'No Pygments lexer found for "%s".' % language)
71 # self.lexer.add_filter('tokenmerge')
72 # Since version 1.2. (released Jan 01, 2010) Pygments has a
73 # TokenMergeFilter. # ``self.merge(tokens)`` in __iter__ could
74 # be replaced by ``self.lexer.add_filter('tokenmerge')`` in __init__.
75 # However, `merge` below also strips a final newline added by pygments.
76 #
77 # self.lexer.add_filter('tokenmerge')
78
79 def merge(self, tokens):
80 """Merge subsequent tokens of same token-type.
81
82 Also strip the final newline (added by pygments).
83 """
84 tokens = iter(tokens)
85 (lasttype, lastval) = next(tokens)
86 for ttype, value in tokens:
87 if ttype is lasttype:
88 lastval += value
89 else:
90 yield lasttype, lastval
91 (lasttype, lastval) = (ttype, value)
92 lastval = lastval.removesuffix('\n')
93 if lastval:
94 yield lasttype, lastval
95
96 def __iter__(self):
97 """Parse self.code and yield "classified" tokens.
98 """
99 if self.lexer is None:
100 yield [], self.code
101 return
102 tokens = pygments.lex(self.code, self.lexer)
103 for tokentype, value in self.merge(tokens):
104 if self.tokennames == 'long': # long CSS class args
105 classes = str(tokentype).lower().split('.')
106 else: # short CSS class args
107 classes = [_get_ttype_class(tokentype)]
108 classes = [cls for cls in classes if cls not in unstyled_tokens]
109 yield classes, value
110
111
112class NumberLines:
113 """Insert linenumber-tokens at the start of every code line.
114
115 Arguments
116
117 tokens -- iterable of ``(classes, value)`` tuples
118 startline -- first line number
119 endline -- last line number
120
121 Iterating over an instance yields the tokens with a
122 ``(['ln'], '<the line number>')`` token added for every code line.
123 Multi-line tokens are split."""
124
125 def __init__(self, tokens, startline, endline) -> None:
126 self.tokens = tokens
127 self.startline = startline
128 # pad linenumbers, e.g. endline == 100 -> fmt_str = '%3d '
129 self.fmt_str = f'%{len(str(endline))}d '
130
131 def __iter__(self):
132 lineno = self.startline
133 yield ['ln'], self.fmt_str % lineno
134 for ttype, value in self.tokens:
135 lines = value.split('\n')
136 for line in lines[:-1]:
137 yield ttype, line + '\n'
138 lineno += 1
139 yield ['ln'], self.fmt_str % lineno
140 yield ttype, lines[-1]