1# :Author: Georg Brandl; Lea Wiemann; Günter Milde
2# :Date: $Date$
3# :Copyright: This module has been placed in the public domain.
4
5"""Lexical analysis of formal languages (i.e. code) using Pygments."""
6
7from docutils import ApplicationError
8try:
9 import pygments
10 from pygments.lexers import get_lexer_by_name
11 from pygments.formatters.html import _get_ttype_class
12 with_pygments = True
13except ImportError:
14 with_pygments = False
15
16# Filter the following token types from the list of class arguments:
17unstyled_tokens = ['token', # Token (base token type)
18 'text', # Token.Text
19 ''] # short name for Token and Text
20# (Add, e.g., Token.Punctuation with ``unstyled_tokens += 'punctuation'``.)
21
22
23class LexerError(ApplicationError):
24 pass
25
26
27class Lexer:
28 """Parse `code` lines and yield "classified" tokens.
29
30 Arguments
31
32 code -- string of source code to parse,
33 language -- formal language the code is written in,
34 tokennames -- either 'long', 'short', or 'none' (see below).
35
36 Merge subsequent tokens of the same token-type.
37
38 Iterating over an instance yields the tokens as ``(tokentype, value)``
39 tuples. The value of `tokennames` configures the naming of the tokentype:
40
41 'long': downcased full token type name,
42 'short': short name defined by pygments.token.STANDARD_TYPES
43 (= class argument used in pygments html output),
44 'none': skip lexical analysis.
45 """
46
47 def __init__(self, code, language, tokennames='short'):
48 """
49 Set up a lexical analyzer for `code` in `language`.
50 """
51 self.code = code
52 self.language = language
53 self.tokennames = tokennames
54 self.lexer = None
55 # get lexical analyzer for `language`:
56 if language in ('', 'text') or tokennames == 'none':
57 return
58 if not with_pygments:
59 raise LexerError('Cannot analyze code. '
60 'Pygments package not found.')
61 try:
62 self.lexer = get_lexer_by_name(self.language)
63 except pygments.util.ClassNotFound:
64 raise LexerError('Cannot analyze code. '
65 'No Pygments lexer found for "%s".' % language)
66 # self.lexer.add_filter('tokenmerge')
67 # Since version 1.2. (released Jan 01, 2010) Pygments has a
68 # TokenMergeFilter. # ``self.merge(tokens)`` in __iter__ could
69 # be replaced by ``self.lexer.add_filter('tokenmerge')`` in __init__.
70 # However, `merge` below also strips a final newline added by pygments.
71 #
72 # self.lexer.add_filter('tokenmerge')
73
74 def merge(self, tokens):
75 """Merge subsequent tokens of same token-type.
76
77 Also strip the final newline (added by pygments).
78 """
79 tokens = iter(tokens)
80 (lasttype, lastval) = next(tokens)
81 for ttype, value in tokens:
82 if ttype is lasttype:
83 lastval += value
84 else:
85 yield lasttype, lastval
86 (lasttype, lastval) = (ttype, value)
87 if lastval.endswith('\n'):
88 lastval = lastval[:-1]
89 if lastval:
90 yield lasttype, lastval
91
92 def __iter__(self):
93 """Parse self.code and yield "classified" tokens.
94 """
95 if self.lexer is None:
96 yield [], self.code
97 return
98 tokens = pygments.lex(self.code, self.lexer)
99 for tokentype, value in self.merge(tokens):
100 if self.tokennames == 'long': # long CSS class args
101 classes = str(tokentype).lower().split('.')
102 else: # short CSS class args
103 classes = [_get_ttype_class(tokentype)]
104 classes = [cls for cls in classes if cls not in unstyled_tokens]
105 yield classes, value
106
107
108class NumberLines:
109 """Insert linenumber-tokens at the start of every code line.
110
111 Arguments
112
113 tokens -- iterable of ``(classes, value)`` tuples
114 startline -- first line number
115 endline -- last line number
116
117 Iterating over an instance yields the tokens with a
118 ``(['ln'], '<the line number>')`` token added for every code line.
119 Multi-line tokens are split."""
120
121 def __init__(self, tokens, startline, endline):
122 self.tokens = tokens
123 self.startline = startline
124 # pad linenumbers, e.g. endline == 100 -> fmt_str = '%3d '
125 self.fmt_str = '%%%dd ' % len(str(endline))
126
127 def __iter__(self):
128 lineno = self.startline
129 yield ['ln'], self.fmt_str % lineno
130 for ttype, value in self.tokens:
131 lines = value.split('\n')
132 for line in lines[:-1]:
133 yield ttype, line + '\n'
134 lineno += 1
135 yield ['ln'], self.fmt_str % lineno
136 yield ttype, lines[-1]