1"""
2 pygments.lexers.grammar_notation
3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4
5 Lexers for grammar notations like BNF.
6
7 :copyright: Copyright 2006-2025 by the Pygments team, see AUTHORS.
8 :license: BSD, see LICENSE for details.
9"""
10
11from pygments.lexer import RegexLexer, bygroups, include, this, using, words
12from pygments.token import Comment, Keyword, Literal, Name, Number, \
13 Operator, Punctuation, String, Text, Whitespace
14
15__all__ = ['BnfLexer', 'AbnfLexer', 'JsgfLexer', 'PegLexer']
16
17
18class BnfLexer(RegexLexer):
19 """
20 This lexer is for grammar notations which are similar to
21 original BNF.
22
23 In order to maximize a number of targets of this lexer,
24 let's decide some designs:
25
26 * We don't distinguish `Terminal Symbol`.
27
28 * We do assume that `NonTerminal Symbol` are always enclosed
29 with arrow brackets.
30
31 * We do assume that `NonTerminal Symbol` may include
32 any printable characters except arrow brackets and ASCII 0x20.
33 This assumption is for `RBNF <http://www.rfc-base.org/txt/rfc-5511.txt>`_.
34
35 * We do assume that target notation doesn't support comment.
36
37 * We don't distinguish any operators and punctuation except
38 `::=`.
39
40 Though these decision making might cause too minimal highlighting
41 and you might be disappointed, but it is reasonable for us.
42 """
43
44 name = 'BNF'
45 aliases = ['bnf']
46 filenames = ['*.bnf']
47 mimetypes = ['text/x-bnf']
48 url = 'https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form'
49 version_added = '2.1'
50
51 tokens = {
52 'root': [
53 (r'(<)([ -;=?-~]+)(>)',
54 bygroups(Punctuation, Name.Class, Punctuation)),
55
56 # an only operator
57 (r'::=', Operator),
58
59 # fallback
60 (r'[^<>:]+', Text), # for performance
61 (r'.', Text),
62 ],
63 }
64
65
66class AbnfLexer(RegexLexer):
67 """
68 Lexer for IETF 7405 ABNF.
69
70 (Updates `5234 <http://www.ietf.org/rfc/rfc5234.txt>`_) grammars.
71 """
72
73 name = 'ABNF'
74 url = 'http://www.ietf.org/rfc/rfc7405.txt'
75 aliases = ['abnf']
76 filenames = ['*.abnf']
77 mimetypes = ['text/x-abnf']
78 version_added = '2.1'
79
80 _core_rules = (
81 'ALPHA', 'BIT', 'CHAR', 'CR', 'CRLF', 'CTL', 'DIGIT',
82 'DQUOTE', 'HEXDIG', 'HTAB', 'LF', 'LWSP', 'OCTET',
83 'SP', 'VCHAR', 'WSP')
84
85 tokens = {
86 'root': [
87 # comment
88 (r';.*$', Comment.Single),
89
90 # quoted
91 # double quote itself in this state, it is as '%x22'.
92 (r'(%[si])?"[^"]*"', Literal),
93
94 # binary (but i have never seen...)
95 (r'%b[01]+\-[01]+\b', Literal), # range
96 (r'%b[01]+(\.[01]+)*\b', Literal), # concat
97
98 # decimal
99 (r'%d[0-9]+\-[0-9]+\b', Literal), # range
100 (r'%d[0-9]+(\.[0-9]+)*\b', Literal), # concat
101
102 # hexadecimal
103 (r'%x[0-9a-fA-F]+\-[0-9a-fA-F]+\b', Literal), # range
104 (r'%x[0-9a-fA-F]+(\.[0-9a-fA-F]+)*\b', Literal), # concat
105
106 # repetition (<a>*<b>element) including nRule
107 (r'\b[0-9]+\*[0-9]+', Operator),
108 (r'\b[0-9]+\*', Operator),
109 (r'\b[0-9]+', Operator),
110 (r'\*', Operator),
111
112 # Strictly speaking, these are not keyword but
113 # are called `Core Rule'.
114 (words(_core_rules, suffix=r'\b'), Keyword),
115
116 # nonterminals (ALPHA *(ALPHA / DIGIT / "-"))
117 (r'[a-zA-Z][a-zA-Z0-9-]*\b', Name.Class),
118
119 # operators
120 (r'(=/|=|/)', Operator),
121
122 # punctuation
123 (r'[\[\]()]', Punctuation),
124
125 # fallback
126 (r'\s+', Whitespace),
127 (r'.', Text),
128 ],
129 }
130
131
132class JsgfLexer(RegexLexer):
133 """
134 For JSpeech Grammar Format grammars.
135 """
136 name = 'JSGF'
137 url = 'https://www.w3.org/TR/jsgf/'
138 aliases = ['jsgf']
139 filenames = ['*.jsgf']
140 mimetypes = ['application/jsgf', 'application/x-jsgf', 'text/jsgf']
141 version_added = '2.2'
142
143 tokens = {
144 'root': [
145 include('comments'),
146 include('non-comments'),
147 ],
148 'comments': [
149 (r'/\*\*(?!/)', Comment.Multiline, 'documentation comment'),
150 (r'/\*[\w\W]*?\*/', Comment.Multiline),
151 (r'//.*$', Comment.Single),
152 ],
153 'non-comments': [
154 (r'\A#JSGF[^;]*', Comment.Preproc),
155 (r'\s+', Whitespace),
156 (r';', Punctuation),
157 (r'[=|()\[\]*+]', Operator),
158 (r'/[^/]+/', Number.Float),
159 (r'"', String.Double, 'string'),
160 (r'\{', String.Other, 'tag'),
161 (words(('import', 'public'), suffix=r'\b'), Keyword.Reserved),
162 (r'grammar\b', Keyword.Reserved, 'grammar name'),
163 (r'(<)(NULL|VOID)(>)',
164 bygroups(Punctuation, Name.Builtin, Punctuation)),
165 (r'<', Punctuation, 'rulename'),
166 (r'\w+|[^\s;=|()\[\]*+/"{<\w]+', Text),
167 ],
168 'string': [
169 (r'"', String.Double, '#pop'),
170 (r'\\.', String.Escape),
171 (r'[^\\"]+', String.Double),
172 ],
173 'tag': [
174 (r'\}', String.Other, '#pop'),
175 (r'\\.', String.Escape),
176 (r'[^\\}]+', String.Other),
177 ],
178 'grammar name': [
179 (r';', Punctuation, '#pop'),
180 (r'\s+', Whitespace),
181 (r'\.', Punctuation),
182 (r'[^;\s.]+', Name.Namespace),
183 ],
184 'rulename': [
185 (r'>', Punctuation, '#pop'),
186 (r'\*', Punctuation),
187 (r'\s+', Whitespace),
188 (r'([^.>]+)(\s*)(\.)', bygroups(Name.Namespace, Text, Punctuation)),
189 (r'[^.>]+', Name.Constant),
190 ],
191 'documentation comment': [
192 (r'\*/', Comment.Multiline, '#pop'),
193 (r'^(\s*)(\*?)(\s*)(@(?:example|see))(\s+)'
194 r'([\w\W]*?(?=(?:^\s*\*?\s*@|\*/)))',
195 bygroups(Whitespace, Comment.Multiline, Whitespace, Comment.Special,
196 Whitespace, using(this, state='example'))),
197 (r'(^\s*\*?\s*)(@\S*)',
198 bygroups(Comment.Multiline, Comment.Special)),
199 (r'[^*\n@]+|\w|\W', Comment.Multiline),
200 ],
201 'example': [
202 (r'(\n\s*)(\*)', bygroups(Whitespace, Comment.Multiline)),
203 include('non-comments'),
204 (r'.', Comment.Multiline),
205 ],
206 }
207
208
209class PegLexer(RegexLexer):
210 """
211 This lexer is for Parsing Expression Grammars (PEG).
212
213 Various implementations of PEG have made different decisions
214 regarding the syntax, so let's try to be accommodating:
215
216 * `<-`, `←`, `:`, and `=` are all accepted as rule operators.
217
218 * Both `|` and `/` are choice operators.
219
220 * `^`, `↑`, and `~` are cut operators.
221
222 * A single `a-z` character immediately before a string, or
223 multiple `a-z` characters following a string, are part of the
224 string (e.g., `r"..."` or `"..."ilmsuxa`).
225 """
226
227 name = 'PEG'
228 url = 'https://bford.info/pub/lang/peg.pdf'
229 aliases = ['peg']
230 filenames = ['*.peg']
231 mimetypes = ['text/x-peg']
232 version_added = '2.6'
233
234 tokens = {
235 'root': [
236 # Comments
237 (r'#.*$', Comment.Single),
238
239 # All operators
240 (r'<-|[←:=/|&!?*+^↑~]', Operator),
241
242 # Other punctuation
243 (r'[()]', Punctuation),
244
245 # Keywords
246 (r'\.', Keyword),
247
248 # Character classes
249 (r'(\[)([^\]]*(?:\\.[^\]\\]*)*)(\])',
250 bygroups(Punctuation, String, Punctuation)),
251
252 # Single and double quoted strings (with optional modifiers)
253 (r'[a-z]?"[^"\\]*(?:\\.[^"\\]*)*"[a-z]*', String.Double),
254 (r"[a-z]?'[^'\\]*(?:\\.[^'\\]*)*'[a-z]*", String.Single),
255
256 # Nonterminals are not whitespace, operators, or punctuation
257 (r'[^\s<←:=/|&!?*+\^↑~()\[\]"\'#]+', Name.Class),
258
259 # Fallback
260 (r'.', Text),
261 ],
262 }