1"""
2 pygments.lexers.sql
3 ~~~~~~~~~~~~~~~~~~~
4
5 Lexers for various SQL dialects and related interactive sessions.
6
7 Postgres specific lexers:
8
9 `PostgresLexer`
10 A SQL lexer for the PostgreSQL dialect. Differences w.r.t. the SQL
11 lexer are:
12
13 - keywords and data types list parsed from the PG docs (run the
14 `_postgres_builtins` module to update them);
15 - Content of $-strings parsed using a specific lexer, e.g. the content
16 of a PL/Python function is parsed using the Python lexer;
17 - parse PG specific constructs: E-strings, $-strings, U&-strings,
18 different operators and punctuation.
19
20 `PlPgsqlLexer`
21 A lexer for the PL/pgSQL language. Adds a few specific construct on
22 top of the PG SQL lexer (such as <<label>>).
23
24 `PostgresConsoleLexer`
25 A lexer to highlight an interactive psql session:
26
27 - identifies the prompt and does its best to detect the end of command
28 in multiline statement where not all the lines are prefixed by a
29 prompt, telling them apart from the output;
30 - highlights errors in the output and notification levels;
31 - handles psql backslash commands.
32
33 `PostgresExplainLexer`
34 A lexer to highlight Postgres execution plan.
35
36 The ``tests/examplefiles`` contains a few test files with data to be
37 parsed by these lexers.
38
39 :copyright: Copyright 2006-2025 by the Pygments team, see AUTHORS.
40 :license: BSD, see LICENSE for details.
41"""
42
43import collections
44import re
45
46from pygments.lexer import Lexer, RegexLexer, do_insertions, bygroups, words
47from pygments.lexers import _googlesql_builtins
48from pygments.lexers import _mysql_builtins
49from pygments.lexers import _postgres_builtins
50from pygments.lexers import _sql_builtins
51from pygments.lexers import _tsql_builtins
52from pygments.lexers import get_lexer_by_name, ClassNotFound
53from pygments.token import Punctuation, Whitespace, Text, Comment, Operator, \
54 Keyword, Name, String, Number, Generic, Literal
55
56
57__all__ = ['GoogleSqlLexer', 'PostgresLexer', 'PlPgsqlLexer',
58 'PostgresConsoleLexer', 'PostgresExplainLexer', 'SqlLexer',
59 'TransactSqlLexer', 'MySqlLexer', 'SqliteConsoleLexer', 'RqlLexer']
60
61line_re = re.compile('.*?\n')
62sqlite_prompt_re = re.compile(r'^(?:sqlite| ...)>(?= )')
63
64language_re = re.compile(r"\s+LANGUAGE\s+'?(\w+)'?", re.IGNORECASE)
65
66do_re = re.compile(r'\bDO\b', re.IGNORECASE)
67
68# Regular expressions for analyse_text()
69name_between_bracket_re = re.compile(r'\[[a-zA-Z_]\w*\]')
70name_between_backtick_re = re.compile(r'`[a-zA-Z_]\w*`')
71tsql_go_re = re.compile(r'\bgo\b', re.IGNORECASE)
72tsql_declare_re = re.compile(r'\bdeclare\s+@', re.IGNORECASE)
73tsql_variable_re = re.compile(r'@[a-zA-Z_]\w*\b')
74
75# Identifiers for analyse_text()
76googlesql_identifiers = (
77 _googlesql_builtins.functionnames
78 + _googlesql_builtins.keywords
79 + _googlesql_builtins.types)
80
81
82def language_callback(lexer, match):
83 """Parse the content of a $-string using a lexer
84
85 The lexer is chosen looking for a nearby LANGUAGE or assumed as
86 plpgsql if inside a DO statement and no LANGUAGE has been found.
87 """
88 lx = None
89 m = language_re.match(lexer.text[match.end():match.end()+100])
90 if m is not None:
91 lx = lexer._get_lexer(m.group(1))
92 else:
93 m = list(language_re.finditer(
94 lexer.text[max(0, match.start()-100):match.start()]))
95 if m:
96 lx = lexer._get_lexer(m[-1].group(1))
97 else:
98 m = list(do_re.finditer(
99 lexer.text[max(0, match.start()-25):match.start()]))
100 if m:
101 lx = lexer._get_lexer('plpgsql')
102
103 # 1 = $, 2 = delimiter, 3 = $
104 yield (match.start(1), String, match.group(1))
105 yield (match.start(2), String.Delimiter, match.group(2))
106 yield (match.start(3), String, match.group(3))
107 # 4 = string contents
108 if lx:
109 yield from lx.get_tokens_unprocessed(match.group(4))
110 else:
111 yield (match.start(4), String, match.group(4))
112 # 5 = $, 6 = delimiter, 7 = $
113 yield (match.start(5), String, match.group(5))
114 yield (match.start(6), String.Delimiter, match.group(6))
115 yield (match.start(7), String, match.group(7))
116
117
118class PostgresBase:
119 """Base class for Postgres-related lexers.
120
121 This is implemented as a mixin to avoid the Lexer metaclass kicking in.
122 this way the different lexer don't have a common Lexer ancestor. If they
123 had, _tokens could be created on this ancestor and not updated for the
124 other classes, resulting e.g. in PL/pgSQL parsed as SQL. This shortcoming
125 seem to suggest that regexp lexers are not really subclassable.
126 """
127 def get_tokens_unprocessed(self, text, *args):
128 # Have a copy of the entire text to be used by `language_callback`.
129 self.text = text
130 yield from super().get_tokens_unprocessed(text, *args)
131
132 def _get_lexer(self, lang):
133 if lang.lower() == 'sql':
134 return get_lexer_by_name('postgresql', **self.options)
135
136 tries = [lang]
137 if lang.startswith('pl'):
138 tries.append(lang[2:])
139 if lang.endswith('u'):
140 tries.append(lang[:-1])
141 if lang.startswith('pl') and lang.endswith('u'):
142 tries.append(lang[2:-1])
143
144 for lx in tries:
145 try:
146 return get_lexer_by_name(lx, **self.options)
147 except ClassNotFound:
148 pass
149 else:
150 # TODO: better logging
151 # print >>sys.stderr, "language not found:", lang
152 return None
153
154
155class PostgresLexer(PostgresBase, RegexLexer):
156 """
157 Lexer for the PostgreSQL dialect of SQL.
158 """
159
160 name = 'PostgreSQL SQL dialect'
161 aliases = ['postgresql', 'postgres']
162 mimetypes = ['text/x-postgresql']
163 url = 'https://www.postgresql.org'
164 version_added = '1.5'
165
166 flags = re.IGNORECASE
167 tokens = {
168 'root': [
169 (r'\s+', Whitespace),
170 (r'--.*\n?', Comment.Single),
171 (r'/\*', Comment.Multiline, 'multiline-comments'),
172 (r'(' + '|'.join(s.replace(" ", r"\s+")
173 for s in _postgres_builtins.DATATYPES +
174 _postgres_builtins.PSEUDO_TYPES) + r')\b',
175 Name.Builtin),
176 (words(_postgres_builtins.KEYWORDS, suffix=r'\b'), Keyword),
177 (r'[+*/<>=~!@#%^&|`?-]+', Operator),
178 (r'::', Operator), # cast
179 (r'\$\d+', Name.Variable),
180 (r'([0-9]*\.[0-9]*|[0-9]+)(e[+-]?[0-9]+)?', Number.Float),
181 (r'[0-9]+', Number.Integer),
182 (r"((?:E|U&)?)(')", bygroups(String.Affix, String.Single), 'string'),
183 # quoted identifier
184 (r'((?:U&)?)(")', bygroups(String.Affix, String.Name), 'quoted-ident'),
185 (r'(?s)(\$)([^$]*)(\$)(.*?)(\$)(\2)(\$)', language_callback),
186 (r'[a-z_]\w*', Name),
187
188 # psql variable in SQL
189 (r""":(['"]?)[a-z]\w*\b\1""", Name.Variable),
190
191 (r'[;:()\[\]{},.]', Punctuation),
192 ],
193 'multiline-comments': [
194 (r'/\*', Comment.Multiline, 'multiline-comments'),
195 (r'\*/', Comment.Multiline, '#pop'),
196 (r'[^/*]+', Comment.Multiline),
197 (r'[/*]', Comment.Multiline)
198 ],
199 'string': [
200 (r"[^']+", String.Single),
201 (r"''", String.Single),
202 (r"'", String.Single, '#pop'),
203 ],
204 'quoted-ident': [
205 (r'[^"]+', String.Name),
206 (r'""', String.Name),
207 (r'"', String.Name, '#pop'),
208 ],
209 }
210
211
212class PlPgsqlLexer(PostgresBase, RegexLexer):
213 """
214 Handle the extra syntax in Pl/pgSQL language.
215 """
216 name = 'PL/pgSQL'
217 aliases = ['plpgsql']
218 mimetypes = ['text/x-plpgsql']
219 url = 'https://www.postgresql.org/docs/current/plpgsql.html'
220 version_added = '1.5'
221
222 flags = re.IGNORECASE
223 # FIXME: use inheritance
224 tokens = {name: state[:] for (name, state) in PostgresLexer.tokens.items()}
225
226 # extend the keywords list
227 for i, pattern in enumerate(tokens['root']):
228 if pattern[1] == Keyword:
229 tokens['root'][i] = (
230 words(_postgres_builtins.KEYWORDS +
231 _postgres_builtins.PLPGSQL_KEYWORDS, suffix=r'\b'),
232 Keyword)
233 del i
234 break
235 else:
236 assert 0, "SQL keywords not found"
237
238 # Add specific PL/pgSQL rules (before the SQL ones)
239 tokens['root'][:0] = [
240 (r'\%[a-z]\w*\b', Name.Builtin), # actually, a datatype
241 (r':=', Operator),
242 (r'\<\<[a-z]\w*\>\>', Name.Label),
243 (r'\#[a-z]\w*\b', Keyword.Pseudo), # #variable_conflict
244 ]
245
246
247class PsqlRegexLexer(PostgresBase, RegexLexer):
248 """
249 Extend the PostgresLexer adding support specific for psql commands.
250
251 This is not a complete psql lexer yet as it lacks prompt support
252 and output rendering.
253 """
254
255 name = 'PostgreSQL console - regexp based lexer'
256 aliases = [] # not public
257
258 flags = re.IGNORECASE
259 tokens = {name: state[:] for (name, state) in PostgresLexer.tokens.items()}
260
261 tokens['root'].append(
262 (r'\\[^\s]+', Keyword.Pseudo, 'psql-command'))
263 tokens['psql-command'] = [
264 (r'\n', Text, 'root'),
265 (r'\s+', Whitespace),
266 (r'\\[^\s]+', Keyword.Pseudo),
267 (r""":(['"]?)[a-z]\w*\b\1""", Name.Variable),
268 (r"'(''|[^'])*'", String.Single),
269 (r"`([^`])*`", String.Backtick),
270 (r"[^\s]+", String.Symbol),
271 ]
272
273
274re_prompt = re.compile(r'^(\S.*?)??[=\-\(\$\'\"][#>]')
275re_psql_command = re.compile(r'\s*\\')
276re_end_command = re.compile(r';\s*(--.*?)?$')
277re_psql_command = re.compile(r'(\s*)(\\.+?)(\s+)$')
278re_error = re.compile(r'(ERROR|FATAL):')
279re_message = re.compile(
280 r'((?:DEBUG|INFO|NOTICE|WARNING|ERROR|'
281 r'FATAL|HINT|DETAIL|CONTEXT|LINE [0-9]+):)(.*?\n)')
282
283
284class lookahead:
285 """Wrap an iterator and allow pushing back an item."""
286 def __init__(self, x):
287 self.iter = iter(x)
288 self._nextitem = None
289
290 def __iter__(self):
291 return self
292
293 def send(self, i):
294 self._nextitem = i
295 return i
296
297 def __next__(self):
298 if self._nextitem is not None:
299 ni = self._nextitem
300 self._nextitem = None
301 return ni
302 return next(self.iter)
303 next = __next__
304
305
306class PostgresConsoleLexer(Lexer):
307 """
308 Lexer for psql sessions.
309 """
310
311 name = 'PostgreSQL console (psql)'
312 aliases = ['psql', 'postgresql-console', 'postgres-console']
313 mimetypes = ['text/x-postgresql-psql']
314 url = 'https://www.postgresql.org'
315 version_added = '1.5'
316 _example = "psql/psql_session.txt"
317
318 def get_tokens_unprocessed(self, data):
319 sql = PsqlRegexLexer(**self.options)
320
321 lines = lookahead(line_re.findall(data))
322
323 # prompt-output cycle
324 while 1:
325
326 # consume the lines of the command: start with an optional prompt
327 # and continue until the end of command is detected
328 curcode = ''
329 insertions = []
330 for line in lines:
331 # Identify a shell prompt in case of psql commandline example
332 if line.startswith('$') and not curcode:
333 lexer = get_lexer_by_name('console', **self.options)
334 yield from lexer.get_tokens_unprocessed(line)
335 break
336
337 # Identify a psql prompt
338 mprompt = re_prompt.match(line)
339 if mprompt is not None:
340 insertions.append((len(curcode),
341 [(0, Generic.Prompt, mprompt.group())]))
342 curcode += line[len(mprompt.group()):]
343 else:
344 curcode += line
345
346 # Check if this is the end of the command
347 # TODO: better handle multiline comments at the end with
348 # a lexer with an external state?
349 if re_psql_command.match(curcode) \
350 or re_end_command.search(curcode):
351 break
352
353 # Emit the combined stream of command and prompt(s)
354 yield from do_insertions(insertions,
355 sql.get_tokens_unprocessed(curcode))
356
357 # Emit the output lines
358 out_token = Generic.Output
359 for line in lines:
360 mprompt = re_prompt.match(line)
361 if mprompt is not None:
362 # push the line back to have it processed by the prompt
363 lines.send(line)
364 break
365
366 mmsg = re_message.match(line)
367 if mmsg is not None:
368 if mmsg.group(1).startswith("ERROR") \
369 or mmsg.group(1).startswith("FATAL"):
370 out_token = Generic.Error
371 yield (mmsg.start(1), Generic.Strong, mmsg.group(1))
372 yield (mmsg.start(2), out_token, mmsg.group(2))
373 else:
374 yield (0, out_token, line)
375 else:
376 return
377
378
379class PostgresExplainLexer(RegexLexer):
380 """
381 Handle PostgreSQL EXPLAIN output
382 """
383
384 name = 'PostgreSQL EXPLAIN dialect'
385 aliases = ['postgres-explain']
386 filenames = ['*.explain']
387 mimetypes = ['text/x-postgresql-explain']
388 url = 'https://www.postgresql.org/docs/current/using-explain.html'
389 version_added = '2.15'
390
391 tokens = {
392 'root': [
393 (r'(:|\(|\)|ms|kB|->|\.\.|\,|\/)', Punctuation),
394 (r'(\s+)', Whitespace),
395
396 # This match estimated cost and effectively measured counters with ANALYZE
397 # Then, we move to instrumentation state
398 (r'(cost)(=?)', bygroups(Name.Class, Punctuation), 'instrumentation'),
399 (r'(actual)( )(=?)', bygroups(Name.Class, Whitespace, Punctuation), 'instrumentation'),
400
401 # Misc keywords
402 (words(('actual', 'Memory Usage', 'Disk Usage', 'Memory', 'Buckets', 'Batches',
403 'originally', 'row', 'rows', 'Hits', 'Misses',
404 'Evictions', 'Overflows', 'Planned Partitions'), suffix=r'\b'),
405 Comment.Single),
406
407 (r'(hit|read|dirtied|written|write|time|calls)(=)', bygroups(Comment.Single, Operator)),
408 (r'(shared|temp|local)', Keyword.Pseudo),
409
410 # We move to sort state in order to emphasize specific keywords (especially disk access)
411 (r'(Sort Method)(: )', bygroups(Comment.Preproc, Punctuation), 'sort'),
412
413 # These keywords can be followed by an object, like a table
414 (r'(Sort Key|Group Key|Presorted Key|Hash Key)(:)( )',
415 bygroups(Comment.Preproc, Punctuation, Whitespace), 'object_name'),
416 (r'(Cache Key|Cache Mode)(:)( )', bygroups(Comment, Punctuation, Whitespace), 'object_name'),
417
418 # These keywords can be followed by a predicate
419 (words(('Join Filter', 'Subplans Removed', 'Filter', 'Merge Cond',
420 'Hash Cond', 'Index Cond', 'Recheck Cond', 'Heap Blocks',
421 'TID Cond', 'Run Condition', 'Order By', 'Function Call',
422 'Table Function Call', 'Inner Unique', 'Params Evaluated',
423 'Single Copy', 'Sampling', 'One-Time Filter', 'Output',
424 'Relations', 'Remote SQL'), suffix=r'\b'),
425 Comment.Preproc, 'predicate'),
426
427 # Special keyword to handle ON CONFLICT
428 (r'Conflict ', Comment.Preproc, 'conflict'),
429
430 # Special keyword for InitPlan or SubPlan
431 (r'(InitPlan|SubPlan)( )(\d+)( )',
432 bygroups(Keyword, Whitespace, Number.Integer, Whitespace),
433 'init_plan'),
434
435 (words(('Sort Method', 'Join Filter', 'Planning time',
436 'Planning Time', 'Execution time', 'Execution Time',
437 'Workers Planned', 'Workers Launched', 'Buffers',
438 'Planning', 'Worker', 'Query Identifier', 'Time',
439 'Full-sort Groups', 'Pre-sorted Groups'), suffix=r'\b'), Comment.Preproc),
440
441 # Emphasize these keywords
442
443 (words(('Rows Removed by Join Filter', 'Rows Removed by Filter',
444 'Rows Removed by Index Recheck',
445 'Heap Fetches', 'never executed'),
446 suffix=r'\b'), Name.Exception),
447 (r'(I/O Timings)(:)( )', bygroups(Name.Exception, Punctuation, Whitespace)),
448
449 (words(_postgres_builtins.EXPLAIN_KEYWORDS, suffix=r'\b'), Keyword),
450
451 # join keywords
452 (r'((Right|Left|Full|Semi|Anti) Join)', Keyword.Type),
453 (r'(Parallel |Async |Finalize |Partial )', Comment.Preproc),
454 (r'Backward', Comment.Preproc),
455 (r'(Intersect|Except|Hash)', Comment.Preproc),
456
457 (r'(CTE)( )(\w*)?', bygroups(Comment, Whitespace, Name.Variable)),
458
459
460 # Treat "on" and "using" as a punctuation
461 (r'(on|using)', Punctuation, 'object_name'),
462
463
464 # strings
465 (r"'(''|[^'])*'", String.Single),
466 # numbers
467 (r'-?\d+\.\d+', Number.Float),
468 (r'(-?\d+)', Number.Integer),
469
470 # boolean
471 (r'(true|false)', Name.Constant),
472 # explain header
473 (r'\s*QUERY PLAN\s*\n\s*-+', Comment.Single),
474 # Settings
475 (r'(Settings)(:)( )', bygroups(Comment.Preproc, Punctuation, Whitespace), 'setting'),
476
477 # Handle JIT counters
478 (r'(JIT|Functions|Options|Timing)(:)', bygroups(Comment.Preproc, Punctuation)),
479 (r'(Inlining|Optimization|Expressions|Deforming|Generation|Emission|Total)', Keyword.Pseudo),
480
481 # Handle Triggers counters
482 (r'(Trigger)( )(\S*)(:)( )',
483 bygroups(Comment.Preproc, Whitespace, Name.Variable, Punctuation, Whitespace)),
484
485 ],
486 'expression': [
487 # matches any kind of parenthesized expression
488 # the first opening paren is matched by the 'caller'
489 (r'\(', Punctuation, '#push'),
490 (r'\)', Punctuation, '#pop'),
491 (r'(never executed)', Name.Exception),
492 (r'[^)(]+', Comment),
493 ],
494 'object_name': [
495
496 # This is a cost or analyze measure
497 (r'(\(cost)(=?)', bygroups(Name.Class, Punctuation), 'instrumentation'),
498 (r'(\(actual)( )(=?)', bygroups(Name.Class, Whitespace, Punctuation), 'instrumentation'),
499
500 # if object_name is parenthesized, mark opening paren as
501 # punctuation, call 'expression', and exit state
502 (r'\(', Punctuation, 'expression'),
503 (r'(on)', Punctuation),
504 # matches possibly schema-qualified table and column names
505 (r'\w+(\.\w+)*( USING \S+| \w+ USING \S+)', Name.Variable),
506 (r'\"?\w+\"?(?:\.\"?\w+\"?)?', Name.Variable),
507 (r'\'\S*\'', Name.Variable),
508
509 # if we encounter a comma, another object is listed
510 (r',\n', Punctuation, 'object_name'),
511 (r',', Punctuation, 'object_name'),
512
513 # special case: "*SELECT*"
514 (r'"\*SELECT\*( \d+)?"(.\w+)?', Name.Variable),
515 (r'"\*VALUES\*(_\d+)?"(.\w+)?', Name.Variable),
516 (r'"ANY_subquery"', Name.Variable),
517
518 # Variable $1 ...
519 (r'\$\d+', Name.Variable),
520 # cast
521 (r'::\w+', Name.Variable),
522 (r' +', Whitespace),
523 (r'"', Punctuation),
524 (r'\[\.\.\.\]', Punctuation),
525 (r'\)', Punctuation, '#pop'),
526 ],
527 'predicate': [
528 # if predicate is parenthesized, mark paren as punctuation
529 (r'(\()([^\n]*)(\))', bygroups(Punctuation, Name.Variable, Punctuation), '#pop'),
530 # otherwise color until newline
531 (r'[^\n]*', Name.Variable, '#pop'),
532 ],
533 'instrumentation': [
534 (r'=|\.\.', Punctuation),
535 (r' +', Whitespace),
536 (r'(rows|width|time|loops)', Name.Class),
537 (r'\d+\.\d+', Number.Float),
538 (r'(\d+)', Number.Integer),
539 (r'\)', Punctuation, '#pop'),
540 ],
541 'conflict': [
542 (r'(Resolution: )(\w+)', bygroups(Comment.Preproc, Name.Variable)),
543 (r'(Arbiter \w+:)', Comment.Preproc, 'object_name'),
544 (r'(Filter: )', Comment.Preproc, 'predicate'),
545 ],
546 'setting': [
547 (r'([a-z_]*?)(\s*)(=)(\s*)(\'.*?\')', bygroups(Name.Attribute, Whitespace, Operator, Whitespace, String)),
548 (r'\, ', Punctuation),
549 ],
550 'init_plan': [
551 (r'\(', Punctuation),
552 (r'returns \$\d+(,\$\d+)?', Name.Variable),
553 (r'\)', Punctuation, '#pop'),
554 ],
555 'sort': [
556 (r':|kB', Punctuation),
557 (r'(quicksort|top-N|heapsort|Average|Memory|Peak)', Comment.Prepoc),
558 (r'(external|merge|Disk|sort)', Name.Exception),
559 (r'(\d+)', Number.Integer),
560 (r' +', Whitespace),
561 ],
562 }
563
564
565class SqlLexer(RegexLexer):
566 """
567 Lexer for Structured Query Language. Currently, this lexer does
568 not recognize any special syntax except ANSI SQL.
569 """
570
571 name = 'SQL'
572 aliases = ['sql']
573 filenames = ['*.sql']
574 mimetypes = ['text/x-sql']
575 url = 'https://en.wikipedia.org/wiki/SQL'
576 version_added = ''
577
578 flags = re.IGNORECASE
579 tokens = {
580 'root': [
581 (r'\s+', Whitespace),
582 (r'--.*\n?', Comment.Single),
583 (r'/\*', Comment.Multiline, 'multiline-comments'),
584 (words(_sql_builtins.KEYWORDS, suffix=r'\b'), Keyword),
585 (words(_sql_builtins.DATATYPES, suffix=r'\b'), Name.Builtin),
586 (r'[+*/<>=~!@#%^&|`?-]', Operator),
587 (r'[0-9]+', Number.Integer),
588 # TODO: Backslash escapes?
589 (r"'(''|[^'])*'", String.Single),
590 (r'"(""|[^"])*"', String.Symbol), # not a real string literal in ANSI SQL
591 (r'[a-z_][\w$]*', Name), # allow $s in strings for Oracle
592 (r'[;:()\[\],.]', Punctuation)
593 ],
594 'multiline-comments': [
595 (r'/\*', Comment.Multiline, 'multiline-comments'),
596 (r'\*/', Comment.Multiline, '#pop'),
597 (r'[^/*]+', Comment.Multiline),
598 (r'[/*]', Comment.Multiline)
599 ]
600 }
601
602 def analyse_text(self, text):
603 return
604
605
606class TransactSqlLexer(RegexLexer):
607 """
608 Transact-SQL (T-SQL) is Microsoft's and Sybase's proprietary extension to
609 SQL.
610
611 The list of keywords includes ODBC and keywords reserved for future use.
612 """
613
614 name = 'Transact-SQL'
615 aliases = ['tsql', 't-sql']
616 filenames = ['*.sql']
617 mimetypes = ['text/x-tsql']
618 url = 'https://www.tsql.info'
619 version_added = ''
620
621 flags = re.IGNORECASE
622
623 tokens = {
624 'root': [
625 (r'\s+', Whitespace),
626 (r'--.*[$|\n]?', Comment.Single),
627 (r'/\*', Comment.Multiline, 'multiline-comments'),
628 (words(_tsql_builtins.OPERATORS), Operator),
629 (words(_tsql_builtins.OPERATOR_WORDS, suffix=r'\b'), Operator.Word),
630 (words(_tsql_builtins.TYPES, suffix=r'\b'), Name.Class),
631 (words(_tsql_builtins.FUNCTIONS, suffix=r'\b'), Name.Function),
632 (r'(goto)(\s+)(\w+\b)', bygroups(Keyword, Whitespace, Name.Label)),
633 (words(_tsql_builtins.KEYWORDS, suffix=r'\b'), Keyword),
634 (r'(\[)([^]]+)(\])', bygroups(Operator, Name, Operator)),
635 (r'0x[0-9a-f]+', Number.Hex),
636 # Float variant 1, for example: 1., 1.e2, 1.2e3
637 (r'[0-9]+\.[0-9]*(e[+-]?[0-9]+)?', Number.Float),
638 # Float variant 2, for example: .1, .1e2
639 (r'\.[0-9]+(e[+-]?[0-9]+)?', Number.Float),
640 # Float variant 3, for example: 123e45
641 (r'[0-9]+e[+-]?[0-9]+', Number.Float),
642 (r'[0-9]+', Number.Integer),
643 (r"'(''|[^'])*'", String.Single),
644 (r'"(""|[^"])*"', String.Symbol),
645 (r'[;(),.]', Punctuation),
646 # Below we use \w even for the first "real" character because
647 # tokens starting with a digit have already been recognized
648 # as Number above.
649 (r'@@\w+', Name.Builtin),
650 (r'@\w+', Name.Variable),
651 (r'(\w+)(:)', bygroups(Name.Label, Punctuation)),
652 (r'#?#?\w+', Name), # names for temp tables and anything else
653 (r'\?', Name.Variable.Magic), # parameter for prepared statements
654 ],
655 'multiline-comments': [
656 (r'/\*', Comment.Multiline, 'multiline-comments'),
657 (r'\*/', Comment.Multiline, '#pop'),
658 (r'[^/*]+', Comment.Multiline),
659 (r'[/*]', Comment.Multiline)
660 ]
661 }
662
663 def analyse_text(text):
664 rating = 0
665 if tsql_declare_re.search(text):
666 # Found T-SQL variable declaration.
667 rating = 1.0
668 else:
669 name_between_backtick_count = len(
670 name_between_backtick_re.findall(text))
671 name_between_bracket_count = len(
672 name_between_bracket_re.findall(text))
673 # We need to check if there are any names using
674 # backticks or brackets, as otherwise both are 0
675 # and 0 >= 2 * 0, so we would always assume it's true
676 dialect_name_count = name_between_backtick_count + name_between_bracket_count
677 if dialect_name_count >= 1 and \
678 name_between_bracket_count >= 2 * name_between_backtick_count:
679 # Found at least twice as many [name] as `name`.
680 rating += 0.5
681 elif name_between_bracket_count > name_between_backtick_count:
682 rating += 0.2
683 elif name_between_bracket_count > 0:
684 rating += 0.1
685 if tsql_variable_re.search(text) is not None:
686 rating += 0.1
687 if tsql_go_re.search(text) is not None:
688 rating += 0.1
689 return rating
690
691
692class MySqlLexer(RegexLexer):
693 """The Oracle MySQL lexer.
694
695 This lexer does not attempt to maintain strict compatibility with
696 MariaDB syntax or keywords. Although MySQL and MariaDB's common code
697 history suggests there may be significant overlap between the two,
698 compatibility between the two is not a target for this lexer.
699 """
700
701 name = 'MySQL'
702 aliases = ['mysql']
703 mimetypes = ['text/x-mysql']
704 url = 'https://www.mysql.com'
705 version_added = ''
706
707 flags = re.IGNORECASE
708 tokens = {
709 'root': [
710 (r'\s+', Whitespace),
711
712 # Comments
713 (r'(?:#|--\s+).*', Comment.Single),
714 (r'/\*\+', Comment.Special, 'optimizer-hints'),
715 (r'/\*', Comment.Multiline, 'multiline-comment'),
716
717 # Hexadecimal literals
718 (r"x'([0-9a-f]{2})+'", Number.Hex), # MySQL requires paired hex characters in this form.
719 (r'0x[0-9a-f]+', Number.Hex),
720
721 # Binary literals
722 (r"b'[01]+'", Number.Bin),
723 (r'0b[01]+', Number.Bin),
724
725 # Numeric literals
726 (r'[0-9]+\.[0-9]*(e[+-]?[0-9]+)?', Number.Float), # Mandatory integer, optional fraction and exponent
727 (r'[0-9]*\.[0-9]+(e[+-]?[0-9]+)?', Number.Float), # Mandatory fraction, optional integer and exponent
728 (r'[0-9]+e[+-]?[0-9]+', Number.Float), # Exponents with integer significands are still floats
729 (r'[0-9]+(?=[^0-9a-z$_\u0080-\uffff])', Number.Integer), # Integers that are not in a schema object name
730
731 # Date literals
732 (r"\{\s*d\s*(?P<quote>['\"])\s*\d{2}(\d{2})?.?\d{2}.?\d{2}\s*(?P=quote)\s*\}",
733 Literal.Date),
734
735 # Time literals
736 (r"\{\s*t\s*(?P<quote>['\"])\s*(?:\d+\s+)?\d{1,2}.?\d{1,2}.?\d{1,2}(\.\d*)?\s*(?P=quote)\s*\}",
737 Literal.Date),
738
739 # Timestamp literals
740 (
741 r"\{\s*ts\s*(?P<quote>['\"])\s*"
742 r"\d{2}(?:\d{2})?.?\d{2}.?\d{2}" # Date part
743 r"\s+" # Whitespace between date and time
744 r"\d{1,2}.?\d{1,2}.?\d{1,2}(\.\d*)?" # Time part
745 r"\s*(?P=quote)\s*\}",
746 Literal.Date
747 ),
748
749 # String literals
750 (r"'", String.Single, 'single-quoted-string'),
751 (r'"', String.Double, 'double-quoted-string'),
752
753 # Variables
754 (r'@@(?:global\.|persist\.|persist_only\.|session\.)?[a-z_]+', Name.Variable),
755 (r'@[a-z0-9_$.]+', Name.Variable),
756 (r"@'", Name.Variable, 'single-quoted-variable'),
757 (r'@"', Name.Variable, 'double-quoted-variable'),
758 (r"@`", Name.Variable, 'backtick-quoted-variable'),
759 (r'\?', Name.Variable), # For demonstrating prepared statements
760
761 # Operators
762 (r'[!%&*+/:<=>^|~-]+', Operator),
763
764 # Exceptions; these words tokenize differently in different contexts.
765 (r'\b(set)(?!\s*\()', Keyword),
766 (r'\b(character)(\s+)(set)\b', bygroups(Keyword, Whitespace, Keyword)),
767 # In all other known cases, "SET" is tokenized by MYSQL_DATATYPES.
768
769 (words(_mysql_builtins.MYSQL_CONSTANTS, prefix=r'\b', suffix=r'\b'),
770 Name.Constant),
771 (words(_mysql_builtins.MYSQL_DATATYPES, prefix=r'\b', suffix=r'\b'),
772 Keyword.Type),
773 (words(_mysql_builtins.MYSQL_KEYWORDS, prefix=r'\b', suffix=r'\b'),
774 Keyword),
775 (words(_mysql_builtins.MYSQL_FUNCTIONS, prefix=r'\b', suffix=r'\b(\s*)(\()'),
776 bygroups(Name.Function, Whitespace, Punctuation)),
777
778 # Schema object names
779 #
780 # Note: Although the first regex supports unquoted all-numeric
781 # identifiers, this will not be a problem in practice because
782 # numeric literals have already been handled above.
783 #
784 ('[0-9a-z$_\u0080-\uffff]+', Name),
785 (r'`', Name.Quoted, 'schema-object-name'),
786
787 # Punctuation
788 (r'[(),.;]', Punctuation),
789 ],
790
791 # Multiline comment substates
792 # ---------------------------
793
794 'optimizer-hints': [
795 (r'[^*a-z]+', Comment.Special),
796 (r'\*/', Comment.Special, '#pop'),
797 (words(_mysql_builtins.MYSQL_OPTIMIZER_HINTS, suffix=r'\b'),
798 Comment.Preproc),
799 ('[a-z]+', Comment.Special),
800 (r'\*', Comment.Special),
801 ],
802
803 'multiline-comment': [
804 (r'[^*]+', Comment.Multiline),
805 (r'\*/', Comment.Multiline, '#pop'),
806 (r'\*', Comment.Multiline),
807 ],
808
809 # String substates
810 # ----------------
811
812 'single-quoted-string': [
813 (r"[^'\\]+", String.Single),
814 (r"''", String.Escape),
815 (r"""\\[0'"bnrtZ\\%_]""", String.Escape),
816 (r"'", String.Single, '#pop'),
817 ],
818
819 'double-quoted-string': [
820 (r'[^"\\]+', String.Double),
821 (r'""', String.Escape),
822 (r"""\\[0'"bnrtZ\\%_]""", String.Escape),
823 (r'"', String.Double, '#pop'),
824 ],
825
826 # Variable substates
827 # ------------------
828
829 'single-quoted-variable': [
830 (r"[^']+", Name.Variable),
831 (r"''", Name.Variable),
832 (r"'", Name.Variable, '#pop'),
833 ],
834
835 'double-quoted-variable': [
836 (r'[^"]+', Name.Variable),
837 (r'""', Name.Variable),
838 (r'"', Name.Variable, '#pop'),
839 ],
840
841 'backtick-quoted-variable': [
842 (r'[^`]+', Name.Variable),
843 (r'``', Name.Variable),
844 (r'`', Name.Variable, '#pop'),
845 ],
846
847 # Schema object name substates
848 # ----------------------------
849 #
850 # "Name.Quoted" and "Name.Quoted.Escape" are non-standard but
851 # formatters will style them as "Name" by default but add
852 # additional styles based on the token name. This gives users
853 # flexibility to add custom styles as desired.
854 #
855 'schema-object-name': [
856 (r'[^`]+', Name.Quoted),
857 (r'``', Name.Quoted.Escape),
858 (r'`', Name.Quoted, '#pop'),
859 ],
860 }
861
862 def analyse_text(text):
863 rating = 0
864 name_between_backtick_count = len(
865 name_between_backtick_re.findall(text))
866 name_between_bracket_count = len(
867 name_between_bracket_re.findall(text))
868 # Same logic as above in the TSQL analysis
869 dialect_name_count = name_between_backtick_count + name_between_bracket_count
870 if dialect_name_count >= 1 and \
871 name_between_backtick_count >= 2 * name_between_bracket_count:
872 # Found at least twice as many `name` as [name].
873 rating += 0.5
874 elif name_between_backtick_count > name_between_bracket_count:
875 rating += 0.2
876 elif name_between_backtick_count > 0:
877 rating += 0.1
878 return rating
879
880
881class GoogleSqlLexer(RegexLexer):
882 """
883 GoogleSQL is Google's standard SQL dialect, formerly known as ZetaSQL.
884
885 The list of keywords includes reserved words for future use.
886 """
887
888 name = 'GoogleSQL'
889 aliases = ['googlesql', 'zetasql']
890 filenames = ['*.googlesql', '*.googlesql.sql']
891 mimetypes = ['text/x-google-sql', 'text/x-google-sql-aux']
892 url = 'https://cloud.google.com/bigquery/googlesql'
893 version_added = '2.19'
894
895 flags = re.IGNORECASE
896 tokens = {
897 'root': [
898 (r'\s+', Whitespace),
899
900 # Comments
901 (r'(?:#|--\s+).*', Comment.Single),
902 (r'/\*', Comment.Multiline, 'multiline-comment'),
903
904 # Hexadecimal literals
905 (r"x'([0-9a-f]{2})+'", Number.Hex),
906 (r'0x[0-9a-f]+', Number.Hex),
907
908 # Binary literals
909 (r"b'[01]+'", Number.Bin),
910 (r'0b[01]+', Number.Bin),
911
912 # Numeric literals
913 (r'[0-9]+\.[0-9]*(e[+-]?[0-9]+)?', Number.Float), # Mandatory integer, optional fraction and exponent
914 (r'[0-9]*\.[0-9]+(e[+-]?[0-9]+)?', Number.Float), # Mandatory fraction, optional integer and exponent
915 (r'[0-9]+e[+-]?[0-9]+', Number.Float), # Exponents with integer significands are still floats
916 (r'[0-9]+(?=[^0-9a-z$_\u0080-\uffff])', Number.Integer), # Integers that are not in a schema object name
917
918 # Date literals
919 (r"\{\s*d\s*(?P<quote>['\"])\s*\d{2}(\d{2})?.?\d{2}.?\d{2}\s*(?P=quote)\s*\}",
920 Literal.Date),
921
922 # Time literals
923 (r"\{\s*t\s*(?P<quote>['\"])\s*(?:\d+\s+)?\d{1,2}.?\d{1,2}.?\d{1,2}(\.\d*)?\s*(?P=quote)\s*\}",
924 Literal.Date),
925
926 # Timestamp literals
927 (
928 r"\{\s*ts\s*(?P<quote>['\"])\s*"
929 r"\d{2}(?:\d{2})?.?\d{2}.?\d{2}" # Date part
930 r"\s+" # Whitespace between date and time
931 r"\d{1,2}.?\d{1,2}.?\d{1,2}(\.\d*)?" # Time part
932 r"\s*(?P=quote)\s*\}",
933 Literal.Date
934 ),
935
936 # String literals
937 (r"'", String.Single, 'single-quoted-string'),
938 (r'"', String.Double, 'double-quoted-string'),
939
940 # Variables
941 (r'@@(?:global\.|persist\.|persist_only\.|session\.)?[a-z_]+', Name.Variable),
942 (r'@[a-z0-9_$.]+', Name.Variable),
943 (r"@'", Name.Variable, 'single-quoted-variable'),
944 (r'@"', Name.Variable, 'double-quoted-variable'),
945 (r"@`", Name.Variable, 'backtick-quoted-variable'),
946 (r'\?', Name.Variable), # For demonstrating prepared statements
947
948 # Exceptions; these words tokenize differently in different contexts.
949 (r'\b(set)(?!\s*\()', Keyword),
950 (r'\b(character)(\s+)(set)\b', bygroups(Keyword, Whitespace, Keyword)),
951
952 # Constants, types, keywords, functions, operators
953 (words(_googlesql_builtins.constants, prefix=r'\b', suffix=r'\b'), Name.Constant),
954 (words(_googlesql_builtins.types, prefix=r'\b', suffix=r'\b'), Keyword.Type),
955 (words(_googlesql_builtins.keywords, prefix=r'\b', suffix=r'\b'), Keyword),
956 (words(_googlesql_builtins.functionnames, prefix=r'\b', suffix=r'\b(\s*)(\()'),
957 bygroups(Name.Function, Whitespace, Punctuation)),
958 (words(_googlesql_builtins.operators, prefix=r'\b', suffix=r'\b'), Operator),
959
960 # Schema object names
961 #
962 # Note: Although the first regex supports unquoted all-numeric
963 # identifiers, this will not be a problem in practice because
964 # numeric literals have already been handled above.
965 #
966 ('[0-9a-z$_\u0080-\uffff]+', Name),
967 (r'`', Name.Quoted, 'schema-object-name'),
968
969 # Punctuation
970 (r'[(),.;]', Punctuation),
971 ],
972
973 # Multiline comment substates
974 # ---------------------------
975
976 'multiline-comment': [
977 (r'[^*]+', Comment.Multiline),
978 (r'\*/', Comment.Multiline, '#pop'),
979 (r'\*', Comment.Multiline),
980 ],
981
982 # String substates
983 # ----------------
984
985 'single-quoted-string': [
986 (r"[^'\\]+", String.Single),
987 (r"''", String.Escape),
988 (r"""\\[0'"bnrtZ\\%_]""", String.Escape),
989 (r"'", String.Single, '#pop'),
990 ],
991
992 'double-quoted-string': [
993 (r'[^"\\]+', String.Double),
994 (r'""', String.Escape),
995 (r"""\\[0'"bnrtZ\\%_]""", String.Escape),
996 (r'"', String.Double, '#pop'),
997 ],
998
999 # Variable substates
1000 # ------------------
1001
1002 'single-quoted-variable': [
1003 (r"[^']+", Name.Variable),
1004 (r"''", Name.Variable),
1005 (r"'", Name.Variable, '#pop'),
1006 ],
1007
1008 'double-quoted-variable': [
1009 (r'[^"]+', Name.Variable),
1010 (r'""', Name.Variable),
1011 (r'"', Name.Variable, '#pop'),
1012 ],
1013
1014 'backtick-quoted-variable': [
1015 (r'[^`]+', Name.Variable),
1016 (r'``', Name.Variable),
1017 (r'`', Name.Variable, '#pop'),
1018 ],
1019
1020 # Schema object name substates
1021 # ----------------------------
1022 #
1023 # "Name.Quoted" and "Name.Quoted.Escape" are non-standard but
1024 # formatters will style them as "Name" by default but add
1025 # additional styles based on the token name. This gives users
1026 # flexibility to add custom styles as desired.
1027 #
1028 'schema-object-name': [
1029 (r'[^`]+', Name.Quoted),
1030 (r'``', Name.Quoted.Escape),
1031 (r'`', Name.Quoted, '#pop'),
1032 ],
1033 }
1034
1035 def analyse_text(text):
1036 tokens = collections.Counter(text.split())
1037 return 0.001 * sum(count for t, count in tokens.items()
1038 if t in googlesql_identifiers)
1039
1040
1041class SqliteConsoleLexer(Lexer):
1042 """
1043 Lexer for example sessions using sqlite3.
1044 """
1045
1046 name = 'sqlite3con'
1047 aliases = ['sqlite3']
1048 filenames = ['*.sqlite3-console']
1049 mimetypes = ['text/x-sqlite3-console']
1050 url = 'https://www.sqlite.org'
1051 version_added = '0.11'
1052 _example = "sqlite3/sqlite3.sqlite3-console"
1053
1054 def get_tokens_unprocessed(self, data):
1055 sql = SqlLexer(**self.options)
1056
1057 curcode = ''
1058 insertions = []
1059 for match in line_re.finditer(data):
1060 line = match.group()
1061 prompt_match = sqlite_prompt_re.match(line)
1062 if prompt_match is not None:
1063 insertions.append((len(curcode),
1064 [(0, Generic.Prompt, line[:7])]))
1065 insertions.append((len(curcode),
1066 [(7, Whitespace, ' ')]))
1067 curcode += line[8:]
1068 else:
1069 if curcode:
1070 yield from do_insertions(insertions,
1071 sql.get_tokens_unprocessed(curcode))
1072 curcode = ''
1073 insertions = []
1074 if line.startswith('SQL error: '):
1075 yield (match.start(), Generic.Traceback, line)
1076 else:
1077 yield (match.start(), Generic.Output, line)
1078 if curcode:
1079 yield from do_insertions(insertions,
1080 sql.get_tokens_unprocessed(curcode))
1081
1082
1083class RqlLexer(RegexLexer):
1084 """
1085 Lexer for Relation Query Language.
1086 """
1087 name = 'RQL'
1088 url = 'http://www.logilab.org/project/rql'
1089 aliases = ['rql']
1090 filenames = ['*.rql']
1091 mimetypes = ['text/x-rql']
1092 version_added = '2.0'
1093
1094 flags = re.IGNORECASE
1095 tokens = {
1096 'root': [
1097 (r'\s+', Whitespace),
1098 (r'(DELETE|SET|INSERT|UNION|DISTINCT|WITH|WHERE|BEING|OR'
1099 r'|AND|NOT|GROUPBY|HAVING|ORDERBY|ASC|DESC|LIMIT|OFFSET'
1100 r'|TODAY|NOW|TRUE|FALSE|NULL|EXISTS)\b', Keyword),
1101 (r'[+*/<>=%-]', Operator),
1102 (r'(Any|is|instance_of|CWEType|CWRelation)\b', Name.Builtin),
1103 (r'[0-9]+', Number.Integer),
1104 (r'[A-Z_]\w*\??', Name),
1105 (r"'(''|[^'])*'", String.Single),
1106 (r'"(""|[^"])*"', String.Single),
1107 (r'[;:()\[\],.]', Punctuation)
1108 ],
1109 }