1"""
2 pygments.lexers.data
3 ~~~~~~~~~~~~~~~~~~~~
4
5 Lexers for data file format.
6
7 :copyright: Copyright 2006-2025 by the Pygments team, see AUTHORS.
8 :license: BSD, see LICENSE for details.
9"""
10
11from pygments.lexer import Lexer, ExtendedRegexLexer, LexerContext, \
12 include, bygroups
13from pygments.token import Comment, Error, Keyword, Literal, Name, Number, \
14 Punctuation, String, Whitespace
15
16__all__ = ['YamlLexer', 'JsonLexer', 'JsonBareObjectLexer', 'JsonLdLexer']
17
18
19class YamlLexerContext(LexerContext):
20 """Indentation context for the YAML lexer."""
21
22 def __init__(self, *args, **kwds):
23 super().__init__(*args, **kwds)
24 self.indent_stack = []
25 self.indent = -1
26 self.next_indent = 0
27 self.block_scalar_indent = None
28
29
30class YamlLexer(ExtendedRegexLexer):
31 """
32 Lexer for YAML, a human-friendly data serialization
33 language.
34 """
35
36 name = 'YAML'
37 url = 'http://yaml.org/'
38 aliases = ['yaml']
39 filenames = ['*.yaml', '*.yml']
40 mimetypes = ['text/x-yaml']
41 version_added = '0.11'
42
43 def something(token_class):
44 """Do not produce empty tokens."""
45 def callback(lexer, match, context):
46 text = match.group()
47 if not text:
48 return
49 yield match.start(), token_class, text
50 context.pos = match.end()
51 return callback
52
53 def reset_indent(token_class):
54 """Reset the indentation levels."""
55 def callback(lexer, match, context):
56 text = match.group()
57 context.indent_stack = []
58 context.indent = -1
59 context.next_indent = 0
60 context.block_scalar_indent = None
61 yield match.start(), token_class, text
62 context.pos = match.end()
63 return callback
64
65 def save_indent(token_class, start=False):
66 """Save a possible indentation level."""
67 def callback(lexer, match, context):
68 text = match.group()
69 extra = ''
70 if start:
71 context.next_indent = len(text)
72 if context.next_indent < context.indent:
73 while context.next_indent < context.indent:
74 context.indent = context.indent_stack.pop()
75 if context.next_indent > context.indent:
76 extra = text[context.indent:]
77 text = text[:context.indent]
78 else:
79 context.next_indent += len(text)
80 if text:
81 yield match.start(), token_class, text
82 if extra:
83 yield match.start()+len(text), token_class.Error, extra
84 context.pos = match.end()
85 return callback
86
87 def set_indent(token_class, implicit=False):
88 """Set the previously saved indentation level."""
89 def callback(lexer, match, context):
90 text = match.group()
91 if context.indent < context.next_indent:
92 context.indent_stack.append(context.indent)
93 context.indent = context.next_indent
94 if not implicit:
95 context.next_indent += len(text)
96 yield match.start(), token_class, text
97 context.pos = match.end()
98 return callback
99
100 def set_block_scalar_indent(token_class):
101 """Set an explicit indentation level for a block scalar."""
102 def callback(lexer, match, context):
103 text = match.group()
104 context.block_scalar_indent = None
105 if not text:
106 return
107 increment = match.group(1)
108 if increment:
109 current_indent = max(context.indent, 0)
110 increment = int(increment)
111 context.block_scalar_indent = current_indent + increment
112 if text:
113 yield match.start(), token_class, text
114 context.pos = match.end()
115 return callback
116
117 def parse_block_scalar_empty_line(indent_token_class, content_token_class):
118 """Process an empty line in a block scalar."""
119 def callback(lexer, match, context):
120 text = match.group()
121 if (context.block_scalar_indent is None or
122 len(text) <= context.block_scalar_indent):
123 if text:
124 yield match.start(), indent_token_class, text
125 else:
126 indentation = text[:context.block_scalar_indent]
127 content = text[context.block_scalar_indent:]
128 yield match.start(), indent_token_class, indentation
129 yield (match.start()+context.block_scalar_indent,
130 content_token_class, content)
131 context.pos = match.end()
132 return callback
133
134 def parse_block_scalar_indent(token_class):
135 """Process indentation spaces in a block scalar."""
136 def callback(lexer, match, context):
137 text = match.group()
138 if context.block_scalar_indent is None:
139 if len(text) <= max(context.indent, 0):
140 context.stack.pop()
141 context.stack.pop()
142 return
143 context.block_scalar_indent = len(text)
144 else:
145 if len(text) < context.block_scalar_indent:
146 context.stack.pop()
147 context.stack.pop()
148 return
149 if text:
150 yield match.start(), token_class, text
151 context.pos = match.end()
152 return callback
153
154 def parse_plain_scalar_indent(token_class):
155 """Process indentation spaces in a plain scalar."""
156 def callback(lexer, match, context):
157 text = match.group()
158 if len(text) <= context.indent:
159 context.stack.pop()
160 context.stack.pop()
161 return
162 if text:
163 yield match.start(), token_class, text
164 context.pos = match.end()
165 return callback
166
167 tokens = {
168 # the root rules
169 'root': [
170 # ignored whitespaces
171 (r'[ ]+(?=#|$)', Whitespace),
172 # line breaks
173 (r'\n+', Whitespace),
174 # a comment
175 (r'#[^\n]*', Comment.Single),
176 # the '%YAML' directive
177 (r'^%YAML(?=[ ]|$)', reset_indent(Name.Tag), 'yaml-directive'),
178 # the %TAG directive
179 (r'^%TAG(?=[ ]|$)', reset_indent(Name.Tag), 'tag-directive'),
180 # document start and document end indicators
181 (r'^(?:---|\.\.\.)(?=[ ]|$)', reset_indent(Name.Namespace),
182 'block-line'),
183 # indentation spaces
184 (r'[ ]*(?!\s|$)', save_indent(Whitespace, start=True),
185 ('block-line', 'indentation')),
186 ],
187
188 # trailing whitespaces after directives or a block scalar indicator
189 'ignored-line': [
190 # ignored whitespaces
191 (r'[ ]+(?=#|$)', Whitespace),
192 # a comment
193 (r'#[^\n]*', Comment.Single),
194 # line break
195 (r'\n', Whitespace, '#pop:2'),
196 ],
197
198 # the %YAML directive
199 'yaml-directive': [
200 # the version number
201 (r'([ ]+)([0-9]+\.[0-9]+)',
202 bygroups(Whitespace, Number), 'ignored-line'),
203 ],
204
205 # the %TAG directive
206 'tag-directive': [
207 # a tag handle and the corresponding prefix
208 (r'([ ]+)(!|![\w-]*!)'
209 r'([ ]+)(!|!?[\w;/?:@&=+$,.!~*\'()\[\]%-]+)',
210 bygroups(Whitespace, Keyword.Type, Whitespace, Keyword.Type),
211 'ignored-line'),
212 ],
213
214 # block scalar indicators and indentation spaces
215 'indentation': [
216 # trailing whitespaces are ignored
217 (r'[ ]*$', something(Whitespace), '#pop:2'),
218 # whitespaces preceding block collection indicators
219 (r'[ ]+(?=[?:-](?:[ ]|$))', save_indent(Whitespace)),
220 # block collection indicators
221 (r'[?:-](?=[ ]|$)', set_indent(Punctuation.Indicator)),
222 # the beginning a block line
223 (r'[ ]*', save_indent(Whitespace), '#pop'),
224 ],
225
226 # an indented line in the block context
227 'block-line': [
228 # the line end
229 (r'[ ]*(?=#|$)', something(Whitespace), '#pop'),
230 # whitespaces separating tokens
231 (r'[ ]+', Whitespace),
232 # key with colon
233 (r'''([^#,?\[\]{}"'\n]+)(:)(?=[ ]|$)''',
234 bygroups(Name.Tag, set_indent(Punctuation, implicit=True))),
235 # tags, anchors and aliases,
236 include('descriptors'),
237 # block collections and scalars
238 include('block-nodes'),
239 # flow collections and quoted scalars
240 include('flow-nodes'),
241 # a plain scalar
242 (r'(?=[^\s?:,\[\]{}#&*!|>\'"%@`-]|[?:-]\S)',
243 something(Name.Variable),
244 'plain-scalar-in-block-context'),
245 ],
246
247 # tags, anchors, aliases
248 'descriptors': [
249 # a full-form tag
250 (r'!<[\w#;/?:@&=+$,.!~*\'()\[\]%-]+>', Keyword.Type),
251 # a tag in the form '!', '!suffix' or '!handle!suffix'
252 (r'!(?:[\w-]+!)?'
253 r'[\w#;/?:@&=+$,.!~*\'()\[\]%-]*', Keyword.Type),
254 # an anchor
255 (r'&[\w-]+', Name.Label),
256 # an alias
257 (r'\*[\w-]+', Name.Variable),
258 ],
259
260 # block collections and scalars
261 'block-nodes': [
262 # implicit key
263 (r':(?=[ ]|$)', set_indent(Punctuation.Indicator, implicit=True)),
264 # literal and folded scalars
265 (r'[|>]', Punctuation.Indicator,
266 ('block-scalar-content', 'block-scalar-header')),
267 ],
268
269 # flow collections and quoted scalars
270 'flow-nodes': [
271 # a flow sequence
272 (r'\[', Punctuation.Indicator, 'flow-sequence'),
273 # a flow mapping
274 (r'\{', Punctuation.Indicator, 'flow-mapping'),
275 # a single-quoted scalar
276 (r'\'', String, 'single-quoted-scalar'),
277 # a double-quoted scalar
278 (r'\"', String, 'double-quoted-scalar'),
279 ],
280
281 # the content of a flow collection
282 'flow-collection': [
283 # whitespaces
284 (r'[ ]+', Whitespace),
285 # line breaks
286 (r'\n+', Whitespace),
287 # a comment
288 (r'#[^\n]*', Comment.Single),
289 # simple indicators
290 (r'[?:,]', Punctuation.Indicator),
291 # tags, anchors and aliases
292 include('descriptors'),
293 # nested collections and quoted scalars
294 include('flow-nodes'),
295 # a plain scalar
296 (r'(?=[^\s?:,\[\]{}#&*!|>\'"%@`])',
297 something(Name.Variable),
298 'plain-scalar-in-flow-context'),
299 ],
300
301 # a flow sequence indicated by '[' and ']'
302 'flow-sequence': [
303 # include flow collection rules
304 include('flow-collection'),
305 # the closing indicator
306 (r'\]', Punctuation.Indicator, '#pop'),
307 ],
308
309 # a flow mapping indicated by '{' and '}'
310 'flow-mapping': [
311 # key with colon
312 (r'''([^,:?\[\]{}"'\n]+)(:)(?=[ ]|$)''',
313 bygroups(Name.Tag, Punctuation)),
314 # include flow collection rules
315 include('flow-collection'),
316 # the closing indicator
317 (r'\}', Punctuation.Indicator, '#pop'),
318 ],
319
320 # block scalar lines
321 'block-scalar-content': [
322 # line break
323 (r'\n', Whitespace),
324 # empty line
325 (r'^[ ]+$',
326 parse_block_scalar_empty_line(Whitespace, Name.Constant)),
327 # indentation spaces (we may leave the state here)
328 (r'^[ ]*', parse_block_scalar_indent(Whitespace)),
329 # line content
330 (r'[\S\t ]+', Name.Constant),
331 ],
332
333 # the content of a literal or folded scalar
334 'block-scalar-header': [
335 # indentation indicator followed by chomping flag
336 (r'([1-9])?[+-]?(?=[ ]|$)',
337 set_block_scalar_indent(Punctuation.Indicator),
338 'ignored-line'),
339 # chomping flag followed by indentation indicator
340 (r'[+-]?([1-9])?(?=[ ]|$)',
341 set_block_scalar_indent(Punctuation.Indicator),
342 'ignored-line'),
343 ],
344
345 # ignored and regular whitespaces in quoted scalars
346 'quoted-scalar-whitespaces': [
347 # leading and trailing whitespaces are ignored
348 (r'^[ ]+', Whitespace),
349 (r'[ ]+$', Whitespace),
350 # line breaks are ignored
351 (r'\n+', Whitespace),
352 # other whitespaces are a part of the value
353 (r'[ ]+', Name.Variable),
354 ],
355
356 # single-quoted scalars
357 'single-quoted-scalar': [
358 # include whitespace and line break rules
359 include('quoted-scalar-whitespaces'),
360 # escaping of the quote character
361 (r'\'\'', String.Escape),
362 # regular non-whitespace characters
363 (r'[^\s\']+', String),
364 # the closing quote
365 (r'\'', String, '#pop'),
366 ],
367
368 # double-quoted scalars
369 'double-quoted-scalar': [
370 # include whitespace and line break rules
371 include('quoted-scalar-whitespaces'),
372 # escaping of special characters
373 (r'\\[0abt\tn\nvfre "\\N_LP]', String),
374 # escape codes
375 (r'\\(?:x[0-9A-Fa-f]{2}|u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})',
376 String.Escape),
377 # regular non-whitespace characters
378 (r'[^\s"\\]+', String),
379 # the closing quote
380 (r'"', String, '#pop'),
381 ],
382
383 # the beginning of a new line while scanning a plain scalar
384 'plain-scalar-in-block-context-new-line': [
385 # empty lines
386 (r'^[ ]+$', Whitespace),
387 # line breaks
388 (r'\n+', Whitespace),
389 # document start and document end indicators
390 (r'^(?=---|\.\.\.)', something(Name.Namespace), '#pop:3'),
391 # indentation spaces (we may leave the block line state here)
392 (r'^[ ]*', parse_plain_scalar_indent(Whitespace), '#pop'),
393 ],
394
395 # a plain scalar in the block context
396 'plain-scalar-in-block-context': [
397 # the scalar ends with the ':' indicator
398 (r'[ ]*(?=:[ ]|:$)', something(Whitespace), '#pop'),
399 # the scalar ends with whitespaces followed by a comment
400 (r'[ ]+(?=#)', Whitespace, '#pop'),
401 # trailing whitespaces are ignored
402 (r'[ ]+$', Whitespace),
403 # line breaks are ignored
404 (r'\n+', Whitespace, 'plain-scalar-in-block-context-new-line'),
405 # other whitespaces are a part of the value
406 (r'[ ]+', Literal.Scalar.Plain),
407 # regular non-whitespace characters
408 (r'(?::(?!\s)|[^\s:])+', Literal.Scalar.Plain),
409 ],
410
411 # a plain scalar is the flow context
412 'plain-scalar-in-flow-context': [
413 # the scalar ends with an indicator character
414 (r'[ ]*(?=[,:?\[\]{}])', something(Whitespace), '#pop'),
415 # the scalar ends with a comment
416 (r'[ ]+(?=#)', Whitespace, '#pop'),
417 # leading and trailing whitespaces are ignored
418 (r'^[ ]+', Whitespace),
419 (r'[ ]+$', Whitespace),
420 # line breaks are ignored
421 (r'\n+', Whitespace),
422 # other whitespaces are a part of the value
423 (r'[ ]+', Name.Variable),
424 # regular non-whitespace characters
425 (r'[^\s,:?\[\]{}]+', Name.Variable),
426 ],
427
428 }
429
430 def get_tokens_unprocessed(self, text=None, context=None):
431 if context is None:
432 context = YamlLexerContext(text, 0)
433 return super().get_tokens_unprocessed(text, context)
434
435
436class JsonLexer(Lexer):
437 """
438 For JSON data structures.
439
440 Javascript-style comments are supported (like ``/* */`` and ``//``),
441 though comments are not part of the JSON specification.
442 This allows users to highlight JSON as it is used in the wild.
443
444 No validation is performed on the input JSON document.
445 """
446
447 name = 'JSON'
448 url = 'https://www.json.org'
449 aliases = ['json', 'json-object']
450 filenames = ['*.json', '*.jsonl', '*.ndjson', 'Pipfile.lock']
451 mimetypes = ['application/json', 'application/json-object', 'application/x-ndjson', 'application/jsonl', 'application/json-seq']
452 version_added = '1.5'
453
454 # No validation of integers, floats, or constants is done.
455 # As long as the characters are members of the following
456 # sets, the token will be considered valid. For example,
457 #
458 # "--1--" is parsed as an integer
459 # "1...eee" is parsed as a float
460 # "trustful" is parsed as a constant
461 #
462 integers = set('-0123456789')
463 floats = set('.eE+')
464 constants = set('truefalsenull') # true|false|null
465 hexadecimals = set('0123456789abcdefABCDEF')
466 punctuations = set('{}[],')
467 whitespaces = {'\u0020', '\u000a', '\u000d', '\u0009'}
468
469 def get_tokens_unprocessed(self, text):
470 """Parse JSON data."""
471
472 in_string = False
473 in_escape = False
474 in_unicode_escape = 0
475 in_whitespace = False
476 in_constant = False
477 in_number = False
478 in_float = False
479 in_punctuation = False
480 in_comment_single = False
481 in_comment_multiline = False
482 expecting_second_comment_opener = False # // or /*
483 expecting_second_comment_closer = False # */
484
485 start = 0
486
487 # The queue is used to store data that may need to be tokenized
488 # differently based on what follows. In particular, JSON object
489 # keys are tokenized differently than string values, but cannot
490 # be distinguished until punctuation is encountered outside the
491 # string.
492 #
493 # A ":" character after the string indicates that the string is
494 # an object key; any other character indicates the string is a
495 # regular string value.
496 #
497 # The queue holds tuples that contain the following data:
498 #
499 # (start_index, token_type, text)
500 #
501 # By default the token type of text in double quotes is
502 # String.Double. The token type will be replaced if a colon
503 # is encountered after the string closes.
504 #
505 queue = []
506
507 for stop, character in enumerate(text):
508 if in_string:
509 if in_unicode_escape:
510 if character in self.hexadecimals:
511 in_unicode_escape -= 1
512 if not in_unicode_escape:
513 in_escape = False
514 else:
515 in_unicode_escape = 0
516 in_escape = False
517
518 elif in_escape:
519 if character == 'u':
520 in_unicode_escape = 4
521 else:
522 in_escape = False
523
524 elif character == '\\':
525 in_escape = True
526
527 elif character == '"':
528 queue.append((start, String.Double, text[start:stop + 1]))
529 in_string = False
530 in_escape = False
531 in_unicode_escape = 0
532
533 continue
534
535 elif in_whitespace:
536 if character in self.whitespaces:
537 continue
538
539 if queue:
540 queue.append((start, Whitespace, text[start:stop]))
541 else:
542 yield start, Whitespace, text[start:stop]
543 in_whitespace = False
544 # Fall through so the new character can be evaluated.
545
546 elif in_constant:
547 if character in self.constants:
548 continue
549
550 yield start, Keyword.Constant, text[start:stop]
551 in_constant = False
552 # Fall through so the new character can be evaluated.
553
554 elif in_number:
555 if character in self.integers:
556 continue
557 elif character in self.floats:
558 in_float = True
559 continue
560
561 if in_float:
562 yield start, Number.Float, text[start:stop]
563 else:
564 yield start, Number.Integer, text[start:stop]
565 in_number = False
566 in_float = False
567 # Fall through so the new character can be evaluated.
568
569 elif in_punctuation:
570 if character in self.punctuations:
571 continue
572
573 yield start, Punctuation, text[start:stop]
574 in_punctuation = False
575 # Fall through so the new character can be evaluated.
576
577 elif in_comment_single:
578 if character != '\n':
579 continue
580
581 if queue:
582 queue.append((start, Comment.Single, text[start:stop]))
583 else:
584 yield start, Comment.Single, text[start:stop]
585
586 in_comment_single = False
587 # Fall through so the new character can be evaluated.
588
589 elif in_comment_multiline:
590 if character == '*':
591 expecting_second_comment_closer = True
592 elif expecting_second_comment_closer:
593 expecting_second_comment_closer = False
594 if character == '/':
595 if queue:
596 queue.append((start, Comment.Multiline, text[start:stop + 1]))
597 else:
598 yield start, Comment.Multiline, text[start:stop + 1]
599
600 in_comment_multiline = False
601
602 continue
603
604 elif expecting_second_comment_opener:
605 expecting_second_comment_opener = False
606 if character == '/':
607 in_comment_single = True
608 continue
609 elif character == '*':
610 in_comment_multiline = True
611 continue
612
613 # Exhaust the queue. Accept the existing token types.
614 yield from queue
615 queue.clear()
616
617 yield start, Error, text[start:stop]
618 # Fall through so the new character can be evaluated.
619
620 start = stop
621
622 if character == '"':
623 in_string = True
624
625 elif character in self.whitespaces:
626 in_whitespace = True
627
628 elif character in {'f', 'n', 't'}: # The first letters of true|false|null
629 # Exhaust the queue. Accept the existing token types.
630 yield from queue
631 queue.clear()
632
633 in_constant = True
634
635 elif character in self.integers:
636 # Exhaust the queue. Accept the existing token types.
637 yield from queue
638 queue.clear()
639
640 in_number = True
641
642 elif character == ':':
643 # Yield from the queue. Replace string token types.
644 for _start, _token, _text in queue:
645 # There can be only three types of tokens before a ':':
646 # Whitespace, Comment, or a quoted string.
647 #
648 # If it's a quoted string we emit Name.Tag.
649 # Otherwise, we yield the original token.
650 #
651 # In all other cases this would be invalid JSON,
652 # but this is not a validating JSON lexer, so it's OK.
653 if _token is String.Double:
654 yield _start, Name.Tag, _text
655 else:
656 yield _start, _token, _text
657 queue.clear()
658
659 in_punctuation = True
660
661 elif character in self.punctuations:
662 # Exhaust the queue. Accept the existing token types.
663 yield from queue
664 queue.clear()
665
666 in_punctuation = True
667
668 elif character == '/':
669 # This is the beginning of a comment.
670 expecting_second_comment_opener = True
671
672 else:
673 # Exhaust the queue. Accept the existing token types.
674 yield from queue
675 queue.clear()
676
677 yield start, Error, character
678
679 # Yield any remaining text.
680 yield from queue
681 if in_string:
682 yield start, Error, text[start:]
683 elif in_float:
684 yield start, Number.Float, text[start:]
685 elif in_number:
686 yield start, Number.Integer, text[start:]
687 elif in_constant:
688 yield start, Keyword.Constant, text[start:]
689 elif in_whitespace:
690 yield start, Whitespace, text[start:]
691 elif in_punctuation:
692 yield start, Punctuation, text[start:]
693 elif in_comment_single:
694 yield start, Comment.Single, text[start:]
695 elif in_comment_multiline:
696 yield start, Error, text[start:]
697 elif expecting_second_comment_opener:
698 yield start, Error, text[start:]
699
700
701class JsonBareObjectLexer(JsonLexer):
702 """
703 For JSON data structures (with missing object curly braces).
704
705 .. deprecated:: 2.8.0
706
707 Behaves the same as `JsonLexer` now.
708 """
709
710 name = 'JSONBareObject'
711 aliases = []
712 filenames = []
713 mimetypes = []
714 version_added = '2.2'
715
716
717class JsonLdLexer(JsonLexer):
718 """
719 For JSON-LD linked data.
720 """
721
722 name = 'JSON-LD'
723 url = 'https://json-ld.org/'
724 aliases = ['jsonld', 'json-ld']
725 filenames = ['*.jsonld']
726 mimetypes = ['application/ld+json']
727 version_added = '2.0'
728
729 json_ld_keywords = {
730 f'"@{keyword}"'
731 for keyword in (
732 'base',
733 'container',
734 'context',
735 'direction',
736 'graph',
737 'id',
738 'import',
739 'included',
740 'index',
741 'json',
742 'language',
743 'list',
744 'nest',
745 'none',
746 'prefix',
747 'propagate',
748 'protected',
749 'reverse',
750 'set',
751 'type',
752 'value',
753 'version',
754 'vocab',
755 )
756 }
757
758 def get_tokens_unprocessed(self, text):
759 for start, token, value in super().get_tokens_unprocessed(text):
760 if token is Name.Tag and value in self.json_ld_keywords:
761 yield start, Name.Decorator, value
762 else:
763 yield start, token, value