Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pip/_vendor/pygments/lexer.py: 18%
468 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:48 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:48 +0000
1"""
2 pygments.lexer
3 ~~~~~~~~~~~~~~
5 Base lexer classes.
7 :copyright: Copyright 2006-2022 by the Pygments team, see AUTHORS.
8 :license: BSD, see LICENSE for details.
9"""
11import re
12import sys
13import time
15from pip._vendor.pygments.filter import apply_filters, Filter
16from pip._vendor.pygments.filters import get_filter_by_name
17from pip._vendor.pygments.token import Error, Text, Other, Whitespace, _TokenType
18from pip._vendor.pygments.util import get_bool_opt, get_int_opt, get_list_opt, \
19 make_analysator, Future, guess_decode
20from pip._vendor.pygments.regexopt import regex_opt
22__all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer',
23 'LexerContext', 'include', 'inherit', 'bygroups', 'using', 'this',
24 'default', 'words', 'line_re']
26line_re = re.compile('.*?\n')
28_encoding_map = [(b'\xef\xbb\xbf', 'utf-8'),
29 (b'\xff\xfe\0\0', 'utf-32'),
30 (b'\0\0\xfe\xff', 'utf-32be'),
31 (b'\xff\xfe', 'utf-16'),
32 (b'\xfe\xff', 'utf-16be')]
34_default_analyse = staticmethod(lambda x: 0.0)
37class LexerMeta(type):
38 """
39 This metaclass automagically converts ``analyse_text`` methods into
40 static methods which always return float values.
41 """
43 def __new__(mcs, name, bases, d):
44 if 'analyse_text' in d:
45 d['analyse_text'] = make_analysator(d['analyse_text'])
46 return type.__new__(mcs, name, bases, d)
49class Lexer(metaclass=LexerMeta):
50 """
51 Lexer for a specific language.
53 Basic options recognized:
54 ``stripnl``
55 Strip leading and trailing newlines from the input (default: True).
56 ``stripall``
57 Strip all leading and trailing whitespace from the input
58 (default: False).
59 ``ensurenl``
60 Make sure that the input ends with a newline (default: True). This
61 is required for some lexers that consume input linewise.
63 .. versionadded:: 1.3
65 ``tabsize``
66 If given and greater than 0, expand tabs in the input (default: 0).
67 ``encoding``
68 If given, must be an encoding name. This encoding will be used to
69 convert the input string to Unicode, if it is not already a Unicode
70 string (default: ``'guess'``, which uses a simple UTF-8 / Locale /
71 Latin1 detection. Can also be ``'chardet'`` to use the chardet
72 library, if it is installed.
73 ``inencoding``
74 Overrides the ``encoding`` if given.
75 """
77 #: Name of the lexer
78 name = None
80 #: URL of the language specification/definition
81 url = None
83 #: Shortcuts for the lexer
84 aliases = []
86 #: File name globs
87 filenames = []
89 #: Secondary file name globs
90 alias_filenames = []
92 #: MIME types
93 mimetypes = []
95 #: Priority, should multiple lexers match and no content is provided
96 priority = 0
98 def __init__(self, **options):
99 self.options = options
100 self.stripnl = get_bool_opt(options, 'stripnl', True)
101 self.stripall = get_bool_opt(options, 'stripall', False)
102 self.ensurenl = get_bool_opt(options, 'ensurenl', True)
103 self.tabsize = get_int_opt(options, 'tabsize', 0)
104 self.encoding = options.get('encoding', 'guess')
105 self.encoding = options.get('inencoding') or self.encoding
106 self.filters = []
107 for filter_ in get_list_opt(options, 'filters', ()):
108 self.add_filter(filter_)
110 def __repr__(self):
111 if self.options:
112 return '<pygments.lexers.%s with %r>' % (self.__class__.__name__,
113 self.options)
114 else:
115 return '<pygments.lexers.%s>' % self.__class__.__name__
117 def add_filter(self, filter_, **options):
118 """
119 Add a new stream filter to this lexer.
120 """
121 if not isinstance(filter_, Filter):
122 filter_ = get_filter_by_name(filter_, **options)
123 self.filters.append(filter_)
125 def analyse_text(text):
126 """
127 Has to return a float between ``0`` and ``1`` that indicates
128 if a lexer wants to highlight this text. Used by ``guess_lexer``.
129 If this method returns ``0`` it won't highlight it in any case, if
130 it returns ``1`` highlighting with this lexer is guaranteed.
132 The `LexerMeta` metaclass automatically wraps this function so
133 that it works like a static method (no ``self`` or ``cls``
134 parameter) and the return value is automatically converted to
135 `float`. If the return value is an object that is boolean `False`
136 it's the same as if the return values was ``0.0``.
137 """
139 def get_tokens(self, text, unfiltered=False):
140 """
141 Return an iterable of (tokentype, value) pairs generated from
142 `text`. If `unfiltered` is set to `True`, the filtering mechanism
143 is bypassed even if filters are defined.
145 Also preprocess the text, i.e. expand tabs and strip it if
146 wanted and applies registered filters.
147 """
148 if not isinstance(text, str):
149 if self.encoding == 'guess':
150 text, _ = guess_decode(text)
151 elif self.encoding == 'chardet':
152 try:
153 from pip._vendor import chardet
154 except ImportError as e:
155 raise ImportError('To enable chardet encoding guessing, '
156 'please install the chardet library '
157 'from http://chardet.feedparser.org/') from e
158 # check for BOM first
159 decoded = None
160 for bom, encoding in _encoding_map:
161 if text.startswith(bom):
162 decoded = text[len(bom):].decode(encoding, 'replace')
163 break
164 # no BOM found, so use chardet
165 if decoded is None:
166 enc = chardet.detect(text[:1024]) # Guess using first 1KB
167 decoded = text.decode(enc.get('encoding') or 'utf-8',
168 'replace')
169 text = decoded
170 else:
171 text = text.decode(self.encoding)
172 if text.startswith('\ufeff'):
173 text = text[len('\ufeff'):]
174 else:
175 if text.startswith('\ufeff'):
176 text = text[len('\ufeff'):]
178 # text now *is* a unicode string
179 text = text.replace('\r\n', '\n')
180 text = text.replace('\r', '\n')
181 if self.stripall:
182 text = text.strip()
183 elif self.stripnl:
184 text = text.strip('\n')
185 if self.tabsize > 0:
186 text = text.expandtabs(self.tabsize)
187 if self.ensurenl and not text.endswith('\n'):
188 text += '\n'
190 def streamer():
191 for _, t, v in self.get_tokens_unprocessed(text):
192 yield t, v
193 stream = streamer()
194 if not unfiltered:
195 stream = apply_filters(stream, self.filters, self)
196 return stream
198 def get_tokens_unprocessed(self, text):
199 """
200 Return an iterable of (index, tokentype, value) pairs where "index"
201 is the starting position of the token within the input text.
203 In subclasses, implement this method as a generator to
204 maximize effectiveness.
205 """
206 raise NotImplementedError
209class DelegatingLexer(Lexer):
210 """
211 This lexer takes two lexer as arguments. A root lexer and
212 a language lexer. First everything is scanned using the language
213 lexer, afterwards all ``Other`` tokens are lexed using the root
214 lexer.
216 The lexers from the ``template`` lexer package use this base lexer.
217 """
219 def __init__(self, _root_lexer, _language_lexer, _needle=Other, **options):
220 self.root_lexer = _root_lexer(**options)
221 self.language_lexer = _language_lexer(**options)
222 self.needle = _needle
223 Lexer.__init__(self, **options)
225 def get_tokens_unprocessed(self, text):
226 buffered = ''
227 insertions = []
228 lng_buffer = []
229 for i, t, v in self.language_lexer.get_tokens_unprocessed(text):
230 if t is self.needle:
231 if lng_buffer:
232 insertions.append((len(buffered), lng_buffer))
233 lng_buffer = []
234 buffered += v
235 else:
236 lng_buffer.append((i, t, v))
237 if lng_buffer:
238 insertions.append((len(buffered), lng_buffer))
239 return do_insertions(insertions,
240 self.root_lexer.get_tokens_unprocessed(buffered))
243# ------------------------------------------------------------------------------
244# RegexLexer and ExtendedRegexLexer
245#
248class include(str): # pylint: disable=invalid-name
249 """
250 Indicates that a state should include rules from another state.
251 """
252 pass
255class _inherit:
256 """
257 Indicates the a state should inherit from its superclass.
258 """
259 def __repr__(self):
260 return 'inherit'
262inherit = _inherit() # pylint: disable=invalid-name
265class combined(tuple): # pylint: disable=invalid-name
266 """
267 Indicates a state combined from multiple states.
268 """
270 def __new__(cls, *args):
271 return tuple.__new__(cls, args)
273 def __init__(self, *args):
274 # tuple.__init__ doesn't do anything
275 pass
278class _PseudoMatch:
279 """
280 A pseudo match object constructed from a string.
281 """
283 def __init__(self, start, text):
284 self._text = text
285 self._start = start
287 def start(self, arg=None):
288 return self._start
290 def end(self, arg=None):
291 return self._start + len(self._text)
293 def group(self, arg=None):
294 if arg:
295 raise IndexError('No such group')
296 return self._text
298 def groups(self):
299 return (self._text,)
301 def groupdict(self):
302 return {}
305def bygroups(*args):
306 """
307 Callback that yields multiple actions for each group in the match.
308 """
309 def callback(lexer, match, ctx=None):
310 for i, action in enumerate(args):
311 if action is None:
312 continue
313 elif type(action) is _TokenType:
314 data = match.group(i + 1)
315 if data:
316 yield match.start(i + 1), action, data
317 else:
318 data = match.group(i + 1)
319 if data is not None:
320 if ctx:
321 ctx.pos = match.start(i + 1)
322 for item in action(lexer,
323 _PseudoMatch(match.start(i + 1), data), ctx):
324 if item:
325 yield item
326 if ctx:
327 ctx.pos = match.end()
328 return callback
331class _This:
332 """
333 Special singleton used for indicating the caller class.
334 Used by ``using``.
335 """
337this = _This()
340def using(_other, **kwargs):
341 """
342 Callback that processes the match with a different lexer.
344 The keyword arguments are forwarded to the lexer, except `state` which
345 is handled separately.
347 `state` specifies the state that the new lexer will start in, and can
348 be an enumerable such as ('root', 'inline', 'string') or a simple
349 string which is assumed to be on top of the root state.
351 Note: For that to work, `_other` must not be an `ExtendedRegexLexer`.
352 """
353 gt_kwargs = {}
354 if 'state' in kwargs:
355 s = kwargs.pop('state')
356 if isinstance(s, (list, tuple)):
357 gt_kwargs['stack'] = s
358 else:
359 gt_kwargs['stack'] = ('root', s)
361 if _other is this:
362 def callback(lexer, match, ctx=None):
363 # if keyword arguments are given the callback
364 # function has to create a new lexer instance
365 if kwargs:
366 # XXX: cache that somehow
367 kwargs.update(lexer.options)
368 lx = lexer.__class__(**kwargs)
369 else:
370 lx = lexer
371 s = match.start()
372 for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs):
373 yield i + s, t, v
374 if ctx:
375 ctx.pos = match.end()
376 else:
377 def callback(lexer, match, ctx=None):
378 # XXX: cache that somehow
379 kwargs.update(lexer.options)
380 lx = _other(**kwargs)
382 s = match.start()
383 for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs):
384 yield i + s, t, v
385 if ctx:
386 ctx.pos = match.end()
387 return callback
390class default:
391 """
392 Indicates a state or state action (e.g. #pop) to apply.
393 For example default('#pop') is equivalent to ('', Token, '#pop')
394 Note that state tuples may be used as well.
396 .. versionadded:: 2.0
397 """
398 def __init__(self, state):
399 self.state = state
402class words(Future):
403 """
404 Indicates a list of literal words that is transformed into an optimized
405 regex that matches any of the words.
407 .. versionadded:: 2.0
408 """
409 def __init__(self, words, prefix='', suffix=''):
410 self.words = words
411 self.prefix = prefix
412 self.suffix = suffix
414 def get(self):
415 return regex_opt(self.words, prefix=self.prefix, suffix=self.suffix)
418class RegexLexerMeta(LexerMeta):
419 """
420 Metaclass for RegexLexer, creates the self._tokens attribute from
421 self.tokens on the first instantiation.
422 """
424 def _process_regex(cls, regex, rflags, state):
425 """Preprocess the regular expression component of a token definition."""
426 if isinstance(regex, Future):
427 regex = regex.get()
428 return re.compile(regex, rflags).match
430 def _process_token(cls, token):
431 """Preprocess the token component of a token definition."""
432 assert type(token) is _TokenType or callable(token), \
433 'token type must be simple type or callable, not %r' % (token,)
434 return token
436 def _process_new_state(cls, new_state, unprocessed, processed):
437 """Preprocess the state transition action of a token definition."""
438 if isinstance(new_state, str):
439 # an existing state
440 if new_state == '#pop':
441 return -1
442 elif new_state in unprocessed:
443 return (new_state,)
444 elif new_state == '#push':
445 return new_state
446 elif new_state[:5] == '#pop:':
447 return -int(new_state[5:])
448 else:
449 assert False, 'unknown new state %r' % new_state
450 elif isinstance(new_state, combined):
451 # combine a new state from existing ones
452 tmp_state = '_tmp_%d' % cls._tmpname
453 cls._tmpname += 1
454 itokens = []
455 for istate in new_state:
456 assert istate != new_state, 'circular state ref %r' % istate
457 itokens.extend(cls._process_state(unprocessed,
458 processed, istate))
459 processed[tmp_state] = itokens
460 return (tmp_state,)
461 elif isinstance(new_state, tuple):
462 # push more than one state
463 for istate in new_state:
464 assert (istate in unprocessed or
465 istate in ('#pop', '#push')), \
466 'unknown new state ' + istate
467 return new_state
468 else:
469 assert False, 'unknown new state def %r' % new_state
471 def _process_state(cls, unprocessed, processed, state):
472 """Preprocess a single state definition."""
473 assert type(state) is str, "wrong state name %r" % state
474 assert state[0] != '#', "invalid state name %r" % state
475 if state in processed:
476 return processed[state]
477 tokens = processed[state] = []
478 rflags = cls.flags
479 for tdef in unprocessed[state]:
480 if isinstance(tdef, include):
481 # it's a state reference
482 assert tdef != state, "circular state reference %r" % state
483 tokens.extend(cls._process_state(unprocessed, processed,
484 str(tdef)))
485 continue
486 if isinstance(tdef, _inherit):
487 # should be processed already, but may not in the case of:
488 # 1. the state has no counterpart in any parent
489 # 2. the state includes more than one 'inherit'
490 continue
491 if isinstance(tdef, default):
492 new_state = cls._process_new_state(tdef.state, unprocessed, processed)
493 tokens.append((re.compile('').match, None, new_state))
494 continue
496 assert type(tdef) is tuple, "wrong rule def %r" % tdef
498 try:
499 rex = cls._process_regex(tdef[0], rflags, state)
500 except Exception as err:
501 raise ValueError("uncompilable regex %r in state %r of %r: %s" %
502 (tdef[0], state, cls, err)) from err
504 token = cls._process_token(tdef[1])
506 if len(tdef) == 2:
507 new_state = None
508 else:
509 new_state = cls._process_new_state(tdef[2],
510 unprocessed, processed)
512 tokens.append((rex, token, new_state))
513 return tokens
515 def process_tokendef(cls, name, tokendefs=None):
516 """Preprocess a dictionary of token definitions."""
517 processed = cls._all_tokens[name] = {}
518 tokendefs = tokendefs or cls.tokens[name]
519 for state in list(tokendefs):
520 cls._process_state(tokendefs, processed, state)
521 return processed
523 def get_tokendefs(cls):
524 """
525 Merge tokens from superclasses in MRO order, returning a single tokendef
526 dictionary.
528 Any state that is not defined by a subclass will be inherited
529 automatically. States that *are* defined by subclasses will, by
530 default, override that state in the superclass. If a subclass wishes to
531 inherit definitions from a superclass, it can use the special value
532 "inherit", which will cause the superclass' state definition to be
533 included at that point in the state.
534 """
535 tokens = {}
536 inheritable = {}
537 for c in cls.__mro__:
538 toks = c.__dict__.get('tokens', {})
540 for state, items in toks.items():
541 curitems = tokens.get(state)
542 if curitems is None:
543 # N.b. because this is assigned by reference, sufficiently
544 # deep hierarchies are processed incrementally (e.g. for
545 # A(B), B(C), C(RegexLexer), B will be premodified so X(B)
546 # will not see any inherits in B).
547 tokens[state] = items
548 try:
549 inherit_ndx = items.index(inherit)
550 except ValueError:
551 continue
552 inheritable[state] = inherit_ndx
553 continue
555 inherit_ndx = inheritable.pop(state, None)
556 if inherit_ndx is None:
557 continue
559 # Replace the "inherit" value with the items
560 curitems[inherit_ndx:inherit_ndx+1] = items
561 try:
562 # N.b. this is the index in items (that is, the superclass
563 # copy), so offset required when storing below.
564 new_inh_ndx = items.index(inherit)
565 except ValueError:
566 pass
567 else:
568 inheritable[state] = inherit_ndx + new_inh_ndx
570 return tokens
572 def __call__(cls, *args, **kwds):
573 """Instantiate cls after preprocessing its token definitions."""
574 if '_tokens' not in cls.__dict__:
575 cls._all_tokens = {}
576 cls._tmpname = 0
577 if hasattr(cls, 'token_variants') and cls.token_variants:
578 # don't process yet
579 pass
580 else:
581 cls._tokens = cls.process_tokendef('', cls.get_tokendefs())
583 return type.__call__(cls, *args, **kwds)
586class RegexLexer(Lexer, metaclass=RegexLexerMeta):
587 """
588 Base for simple stateful regular expression-based lexers.
589 Simplifies the lexing process so that you need only
590 provide a list of states and regular expressions.
591 """
593 #: Flags for compiling the regular expressions.
594 #: Defaults to MULTILINE.
595 flags = re.MULTILINE
597 #: At all time there is a stack of states. Initially, the stack contains
598 #: a single state 'root'. The top of the stack is called "the current state".
599 #:
600 #: Dict of ``{'state': [(regex, tokentype, new_state), ...], ...}``
601 #:
602 #: ``new_state`` can be omitted to signify no state transition.
603 #: If ``new_state`` is a string, it is pushed on the stack. This ensure
604 #: the new current state is ``new_state``.
605 #: If ``new_state`` is a tuple of strings, all of those strings are pushed
606 #: on the stack and the current state will be the last element of the list.
607 #: ``new_state`` can also be ``combined('state1', 'state2', ...)``
608 #: to signify a new, anonymous state combined from the rules of two
609 #: or more existing ones.
610 #: Furthermore, it can be '#pop' to signify going back one step in
611 #: the state stack, or '#push' to push the current state on the stack
612 #: again. Note that if you push while in a combined state, the combined
613 #: state itself is pushed, and not only the state in which the rule is
614 #: defined.
615 #:
616 #: The tuple can also be replaced with ``include('state')``, in which
617 #: case the rules from the state named by the string are included in the
618 #: current one.
619 tokens = {}
621 def get_tokens_unprocessed(self, text, stack=('root',)):
622 """
623 Split ``text`` into (tokentype, text) pairs.
625 ``stack`` is the initial stack (default: ``['root']``)
626 """
627 pos = 0
628 tokendefs = self._tokens
629 statestack = list(stack)
630 statetokens = tokendefs[statestack[-1]]
631 while 1:
632 for rexmatch, action, new_state in statetokens:
633 m = rexmatch(text, pos)
634 if m:
635 if action is not None:
636 if type(action) is _TokenType:
637 yield pos, action, m.group()
638 else:
639 yield from action(self, m)
640 pos = m.end()
641 if new_state is not None:
642 # state transition
643 if isinstance(new_state, tuple):
644 for state in new_state:
645 if state == '#pop':
646 if len(statestack) > 1:
647 statestack.pop()
648 elif state == '#push':
649 statestack.append(statestack[-1])
650 else:
651 statestack.append(state)
652 elif isinstance(new_state, int):
653 # pop, but keep at least one state on the stack
654 # (random code leading to unexpected pops should
655 # not allow exceptions)
656 if abs(new_state) >= len(statestack):
657 del statestack[1:]
658 else:
659 del statestack[new_state:]
660 elif new_state == '#push':
661 statestack.append(statestack[-1])
662 else:
663 assert False, "wrong state def: %r" % new_state
664 statetokens = tokendefs[statestack[-1]]
665 break
666 else:
667 # We are here only if all state tokens have been considered
668 # and there was not a match on any of them.
669 try:
670 if text[pos] == '\n':
671 # at EOL, reset state to "root"
672 statestack = ['root']
673 statetokens = tokendefs['root']
674 yield pos, Whitespace, '\n'
675 pos += 1
676 continue
677 yield pos, Error, text[pos]
678 pos += 1
679 except IndexError:
680 break
683class LexerContext:
684 """
685 A helper object that holds lexer position data.
686 """
688 def __init__(self, text, pos, stack=None, end=None):
689 self.text = text
690 self.pos = pos
691 self.end = end or len(text) # end=0 not supported ;-)
692 self.stack = stack or ['root']
694 def __repr__(self):
695 return 'LexerContext(%r, %r, %r)' % (
696 self.text, self.pos, self.stack)
699class ExtendedRegexLexer(RegexLexer):
700 """
701 A RegexLexer that uses a context object to store its state.
702 """
704 def get_tokens_unprocessed(self, text=None, context=None):
705 """
706 Split ``text`` into (tokentype, text) pairs.
707 If ``context`` is given, use this lexer context instead.
708 """
709 tokendefs = self._tokens
710 if not context:
711 ctx = LexerContext(text, 0)
712 statetokens = tokendefs['root']
713 else:
714 ctx = context
715 statetokens = tokendefs[ctx.stack[-1]]
716 text = ctx.text
717 while 1:
718 for rexmatch, action, new_state in statetokens:
719 m = rexmatch(text, ctx.pos, ctx.end)
720 if m:
721 if action is not None:
722 if type(action) is _TokenType:
723 yield ctx.pos, action, m.group()
724 ctx.pos = m.end()
725 else:
726 yield from action(self, m, ctx)
727 if not new_state:
728 # altered the state stack?
729 statetokens = tokendefs[ctx.stack[-1]]
730 # CAUTION: callback must set ctx.pos!
731 if new_state is not None:
732 # state transition
733 if isinstance(new_state, tuple):
734 for state in new_state:
735 if state == '#pop':
736 if len(ctx.stack) > 1:
737 ctx.stack.pop()
738 elif state == '#push':
739 ctx.stack.append(ctx.stack[-1])
740 else:
741 ctx.stack.append(state)
742 elif isinstance(new_state, int):
743 # see RegexLexer for why this check is made
744 if abs(new_state) >= len(ctx.stack):
745 del ctx.stack[1:]
746 else:
747 del ctx.stack[new_state:]
748 elif new_state == '#push':
749 ctx.stack.append(ctx.stack[-1])
750 else:
751 assert False, "wrong state def: %r" % new_state
752 statetokens = tokendefs[ctx.stack[-1]]
753 break
754 else:
755 try:
756 if ctx.pos >= ctx.end:
757 break
758 if text[ctx.pos] == '\n':
759 # at EOL, reset state to "root"
760 ctx.stack = ['root']
761 statetokens = tokendefs['root']
762 yield ctx.pos, Text, '\n'
763 ctx.pos += 1
764 continue
765 yield ctx.pos, Error, text[ctx.pos]
766 ctx.pos += 1
767 except IndexError:
768 break
771def do_insertions(insertions, tokens):
772 """
773 Helper for lexers which must combine the results of several
774 sublexers.
776 ``insertions`` is a list of ``(index, itokens)`` pairs.
777 Each ``itokens`` iterable should be inserted at position
778 ``index`` into the token stream given by the ``tokens``
779 argument.
781 The result is a combined token stream.
783 TODO: clean up the code here.
784 """
785 insertions = iter(insertions)
786 try:
787 index, itokens = next(insertions)
788 except StopIteration:
789 # no insertions
790 yield from tokens
791 return
793 realpos = None
794 insleft = True
796 # iterate over the token stream where we want to insert
797 # the tokens from the insertion list.
798 for i, t, v in tokens:
799 # first iteration. store the position of first item
800 if realpos is None:
801 realpos = i
802 oldi = 0
803 while insleft and i + len(v) >= index:
804 tmpval = v[oldi:index - i]
805 if tmpval:
806 yield realpos, t, tmpval
807 realpos += len(tmpval)
808 for it_index, it_token, it_value in itokens:
809 yield realpos, it_token, it_value
810 realpos += len(it_value)
811 oldi = index - i
812 try:
813 index, itokens = next(insertions)
814 except StopIteration:
815 insleft = False
816 break # not strictly necessary
817 if oldi < len(v):
818 yield realpos, t, v[oldi:]
819 realpos += len(v) - oldi
821 # leftover tokens
822 while insleft:
823 # no normal tokens, set realpos to zero
824 realpos = realpos or 0
825 for p, t, v in itokens:
826 yield realpos, t, v
827 realpos += len(v)
828 try:
829 index, itokens = next(insertions)
830 except StopIteration:
831 insleft = False
832 break # not strictly necessary
835class ProfilingRegexLexerMeta(RegexLexerMeta):
836 """Metaclass for ProfilingRegexLexer, collects regex timing info."""
838 def _process_regex(cls, regex, rflags, state):
839 if isinstance(regex, words):
840 rex = regex_opt(regex.words, prefix=regex.prefix,
841 suffix=regex.suffix)
842 else:
843 rex = regex
844 compiled = re.compile(rex, rflags)
846 def match_func(text, pos, endpos=sys.maxsize):
847 info = cls._prof_data[-1].setdefault((state, rex), [0, 0.0])
848 t0 = time.time()
849 res = compiled.match(text, pos, endpos)
850 t1 = time.time()
851 info[0] += 1
852 info[1] += t1 - t0
853 return res
854 return match_func
857class ProfilingRegexLexer(RegexLexer, metaclass=ProfilingRegexLexerMeta):
858 """Drop-in replacement for RegexLexer that does profiling of its regexes."""
860 _prof_data = []
861 _prof_sort_index = 4 # defaults to time per call
863 def get_tokens_unprocessed(self, text, stack=('root',)):
864 # this needs to be a stack, since using(this) will produce nested calls
865 self.__class__._prof_data.append({})
866 yield from RegexLexer.get_tokens_unprocessed(self, text, stack)
867 rawdata = self.__class__._prof_data.pop()
868 data = sorted(((s, repr(r).strip('u\'').replace('\\\\', '\\')[:65],
869 n, 1000 * t, 1000 * t / n)
870 for ((s, r), (n, t)) in rawdata.items()),
871 key=lambda x: x[self._prof_sort_index],
872 reverse=True)
873 sum_total = sum(x[3] for x in data)
875 print()
876 print('Profiling result for %s lexing %d chars in %.3f ms' %
877 (self.__class__.__name__, len(text), sum_total))
878 print('=' * 110)
879 print('%-20s %-64s ncalls tottime percall' % ('state', 'regex'))
880 print('-' * 110)
881 for d in data:
882 print('%-20s %-65s %5d %8.4f %8.4f' % d)
883 print('=' * 110)