Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pygments/lexer.py: 21%

1"""

2 pygments.lexer

3 ~~~~~~~~~~~~~~

5 Base lexer classes.

8 :license: BSD, see LICENSE for details.

9"""

11import re

12import sys

13import time

15from pygments.filter import apply_filters, Filter

16from pygments.filters import get_filter_by_name

17from pygments.token import Error, Text, Other, Whitespace, _TokenType

18from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \

19 make_analysator, Future, guess_decode

20from pygments.regexopt import regex_opt

22__all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer',

23 'LexerContext', 'include', 'inherit', 'bygroups', 'using', 'this',

24 'default', 'words', 'line_re']

26line_re = re.compile('.*?\n')

28_encoding_map = [(b'\xef\xbb\xbf', 'utf-8'),

29 (b'\xff\xfe\0\0', 'utf-32'),

30 (b'\0\0\xfe\xff', 'utf-32be'),

31 (b'\xff\xfe', 'utf-16'),

32 (b'\xfe\xff', 'utf-16be')]

34_default_analyse = staticmethod(lambda x: 0.0)

37class LexerMeta(type):

38 """

39 This metaclass automagically converts ``analyse_text`` methods into

40 static methods which always return float values.

41 """

43 def __new__(mcs, name, bases, d):

44 if 'analyse_text' in d:

45 d['analyse_text'] = make_analysator(d['analyse_text'])

46 return type.__new__(mcs, name, bases, d)

49class Lexer(metaclass=LexerMeta):

50 """

51 Lexer for a specific language.

53 Basic options recognized:

54 ``stripnl``

55 Strip leading and trailing newlines from the input (default: True).

56 ``stripall``

57 Strip all leading and trailing whitespace from the input

58 (default: False).

59 ``ensurenl``

60 Make sure that the input ends with a newline (default: True). This

61 is required for some lexers that consume input linewise.

63 .. versionadded:: 1.3

65 ``tabsize``

66 If given and greater than 0, expand tabs in the input (default: 0).

67 ``encoding``

68 If given, must be an encoding name. This encoding will be used to

69 convert the input string to Unicode, if it is not already a Unicode

70 string (default: ``'guess'``, which uses a simple UTF-8 / Locale /

71 Latin1 detection. Can also be ``'chardet'`` to use the chardet

72 library, if it is installed.

73 ``inencoding``

74 Overrides the ``encoding`` if given.

75 """

77 #: Name of the lexer

78 name = None

80 #: URL of the language specification/definition

81 url = None

83 #: Shortcuts for the lexer

84 aliases = []

86 #: File name globs

87 filenames = []

89 #: Secondary file name globs

90 alias_filenames = []

92 #: MIME types

93 mimetypes = []

95 #: Priority, should multiple lexers match and no content is provided

96 priority = 0

98 def __init__(self, **options):

99 self.options = options

100 self.stripnl = get_bool_opt(options, 'stripnl', True)

101 self.stripall = get_bool_opt(options, 'stripall', False)

102 self.ensurenl = get_bool_opt(options, 'ensurenl', True)

103 self.tabsize = get_int_opt(options, 'tabsize', 0)

104 self.encoding = options.get('encoding', 'guess')

105 self.encoding = options.get('inencoding') or self.encoding

106 self.filters = []

107 for filter_ in get_list_opt(options, 'filters', ()):

108 self.add_filter(filter_)

109

110 def __repr__(self):

111 if self.options:

112 return '<pygments.lexers.%s with %r>' % (self.__class__.__name__,

113 self.options)

114 else:

115 return '<pygments.lexers.%s>' % self.__class__.__name__

116

117 def add_filter(self, filter_, **options):

118 """

119 Add a new stream filter to this lexer.

120 """

121 if not isinstance(filter_, Filter):

122 filter_ = get_filter_by_name(filter_, **options)

123 self.filters.append(filter_)

124

125 def analyse_text(text):

126 """

127 Has to return a float between ``0`` and ``1`` that indicates

128 if a lexer wants to highlight this text. Used by ``guess_lexer``.

129 If this method returns ``0`` it won't highlight it in any case, if

130 it returns ``1`` highlighting with this lexer is guaranteed.

131

132 The `LexerMeta` metaclass automatically wraps this function so

133 that it works like a static method (no ``self`` or ``cls``

134 parameter) and the return value is automatically converted to

135 `float`. If the return value is an object that is boolean `False`

136 it's the same as if the return values was ``0.0``.

137 """

138

139 def get_tokens(self, text, unfiltered=False):

140 """

141 Return an iterable of (tokentype, value) pairs generated from

142 `text`. If `unfiltered` is set to `True`, the filtering mechanism

143 is bypassed even if filters are defined.

144

145 Also preprocess the text, i.e. expand tabs and strip it if

146 wanted and applies registered filters.

147 """

148 if not isinstance(text, str):

149 if self.encoding == 'guess':

150 text, _ = guess_decode(text)

151 elif self.encoding == 'chardet':

152 try:

153 import chardet

154 except ImportError as e:

155 raise ImportError('To enable chardet encoding guessing, '

156 'please install the chardet library '

157 'from http://chardet.feedparser.org/') from e

158 # check for BOM first

159 decoded = None

160 for bom, encoding in _encoding_map:

161 if text.startswith(bom):

162 decoded = text[len(bom):].decode(encoding, 'replace')

163 break

164 # no BOM found, so use chardet

165 if decoded is None:

166 enc = chardet.detect(text[:1024]) # Guess using first 1KB

167 decoded = text.decode(enc.get('encoding') or 'utf-8',

168 'replace')

169 text = decoded

170 else:

171 text = text.decode(self.encoding)

172 if text.startswith('\ufeff'):

173 text = text[len('\ufeff'):]

174 else:

175 if text.startswith('\ufeff'):

176 text = text[len('\ufeff'):]

177

178 # text now *is* a unicode string

179 text = text.replace('\r\n', '\n')

180 text = text.replace('\r', '\n')

181 if self.stripall:

182 text = text.strip()

183 elif self.stripnl:

184 text = text.strip('\n')

185 if self.tabsize > 0:

186 text = text.expandtabs(self.tabsize)

187 if self.ensurenl and not text.endswith('\n'):

188 text += '\n'

189

190 def streamer():

191 for _, t, v in self.get_tokens_unprocessed(text):

192 yield t, v

193 stream = streamer()

194 if not unfiltered:

195 stream = apply_filters(stream, self.filters, self)

196 return stream

197

198 def get_tokens_unprocessed(self, text):

199 """

200 Return an iterable of (index, tokentype, value) pairs where "index"

201 is the starting position of the token within the input text.

202

203 In subclasses, implement this method as a generator to

204 maximize effectiveness.

205 """

206 raise NotImplementedError

207

208

209class DelegatingLexer(Lexer):

210 """

211 This lexer takes two lexer as arguments. A root lexer and

212 a language lexer. First everything is scanned using the language

213 lexer, afterwards all ``Other`` tokens are lexed using the root

214 lexer.

215

216 The lexers from the ``template`` lexer package use this base lexer.

217 """

218

219 def __init__(self, _root_lexer, _language_lexer, _needle=Other, **options):

220 self.root_lexer = _root_lexer(**options)

221 self.language_lexer = _language_lexer(**options)

222 self.needle = _needle

223 Lexer.__init__(self, **options)

224

225 def get_tokens_unprocessed(self, text):

226 buffered = ''

227 insertions = []

228 lng_buffer = []

229 for i, t, v in self.language_lexer.get_tokens_unprocessed(text):

230 if t is self.needle:

231 if lng_buffer:

232 insertions.append((len(buffered), lng_buffer))

233 lng_buffer = []

234 buffered += v

235 else:

236 lng_buffer.append((i, t, v))

237 if lng_buffer:

238 insertions.append((len(buffered), lng_buffer))

239 return do_insertions(insertions,

240 self.root_lexer.get_tokens_unprocessed(buffered))

241

242

243# ------------------------------------------------------------------------------

244# RegexLexer and ExtendedRegexLexer

245#

246

247

248class include(str): # pylint: disable=invalid-name

249 """

250 Indicates that a state should include rules from another state.

251 """

252 pass

253

254

255class _inherit:

256 """

257 Indicates the a state should inherit from its superclass.

258 """

259 def __repr__(self):

260 return 'inherit'

261

262inherit = _inherit() # pylint: disable=invalid-name

263

264

265class combined(tuple): # pylint: disable=invalid-name

266 """

267 Indicates a state combined from multiple states.

268 """

269

270 def __new__(cls, *args):

271 return tuple.__new__(cls, args)

272

273 def __init__(self, *args):

274 # tuple.__init__ doesn't do anything

275 pass

276

277

278class _PseudoMatch:

279 """

280 A pseudo match object constructed from a string.

281 """

282

283 def __init__(self, start, text):

284 self._text = text

285 self._start = start

286

287 def start(self, arg=None):

288 return self._start

289

290 def end(self, arg=None):

291 return self._start + len(self._text)

292

293 def group(self, arg=None):

294 if arg:

295 raise IndexError('No such group')

296 return self._text

297

298 def groups(self):

299 return (self._text,)

300

301 def groupdict(self):

302 return {}

303

304

305def bygroups(*args):

306 """

307 Callback that yields multiple actions for each group in the match.

308 """

309 def callback(lexer, match, ctx=None):

310 for i, action in enumerate(args):

311 if action is None:

312 continue

313 elif type(action) is _TokenType:

314 data = match.group(i + 1)

315 if data:

316 yield match.start(i + 1), action, data

317 else:

318 data = match.group(i + 1)

319 if data is not None:

320 if ctx:

321 ctx.pos = match.start(i + 1)

322 for item in action(lexer,

323 _PseudoMatch(match.start(i + 1), data), ctx):

324 if item:

325 yield item

326 if ctx:

327 ctx.pos = match.end()

328 return callback

329

330

331class _This:

332 """

333 Special singleton used for indicating the caller class.

334 Used by ``using``.

335 """

336

337this = _This()

338

339

340def using(_other, **kwargs):

341 """

342 Callback that processes the match with a different lexer.

343

344 The keyword arguments are forwarded to the lexer, except `state` which

345 is handled separately.

346

347 `state` specifies the state that the new lexer will start in, and can

348 be an enumerable such as ('root', 'inline', 'string') or a simple

349 string which is assumed to be on top of the root state.

350

351 Note: For that to work, `_other` must not be an `ExtendedRegexLexer`.

352 """

353 gt_kwargs = {}

354 if 'state' in kwargs:

355 s = kwargs.pop('state')

356 if isinstance(s, (list, tuple)):

357 gt_kwargs['stack'] = s

358 else:

359 gt_kwargs['stack'] = ('root', s)

360

361 if _other is this:

362 def callback(lexer, match, ctx=None):

363 # if keyword arguments are given the callback

364 # function has to create a new lexer instance

365 if kwargs:

366 # XXX: cache that somehow

367 kwargs.update(lexer.options)

368 lx = lexer.__class__(**kwargs)

369 else:

370 lx = lexer

371 s = match.start()

372 for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs):

373 yield i + s, t, v

374 if ctx:

375 ctx.pos = match.end()

376 else:

377 def callback(lexer, match, ctx=None):

378 # XXX: cache that somehow

379 kwargs.update(lexer.options)

380 lx = _other(**kwargs)

381

382 s = match.start()

383 for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs):

384 yield i + s, t, v

385 if ctx:

386 ctx.pos = match.end()

387 return callback

388

389

390class default:

391 """

392 Indicates a state or state action (e.g. #pop) to apply.

393 For example default('#pop') is equivalent to ('', Token, '#pop')

394 Note that state tuples may be used as well.

395

396 .. versionadded:: 2.0

397 """

398 def __init__(self, state):

399 self.state = state

400

401

402class words(Future):

403 """

404 Indicates a list of literal words that is transformed into an optimized

405 regex that matches any of the words.

406

407 .. versionadded:: 2.0

408 """

409 def __init__(self, words, prefix='', suffix=''):

410 self.words = words

411 self.prefix = prefix

412 self.suffix = suffix

413

414 def get(self):

415 return regex_opt(self.words, prefix=self.prefix, suffix=self.suffix)

416

417

418class RegexLexerMeta(LexerMeta):

419 """

420 Metaclass for RegexLexer, creates the self._tokens attribute from

421 self.tokens on the first instantiation.

422 """

423

424 def _process_regex(cls, regex, rflags, state):

425 """Preprocess the regular expression component of a token definition."""

426 if isinstance(regex, Future):

427 regex = regex.get()

428 return re.compile(regex, rflags).match

429

430 def _process_token(cls, token):

431 """Preprocess the token component of a token definition."""

432 assert type(token) is _TokenType or callable(token), \

433 'token type must be simple type or callable, not %r' % (token,)

434 return token

435

436 def _process_new_state(cls, new_state, unprocessed, processed):

437 """Preprocess the state transition action of a token definition."""

438 if isinstance(new_state, str):

439 # an existing state

440 if new_state == '#pop':

441 return -1

442 elif new_state in unprocessed:

443 return (new_state,)

444 elif new_state == '#push':

445 return new_state

446 elif new_state[:5] == '#pop:':

447 return -int(new_state[5:])

448 else:

449 assert False, 'unknown new state %r' % new_state

450 elif isinstance(new_state, combined):

451 # combine a new state from existing ones

452 tmp_state = '_tmp_%d' % cls._tmpname

453 cls._tmpname += 1

454 itokens = []

455 for istate in new_state:

456 assert istate != new_state, 'circular state ref %r' % istate

457 itokens.extend(cls._process_state(unprocessed,

458 processed, istate))

459 processed[tmp_state] = itokens

460 return (tmp_state,)

461 elif isinstance(new_state, tuple):

462 # push more than one state

463 for istate in new_state:

464 assert (istate in unprocessed or

465 istate in ('#pop', '#push')), \

466 'unknown new state ' + istate

467 return new_state

468 else:

469 assert False, 'unknown new state def %r' % new_state

470

471 def _process_state(cls, unprocessed, processed, state):

472 """Preprocess a single state definition."""

473 assert type(state) is str, "wrong state name %r" % state

474 assert state[0] != '#', "invalid state name %r" % state

475 if state in processed:

476 return processed[state]

477 tokens = processed[state] = []

478 rflags = cls.flags

479 for tdef in unprocessed[state]:

480 if isinstance(tdef, include):

481 # it's a state reference

482 assert tdef != state, "circular state reference %r" % state

483 tokens.extend(cls._process_state(unprocessed, processed,

484 str(tdef)))

485 continue

486 if isinstance(tdef, _inherit):

487 # should be processed already, but may not in the case of:

488 # 1. the state has no counterpart in any parent

489 # 2. the state includes more than one 'inherit'

490 continue

491 if isinstance(tdef, default):

492 new_state = cls._process_new_state(tdef.state, unprocessed, processed)

493 tokens.append((re.compile('').match, None, new_state))

494 continue

495

496 assert type(tdef) is tuple, "wrong rule def %r" % tdef

497

498 try:

499 rex = cls._process_regex(tdef[0], rflags, state)

500 except Exception as err:

501 raise ValueError("uncompilable regex %r in state %r of %r: %s" %

502 (tdef[0], state, cls, err)) from err

503

504 token = cls._process_token(tdef[1])

505

506 if len(tdef) == 2:

507 new_state = None

508 else:

509 new_state = cls._process_new_state(tdef[2],

510 unprocessed, processed)

511

512 tokens.append((rex, token, new_state))

513 return tokens

514

515 def process_tokendef(cls, name, tokendefs=None):

516 """Preprocess a dictionary of token definitions."""

517 processed = cls._all_tokens[name] = {}

518 tokendefs = tokendefs or cls.tokens[name]

519 for state in list(tokendefs):

520 cls._process_state(tokendefs, processed, state)

521 return processed

522

523 def get_tokendefs(cls):

524 """

525 Merge tokens from superclasses in MRO order, returning a single tokendef

526 dictionary.

527

528 Any state that is not defined by a subclass will be inherited

529 automatically. States that *are* defined by subclasses will, by

530 default, override that state in the superclass. If a subclass wishes to

531 inherit definitions from a superclass, it can use the special value

532 "inherit", which will cause the superclass' state definition to be

533 included at that point in the state.

534 """

535 tokens = {}

536 inheritable = {}

537 for c in cls.__mro__:

538 toks = c.__dict__.get('tokens', {})

539

540 for state, items in toks.items():

541 curitems = tokens.get(state)

542 if curitems is None:

543 # N.b. because this is assigned by reference, sufficiently

544 # deep hierarchies are processed incrementally (e.g. for

545 # A(B), B(C), C(RegexLexer), B will be premodified so X(B)

546 # will not see any inherits in B).

547 tokens[state] = items

548 try:

549 inherit_ndx = items.index(inherit)

550 except ValueError:

551 continue

552 inheritable[state] = inherit_ndx

553 continue

554

555 inherit_ndx = inheritable.pop(state, None)

556 if inherit_ndx is None:

557 continue

558

559 # Replace the "inherit" value with the items

560 curitems[inherit_ndx:inherit_ndx+1] = items

561 try:

562 # N.b. this is the index in items (that is, the superclass

563 # copy), so offset required when storing below.

564 new_inh_ndx = items.index(inherit)

565 except ValueError:

566 pass

567 else:

568 inheritable[state] = inherit_ndx + new_inh_ndx

569

570 return tokens

571

572 def __call__(cls, *args, **kwds):

573 """Instantiate cls after preprocessing its token definitions."""

574 if '_tokens' not in cls.__dict__:

575 cls._all_tokens = {}

576 cls._tmpname = 0

577 if hasattr(cls, 'token_variants') and cls.token_variants:

578 # don't process yet

579 pass

580 else:

581 cls._tokens = cls.process_tokendef('', cls.get_tokendefs())

582

583 return type.__call__(cls, *args, **kwds)

584

585

586class RegexLexer(Lexer, metaclass=RegexLexerMeta):

587 """

588 Base for simple stateful regular expression-based lexers.

589 Simplifies the lexing process so that you need only

590 provide a list of states and regular expressions.

591 """

592

593 #: Flags for compiling the regular expressions.

594 #: Defaults to MULTILINE.

595 flags = re.MULTILINE

596

597 #: At all time there is a stack of states. Initially, the stack contains

598 #: a single state 'root'. The top of the stack is called "the current state".

599 #:

600 #: Dict of ``{'state': [(regex, tokentype, new_state), ...], ...}``

601 #:

602 #: ``new_state`` can be omitted to signify no state transition.

603 #: If ``new_state`` is a string, it is pushed on the stack. This ensure

604 #: the new current state is ``new_state``.

605 #: If ``new_state`` is a tuple of strings, all of those strings are pushed

606 #: on the stack and the current state will be the last element of the list.

607 #: ``new_state`` can also be ``combined('state1', 'state2', ...)``

608 #: to signify a new, anonymous state combined from the rules of two

609 #: or more existing ones.

610 #: Furthermore, it can be '#pop' to signify going back one step in

611 #: the state stack, or '#push' to push the current state on the stack

612 #: again. Note that if you push while in a combined state, the combined

613 #: state itself is pushed, and not only the state in which the rule is

614 #: defined.

615 #:

616 #: The tuple can also be replaced with ``include('state')``, in which

617 #: case the rules from the state named by the string are included in the

618 #: current one.

619 tokens = {}

620

621 def get_tokens_unprocessed(self, text, stack=('root',)):

622 """

623 Split ``text`` into (tokentype, text) pairs.

624

625 ``stack`` is the initial stack (default: ``['root']``)

626 """

627 pos = 0

628 tokendefs = self._tokens

629 statestack = list(stack)

630 statetokens = tokendefs[statestack[-1]]

631 while 1:

632 for rexmatch, action, new_state in statetokens:

633 m = rexmatch(text, pos)

634 if m:

635 if action is not None:

636 if type(action) is _TokenType:

637 yield pos, action, m.group()

638 else:

639 yield from action(self, m)

640 pos = m.end()

641 if new_state is not None:

642 # state transition

643 if isinstance(new_state, tuple):

644 for state in new_state:

645 if state == '#pop':

646 if len(statestack) > 1:

647 statestack.pop()

648 elif state == '#push':

649 statestack.append(statestack[-1])

650 else:

651 statestack.append(state)

652 elif isinstance(new_state, int):

653 # pop, but keep at least one state on the stack

654 # (random code leading to unexpected pops should

655 # not allow exceptions)

656 if abs(new_state) >= len(statestack):

657 del statestack[1:]

658 else:

659 del statestack[new_state:]

660 elif new_state == '#push':

661 statestack.append(statestack[-1])

662 else:

663 assert False, "wrong state def: %r" % new_state

664 statetokens = tokendefs[statestack[-1]]

665 break

666 else:

667 # We are here only if all state tokens have been considered

668 # and there was not a match on any of them.

669 try:

670 if text[pos] == '\n':

671 # at EOL, reset state to "root"

672 statestack = ['root']

673 statetokens = tokendefs['root']

674 yield pos, Whitespace, '\n'

675 pos += 1

676 continue

677 yield pos, Error, text[pos]

678 pos += 1

679 except IndexError:

680 break

681

682

683class LexerContext:

684 """

685 A helper object that holds lexer position data.

686 """

687

688 def __init__(self, text, pos, stack=None, end=None):

689 self.text = text

690 self.pos = pos

691 self.end = end or len(text) # end=0 not supported ;-)

692 self.stack = stack or ['root']

693

694 def __repr__(self):

695 return 'LexerContext(%r, %r, %r)' % (

696 self.text, self.pos, self.stack)

697

698

699class ExtendedRegexLexer(RegexLexer):

700 """

701 A RegexLexer that uses a context object to store its state.

702 """

703

704 def get_tokens_unprocessed(self, text=None, context=None):

705 """

706 Split ``text`` into (tokentype, text) pairs.

707 If ``context`` is given, use this lexer context instead.

708 """

709 tokendefs = self._tokens

710 if not context:

711 ctx = LexerContext(text, 0)

712 statetokens = tokendefs['root']

713 else:

714 ctx = context

715 statetokens = tokendefs[ctx.stack[-1]]

716 text = ctx.text

717 while 1:

718 for rexmatch, action, new_state in statetokens:

719 m = rexmatch(text, ctx.pos, ctx.end)

720 if m:

721 if action is not None:

722 if type(action) is _TokenType:

723 yield ctx.pos, action, m.group()

724 ctx.pos = m.end()

725 else:

726 yield from action(self, m, ctx)

727 if not new_state:

728 # altered the state stack?

729 statetokens = tokendefs[ctx.stack[-1]]

730 # CAUTION: callback must set ctx.pos!

731 if new_state is not None:

732 # state transition

733 if isinstance(new_state, tuple):

734 for state in new_state:

735 if state == '#pop':

736 if len(ctx.stack) > 1:

737 ctx.stack.pop()

738 elif state == '#push':

739 ctx.stack.append(ctx.stack[-1])

740 else:

741 ctx.stack.append(state)

742 elif isinstance(new_state, int):

743 # see RegexLexer for why this check is made

744 if abs(new_state) >= len(ctx.stack):

745 del ctx.stack[1:]

746 else:

747 del ctx.stack[new_state:]

748 elif new_state == '#push':

749 ctx.stack.append(ctx.stack[-1])

750 else:

751 assert False, "wrong state def: %r" % new_state

752 statetokens = tokendefs[ctx.stack[-1]]

753 break

754 else:

755 try:

756 if ctx.pos >= ctx.end:

757 break

758 if text[ctx.pos] == '\n':

759 # at EOL, reset state to "root"

760 ctx.stack = ['root']

761 statetokens = tokendefs['root']

762 yield ctx.pos, Text, '\n'

763 ctx.pos += 1

764 continue

765 yield ctx.pos, Error, text[ctx.pos]

766 ctx.pos += 1

767 except IndexError:

768 break

769

770

771def do_insertions(insertions, tokens):

772 """

773 Helper for lexers which must combine the results of several

774 sublexers.

775

776 ``insertions`` is a list of ``(index, itokens)`` pairs.

777 Each ``itokens`` iterable should be inserted at position

778 ``index`` into the token stream given by the ``tokens``

779 argument.

780

781 The result is a combined token stream.

782

783 TODO: clean up the code here.

784 """

785 insertions = iter(insertions)

786 try:

787 index, itokens = next(insertions)

788 except StopIteration:

789 # no insertions

790 yield from tokens

791 return

792

793 realpos = None

794 insleft = True

795

796 # iterate over the token stream where we want to insert

797 # the tokens from the insertion list.

798 for i, t, v in tokens:

799 # first iteration. store the position of first item

800 if realpos is None:

801 realpos = i

802 oldi = 0

803 while insleft and i + len(v) >= index:

804 tmpval = v[oldi:index - i]

805 if tmpval:

806 yield realpos, t, tmpval

807 realpos += len(tmpval)

808 for it_index, it_token, it_value in itokens:

809 yield realpos, it_token, it_value

810 realpos += len(it_value)

811 oldi = index - i

812 try:

813 index, itokens = next(insertions)

814 except StopIteration:

815 insleft = False

816 break # not strictly necessary

817 if oldi < len(v):

818 yield realpos, t, v[oldi:]

819 realpos += len(v) - oldi

820

821 # leftover tokens

822 while insleft:

823 # no normal tokens, set realpos to zero

824 realpos = realpos or 0

825 for p, t, v in itokens:

826 yield realpos, t, v

827 realpos += len(v)

828 try:

829 index, itokens = next(insertions)

830 except StopIteration:

831 insleft = False

832 break # not strictly necessary

833

834

835class ProfilingRegexLexerMeta(RegexLexerMeta):

836 """Metaclass for ProfilingRegexLexer, collects regex timing info."""

837

838 def _process_regex(cls, regex, rflags, state):

839 if isinstance(regex, words):

840 rex = regex_opt(regex.words, prefix=regex.prefix,

841 suffix=regex.suffix)

842 else:

843 rex = regex

844 compiled = re.compile(rex, rflags)

845

846 def match_func(text, pos, endpos=sys.maxsize):

847 info = cls._prof_data[-1].setdefault((state, rex), [0, 0.0])

848 t0 = time.time()

849 res = compiled.match(text, pos, endpos)

850 t1 = time.time()

851 info[0] += 1

852 info[1] += t1 - t0

853 return res

854 return match_func

855

856

857class ProfilingRegexLexer(RegexLexer, metaclass=ProfilingRegexLexerMeta):

858 """Drop-in replacement for RegexLexer that does profiling of its regexes."""

859

860 _prof_data = []

861 _prof_sort_index = 4 # defaults to time per call

862

863 def get_tokens_unprocessed(self, text, stack=('root',)):

864 # this needs to be a stack, since using(this) will produce nested calls

865 self.__class__._prof_data.append({})

866 yield from RegexLexer.get_tokens_unprocessed(self, text, stack)

867 rawdata = self.__class__._prof_data.pop()

868 data = sorted(((s, repr(r).strip('u\'').replace('\\\\', '\\')[:65],

869 n, 1000 * t, 1000 * t / n)

870 for ((s, r), (n, t)) in rawdata.items()),

871 key=lambda x: x[self._prof_sort_index],

872 reverse=True)

873 sum_total = sum(x[3] for x in data)

874

875 print()

876 print('Profiling result for %s lexing %d chars in %.3f ms' %

877 (self.__class__.__name__, len(text), sum_total))

878 print('=' * 110)

879 print('%-20s %-64s ncalls tottime percall' % ('state', 'regex'))

880 print('-' * 110)

881 for d in data:

882 print('%-20s %-65s %5d %8.4f %8.4f' % d)

883 print('=' * 110)