Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pygments/lexer.py: 21%

468 statements  

« prev     ^ index     » next       coverage.py v7.2.2, created at 2023-03-26 06:07 +0000

1""" 

2 pygments.lexer 

3 ~~~~~~~~~~~~~~ 

4 

5 Base lexer classes. 

6 

7 :copyright: Copyright 2006-2022 by the Pygments team, see AUTHORS. 

8 :license: BSD, see LICENSE for details. 

9""" 

10 

11import re 

12import sys 

13import time 

14 

15from pygments.filter import apply_filters, Filter 

16from pygments.filters import get_filter_by_name 

17from pygments.token import Error, Text, Other, Whitespace, _TokenType 

18from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \ 

19 make_analysator, Future, guess_decode 

20from pygments.regexopt import regex_opt 

21 

22__all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer', 

23 'LexerContext', 'include', 'inherit', 'bygroups', 'using', 'this', 

24 'default', 'words', 'line_re'] 

25 

26line_re = re.compile('.*?\n') 

27 

28_encoding_map = [(b'\xef\xbb\xbf', 'utf-8'), 

29 (b'\xff\xfe\0\0', 'utf-32'), 

30 (b'\0\0\xfe\xff', 'utf-32be'), 

31 (b'\xff\xfe', 'utf-16'), 

32 (b'\xfe\xff', 'utf-16be')] 

33 

34_default_analyse = staticmethod(lambda x: 0.0) 

35 

36 

37class LexerMeta(type): 

38 """ 

39 This metaclass automagically converts ``analyse_text`` methods into 

40 static methods which always return float values. 

41 """ 

42 

43 def __new__(mcs, name, bases, d): 

44 if 'analyse_text' in d: 

45 d['analyse_text'] = make_analysator(d['analyse_text']) 

46 return type.__new__(mcs, name, bases, d) 

47 

48 

49class Lexer(metaclass=LexerMeta): 

50 """ 

51 Lexer for a specific language. 

52 

53 Basic options recognized: 

54 ``stripnl`` 

55 Strip leading and trailing newlines from the input (default: True). 

56 ``stripall`` 

57 Strip all leading and trailing whitespace from the input 

58 (default: False). 

59 ``ensurenl`` 

60 Make sure that the input ends with a newline (default: True). This 

61 is required for some lexers that consume input linewise. 

62 

63 .. versionadded:: 1.3 

64 

65 ``tabsize`` 

66 If given and greater than 0, expand tabs in the input (default: 0). 

67 ``encoding`` 

68 If given, must be an encoding name. This encoding will be used to 

69 convert the input string to Unicode, if it is not already a Unicode 

70 string (default: ``'guess'``, which uses a simple UTF-8 / Locale / 

71 Latin1 detection. Can also be ``'chardet'`` to use the chardet 

72 library, if it is installed. 

73 ``inencoding`` 

74 Overrides the ``encoding`` if given. 

75 """ 

76 

77 #: Name of the lexer 

78 name = None 

79 

80 #: URL of the language specification/definition 

81 url = None 

82 

83 #: Shortcuts for the lexer 

84 aliases = [] 

85 

86 #: File name globs 

87 filenames = [] 

88 

89 #: Secondary file name globs 

90 alias_filenames = [] 

91 

92 #: MIME types 

93 mimetypes = [] 

94 

95 #: Priority, should multiple lexers match and no content is provided 

96 priority = 0 

97 

98 def __init__(self, **options): 

99 self.options = options 

100 self.stripnl = get_bool_opt(options, 'stripnl', True) 

101 self.stripall = get_bool_opt(options, 'stripall', False) 

102 self.ensurenl = get_bool_opt(options, 'ensurenl', True) 

103 self.tabsize = get_int_opt(options, 'tabsize', 0) 

104 self.encoding = options.get('encoding', 'guess') 

105 self.encoding = options.get('inencoding') or self.encoding 

106 self.filters = [] 

107 for filter_ in get_list_opt(options, 'filters', ()): 

108 self.add_filter(filter_) 

109 

110 def __repr__(self): 

111 if self.options: 

112 return '<pygments.lexers.%s with %r>' % (self.__class__.__name__, 

113 self.options) 

114 else: 

115 return '<pygments.lexers.%s>' % self.__class__.__name__ 

116 

117 def add_filter(self, filter_, **options): 

118 """ 

119 Add a new stream filter to this lexer. 

120 """ 

121 if not isinstance(filter_, Filter): 

122 filter_ = get_filter_by_name(filter_, **options) 

123 self.filters.append(filter_) 

124 

125 def analyse_text(text): 

126 """ 

127 Has to return a float between ``0`` and ``1`` that indicates 

128 if a lexer wants to highlight this text. Used by ``guess_lexer``. 

129 If this method returns ``0`` it won't highlight it in any case, if 

130 it returns ``1`` highlighting with this lexer is guaranteed. 

131 

132 The `LexerMeta` metaclass automatically wraps this function so 

133 that it works like a static method (no ``self`` or ``cls`` 

134 parameter) and the return value is automatically converted to 

135 `float`. If the return value is an object that is boolean `False` 

136 it's the same as if the return values was ``0.0``. 

137 """ 

138 

139 def get_tokens(self, text, unfiltered=False): 

140 """ 

141 Return an iterable of (tokentype, value) pairs generated from 

142 `text`. If `unfiltered` is set to `True`, the filtering mechanism 

143 is bypassed even if filters are defined. 

144 

145 Also preprocess the text, i.e. expand tabs and strip it if 

146 wanted and applies registered filters. 

147 """ 

148 if not isinstance(text, str): 

149 if self.encoding == 'guess': 

150 text, _ = guess_decode(text) 

151 elif self.encoding == 'chardet': 

152 try: 

153 import chardet 

154 except ImportError as e: 

155 raise ImportError('To enable chardet encoding guessing, ' 

156 'please install the chardet library ' 

157 'from http://chardet.feedparser.org/') from e 

158 # check for BOM first 

159 decoded = None 

160 for bom, encoding in _encoding_map: 

161 if text.startswith(bom): 

162 decoded = text[len(bom):].decode(encoding, 'replace') 

163 break 

164 # no BOM found, so use chardet 

165 if decoded is None: 

166 enc = chardet.detect(text[:1024]) # Guess using first 1KB 

167 decoded = text.decode(enc.get('encoding') or 'utf-8', 

168 'replace') 

169 text = decoded 

170 else: 

171 text = text.decode(self.encoding) 

172 if text.startswith('\ufeff'): 

173 text = text[len('\ufeff'):] 

174 else: 

175 if text.startswith('\ufeff'): 

176 text = text[len('\ufeff'):] 

177 

178 # text now *is* a unicode string 

179 text = text.replace('\r\n', '\n') 

180 text = text.replace('\r', '\n') 

181 if self.stripall: 

182 text = text.strip() 

183 elif self.stripnl: 

184 text = text.strip('\n') 

185 if self.tabsize > 0: 

186 text = text.expandtabs(self.tabsize) 

187 if self.ensurenl and not text.endswith('\n'): 

188 text += '\n' 

189 

190 def streamer(): 

191 for _, t, v in self.get_tokens_unprocessed(text): 

192 yield t, v 

193 stream = streamer() 

194 if not unfiltered: 

195 stream = apply_filters(stream, self.filters, self) 

196 return stream 

197 

198 def get_tokens_unprocessed(self, text): 

199 """ 

200 Return an iterable of (index, tokentype, value) pairs where "index" 

201 is the starting position of the token within the input text. 

202 

203 In subclasses, implement this method as a generator to 

204 maximize effectiveness. 

205 """ 

206 raise NotImplementedError 

207 

208 

209class DelegatingLexer(Lexer): 

210 """ 

211 This lexer takes two lexer as arguments. A root lexer and 

212 a language lexer. First everything is scanned using the language 

213 lexer, afterwards all ``Other`` tokens are lexed using the root 

214 lexer. 

215 

216 The lexers from the ``template`` lexer package use this base lexer. 

217 """ 

218 

219 def __init__(self, _root_lexer, _language_lexer, _needle=Other, **options): 

220 self.root_lexer = _root_lexer(**options) 

221 self.language_lexer = _language_lexer(**options) 

222 self.needle = _needle 

223 Lexer.__init__(self, **options) 

224 

225 def get_tokens_unprocessed(self, text): 

226 buffered = '' 

227 insertions = [] 

228 lng_buffer = [] 

229 for i, t, v in self.language_lexer.get_tokens_unprocessed(text): 

230 if t is self.needle: 

231 if lng_buffer: 

232 insertions.append((len(buffered), lng_buffer)) 

233 lng_buffer = [] 

234 buffered += v 

235 else: 

236 lng_buffer.append((i, t, v)) 

237 if lng_buffer: 

238 insertions.append((len(buffered), lng_buffer)) 

239 return do_insertions(insertions, 

240 self.root_lexer.get_tokens_unprocessed(buffered)) 

241 

242 

243# ------------------------------------------------------------------------------ 

244# RegexLexer and ExtendedRegexLexer 

245# 

246 

247 

248class include(str): # pylint: disable=invalid-name 

249 """ 

250 Indicates that a state should include rules from another state. 

251 """ 

252 pass 

253 

254 

255class _inherit: 

256 """ 

257 Indicates the a state should inherit from its superclass. 

258 """ 

259 def __repr__(self): 

260 return 'inherit' 

261 

262inherit = _inherit() # pylint: disable=invalid-name 

263 

264 

265class combined(tuple): # pylint: disable=invalid-name 

266 """ 

267 Indicates a state combined from multiple states. 

268 """ 

269 

270 def __new__(cls, *args): 

271 return tuple.__new__(cls, args) 

272 

273 def __init__(self, *args): 

274 # tuple.__init__ doesn't do anything 

275 pass 

276 

277 

278class _PseudoMatch: 

279 """ 

280 A pseudo match object constructed from a string. 

281 """ 

282 

283 def __init__(self, start, text): 

284 self._text = text 

285 self._start = start 

286 

287 def start(self, arg=None): 

288 return self._start 

289 

290 def end(self, arg=None): 

291 return self._start + len(self._text) 

292 

293 def group(self, arg=None): 

294 if arg: 

295 raise IndexError('No such group') 

296 return self._text 

297 

298 def groups(self): 

299 return (self._text,) 

300 

301 def groupdict(self): 

302 return {} 

303 

304 

305def bygroups(*args): 

306 """ 

307 Callback that yields multiple actions for each group in the match. 

308 """ 

309 def callback(lexer, match, ctx=None): 

310 for i, action in enumerate(args): 

311 if action is None: 

312 continue 

313 elif type(action) is _TokenType: 

314 data = match.group(i + 1) 

315 if data: 

316 yield match.start(i + 1), action, data 

317 else: 

318 data = match.group(i + 1) 

319 if data is not None: 

320 if ctx: 

321 ctx.pos = match.start(i + 1) 

322 for item in action(lexer, 

323 _PseudoMatch(match.start(i + 1), data), ctx): 

324 if item: 

325 yield item 

326 if ctx: 

327 ctx.pos = match.end() 

328 return callback 

329 

330 

331class _This: 

332 """ 

333 Special singleton used for indicating the caller class. 

334 Used by ``using``. 

335 """ 

336 

337this = _This() 

338 

339 

340def using(_other, **kwargs): 

341 """ 

342 Callback that processes the match with a different lexer. 

343 

344 The keyword arguments are forwarded to the lexer, except `state` which 

345 is handled separately. 

346 

347 `state` specifies the state that the new lexer will start in, and can 

348 be an enumerable such as ('root', 'inline', 'string') or a simple 

349 string which is assumed to be on top of the root state. 

350 

351 Note: For that to work, `_other` must not be an `ExtendedRegexLexer`. 

352 """ 

353 gt_kwargs = {} 

354 if 'state' in kwargs: 

355 s = kwargs.pop('state') 

356 if isinstance(s, (list, tuple)): 

357 gt_kwargs['stack'] = s 

358 else: 

359 gt_kwargs['stack'] = ('root', s) 

360 

361 if _other is this: 

362 def callback(lexer, match, ctx=None): 

363 # if keyword arguments are given the callback 

364 # function has to create a new lexer instance 

365 if kwargs: 

366 # XXX: cache that somehow 

367 kwargs.update(lexer.options) 

368 lx = lexer.__class__(**kwargs) 

369 else: 

370 lx = lexer 

371 s = match.start() 

372 for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs): 

373 yield i + s, t, v 

374 if ctx: 

375 ctx.pos = match.end() 

376 else: 

377 def callback(lexer, match, ctx=None): 

378 # XXX: cache that somehow 

379 kwargs.update(lexer.options) 

380 lx = _other(**kwargs) 

381 

382 s = match.start() 

383 for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs): 

384 yield i + s, t, v 

385 if ctx: 

386 ctx.pos = match.end() 

387 return callback 

388 

389 

390class default: 

391 """ 

392 Indicates a state or state action (e.g. #pop) to apply. 

393 For example default('#pop') is equivalent to ('', Token, '#pop') 

394 Note that state tuples may be used as well. 

395 

396 .. versionadded:: 2.0 

397 """ 

398 def __init__(self, state): 

399 self.state = state 

400 

401 

402class words(Future): 

403 """ 

404 Indicates a list of literal words that is transformed into an optimized 

405 regex that matches any of the words. 

406 

407 .. versionadded:: 2.0 

408 """ 

409 def __init__(self, words, prefix='', suffix=''): 

410 self.words = words 

411 self.prefix = prefix 

412 self.suffix = suffix 

413 

414 def get(self): 

415 return regex_opt(self.words, prefix=self.prefix, suffix=self.suffix) 

416 

417 

418class RegexLexerMeta(LexerMeta): 

419 """ 

420 Metaclass for RegexLexer, creates the self._tokens attribute from 

421 self.tokens on the first instantiation. 

422 """ 

423 

424 def _process_regex(cls, regex, rflags, state): 

425 """Preprocess the regular expression component of a token definition.""" 

426 if isinstance(regex, Future): 

427 regex = regex.get() 

428 return re.compile(regex, rflags).match 

429 

430 def _process_token(cls, token): 

431 """Preprocess the token component of a token definition.""" 

432 assert type(token) is _TokenType or callable(token), \ 

433 'token type must be simple type or callable, not %r' % (token,) 

434 return token 

435 

436 def _process_new_state(cls, new_state, unprocessed, processed): 

437 """Preprocess the state transition action of a token definition.""" 

438 if isinstance(new_state, str): 

439 # an existing state 

440 if new_state == '#pop': 

441 return -1 

442 elif new_state in unprocessed: 

443 return (new_state,) 

444 elif new_state == '#push': 

445 return new_state 

446 elif new_state[:5] == '#pop:': 

447 return -int(new_state[5:]) 

448 else: 

449 assert False, 'unknown new state %r' % new_state 

450 elif isinstance(new_state, combined): 

451 # combine a new state from existing ones 

452 tmp_state = '_tmp_%d' % cls._tmpname 

453 cls._tmpname += 1 

454 itokens = [] 

455 for istate in new_state: 

456 assert istate != new_state, 'circular state ref %r' % istate 

457 itokens.extend(cls._process_state(unprocessed, 

458 processed, istate)) 

459 processed[tmp_state] = itokens 

460 return (tmp_state,) 

461 elif isinstance(new_state, tuple): 

462 # push more than one state 

463 for istate in new_state: 

464 assert (istate in unprocessed or 

465 istate in ('#pop', '#push')), \ 

466 'unknown new state ' + istate 

467 return new_state 

468 else: 

469 assert False, 'unknown new state def %r' % new_state 

470 

471 def _process_state(cls, unprocessed, processed, state): 

472 """Preprocess a single state definition.""" 

473 assert type(state) is str, "wrong state name %r" % state 

474 assert state[0] != '#', "invalid state name %r" % state 

475 if state in processed: 

476 return processed[state] 

477 tokens = processed[state] = [] 

478 rflags = cls.flags 

479 for tdef in unprocessed[state]: 

480 if isinstance(tdef, include): 

481 # it's a state reference 

482 assert tdef != state, "circular state reference %r" % state 

483 tokens.extend(cls._process_state(unprocessed, processed, 

484 str(tdef))) 

485 continue 

486 if isinstance(tdef, _inherit): 

487 # should be processed already, but may not in the case of: 

488 # 1. the state has no counterpart in any parent 

489 # 2. the state includes more than one 'inherit' 

490 continue 

491 if isinstance(tdef, default): 

492 new_state = cls._process_new_state(tdef.state, unprocessed, processed) 

493 tokens.append((re.compile('').match, None, new_state)) 

494 continue 

495 

496 assert type(tdef) is tuple, "wrong rule def %r" % tdef 

497 

498 try: 

499 rex = cls._process_regex(tdef[0], rflags, state) 

500 except Exception as err: 

501 raise ValueError("uncompilable regex %r in state %r of %r: %s" % 

502 (tdef[0], state, cls, err)) from err 

503 

504 token = cls._process_token(tdef[1]) 

505 

506 if len(tdef) == 2: 

507 new_state = None 

508 else: 

509 new_state = cls._process_new_state(tdef[2], 

510 unprocessed, processed) 

511 

512 tokens.append((rex, token, new_state)) 

513 return tokens 

514 

515 def process_tokendef(cls, name, tokendefs=None): 

516 """Preprocess a dictionary of token definitions.""" 

517 processed = cls._all_tokens[name] = {} 

518 tokendefs = tokendefs or cls.tokens[name] 

519 for state in list(tokendefs): 

520 cls._process_state(tokendefs, processed, state) 

521 return processed 

522 

523 def get_tokendefs(cls): 

524 """ 

525 Merge tokens from superclasses in MRO order, returning a single tokendef 

526 dictionary. 

527 

528 Any state that is not defined by a subclass will be inherited 

529 automatically. States that *are* defined by subclasses will, by 

530 default, override that state in the superclass. If a subclass wishes to 

531 inherit definitions from a superclass, it can use the special value 

532 "inherit", which will cause the superclass' state definition to be 

533 included at that point in the state. 

534 """ 

535 tokens = {} 

536 inheritable = {} 

537 for c in cls.__mro__: 

538 toks = c.__dict__.get('tokens', {}) 

539 

540 for state, items in toks.items(): 

541 curitems = tokens.get(state) 

542 if curitems is None: 

543 # N.b. because this is assigned by reference, sufficiently 

544 # deep hierarchies are processed incrementally (e.g. for 

545 # A(B), B(C), C(RegexLexer), B will be premodified so X(B) 

546 # will not see any inherits in B). 

547 tokens[state] = items 

548 try: 

549 inherit_ndx = items.index(inherit) 

550 except ValueError: 

551 continue 

552 inheritable[state] = inherit_ndx 

553 continue 

554 

555 inherit_ndx = inheritable.pop(state, None) 

556 if inherit_ndx is None: 

557 continue 

558 

559 # Replace the "inherit" value with the items 

560 curitems[inherit_ndx:inherit_ndx+1] = items 

561 try: 

562 # N.b. this is the index in items (that is, the superclass 

563 # copy), so offset required when storing below. 

564 new_inh_ndx = items.index(inherit) 

565 except ValueError: 

566 pass 

567 else: 

568 inheritable[state] = inherit_ndx + new_inh_ndx 

569 

570 return tokens 

571 

572 def __call__(cls, *args, **kwds): 

573 """Instantiate cls after preprocessing its token definitions.""" 

574 if '_tokens' not in cls.__dict__: 

575 cls._all_tokens = {} 

576 cls._tmpname = 0 

577 if hasattr(cls, 'token_variants') and cls.token_variants: 

578 # don't process yet 

579 pass 

580 else: 

581 cls._tokens = cls.process_tokendef('', cls.get_tokendefs()) 

582 

583 return type.__call__(cls, *args, **kwds) 

584 

585 

586class RegexLexer(Lexer, metaclass=RegexLexerMeta): 

587 """ 

588 Base for simple stateful regular expression-based lexers. 

589 Simplifies the lexing process so that you need only 

590 provide a list of states and regular expressions. 

591 """ 

592 

593 #: Flags for compiling the regular expressions. 

594 #: Defaults to MULTILINE. 

595 flags = re.MULTILINE 

596 

597 #: At all time there is a stack of states. Initially, the stack contains 

598 #: a single state 'root'. The top of the stack is called "the current state". 

599 #: 

600 #: Dict of ``{'state': [(regex, tokentype, new_state), ...], ...}`` 

601 #: 

602 #: ``new_state`` can be omitted to signify no state transition. 

603 #: If ``new_state`` is a string, it is pushed on the stack. This ensure 

604 #: the new current state is ``new_state``. 

605 #: If ``new_state`` is a tuple of strings, all of those strings are pushed 

606 #: on the stack and the current state will be the last element of the list. 

607 #: ``new_state`` can also be ``combined('state1', 'state2', ...)`` 

608 #: to signify a new, anonymous state combined from the rules of two 

609 #: or more existing ones. 

610 #: Furthermore, it can be '#pop' to signify going back one step in 

611 #: the state stack, or '#push' to push the current state on the stack 

612 #: again. Note that if you push while in a combined state, the combined 

613 #: state itself is pushed, and not only the state in which the rule is 

614 #: defined. 

615 #: 

616 #: The tuple can also be replaced with ``include('state')``, in which 

617 #: case the rules from the state named by the string are included in the 

618 #: current one. 

619 tokens = {} 

620 

621 def get_tokens_unprocessed(self, text, stack=('root',)): 

622 """ 

623 Split ``text`` into (tokentype, text) pairs. 

624 

625 ``stack`` is the initial stack (default: ``['root']``) 

626 """ 

627 pos = 0 

628 tokendefs = self._tokens 

629 statestack = list(stack) 

630 statetokens = tokendefs[statestack[-1]] 

631 while 1: 

632 for rexmatch, action, new_state in statetokens: 

633 m = rexmatch(text, pos) 

634 if m: 

635 if action is not None: 

636 if type(action) is _TokenType: 

637 yield pos, action, m.group() 

638 else: 

639 yield from action(self, m) 

640 pos = m.end() 

641 if new_state is not None: 

642 # state transition 

643 if isinstance(new_state, tuple): 

644 for state in new_state: 

645 if state == '#pop': 

646 if len(statestack) > 1: 

647 statestack.pop() 

648 elif state == '#push': 

649 statestack.append(statestack[-1]) 

650 else: 

651 statestack.append(state) 

652 elif isinstance(new_state, int): 

653 # pop, but keep at least one state on the stack 

654 # (random code leading to unexpected pops should 

655 # not allow exceptions) 

656 if abs(new_state) >= len(statestack): 

657 del statestack[1:] 

658 else: 

659 del statestack[new_state:] 

660 elif new_state == '#push': 

661 statestack.append(statestack[-1]) 

662 else: 

663 assert False, "wrong state def: %r" % new_state 

664 statetokens = tokendefs[statestack[-1]] 

665 break 

666 else: 

667 # We are here only if all state tokens have been considered 

668 # and there was not a match on any of them. 

669 try: 

670 if text[pos] == '\n': 

671 # at EOL, reset state to "root" 

672 statestack = ['root'] 

673 statetokens = tokendefs['root'] 

674 yield pos, Whitespace, '\n' 

675 pos += 1 

676 continue 

677 yield pos, Error, text[pos] 

678 pos += 1 

679 except IndexError: 

680 break 

681 

682 

683class LexerContext: 

684 """ 

685 A helper object that holds lexer position data. 

686 """ 

687 

688 def __init__(self, text, pos, stack=None, end=None): 

689 self.text = text 

690 self.pos = pos 

691 self.end = end or len(text) # end=0 not supported ;-) 

692 self.stack = stack or ['root'] 

693 

694 def __repr__(self): 

695 return 'LexerContext(%r, %r, %r)' % ( 

696 self.text, self.pos, self.stack) 

697 

698 

699class ExtendedRegexLexer(RegexLexer): 

700 """ 

701 A RegexLexer that uses a context object to store its state. 

702 """ 

703 

704 def get_tokens_unprocessed(self, text=None, context=None): 

705 """ 

706 Split ``text`` into (tokentype, text) pairs. 

707 If ``context`` is given, use this lexer context instead. 

708 """ 

709 tokendefs = self._tokens 

710 if not context: 

711 ctx = LexerContext(text, 0) 

712 statetokens = tokendefs['root'] 

713 else: 

714 ctx = context 

715 statetokens = tokendefs[ctx.stack[-1]] 

716 text = ctx.text 

717 while 1: 

718 for rexmatch, action, new_state in statetokens: 

719 m = rexmatch(text, ctx.pos, ctx.end) 

720 if m: 

721 if action is not None: 

722 if type(action) is _TokenType: 

723 yield ctx.pos, action, m.group() 

724 ctx.pos = m.end() 

725 else: 

726 yield from action(self, m, ctx) 

727 if not new_state: 

728 # altered the state stack? 

729 statetokens = tokendefs[ctx.stack[-1]] 

730 # CAUTION: callback must set ctx.pos! 

731 if new_state is not None: 

732 # state transition 

733 if isinstance(new_state, tuple): 

734 for state in new_state: 

735 if state == '#pop': 

736 if len(ctx.stack) > 1: 

737 ctx.stack.pop() 

738 elif state == '#push': 

739 ctx.stack.append(ctx.stack[-1]) 

740 else: 

741 ctx.stack.append(state) 

742 elif isinstance(new_state, int): 

743 # see RegexLexer for why this check is made 

744 if abs(new_state) >= len(ctx.stack): 

745 del ctx.stack[1:] 

746 else: 

747 del ctx.stack[new_state:] 

748 elif new_state == '#push': 

749 ctx.stack.append(ctx.stack[-1]) 

750 else: 

751 assert False, "wrong state def: %r" % new_state 

752 statetokens = tokendefs[ctx.stack[-1]] 

753 break 

754 else: 

755 try: 

756 if ctx.pos >= ctx.end: 

757 break 

758 if text[ctx.pos] == '\n': 

759 # at EOL, reset state to "root" 

760 ctx.stack = ['root'] 

761 statetokens = tokendefs['root'] 

762 yield ctx.pos, Text, '\n' 

763 ctx.pos += 1 

764 continue 

765 yield ctx.pos, Error, text[ctx.pos] 

766 ctx.pos += 1 

767 except IndexError: 

768 break 

769 

770 

771def do_insertions(insertions, tokens): 

772 """ 

773 Helper for lexers which must combine the results of several 

774 sublexers. 

775 

776 ``insertions`` is a list of ``(index, itokens)`` pairs. 

777 Each ``itokens`` iterable should be inserted at position 

778 ``index`` into the token stream given by the ``tokens`` 

779 argument. 

780 

781 The result is a combined token stream. 

782 

783 TODO: clean up the code here. 

784 """ 

785 insertions = iter(insertions) 

786 try: 

787 index, itokens = next(insertions) 

788 except StopIteration: 

789 # no insertions 

790 yield from tokens 

791 return 

792 

793 realpos = None 

794 insleft = True 

795 

796 # iterate over the token stream where we want to insert 

797 # the tokens from the insertion list. 

798 for i, t, v in tokens: 

799 # first iteration. store the position of first item 

800 if realpos is None: 

801 realpos = i 

802 oldi = 0 

803 while insleft and i + len(v) >= index: 

804 tmpval = v[oldi:index - i] 

805 if tmpval: 

806 yield realpos, t, tmpval 

807 realpos += len(tmpval) 

808 for it_index, it_token, it_value in itokens: 

809 yield realpos, it_token, it_value 

810 realpos += len(it_value) 

811 oldi = index - i 

812 try: 

813 index, itokens = next(insertions) 

814 except StopIteration: 

815 insleft = False 

816 break # not strictly necessary 

817 if oldi < len(v): 

818 yield realpos, t, v[oldi:] 

819 realpos += len(v) - oldi 

820 

821 # leftover tokens 

822 while insleft: 

823 # no normal tokens, set realpos to zero 

824 realpos = realpos or 0 

825 for p, t, v in itokens: 

826 yield realpos, t, v 

827 realpos += len(v) 

828 try: 

829 index, itokens = next(insertions) 

830 except StopIteration: 

831 insleft = False 

832 break # not strictly necessary 

833 

834 

835class ProfilingRegexLexerMeta(RegexLexerMeta): 

836 """Metaclass for ProfilingRegexLexer, collects regex timing info.""" 

837 

838 def _process_regex(cls, regex, rflags, state): 

839 if isinstance(regex, words): 

840 rex = regex_opt(regex.words, prefix=regex.prefix, 

841 suffix=regex.suffix) 

842 else: 

843 rex = regex 

844 compiled = re.compile(rex, rflags) 

845 

846 def match_func(text, pos, endpos=sys.maxsize): 

847 info = cls._prof_data[-1].setdefault((state, rex), [0, 0.0]) 

848 t0 = time.time() 

849 res = compiled.match(text, pos, endpos) 

850 t1 = time.time() 

851 info[0] += 1 

852 info[1] += t1 - t0 

853 return res 

854 return match_func 

855 

856 

857class ProfilingRegexLexer(RegexLexer, metaclass=ProfilingRegexLexerMeta): 

858 """Drop-in replacement for RegexLexer that does profiling of its regexes.""" 

859 

860 _prof_data = [] 

861 _prof_sort_index = 4 # defaults to time per call 

862 

863 def get_tokens_unprocessed(self, text, stack=('root',)): 

864 # this needs to be a stack, since using(this) will produce nested calls 

865 self.__class__._prof_data.append({}) 

866 yield from RegexLexer.get_tokens_unprocessed(self, text, stack) 

867 rawdata = self.__class__._prof_data.pop() 

868 data = sorted(((s, repr(r).strip('u\'').replace('\\\\', '\\')[:65], 

869 n, 1000 * t, 1000 * t / n) 

870 for ((s, r), (n, t)) in rawdata.items()), 

871 key=lambda x: x[self._prof_sort_index], 

872 reverse=True) 

873 sum_total = sum(x[3] for x in data) 

874 

875 print() 

876 print('Profiling result for %s lexing %d chars in %.3f ms' % 

877 (self.__class__.__name__, len(text), sum_total)) 

878 print('=' * 110) 

879 print('%-20s %-64s ncalls tottime percall' % ('state', 'regex')) 

880 print('-' * 110) 

881 for d in data: 

882 print('%-20s %-65s %5d %8.4f %8.4f' % d) 

883 print('=' * 110)