Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pycparser/ply/lex.py: 41%

1# -----------------------------------------------------------------------------

2# ply: lex.py

5# David M. Beazley (Dabeaz LLC)

8# Redistribution and use in source and binary forms, with or without

9# modification, are permitted provided that the following conditions are

10# met:

11#

12# * Redistributions of source code must retain the above copyright notice,

13# this list of conditions and the following disclaimer.

14# * Redistributions in binary form must reproduce the above copyright notice,

15# this list of conditions and the following disclaimer in the documentation

16# and/or other materials provided with the distribution.

17# * Neither the name of the David Beazley or Dabeaz LLC may be used to

18# endorse or promote products derived from this software without

19# specific prior written permission.

20#

21# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

22# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

23# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

24# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

25# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

26# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

27# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

28# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

29# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

30# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

31# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

32# -----------------------------------------------------------------------------

34__version__ = '3.10'

35__tabversion__ = '3.10'

37import re

38import sys

39import types

40import copy

41import os

42import inspect

44# This tuple contains known string types

45try:

46 # Python 2.6

47 StringTypes = (types.StringType, types.UnicodeType)

48except AttributeError:

49 # Python 3.0

50 StringTypes = (str, bytes)

52# This regular expression is used to match valid token names

53_is_identifier = re.compile(r'^[a-zA-Z0-9_]+$')

55# Exception thrown when invalid token encountered and no default error

56# handler is defined.

57class LexError(Exception):

58 def __init__(self, message, s):

59 self.args = (message,)

60 self.text = s

63# Token class. This class is used to represent the tokens produced.

64class LexToken(object):

65 def __str__(self):

66 return 'LexToken(%s,%r,%d,%d)' % (self.type, self.value, self.lineno, self.lexpos)

68 def __repr__(self):

69 return str(self)

72# This object is a stand-in for a logging object created by the

73# logging module.

75class PlyLogger(object):

76 def __init__(self, f):

77 self.f = f

79 def critical(self, msg, *args, **kwargs):

80 self.f.write((msg % args) + '\n')

82 def warning(self, msg, *args, **kwargs):

83 self.f.write('WARNING: ' + (msg % args) + '\n')

85 def error(self, msg, *args, **kwargs):

86 self.f.write('ERROR: ' + (msg % args) + '\n')

88 info = critical

89 debug = critical

92# Null logger is used when no output is generated. Does nothing.

93class NullLogger(object):

94 def __getattribute__(self, name):

95 return self

97 def __call__(self, *args, **kwargs):

98 return self

100

101# -----------------------------------------------------------------------------

102# === Lexing Engine ===

103#

104# The following Lexer class implements the lexer runtime. There are only

105# a few public methods and attributes:

106#

107# input() - Store a new string in the lexer

108# token() - Get the next token

109# clone() - Clone the lexer

110#

111# lineno - Current line number

112# lexpos - Current position in the input string

113# -----------------------------------------------------------------------------

114

115class Lexer:

116 def __init__(self):

117 self.lexre = None # Master regular expression. This is a list of

118 # tuples (re, findex) where re is a compiled

119 # regular expression and findex is a list

120 # mapping regex group numbers to rules

121 self.lexretext = None # Current regular expression strings

122 self.lexstatere = {} # Dictionary mapping lexer states to master regexs

123 self.lexstateretext = {} # Dictionary mapping lexer states to regex strings

124 self.lexstaterenames = {} # Dictionary mapping lexer states to symbol names

125 self.lexstate = 'INITIAL' # Current lexer state

126 self.lexstatestack = [] # Stack of lexer states

127 self.lexstateinfo = None # State information

128 self.lexstateignore = {} # Dictionary of ignored characters for each state

129 self.lexstateerrorf = {} # Dictionary of error functions for each state

130 self.lexstateeoff = {} # Dictionary of eof functions for each state

131 self.lexreflags = 0 # Optional re compile flags

132 self.lexdata = None # Actual input data (as a string)

133 self.lexpos = 0 # Current position in input text

134 self.lexlen = 0 # Length of the input text

135 self.lexerrorf = None # Error rule (if any)

136 self.lexeoff = None # EOF rule (if any)

137 self.lextokens = None # List of valid tokens

138 self.lexignore = '' # Ignored characters

139 self.lexliterals = '' # Literal characters that can be passed through

140 self.lexmodule = None # Module

141 self.lineno = 1 # Current line number

142 self.lexoptimize = False # Optimized mode

143

144 def clone(self, object=None):

145 c = copy.copy(self)

146

147 # If the object parameter has been supplied, it means we are attaching the

148 # lexer to a new object. In this case, we have to rebind all methods in

149 # the lexstatere and lexstateerrorf tables.

150

151 if object:

152 newtab = {}

153 for key, ritem in self.lexstatere.items():

154 newre = []

155 for cre, findex in ritem:

156 newfindex = []

157 for f in findex:

158 if not f or not f[0]:

159 newfindex.append(f)

160 continue

161 newfindex.append((getattr(object, f[0].__name__), f[1]))

162 newre.append((cre, newfindex))

163 newtab[key] = newre

164 c.lexstatere = newtab

165 c.lexstateerrorf = {}

166 for key, ef in self.lexstateerrorf.items():

167 c.lexstateerrorf[key] = getattr(object, ef.__name__)

168 c.lexmodule = object

169 return c

170

171 # ------------------------------------------------------------

172 # writetab() - Write lexer information to a table file

173 # ------------------------------------------------------------

174 def writetab(self, lextab, outputdir=''):

175 if isinstance(lextab, types.ModuleType):

176 raise IOError("Won't overwrite existing lextab module")

177 basetabmodule = lextab.split('.')[-1]

178 filename = os.path.join(outputdir, basetabmodule) + '.py'

179 with open(filename, 'w') as tf:

180 tf.write('# %s.py. This file automatically created by PLY (version %s). Don\'t edit!\n' % (basetabmodule, __version__))

181 tf.write('_tabversion = %s\n' % repr(__tabversion__))

182 tf.write('_lextokens = set(%s)\n' % repr(tuple(sorted(self.lextokens))))

183 tf.write('_lexreflags = %s\n' % repr(self.lexreflags))

184 tf.write('_lexliterals = %s\n' % repr(self.lexliterals))

185 tf.write('_lexstateinfo = %s\n' % repr(self.lexstateinfo))

186

187 # Rewrite the lexstatere table, replacing function objects with function names

188 tabre = {}

189 for statename, lre in self.lexstatere.items():

190 titem = []

191 for (pat, func), retext, renames in zip(lre, self.lexstateretext[statename], self.lexstaterenames[statename]):

192 titem.append((retext, _funcs_to_names(func, renames)))

193 tabre[statename] = titem

194

195 tf.write('_lexstatere = %s\n' % repr(tabre))

196 tf.write('_lexstateignore = %s\n' % repr(self.lexstateignore))

197

198 taberr = {}

199 for statename, ef in self.lexstateerrorf.items():

200 taberr[statename] = ef.__name__ if ef else None

201 tf.write('_lexstateerrorf = %s\n' % repr(taberr))

202

203 tabeof = {}

204 for statename, ef in self.lexstateeoff.items():

205 tabeof[statename] = ef.__name__ if ef else None

206 tf.write('_lexstateeoff = %s\n' % repr(tabeof))

207

208 # ------------------------------------------------------------

209 # readtab() - Read lexer information from a tab file

210 # ------------------------------------------------------------

211 def readtab(self, tabfile, fdict):

212 if isinstance(tabfile, types.ModuleType):

213 lextab = tabfile

214 else:

215 exec('import %s' % tabfile)

216 lextab = sys.modules[tabfile]

217

218 if getattr(lextab, '_tabversion', '0.0') != __tabversion__:

219 raise ImportError('Inconsistent PLY version')

220

221 self.lextokens = lextab._lextokens

222 self.lexreflags = lextab._lexreflags

223 self.lexliterals = lextab._lexliterals

224 self.lextokens_all = self.lextokens | set(self.lexliterals)

225 self.lexstateinfo = lextab._lexstateinfo

226 self.lexstateignore = lextab._lexstateignore

227 self.lexstatere = {}

228 self.lexstateretext = {}

229 for statename, lre in lextab._lexstatere.items():

230 titem = []

231 txtitem = []

232 for pat, func_name in lre:

233 titem.append((re.compile(pat, lextab._lexreflags), _names_to_funcs(func_name, fdict)))

234

235 self.lexstatere[statename] = titem

236 self.lexstateretext[statename] = txtitem

237

238 self.lexstateerrorf = {}

239 for statename, ef in lextab._lexstateerrorf.items():

240 self.lexstateerrorf[statename] = fdict[ef]

241

242 self.lexstateeoff = {}

243 for statename, ef in lextab._lexstateeoff.items():

244 self.lexstateeoff[statename] = fdict[ef]

245

246 self.begin('INITIAL')

247

248 # ------------------------------------------------------------

249 # input() - Push a new string into the lexer

250 # ------------------------------------------------------------

251 def input(self, s):

252 # Pull off the first character to see if s looks like a string

253 c = s[:1]

254 if not isinstance(c, StringTypes):

255 raise ValueError('Expected a string')

256 self.lexdata = s

257 self.lexpos = 0

258 self.lexlen = len(s)

259

260 # ------------------------------------------------------------

261 # begin() - Changes the lexing state

262 # ------------------------------------------------------------

263 def begin(self, state):

264 if state not in self.lexstatere:

265 raise ValueError('Undefined state')

266 self.lexre = self.lexstatere[state]

267 self.lexretext = self.lexstateretext[state]

268 self.lexignore = self.lexstateignore.get(state, '')

269 self.lexerrorf = self.lexstateerrorf.get(state, None)

270 self.lexeoff = self.lexstateeoff.get(state, None)

271 self.lexstate = state

272

273 # ------------------------------------------------------------

274 # push_state() - Changes the lexing state and saves old on stack

275 # ------------------------------------------------------------

276 def push_state(self, state):

277 self.lexstatestack.append(self.lexstate)

278 self.begin(state)

279

280 # ------------------------------------------------------------

281 # pop_state() - Restores the previous state

282 # ------------------------------------------------------------

283 def pop_state(self):

284 self.begin(self.lexstatestack.pop())

285

286 # ------------------------------------------------------------

287 # current_state() - Returns the current lexing state

288 # ------------------------------------------------------------

289 def current_state(self):

290 return self.lexstate

291

292 # ------------------------------------------------------------

293 # skip() - Skip ahead n characters

294 # ------------------------------------------------------------

295 def skip(self, n):

296 self.lexpos += n

297

298 # ------------------------------------------------------------

299 # opttoken() - Return the next token from the Lexer

300 #

301 # Note: This function has been carefully implemented to be as fast

302 # as possible. Don't make changes unless you really know what

303 # you are doing

304 # ------------------------------------------------------------

305 def token(self):

306 # Make local copies of frequently referenced attributes

307 lexpos = self.lexpos

308 lexlen = self.lexlen

309 lexignore = self.lexignore

310 lexdata = self.lexdata

311

312 while lexpos < lexlen:

313 # This code provides some short-circuit code for whitespace, tabs, and other ignored characters

314 if lexdata[lexpos] in lexignore:

315 lexpos += 1

316 continue

317

318 # Look for a regular expression match

319 for lexre, lexindexfunc in self.lexre:

320 m = lexre.match(lexdata, lexpos)

321 if not m:

322 continue

323

324 # Create a token for return

325 tok = LexToken()

326 tok.value = m.group()

327 tok.lineno = self.lineno

328 tok.lexpos = lexpos

329

330 i = m.lastindex

331 func, tok.type = lexindexfunc[i]

332

333 if not func:

334 # If no token type was set, it's an ignored token

335 if tok.type:

336 self.lexpos = m.end()

337 return tok

338 else:

339 lexpos = m.end()

340 break

341

342 lexpos = m.end()

343

344 # If token is processed by a function, call it

345

346 tok.lexer = self # Set additional attributes useful in token rules

347 self.lexmatch = m

348 self.lexpos = lexpos

349

350 newtok = func(tok)

351

352 # Every function must return a token, if nothing, we just move to next token

353 if not newtok:

354 lexpos = self.lexpos # This is here in case user has updated lexpos.

355 lexignore = self.lexignore # This is here in case there was a state change

356 break

357

358 # Verify type of the token. If not in the token map, raise an error

359 if not self.lexoptimize:

360 if newtok.type not in self.lextokens_all:

361 raise LexError("%s:%d: Rule '%s' returned an unknown token type '%s'" % (

362 func.__code__.co_filename, func.__code__.co_firstlineno,

363 func.__name__, newtok.type), lexdata[lexpos:])

364

365 return newtok

366 else:

367 # No match, see if in literals

368 if lexdata[lexpos] in self.lexliterals:

369 tok = LexToken()

370 tok.value = lexdata[lexpos]

371 tok.lineno = self.lineno

372 tok.type = tok.value

373 tok.lexpos = lexpos

374 self.lexpos = lexpos + 1

375 return tok

376

377 # No match. Call t_error() if defined.

378 if self.lexerrorf:

379 tok = LexToken()

380 tok.value = self.lexdata[lexpos:]

381 tok.lineno = self.lineno

382 tok.type = 'error'

383 tok.lexer = self

384 tok.lexpos = lexpos

385 self.lexpos = lexpos

386 newtok = self.lexerrorf(tok)

387 if lexpos == self.lexpos:

388 # Error method didn't change text position at all. This is an error.

389 raise LexError("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:])

390 lexpos = self.lexpos

391 if not newtok:

392 continue

393 return newtok

394

395 self.lexpos = lexpos

396 raise LexError("Illegal character '%s' at index %d" % (lexdata[lexpos], lexpos), lexdata[lexpos:])

397

398 if self.lexeoff:

399 tok = LexToken()

400 tok.type = 'eof'

401 tok.value = ''

402 tok.lineno = self.lineno

403 tok.lexpos = lexpos

404 tok.lexer = self

405 self.lexpos = lexpos

406 newtok = self.lexeoff(tok)

407 return newtok

408

409 self.lexpos = lexpos + 1

410 if self.lexdata is None:

411 raise RuntimeError('No input string given with input()')

412 return None

413

414 # Iterator interface

415 def __iter__(self):

416 return self

417

418 def next(self):

419 t = self.token()

420 if t is None:

421 raise StopIteration

422 return t

423

424 __next__ = next

425

426# -----------------------------------------------------------------------------

427# ==== Lex Builder ===

428#

429# The functions and classes below are used to collect lexing information

430# and build a Lexer object from it.

431# -----------------------------------------------------------------------------

432

433# -----------------------------------------------------------------------------

434# _get_regex(func)

435#

436# Returns the regular expression assigned to a function either as a doc string

437# or as a .regex attribute attached by the @TOKEN decorator.

438# -----------------------------------------------------------------------------

439def _get_regex(func):

440 return getattr(func, 'regex', func.__doc__)

441

442# -----------------------------------------------------------------------------

443# get_caller_module_dict()

444#

445# This function returns a dictionary containing all of the symbols defined within

446# a caller further down the call stack. This is used to get the environment

447# associated with the yacc() call if none was provided.

448# -----------------------------------------------------------------------------

449def get_caller_module_dict(levels):

450 f = sys._getframe(levels)

451 ldict = f.f_globals.copy()

452 if f.f_globals != f.f_locals:

453 ldict.update(f.f_locals)

454 return ldict

455

456# -----------------------------------------------------------------------------

457# _funcs_to_names()

458#

459# Given a list of regular expression functions, this converts it to a list

460# suitable for output to a table file

461# -----------------------------------------------------------------------------

462def _funcs_to_names(funclist, namelist):

463 result = []

464 for f, name in zip(funclist, namelist):

465 if f and f[0]:

466 result.append((name, f[1]))

467 else:

468 result.append(f)

469 return result

470

471# -----------------------------------------------------------------------------

472# _names_to_funcs()

473#

474# Given a list of regular expression function names, this converts it back to

475# functions.

476# -----------------------------------------------------------------------------

477def _names_to_funcs(namelist, fdict):

478 result = []

479 for n in namelist:

480 if n and n[0]:

481 result.append((fdict[n[0]], n[1]))

482 else:

483 result.append(n)

484 return result

485

486# -----------------------------------------------------------------------------

487# _form_master_re()

488#

489# This function takes a list of all of the regex components and attempts to

490# form the master regular expression. Given limitations in the Python re

491# module, it may be necessary to break the master regex into separate expressions.

492# -----------------------------------------------------------------------------

493def _form_master_re(relist, reflags, ldict, toknames):

494 if not relist:

495 return []

496 regex = '|'.join(relist)

497 try:

498 lexre = re.compile(regex, reflags)

499

500 # Build the index to function map for the matching engine

501 lexindexfunc = [None] * (max(lexre.groupindex.values()) + 1)

502 lexindexnames = lexindexfunc[:]

503

504 for f, i in lexre.groupindex.items():

505 handle = ldict.get(f, None)

506 if type(handle) in (types.FunctionType, types.MethodType):

507 lexindexfunc[i] = (handle, toknames[f])

508 lexindexnames[i] = f

509 elif handle is not None:

510 lexindexnames[i] = f

511 if f.find('ignore_') > 0:

512 lexindexfunc[i] = (None, None)

513 else:

514 lexindexfunc[i] = (None, toknames[f])

515

516 return [(lexre, lexindexfunc)], [regex], [lexindexnames]

517 except Exception:

518 m = int(len(relist)/2)

519 if m == 0:

520 m = 1

521 llist, lre, lnames = _form_master_re(relist[:m], reflags, ldict, toknames)

522 rlist, rre, rnames = _form_master_re(relist[m:], reflags, ldict, toknames)

523 return (llist+rlist), (lre+rre), (lnames+rnames)

524

525# -----------------------------------------------------------------------------

526# def _statetoken(s,names)

527#

528# Given a declaration name s of the form "t_" and a dictionary whose keys are

529# state names, this function returns a tuple (states,tokenname) where states

530# is a tuple of state names and tokenname is the name of the token. For example,

531# calling this with s = "t_foo_bar_SPAM" might return (('foo','bar'),'SPAM')

532# -----------------------------------------------------------------------------

533def _statetoken(s, names):

534 nonstate = 1

535 parts = s.split('_')

536 for i, part in enumerate(parts[1:], 1):

537 if part not in names and part != 'ANY':

538 break

539

540 if i > 1:

541 states = tuple(parts[1:i])

542 else:

543 states = ('INITIAL',)

544

545 if 'ANY' in states:

546 states = tuple(names)

547

548 tokenname = '_'.join(parts[i:])

549 return (states, tokenname)

550

551

552# -----------------------------------------------------------------------------

553# LexerReflect()

554#

555# This class represents information needed to build a lexer as extracted from a

556# user's input file.

557# -----------------------------------------------------------------------------

558class LexerReflect(object):

559 def __init__(self, ldict, log=None, reflags=0):

560 self.ldict = ldict

561 self.error_func = None

562 self.tokens = []

563 self.reflags = reflags

564 self.stateinfo = {'INITIAL': 'inclusive'}

565 self.modules = set()

566 self.error = False

567 self.log = PlyLogger(sys.stderr) if log is None else log

568

569 # Get all of the basic information

570 def get_all(self):

571 self.get_tokens()

572 self.get_literals()

573 self.get_states()

574 self.get_rules()

575

576 # Validate all of the information

577 def validate_all(self):

578 self.validate_tokens()

579 self.validate_literals()

580 self.validate_rules()

581 return self.error

582

583 # Get the tokens map

584 def get_tokens(self):

585 tokens = self.ldict.get('tokens', None)

586 if not tokens:

587 self.log.error('No token list is defined')

588 self.error = True

589 return

590

591 if not isinstance(tokens, (list, tuple)):

592 self.log.error('tokens must be a list or tuple')

593 self.error = True

594 return

595

596 if not tokens:

597 self.log.error('tokens is empty')

598 self.error = True

599 return

600

601 self.tokens = tokens

602

603 # Validate the tokens

604 def validate_tokens(self):

605 terminals = {}

606 for n in self.tokens:

607 if not _is_identifier.match(n):

608 self.log.error("Bad token name '%s'", n)

609 self.error = True

610 if n in terminals:

611 self.log.warning("Token '%s' multiply defined", n)

612 terminals[n] = 1

613

614 # Get the literals specifier

615 def get_literals(self):

616 self.literals = self.ldict.get('literals', '')

617 if not self.literals:

618 self.literals = ''

619

620 # Validate literals

621 def validate_literals(self):

622 try:

623 for c in self.literals:

624 if not isinstance(c, StringTypes) or len(c) > 1:

625 self.log.error('Invalid literal %s. Must be a single character', repr(c))

626 self.error = True

627

628 except TypeError:

629 self.log.error('Invalid literals specification. literals must be a sequence of characters')

630 self.error = True

631

632 def get_states(self):

633 self.states = self.ldict.get('states', None)

634 # Build statemap

635 if self.states:

636 if not isinstance(self.states, (tuple, list)):

637 self.log.error('states must be defined as a tuple or list')

638 self.error = True

639 else:

640 for s in self.states:

641 if not isinstance(s, tuple) or len(s) != 2:

642 self.log.error("Invalid state specifier %s. Must be a tuple (statename,'exclusive|inclusive')", repr(s))

643 self.error = True

644 continue

645 name, statetype = s

646 if not isinstance(name, StringTypes):

647 self.log.error('State name %s must be a string', repr(name))

648 self.error = True

649 continue

650 if not (statetype == 'inclusive' or statetype == 'exclusive'):

651 self.log.error("State type for state %s must be 'inclusive' or 'exclusive'", name)

652 self.error = True

653 continue

654 if name in self.stateinfo:

655 self.log.error("State '%s' already defined", name)

656 self.error = True

657 continue

658 self.stateinfo[name] = statetype

659

660 # Get all of the symbols with a t_ prefix and sort them into various

661 # categories (functions, strings, error functions, and ignore characters)

662

663 def get_rules(self):

664 tsymbols = [f for f in self.ldict if f[:2] == 't_']

665

666 # Now build up a list of functions and a list of strings

667 self.toknames = {} # Mapping of symbols to token names

668 self.funcsym = {} # Symbols defined as functions

669 self.strsym = {} # Symbols defined as strings

670 self.ignore = {} # Ignore strings by state

671 self.errorf = {} # Error functions by state

672 self.eoff = {} # EOF functions by state

673

674 for s in self.stateinfo:

675 self.funcsym[s] = []

676 self.strsym[s] = []

677

678 if len(tsymbols) == 0:

679 self.log.error('No rules of the form t_rulename are defined')

680 self.error = True

681 return

682

683 for f in tsymbols:

684 t = self.ldict[f]

685 states, tokname = _statetoken(f, self.stateinfo)

686 self.toknames[f] = tokname

687

688 if hasattr(t, '__call__'):

689 if tokname == 'error':

690 for s in states:

691 self.errorf[s] = t

692 elif tokname == 'eof':

693 for s in states:

694 self.eoff[s] = t

695 elif tokname == 'ignore':

696 line = t.__code__.co_firstlineno

697 file = t.__code__.co_filename

698 self.log.error("%s:%d: Rule '%s' must be defined as a string", file, line, t.__name__)

699 self.error = True

700 else:

701 for s in states:

702 self.funcsym[s].append((f, t))

703 elif isinstance(t, StringTypes):

704 if tokname == 'ignore':

705 for s in states:

706 self.ignore[s] = t

707 if '\\' in t:

708 self.log.warning("%s contains a literal backslash '\\'", f)

709

710 elif tokname == 'error':

711 self.log.error("Rule '%s' must be defined as a function", f)

712 self.error = True

713 else:

714 for s in states:

715 self.strsym[s].append((f, t))

716 else:

717 self.log.error('%s not defined as a function or string', f)

718 self.error = True

719

720 # Sort the functions by line number

721 for f in self.funcsym.values():

722 f.sort(key=lambda x: x[1].__code__.co_firstlineno)

723

724 # Sort the strings by regular expression length

725 for s in self.strsym.values():

726 s.sort(key=lambda x: len(x[1]), reverse=True)

727

728 # Validate all of the t_rules collected

729 def validate_rules(self):

730 for state in self.stateinfo:

731 # Validate all rules defined by functions

732

733 for fname, f in self.funcsym[state]:

734 line = f.__code__.co_firstlineno

735 file = f.__code__.co_filename

736 module = inspect.getmodule(f)

737 self.modules.add(module)

738

739 tokname = self.toknames[fname]

740 if isinstance(f, types.MethodType):

741 reqargs = 2

742 else:

743 reqargs = 1

744 nargs = f.__code__.co_argcount

745 if nargs > reqargs:

746 self.log.error("%s:%d: Rule '%s' has too many arguments", file, line, f.__name__)

747 self.error = True

748 continue

749

750 if nargs < reqargs:

751 self.log.error("%s:%d: Rule '%s' requires an argument", file, line, f.__name__)

752 self.error = True

753 continue

754

755 if not _get_regex(f):

756 self.log.error("%s:%d: No regular expression defined for rule '%s'", file, line, f.__name__)

757 self.error = True

758 continue

759

760 try:

761 c = re.compile('(?P<%s>%s)' % (fname, _get_regex(f)), self.reflags)

762 if c.match(''):

763 self.log.error("%s:%d: Regular expression for rule '%s' matches empty string", file, line, f.__name__)

764 self.error = True

765 except re.error as e:

766 self.log.error("%s:%d: Invalid regular expression for rule '%s'. %s", file, line, f.__name__, e)

767 if '#' in _get_regex(f):

768 self.log.error("%s:%d. Make sure '#' in rule '%s' is escaped with '\\#'", file, line, f.__name__)

769 self.error = True

770

771 # Validate all rules defined by strings

772 for name, r in self.strsym[state]:

773 tokname = self.toknames[name]

774 if tokname == 'error':

775 self.log.error("Rule '%s' must be defined as a function", name)

776 self.error = True

777 continue

778

779 if tokname not in self.tokens and tokname.find('ignore_') < 0:

780 self.log.error("Rule '%s' defined for an unspecified token %s", name, tokname)

781 self.error = True

782 continue

783

784 try:

785 c = re.compile('(?P<%s>%s)' % (name, r), self.reflags)

786 if (c.match('')):

787 self.log.error("Regular expression for rule '%s' matches empty string", name)

788 self.error = True

789 except re.error as e:

790 self.log.error("Invalid regular expression for rule '%s'. %s", name, e)

791 if '#' in r:

792 self.log.error("Make sure '#' in rule '%s' is escaped with '\\#'", name)

793 self.error = True

794

795 if not self.funcsym[state] and not self.strsym[state]:

796 self.log.error("No rules defined for state '%s'", state)

797 self.error = True

798

799 # Validate the error function

800 efunc = self.errorf.get(state, None)

801 if efunc:

802 f = efunc

803 line = f.__code__.co_firstlineno

804 file = f.__code__.co_filename

805 module = inspect.getmodule(f)

806 self.modules.add(module)

807

808 if isinstance(f, types.MethodType):

809 reqargs = 2

810 else:

811 reqargs = 1

812 nargs = f.__code__.co_argcount

813 if nargs > reqargs:

814 self.log.error("%s:%d: Rule '%s' has too many arguments", file, line, f.__name__)

815 self.error = True

816

817 if nargs < reqargs:

818 self.log.error("%s:%d: Rule '%s' requires an argument", file, line, f.__name__)

819 self.error = True

820

821 for module in self.modules:

822 self.validate_module(module)

823

824 # -----------------------------------------------------------------------------

825 # validate_module()

826 #

827 # This checks to see if there are duplicated t_rulename() functions or strings

828 # in the parser input file. This is done using a simple regular expression

829 # match on each line in the source code of the given module.

830 # -----------------------------------------------------------------------------

831

832 def validate_module(self, module):

833 try:

834 lines, linen = inspect.getsourcelines(module)

835 except IOError:

836 return

837

838 fre = re.compile(r'\s*def\s+(t_[a-zA-Z_0-9]*)\(')

839 sre = re.compile(r'\s*(t_[a-zA-Z_0-9]*)\s*=')

840

841 counthash = {}

842 linen += 1

843 for line in lines:

844 m = fre.match(line)

845 if not m:

846 m = sre.match(line)

847 if m:

848 name = m.group(1)

849 prev = counthash.get(name)

850 if not prev:

851 counthash[name] = linen

852 else:

853 filename = inspect.getsourcefile(module)

854 self.log.error('%s:%d: Rule %s redefined. Previously defined on line %d', filename, linen, name, prev)

855 self.error = True

856 linen += 1

857

858# -----------------------------------------------------------------------------

859# lex(module)

860#

861# Build all of the regular expression rules from definitions in the supplied module

862# -----------------------------------------------------------------------------

863def lex(module=None, object=None, debug=False, optimize=False, lextab='lextab',

864 reflags=int(re.VERBOSE), nowarn=False, outputdir=None, debuglog=None, errorlog=None):

865

866 if lextab is None:

867 lextab = 'lextab'

868

869 global lexer

870

871 ldict = None

872 stateinfo = {'INITIAL': 'inclusive'}

873 lexobj = Lexer()

874 lexobj.lexoptimize = optimize

875 global token, input

876

877 if errorlog is None:

878 errorlog = PlyLogger(sys.stderr)

879

880 if debug:

881 if debuglog is None:

882 debuglog = PlyLogger(sys.stderr)

883

884 # Get the module dictionary used for the lexer

885 if object:

886 module = object

887

888 # Get the module dictionary used for the parser

889 if module:

890 _items = [(k, getattr(module, k)) for k in dir(module)]

891 ldict = dict(_items)

892 # If no __file__ attribute is available, try to obtain it from the __module__ instead

893 if '__file__' not in ldict:

894 ldict['__file__'] = sys.modules[ldict['__module__']].__file__

895 else:

896 ldict = get_caller_module_dict(2)

897

898 # Determine if the module is package of a package or not.

899 # If so, fix the tabmodule setting so that tables load correctly

900 pkg = ldict.get('__package__')

901 if pkg and isinstance(lextab, str):

902 if '.' not in lextab:

903 lextab = pkg + '.' + lextab

904

905 # Collect parser information from the dictionary

906 linfo = LexerReflect(ldict, log=errorlog, reflags=reflags)

907 linfo.get_all()

908 if not optimize:

909 if linfo.validate_all():

910 raise SyntaxError("Can't build lexer")

911

912 if optimize and lextab:

913 try:

914 lexobj.readtab(lextab, ldict)

915 token = lexobj.token

916 input = lexobj.input

917 lexer = lexobj

918 return lexobj

919

920 except ImportError:

921 pass

922

923 # Dump some basic debugging information

924 if debug:

925 debuglog.info('lex: tokens = %r', linfo.tokens)

926 debuglog.info('lex: literals = %r', linfo.literals)

927 debuglog.info('lex: states = %r', linfo.stateinfo)

928

929 # Build a dictionary of valid token names

930 lexobj.lextokens = set()

931 for n in linfo.tokens:

932 lexobj.lextokens.add(n)

933

934 # Get literals specification

935 if isinstance(linfo.literals, (list, tuple)):

936 lexobj.lexliterals = type(linfo.literals[0])().join(linfo.literals)

937 else:

938 lexobj.lexliterals = linfo.literals

939

940 lexobj.lextokens_all = lexobj.lextokens | set(lexobj.lexliterals)

941

942 # Get the stateinfo dictionary

943 stateinfo = linfo.stateinfo

944

945 regexs = {}

946 # Build the master regular expressions

947 for state in stateinfo:

948 regex_list = []

949

950 # Add rules defined by functions first

951 for fname, f in linfo.funcsym[state]:

952 line = f.__code__.co_firstlineno

953 file = f.__code__.co_filename

954 regex_list.append('(?P<%s>%s)' % (fname, _get_regex(f)))

955 if debug:

956 debuglog.info("lex: Adding rule %s -> '%s' (state '%s')", fname, _get_regex(f), state)

957

958 # Now add all of the simple rules

959 for name, r in linfo.strsym[state]:

960 regex_list.append('(?P<%s>%s)' % (name, r))

961 if debug:

962 debuglog.info("lex: Adding rule %s -> '%s' (state '%s')", name, r, state)

963

964 regexs[state] = regex_list

965

966 # Build the master regular expressions

967

968 if debug:

969 debuglog.info('lex: ==== MASTER REGEXS FOLLOW ====')

970

971 for state in regexs:

972 lexre, re_text, re_names = _form_master_re(regexs[state], reflags, ldict, linfo.toknames)

973 lexobj.lexstatere[state] = lexre

974 lexobj.lexstateretext[state] = re_text

975 lexobj.lexstaterenames[state] = re_names

976 if debug:

977 for i, text in enumerate(re_text):

978 debuglog.info("lex: state '%s' : regex[%d] = '%s'", state, i, text)

979

980 # For inclusive states, we need to add the regular expressions from the INITIAL state

981 for state, stype in stateinfo.items():

982 if state != 'INITIAL' and stype == 'inclusive':

983 lexobj.lexstatere[state].extend(lexobj.lexstatere['INITIAL'])

984 lexobj.lexstateretext[state].extend(lexobj.lexstateretext['INITIAL'])

985 lexobj.lexstaterenames[state].extend(lexobj.lexstaterenames['INITIAL'])

986

987 lexobj.lexstateinfo = stateinfo

988 lexobj.lexre = lexobj.lexstatere['INITIAL']

989 lexobj.lexretext = lexobj.lexstateretext['INITIAL']

990 lexobj.lexreflags = reflags

991

992 # Set up ignore variables

993 lexobj.lexstateignore = linfo.ignore

994 lexobj.lexignore = lexobj.lexstateignore.get('INITIAL', '')

995

996 # Set up error functions

997 lexobj.lexstateerrorf = linfo.errorf

998 lexobj.lexerrorf = linfo.errorf.get('INITIAL', None)

999 if not lexobj.lexerrorf:

1000 errorlog.warning('No t_error rule is defined')

1001

1002 # Set up eof functions

1003 lexobj.lexstateeoff = linfo.eoff

1004 lexobj.lexeoff = linfo.eoff.get('INITIAL', None)

1005

1006 # Check state information for ignore and error rules

1007 for s, stype in stateinfo.items():

1008 if stype == 'exclusive':

1009 if s not in linfo.errorf:

1010 errorlog.warning("No error rule is defined for exclusive state '%s'", s)

1011 if s not in linfo.ignore and lexobj.lexignore:

1012 errorlog.warning("No ignore rule is defined for exclusive state '%s'", s)

1013 elif stype == 'inclusive':

1014 if s not in linfo.errorf:

1015 linfo.errorf[s] = linfo.errorf.get('INITIAL', None)

1016 if s not in linfo.ignore:

1017 linfo.ignore[s] = linfo.ignore.get('INITIAL', '')

1018

1019 # Create global versions of the token() and input() functions

1020 token = lexobj.token

1021 input = lexobj.input

1022 lexer = lexobj

1023

1024 # If in optimize mode, we write the lextab

1025 if lextab and optimize:

1026 if outputdir is None:

1027 # If no output directory is set, the location of the output files

1028 # is determined according to the following rules:

1029 # - If lextab specifies a package, files go into that package directory

1030 # - Otherwise, files go in the same directory as the specifying module

1031 if isinstance(lextab, types.ModuleType):

1032 srcfile = lextab.__file__

1033 else:

1034 if '.' not in lextab:

1035 srcfile = ldict['__file__']

1036 else:

1037 parts = lextab.split('.')

1038 pkgname = '.'.join(parts[:-1])

1039 exec('import %s' % pkgname)

1040 srcfile = getattr(sys.modules[pkgname], '__file__', '')

1041 outputdir = os.path.dirname(srcfile)

1042 try:

1043 lexobj.writetab(lextab, outputdir)

1044 except IOError as e:

1045 errorlog.warning("Couldn't write lextab module %r. %s" % (lextab, e))

1046

1047 return lexobj

1048

1049# -----------------------------------------------------------------------------

1050# runmain()

1051#

1052# This runs the lexer as a main program

1053# -----------------------------------------------------------------------------

1054

1055def runmain(lexer=None, data=None):

1056 if not data:

1057 try:

1058 filename = sys.argv[1]

1059 f = open(filename)

1060 data = f.read()

1061 f.close()

1062 except IndexError:

1063 sys.stdout.write('Reading from standard input (type EOF to end):\n')

1064 data = sys.stdin.read()

1065

1066 if lexer:

1067 _input = lexer.input

1068 else:

1069 _input = input

1070 _input(data)

1071 if lexer:

1072 _token = lexer.token

1073 else:

1074 _token = token

1075

1076 while True:

1077 tok = _token()

1078 if not tok:

1079 break

1080 sys.stdout.write('(%s,%r,%d,%d)\n' % (tok.type, tok.value, tok.lineno, tok.lexpos))

1081

1082# -----------------------------------------------------------------------------

1083# @TOKEN(regex)

1084#

1085# This decorator function can be used to set the regex expression on a function

1086# when its docstring might need to be set in an alternative way

1087# -----------------------------------------------------------------------------

1088

1089def TOKEN(r):

1090 def set_regex(f):

1091 if hasattr(r, '__call__'):

1092 f.regex = _get_regex(r)

1093 else:

1094 f.regex = r

1095 return f

1096 return set_regex

1097

1098# Alternative spelling of the TOKEN decorator

1099Token = TOKEN