Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pycparser/c_lexer.py: 92%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

260 statements  

1#------------------------------------------------------------------------------ 

2# pycparser: c_lexer.py 

3# 

4# CLexer class: lexer for the C language 

5# 

6# Eli Bendersky [https://eli.thegreenplace.net/] 

7# License: BSD 

8#------------------------------------------------------------------------------ 

9import re 

10 

11from .ply import lex 

12from .ply.lex import TOKEN 

13 

14 

15class CLexer(object): 

16 """ A lexer for the C language. After building it, set the 

17 input text with input(), and call token() to get new 

18 tokens. 

19 

20 The public attribute filename can be set to an initial 

21 filename, but the lexer will update it upon #line 

22 directives. 

23 """ 

24 def __init__(self, error_func, on_lbrace_func, on_rbrace_func, 

25 type_lookup_func): 

26 """ Create a new Lexer. 

27 

28 error_func: 

29 An error function. Will be called with an error 

30 message, line and column as arguments, in case of 

31 an error during lexing. 

32 

33 on_lbrace_func, on_rbrace_func: 

34 Called when an LBRACE or RBRACE is encountered 

35 (likely to push/pop type_lookup_func's scope) 

36 

37 type_lookup_func: 

38 A type lookup function. Given a string, it must 

39 return True IFF this string is a name of a type 

40 that was defined with a typedef earlier. 

41 """ 

42 self.error_func = error_func 

43 self.on_lbrace_func = on_lbrace_func 

44 self.on_rbrace_func = on_rbrace_func 

45 self.type_lookup_func = type_lookup_func 

46 self.filename = '' 

47 

48 # Keeps track of the last token returned from self.token() 

49 self.last_token = None 

50 

51 # Allow either "# line" or "# <num>" to support GCC's 

52 # cpp output 

53 # 

54 self.line_pattern = re.compile(r'([ \t]*line\W)|([ \t]*\d+)') 

55 self.pragma_pattern = re.compile(r'[ \t]*pragma\W') 

56 

57 def build(self, **kwargs): 

58 """ Builds the lexer from the specification. Must be 

59 called after the lexer object is created. 

60 

61 This method exists separately, because the PLY 

62 manual warns against calling lex.lex inside 

63 __init__ 

64 """ 

65 self.lexer = lex.lex(object=self, **kwargs) 

66 

67 def reset_lineno(self): 

68 """ Resets the internal line number counter of the lexer. 

69 """ 

70 self.lexer.lineno = 1 

71 

72 def input(self, text): 

73 self.lexer.input(text) 

74 

75 def token(self): 

76 self.last_token = self.lexer.token() 

77 return self.last_token 

78 

79 def find_tok_column(self, token): 

80 """ Find the column of the token in its line. 

81 """ 

82 last_cr = self.lexer.lexdata.rfind('\n', 0, token.lexpos) 

83 return token.lexpos - last_cr 

84 

85 ######################-- PRIVATE --###################### 

86 

87 ## 

88 ## Internal auxiliary methods 

89 ## 

90 def _error(self, msg, token): 

91 location = self._make_tok_location(token) 

92 self.error_func(msg, location[0], location[1]) 

93 self.lexer.skip(1) 

94 

95 def _make_tok_location(self, token): 

96 return (token.lineno, self.find_tok_column(token)) 

97 

98 ## 

99 ## Reserved keywords 

100 ## 

101 keywords = ( 

102 'AUTO', 'BREAK', 'CASE', 'CHAR', 'CONST', 

103 'CONTINUE', 'DEFAULT', 'DO', 'DOUBLE', 'ELSE', 'ENUM', 'EXTERN', 

104 'FLOAT', 'FOR', 'GOTO', 'IF', 'INLINE', 'INT', 'LONG', 

105 'REGISTER', 'OFFSETOF', 

106 'RESTRICT', 'RETURN', 'SHORT', 'SIGNED', 'SIZEOF', 'STATIC', 'STRUCT', 

107 'SWITCH', 'TYPEDEF', 'UNION', 'UNSIGNED', 'VOID', 

108 'VOLATILE', 'WHILE', '__INT128', 

109 ) 

110 

111 keywords_new = ( 

112 '_BOOL', '_COMPLEX', 

113 '_NORETURN', '_THREAD_LOCAL', '_STATIC_ASSERT', 

114 '_ATOMIC', '_ALIGNOF', '_ALIGNAS', 

115 '_PRAGMA', 

116 ) 

117 

118 keyword_map = {} 

119 

120 for keyword in keywords: 

121 keyword_map[keyword.lower()] = keyword 

122 

123 for keyword in keywords_new: 

124 keyword_map[keyword[:2].upper() + keyword[2:].lower()] = keyword 

125 

126 ## 

127 ## All the tokens recognized by the lexer 

128 ## 

129 tokens = keywords + keywords_new + ( 

130 # Identifiers 

131 'ID', 

132 

133 # Type identifiers (identifiers previously defined as 

134 # types with typedef) 

135 'TYPEID', 

136 

137 # constants 

138 'INT_CONST_DEC', 'INT_CONST_OCT', 'INT_CONST_HEX', 'INT_CONST_BIN', 'INT_CONST_CHAR', 

139 'FLOAT_CONST', 'HEX_FLOAT_CONST', 

140 'CHAR_CONST', 

141 'WCHAR_CONST', 

142 'U8CHAR_CONST', 

143 'U16CHAR_CONST', 

144 'U32CHAR_CONST', 

145 

146 # String literals 

147 'STRING_LITERAL', 

148 'WSTRING_LITERAL', 

149 'U8STRING_LITERAL', 

150 'U16STRING_LITERAL', 

151 'U32STRING_LITERAL', 

152 

153 # Operators 

154 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'MOD', 

155 'OR', 'AND', 'NOT', 'XOR', 'LSHIFT', 'RSHIFT', 

156 'LOR', 'LAND', 'LNOT', 

157 'LT', 'LE', 'GT', 'GE', 'EQ', 'NE', 

158 

159 # Assignment 

160 'EQUALS', 'TIMESEQUAL', 'DIVEQUAL', 'MODEQUAL', 

161 'PLUSEQUAL', 'MINUSEQUAL', 

162 'LSHIFTEQUAL','RSHIFTEQUAL', 'ANDEQUAL', 'XOREQUAL', 

163 'OREQUAL', 

164 

165 # Increment/decrement 

166 'PLUSPLUS', 'MINUSMINUS', 

167 

168 # Structure dereference (->) 

169 'ARROW', 

170 

171 # Conditional operator (?) 

172 'CONDOP', 

173 

174 # Delimiters 

175 'LPAREN', 'RPAREN', # ( ) 

176 'LBRACKET', 'RBRACKET', # [ ] 

177 'LBRACE', 'RBRACE', # { } 

178 'COMMA', 'PERIOD', # . , 

179 'SEMI', 'COLON', # ; : 

180 

181 # Ellipsis (...) 

182 'ELLIPSIS', 

183 

184 # pre-processor 

185 'PPHASH', # '#' 

186 'PPPRAGMA', # 'pragma' 

187 'PPPRAGMASTR', 

188 ) 

189 

190 ## 

191 ## Regexes for use in tokens 

192 ## 

193 ## 

194 

195 # valid C identifiers (K&R2: A.2.3), plus '$' (supported by some compilers) 

196 identifier = r'[a-zA-Z_$][0-9a-zA-Z_$]*' 

197 

198 hex_prefix = '0[xX]' 

199 hex_digits = '[0-9a-fA-F]+' 

200 bin_prefix = '0[bB]' 

201 bin_digits = '[01]+' 

202 

203 # integer constants (K&R2: A.2.5.1) 

204 integer_suffix_opt = r'(([uU]ll)|([uU]LL)|(ll[uU]?)|(LL[uU]?)|([uU][lL])|([lL][uU]?)|[uU])?' 

205 decimal_constant = '(0'+integer_suffix_opt+')|([1-9][0-9]*'+integer_suffix_opt+')' 

206 octal_constant = '0[0-7]*'+integer_suffix_opt 

207 hex_constant = hex_prefix+hex_digits+integer_suffix_opt 

208 bin_constant = bin_prefix+bin_digits+integer_suffix_opt 

209 

210 bad_octal_constant = '0[0-7]*[89]' 

211 

212 # comments are not supported 

213 unsupported_c_style_comment = r'\/\*' 

214 unsupported_cxx_style_comment = r'\/\/' 

215 

216 # character constants (K&R2: A.2.5.2) 

217 # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line 

218 # directives with Windows paths as filenames (..\..\dir\file) 

219 # For the same reason, decimal_escape allows all digit sequences. We want to 

220 # parse all correct code, even if it means to sometimes parse incorrect 

221 # code. 

222 # 

223 # The original regexes were taken verbatim from the C syntax definition, 

224 # and were later modified to avoid worst-case exponential running time. 

225 # 

226 # simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])""" 

227 # decimal_escape = r"""(\d+)""" 

228 # hex_escape = r"""(x[0-9a-fA-F]+)""" 

229 # bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])""" 

230 # 

231 # The following modifications were made to avoid the ambiguity that allowed backtracking: 

232 # (https://github.com/eliben/pycparser/issues/61) 

233 # 

234 # - \x was removed from simple_escape, unless it was not followed by a hex digit, to avoid ambiguity with hex_escape. 

235 # - hex_escape allows one or more hex characters, but requires that the next character(if any) is not hex 

236 # - decimal_escape allows one or more decimal characters, but requires that the next character(if any) is not a decimal 

237 # - bad_escape does not allow any decimals (8-9), to avoid conflicting with the permissive decimal_escape. 

238 # 

239 # Without this change, python's `re` module would recursively try parsing each ambiguous escape sequence in multiple ways. 

240 # e.g. `\123` could be parsed as `\1`+`23`, `\12`+`3`, and `\123`. 

241 

242 simple_escape = r"""([a-wyzA-Z._~!=&\^\-\\?'"]|x(?![0-9a-fA-F]))""" 

243 decimal_escape = r"""(\d+)(?!\d)""" 

244 hex_escape = r"""(x[0-9a-fA-F]+)(?![0-9a-fA-F])""" 

245 bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-9])""" 

246 

247 escape_sequence = r"""(\\("""+simple_escape+'|'+decimal_escape+'|'+hex_escape+'))' 

248 

249 # This complicated regex with lookahead might be slow for strings, so because all of the valid escapes (including \x) allowed 

250 # 0 or more non-escaped characters after the first character, simple_escape+decimal_escape+hex_escape got simplified to 

251 

252 escape_sequence_start_in_string = r"""(\\[0-9a-zA-Z._~!=&\^\-\\?'"])""" 

253 

254 cconst_char = r"""([^'\\\n]|"""+escape_sequence+')' 

255 char_const = "'"+cconst_char+"'" 

256 wchar_const = 'L'+char_const 

257 u8char_const = 'u8'+char_const 

258 u16char_const = 'u'+char_const 

259 u32char_const = 'U'+char_const 

260 multicharacter_constant = "'"+cconst_char+"{2,4}'" 

261 unmatched_quote = "('"+cconst_char+"*\\n)|('"+cconst_char+"*$)" 

262 bad_char_const = r"""('"""+cconst_char+"""[^'\n]+')|('')|('"""+bad_escape+r"""[^'\n]*')""" 

263 

264 # string literals (K&R2: A.2.6) 

265 string_char = r"""([^"\\\n]|"""+escape_sequence_start_in_string+')' 

266 string_literal = '"'+string_char+'*"' 

267 wstring_literal = 'L'+string_literal 

268 u8string_literal = 'u8'+string_literal 

269 u16string_literal = 'u'+string_literal 

270 u32string_literal = 'U'+string_literal 

271 bad_string_literal = '"'+string_char+'*'+bad_escape+string_char+'*"' 

272 

273 # floating constants (K&R2: A.2.5.3) 

274 exponent_part = r"""([eE][-+]?[0-9]+)""" 

275 fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)""" 

276 floating_constant = '(((('+fractional_constant+')'+exponent_part+'?)|([0-9]+'+exponent_part+'))[FfLl]?)' 

277 binary_exponent_part = r'''([pP][+-]?[0-9]+)''' 

278 hex_fractional_constant = '((('+hex_digits+r""")?\."""+hex_digits+')|('+hex_digits+r"""\.))""" 

279 hex_floating_constant = '('+hex_prefix+'('+hex_digits+'|'+hex_fractional_constant+')'+binary_exponent_part+'[FfLl]?)' 

280 

281 ## 

282 ## Lexer states: used for preprocessor \n-terminated directives 

283 ## 

284 states = ( 

285 # ppline: preprocessor line directives 

286 # 

287 ('ppline', 'exclusive'), 

288 

289 # pppragma: pragma 

290 # 

291 ('pppragma', 'exclusive'), 

292 ) 

293 

294 def t_PPHASH(self, t): 

295 r'[ \t]*\#' 

296 if self.line_pattern.match(t.lexer.lexdata, pos=t.lexer.lexpos): 

297 t.lexer.begin('ppline') 

298 self.pp_line = self.pp_filename = None 

299 elif self.pragma_pattern.match(t.lexer.lexdata, pos=t.lexer.lexpos): 

300 t.lexer.begin('pppragma') 

301 else: 

302 t.type = 'PPHASH' 

303 return t 

304 

305 ## 

306 ## Rules for the ppline state 

307 ## 

308 @TOKEN(string_literal) 

309 def t_ppline_FILENAME(self, t): 

310 if self.pp_line is None: 

311 self._error('filename before line number in #line', t) 

312 else: 

313 self.pp_filename = t.value.lstrip('"').rstrip('"') 

314 

315 @TOKEN(decimal_constant) 

316 def t_ppline_LINE_NUMBER(self, t): 

317 if self.pp_line is None: 

318 self.pp_line = t.value 

319 else: 

320 # Ignore: GCC's cpp sometimes inserts a numeric flag 

321 # after the file name 

322 pass 

323 

324 def t_ppline_NEWLINE(self, t): 

325 r'\n' 

326 if self.pp_line is None: 

327 self._error('line number missing in #line', t) 

328 else: 

329 self.lexer.lineno = int(self.pp_line) 

330 

331 if self.pp_filename is not None: 

332 self.filename = self.pp_filename 

333 

334 t.lexer.begin('INITIAL') 

335 

336 def t_ppline_PPLINE(self, t): 

337 r'line' 

338 pass 

339 

340 t_ppline_ignore = ' \t' 

341 

342 def t_ppline_error(self, t): 

343 self._error('invalid #line directive', t) 

344 

345 ## 

346 ## Rules for the pppragma state 

347 ## 

348 def t_pppragma_NEWLINE(self, t): 

349 r'\n' 

350 t.lexer.lineno += 1 

351 t.lexer.begin('INITIAL') 

352 

353 def t_pppragma_PPPRAGMA(self, t): 

354 r'pragma' 

355 return t 

356 

357 t_pppragma_ignore = ' \t' 

358 

359 def t_pppragma_STR(self, t): 

360 '.+' 

361 t.type = 'PPPRAGMASTR' 

362 return t 

363 

364 def t_pppragma_error(self, t): 

365 self._error('invalid #pragma directive', t) 

366 

367 ## 

368 ## Rules for the normal state 

369 ## 

370 t_ignore = ' \t' 

371 

372 # Newlines 

373 def t_NEWLINE(self, t): 

374 r'\n+' 

375 t.lexer.lineno += t.value.count("\n") 

376 

377 # Operators 

378 t_PLUS = r'\+' 

379 t_MINUS = r'-' 

380 t_TIMES = r'\*' 

381 t_DIVIDE = r'/' 

382 t_MOD = r'%' 

383 t_OR = r'\|' 

384 t_AND = r'&' 

385 t_NOT = r'~' 

386 t_XOR = r'\^' 

387 t_LSHIFT = r'<<' 

388 t_RSHIFT = r'>>' 

389 t_LOR = r'\|\|' 

390 t_LAND = r'&&' 

391 t_LNOT = r'!' 

392 t_LT = r'<' 

393 t_GT = r'>' 

394 t_LE = r'<=' 

395 t_GE = r'>=' 

396 t_EQ = r'==' 

397 t_NE = r'!=' 

398 

399 # Assignment operators 

400 t_EQUALS = r'=' 

401 t_TIMESEQUAL = r'\*=' 

402 t_DIVEQUAL = r'/=' 

403 t_MODEQUAL = r'%=' 

404 t_PLUSEQUAL = r'\+=' 

405 t_MINUSEQUAL = r'-=' 

406 t_LSHIFTEQUAL = r'<<=' 

407 t_RSHIFTEQUAL = r'>>=' 

408 t_ANDEQUAL = r'&=' 

409 t_OREQUAL = r'\|=' 

410 t_XOREQUAL = r'\^=' 

411 

412 # Increment/decrement 

413 t_PLUSPLUS = r'\+\+' 

414 t_MINUSMINUS = r'--' 

415 

416 # -> 

417 t_ARROW = r'->' 

418 

419 # ? 

420 t_CONDOP = r'\?' 

421 

422 # Delimiters 

423 t_LPAREN = r'\(' 

424 t_RPAREN = r'\)' 

425 t_LBRACKET = r'\[' 

426 t_RBRACKET = r'\]' 

427 t_COMMA = r',' 

428 t_PERIOD = r'\.' 

429 t_SEMI = r';' 

430 t_COLON = r':' 

431 t_ELLIPSIS = r'\.\.\.' 

432 

433 # Scope delimiters 

434 # To see why on_lbrace_func is needed, consider: 

435 # typedef char TT; 

436 # void foo(int TT) { TT = 10; } 

437 # TT x = 5; 

438 # Outside the function, TT is a typedef, but inside (starting and ending 

439 # with the braces) it's a parameter. The trouble begins with yacc's 

440 # lookahead token. If we open a new scope in brace_open, then TT has 

441 # already been read and incorrectly interpreted as TYPEID. So, we need 

442 # to open and close scopes from within the lexer. 

443 # Similar for the TT immediately outside the end of the function. 

444 # 

445 @TOKEN(r'\{') 

446 def t_LBRACE(self, t): 

447 self.on_lbrace_func() 

448 return t 

449 @TOKEN(r'\}') 

450 def t_RBRACE(self, t): 

451 self.on_rbrace_func() 

452 return t 

453 

454 t_STRING_LITERAL = string_literal 

455 

456 # The following floating and integer constants are defined as 

457 # functions to impose a strict order (otherwise, decimal 

458 # is placed before the others because its regex is longer, 

459 # and this is bad) 

460 # 

461 @TOKEN(floating_constant) 

462 def t_FLOAT_CONST(self, t): 

463 return t 

464 

465 @TOKEN(hex_floating_constant) 

466 def t_HEX_FLOAT_CONST(self, t): 

467 return t 

468 

469 @TOKEN(hex_constant) 

470 def t_INT_CONST_HEX(self, t): 

471 return t 

472 

473 @TOKEN(bin_constant) 

474 def t_INT_CONST_BIN(self, t): 

475 return t 

476 

477 @TOKEN(bad_octal_constant) 

478 def t_BAD_CONST_OCT(self, t): 

479 msg = "Invalid octal constant" 

480 self._error(msg, t) 

481 

482 @TOKEN(unsupported_c_style_comment) 

483 def t_UNSUPPORTED_C_STYLE_COMMENT(self, t): 

484 msg = "Comments are not supported, see https://github.com/eliben/pycparser#3using." 

485 self._error(msg, t) 

486 

487 @TOKEN(unsupported_cxx_style_comment) 

488 def t_UNSUPPORTED_CXX_STYLE_COMMENT(self, t): 

489 msg = "Comments are not supported, see https://github.com/eliben/pycparser#3using." 

490 self._error(msg, t) 

491 

492 @TOKEN(octal_constant) 

493 def t_INT_CONST_OCT(self, t): 

494 return t 

495 

496 @TOKEN(decimal_constant) 

497 def t_INT_CONST_DEC(self, t): 

498 return t 

499 

500 # Must come before bad_char_const, to prevent it from 

501 # catching valid char constants as invalid 

502 # 

503 @TOKEN(multicharacter_constant) 

504 def t_INT_CONST_CHAR(self, t): 

505 return t 

506 

507 @TOKEN(char_const) 

508 def t_CHAR_CONST(self, t): 

509 return t 

510 

511 @TOKEN(wchar_const) 

512 def t_WCHAR_CONST(self, t): 

513 return t 

514 

515 @TOKEN(u8char_const) 

516 def t_U8CHAR_CONST(self, t): 

517 return t 

518 

519 @TOKEN(u16char_const) 

520 def t_U16CHAR_CONST(self, t): 

521 return t 

522 

523 @TOKEN(u32char_const) 

524 def t_U32CHAR_CONST(self, t): 

525 return t 

526 

527 @TOKEN(unmatched_quote) 

528 def t_UNMATCHED_QUOTE(self, t): 

529 msg = "Unmatched '" 

530 self._error(msg, t) 

531 

532 @TOKEN(bad_char_const) 

533 def t_BAD_CHAR_CONST(self, t): 

534 msg = "Invalid char constant %s" % t.value 

535 self._error(msg, t) 

536 

537 @TOKEN(wstring_literal) 

538 def t_WSTRING_LITERAL(self, t): 

539 return t 

540 

541 @TOKEN(u8string_literal) 

542 def t_U8STRING_LITERAL(self, t): 

543 return t 

544 

545 @TOKEN(u16string_literal) 

546 def t_U16STRING_LITERAL(self, t): 

547 return t 

548 

549 @TOKEN(u32string_literal) 

550 def t_U32STRING_LITERAL(self, t): 

551 return t 

552 

553 # unmatched string literals are caught by the preprocessor 

554 

555 @TOKEN(bad_string_literal) 

556 def t_BAD_STRING_LITERAL(self, t): 

557 msg = "String contains invalid escape code" 

558 self._error(msg, t) 

559 

560 @TOKEN(identifier) 

561 def t_ID(self, t): 

562 t.type = self.keyword_map.get(t.value, "ID") 

563 if t.type == 'ID' and self.type_lookup_func(t.value): 

564 t.type = "TYPEID" 

565 return t 

566 

567 def t_error(self, t): 

568 msg = 'Illegal character %s' % repr(t.value[0]) 

569 self._error(msg, t)