Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pycparser/c_lexer.py: 85%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

250 statements  

1#------------------------------------------------------------------------------ 

2# pycparser: c_lexer.py 

3# 

4# CLexer class: lexer for the C language 

5# 

6# Eli Bendersky [https://eli.thegreenplace.net/] 

7# License: BSD 

8#------------------------------------------------------------------------------ 

9import re 

10 

11from .ply import lex 

12from .ply.lex import TOKEN 

13 

14 

15class CLexer(object): 

16 """ A lexer for the C language. After building it, set the 

17 input text with input(), and call token() to get new 

18 tokens. 

19 

20 The public attribute filename can be set to an initial 

21 filename, but the lexer will update it upon #line 

22 directives. 

23 """ 

24 def __init__(self, error_func, on_lbrace_func, on_rbrace_func, 

25 type_lookup_func): 

26 """ Create a new Lexer. 

27 

28 error_func: 

29 An error function. Will be called with an error 

30 message, line and column as arguments, in case of 

31 an error during lexing. 

32 

33 on_lbrace_func, on_rbrace_func: 

34 Called when an LBRACE or RBRACE is encountered 

35 (likely to push/pop type_lookup_func's scope) 

36 

37 type_lookup_func: 

38 A type lookup function. Given a string, it must 

39 return True IFF this string is a name of a type 

40 that was defined with a typedef earlier. 

41 """ 

42 self.error_func = error_func 

43 self.on_lbrace_func = on_lbrace_func 

44 self.on_rbrace_func = on_rbrace_func 

45 self.type_lookup_func = type_lookup_func 

46 self.filename = '' 

47 

48 # Keeps track of the last token returned from self.token() 

49 self.last_token = None 

50 

51 # Allow either "# line" or "# <num>" to support GCC's 

52 # cpp output 

53 # 

54 self.line_pattern = re.compile(r'([ \t]*line\W)|([ \t]*\d+)') 

55 self.pragma_pattern = re.compile(r'[ \t]*pragma\W') 

56 

57 def build(self, **kwargs): 

58 """ Builds the lexer from the specification. Must be 

59 called after the lexer object is created. 

60 

61 This method exists separately, because the PLY 

62 manual warns against calling lex.lex inside 

63 __init__ 

64 """ 

65 self.lexer = lex.lex(object=self, **kwargs) 

66 

67 def reset_lineno(self): 

68 """ Resets the internal line number counter of the lexer. 

69 """ 

70 self.lexer.lineno = 1 

71 

72 def input(self, text): 

73 self.lexer.input(text) 

74 

75 def token(self): 

76 self.last_token = self.lexer.token() 

77 return self.last_token 

78 

79 def find_tok_column(self, token): 

80 """ Find the column of the token in its line. 

81 """ 

82 last_cr = self.lexer.lexdata.rfind('\n', 0, token.lexpos) 

83 return token.lexpos - last_cr 

84 

85 ######################-- PRIVATE --###################### 

86 

87 ## 

88 ## Internal auxiliary methods 

89 ## 

90 def _error(self, msg, token): 

91 location = self._make_tok_location(token) 

92 self.error_func(msg, location[0], location[1]) 

93 self.lexer.skip(1) 

94 

95 def _make_tok_location(self, token): 

96 return (token.lineno, self.find_tok_column(token)) 

97 

98 ## 

99 ## Reserved keywords 

100 ## 

101 keywords = ( 

102 'AUTO', 'BREAK', 'CASE', 'CHAR', 'CONST', 

103 'CONTINUE', 'DEFAULT', 'DO', 'DOUBLE', 'ELSE', 'ENUM', 'EXTERN', 

104 'FLOAT', 'FOR', 'GOTO', 'IF', 'INLINE', 'INT', 'LONG', 

105 'REGISTER', 'OFFSETOF', 

106 'RESTRICT', 'RETURN', 'SHORT', 'SIGNED', 'SIZEOF', 'STATIC', 'STRUCT', 

107 'SWITCH', 'TYPEDEF', 'UNION', 'UNSIGNED', 'VOID', 

108 'VOLATILE', 'WHILE', '__INT128', 

109 ) 

110 

111 keywords_new = ( 

112 '_BOOL', '_COMPLEX', 

113 '_NORETURN', '_THREAD_LOCAL', '_STATIC_ASSERT', 

114 '_ATOMIC', '_ALIGNOF', '_ALIGNAS', 

115 '_PRAGMA', 

116 ) 

117 

118 keyword_map = {} 

119 

120 for keyword in keywords: 

121 keyword_map[keyword.lower()] = keyword 

122 

123 for keyword in keywords_new: 

124 keyword_map[keyword[:2].upper() + keyword[2:].lower()] = keyword 

125 

126 ## 

127 ## All the tokens recognized by the lexer 

128 ## 

129 tokens = keywords + keywords_new + ( 

130 # Identifiers 

131 'ID', 

132 

133 # Type identifiers (identifiers previously defined as 

134 # types with typedef) 

135 'TYPEID', 

136 

137 # constants 

138 'INT_CONST_DEC', 'INT_CONST_OCT', 'INT_CONST_HEX', 'INT_CONST_BIN', 'INT_CONST_CHAR', 

139 'FLOAT_CONST', 'HEX_FLOAT_CONST', 

140 'CHAR_CONST', 

141 'WCHAR_CONST', 

142 'U8CHAR_CONST', 

143 'U16CHAR_CONST', 

144 'U32CHAR_CONST', 

145 

146 # String literals 

147 'STRING_LITERAL', 

148 'WSTRING_LITERAL', 

149 'U8STRING_LITERAL', 

150 'U16STRING_LITERAL', 

151 'U32STRING_LITERAL', 

152 

153 # Operators 

154 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'MOD', 

155 'OR', 'AND', 'NOT', 'XOR', 'LSHIFT', 'RSHIFT', 

156 'LOR', 'LAND', 'LNOT', 

157 'LT', 'LE', 'GT', 'GE', 'EQ', 'NE', 

158 

159 # Assignment 

160 'EQUALS', 'TIMESEQUAL', 'DIVEQUAL', 'MODEQUAL', 

161 'PLUSEQUAL', 'MINUSEQUAL', 

162 'LSHIFTEQUAL','RSHIFTEQUAL', 'ANDEQUAL', 'XOREQUAL', 

163 'OREQUAL', 

164 

165 # Increment/decrement 

166 'PLUSPLUS', 'MINUSMINUS', 

167 

168 # Structure dereference (->) 

169 'ARROW', 

170 

171 # Conditional operator (?) 

172 'CONDOP', 

173 

174 # Delimiters 

175 'LPAREN', 'RPAREN', # ( ) 

176 'LBRACKET', 'RBRACKET', # [ ] 

177 'LBRACE', 'RBRACE', # { } 

178 'COMMA', 'PERIOD', # . , 

179 'SEMI', 'COLON', # ; : 

180 

181 # Ellipsis (...) 

182 'ELLIPSIS', 

183 

184 # pre-processor 

185 'PPHASH', # '#' 

186 'PPPRAGMA', # 'pragma' 

187 'PPPRAGMASTR', 

188 ) 

189 

190 ## 

191 ## Regexes for use in tokens 

192 ## 

193 ## 

194 

195 # valid C identifiers (K&R2: A.2.3), plus '$' (supported by some compilers) 

196 identifier = r'[a-zA-Z_$][0-9a-zA-Z_$]*' 

197 

198 hex_prefix = '0[xX]' 

199 hex_digits = '[0-9a-fA-F]+' 

200 bin_prefix = '0[bB]' 

201 bin_digits = '[01]+' 

202 

203 # integer constants (K&R2: A.2.5.1) 

204 integer_suffix_opt = r'(([uU]ll)|([uU]LL)|(ll[uU]?)|(LL[uU]?)|([uU][lL])|([lL][uU]?)|[uU])?' 

205 decimal_constant = '(0'+integer_suffix_opt+')|([1-9][0-9]*'+integer_suffix_opt+')' 

206 octal_constant = '0[0-7]*'+integer_suffix_opt 

207 hex_constant = hex_prefix+hex_digits+integer_suffix_opt 

208 bin_constant = bin_prefix+bin_digits+integer_suffix_opt 

209 

210 bad_octal_constant = '0[0-7]*[89]' 

211 

212 # character constants (K&R2: A.2.5.2) 

213 # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line 

214 # directives with Windows paths as filenames (..\..\dir\file) 

215 # For the same reason, decimal_escape allows all digit sequences. We want to 

216 # parse all correct code, even if it means to sometimes parse incorrect 

217 # code. 

218 # 

219 # The original regexes were taken verbatim from the C syntax definition, 

220 # and were later modified to avoid worst-case exponential running time. 

221 # 

222 # simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])""" 

223 # decimal_escape = r"""(\d+)""" 

224 # hex_escape = r"""(x[0-9a-fA-F]+)""" 

225 # bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])""" 

226 # 

227 # The following modifications were made to avoid the ambiguity that allowed backtracking: 

228 # (https://github.com/eliben/pycparser/issues/61) 

229 # 

230 # - \x was removed from simple_escape, unless it was not followed by a hex digit, to avoid ambiguity with hex_escape. 

231 # - hex_escape allows one or more hex characters, but requires that the next character(if any) is not hex 

232 # - decimal_escape allows one or more decimal characters, but requires that the next character(if any) is not a decimal 

233 # - bad_escape does not allow any decimals (8-9), to avoid conflicting with the permissive decimal_escape. 

234 # 

235 # Without this change, python's `re` module would recursively try parsing each ambiguous escape sequence in multiple ways. 

236 # e.g. `\123` could be parsed as `\1`+`23`, `\12`+`3`, and `\123`. 

237 

238 simple_escape = r"""([a-wyzA-Z._~!=&\^\-\\?'"]|x(?![0-9a-fA-F]))""" 

239 decimal_escape = r"""(\d+)(?!\d)""" 

240 hex_escape = r"""(x[0-9a-fA-F]+)(?![0-9a-fA-F])""" 

241 bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-9])""" 

242 

243 escape_sequence = r"""(\\("""+simple_escape+'|'+decimal_escape+'|'+hex_escape+'))' 

244 

245 # This complicated regex with lookahead might be slow for strings, so because all of the valid escapes (including \x) allowed 

246 # 0 or more non-escaped characters after the first character, simple_escape+decimal_escape+hex_escape got simplified to 

247 

248 escape_sequence_start_in_string = r"""(\\[0-9a-zA-Z._~!=&\^\-\\?'"])""" 

249 

250 cconst_char = r"""([^'\\\n]|"""+escape_sequence+')' 

251 char_const = "'"+cconst_char+"'" 

252 wchar_const = 'L'+char_const 

253 u8char_const = 'u8'+char_const 

254 u16char_const = 'u'+char_const 

255 u32char_const = 'U'+char_const 

256 multicharacter_constant = "'"+cconst_char+"{2,4}'" 

257 unmatched_quote = "('"+cconst_char+"*\\n)|('"+cconst_char+"*$)" 

258 bad_char_const = r"""('"""+cconst_char+"""[^'\n]+')|('')|('"""+bad_escape+r"""[^'\n]*')""" 

259 

260 # string literals (K&R2: A.2.6) 

261 string_char = r"""([^"\\\n]|"""+escape_sequence_start_in_string+')' 

262 string_literal = '"'+string_char+'*"' 

263 wstring_literal = 'L'+string_literal 

264 u8string_literal = 'u8'+string_literal 

265 u16string_literal = 'u'+string_literal 

266 u32string_literal = 'U'+string_literal 

267 bad_string_literal = '"'+string_char+'*'+bad_escape+string_char+'*"' 

268 

269 # floating constants (K&R2: A.2.5.3) 

270 exponent_part = r"""([eE][-+]?[0-9]+)""" 

271 fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)""" 

272 floating_constant = '(((('+fractional_constant+')'+exponent_part+'?)|([0-9]+'+exponent_part+'))[FfLl]?)' 

273 binary_exponent_part = r'''([pP][+-]?[0-9]+)''' 

274 hex_fractional_constant = '((('+hex_digits+r""")?\."""+hex_digits+')|('+hex_digits+r"""\.))""" 

275 hex_floating_constant = '('+hex_prefix+'('+hex_digits+'|'+hex_fractional_constant+')'+binary_exponent_part+'[FfLl]?)' 

276 

277 ## 

278 ## Lexer states: used for preprocessor \n-terminated directives 

279 ## 

280 states = ( 

281 # ppline: preprocessor line directives 

282 # 

283 ('ppline', 'exclusive'), 

284 

285 # pppragma: pragma 

286 # 

287 ('pppragma', 'exclusive'), 

288 ) 

289 

290 def t_PPHASH(self, t): 

291 r'[ \t]*\#' 

292 if self.line_pattern.match(t.lexer.lexdata, pos=t.lexer.lexpos): 

293 t.lexer.begin('ppline') 

294 self.pp_line = self.pp_filename = None 

295 elif self.pragma_pattern.match(t.lexer.lexdata, pos=t.lexer.lexpos): 

296 t.lexer.begin('pppragma') 

297 else: 

298 t.type = 'PPHASH' 

299 return t 

300 

301 ## 

302 ## Rules for the ppline state 

303 ## 

304 @TOKEN(string_literal) 

305 def t_ppline_FILENAME(self, t): 

306 if self.pp_line is None: 

307 self._error('filename before line number in #line', t) 

308 else: 

309 self.pp_filename = t.value.lstrip('"').rstrip('"') 

310 

311 @TOKEN(decimal_constant) 

312 def t_ppline_LINE_NUMBER(self, t): 

313 if self.pp_line is None: 

314 self.pp_line = t.value 

315 else: 

316 # Ignore: GCC's cpp sometimes inserts a numeric flag 

317 # after the file name 

318 pass 

319 

320 def t_ppline_NEWLINE(self, t): 

321 r'\n' 

322 if self.pp_line is None: 

323 self._error('line number missing in #line', t) 

324 else: 

325 self.lexer.lineno = int(self.pp_line) 

326 

327 if self.pp_filename is not None: 

328 self.filename = self.pp_filename 

329 

330 t.lexer.begin('INITIAL') 

331 

332 def t_ppline_PPLINE(self, t): 

333 r'line' 

334 pass 

335 

336 t_ppline_ignore = ' \t' 

337 

338 def t_ppline_error(self, t): 

339 self._error('invalid #line directive', t) 

340 

341 ## 

342 ## Rules for the pppragma state 

343 ## 

344 def t_pppragma_NEWLINE(self, t): 

345 r'\n' 

346 t.lexer.lineno += 1 

347 t.lexer.begin('INITIAL') 

348 

349 def t_pppragma_PPPRAGMA(self, t): 

350 r'pragma' 

351 return t 

352 

353 t_pppragma_ignore = ' \t' 

354 

355 def t_pppragma_STR(self, t): 

356 '.+' 

357 t.type = 'PPPRAGMASTR' 

358 return t 

359 

360 def t_pppragma_error(self, t): 

361 self._error('invalid #pragma directive', t) 

362 

363 ## 

364 ## Rules for the normal state 

365 ## 

366 t_ignore = ' \t' 

367 

368 # Newlines 

369 def t_NEWLINE(self, t): 

370 r'\n+' 

371 t.lexer.lineno += t.value.count("\n") 

372 

373 # Operators 

374 t_PLUS = r'\+' 

375 t_MINUS = r'-' 

376 t_TIMES = r'\*' 

377 t_DIVIDE = r'/' 

378 t_MOD = r'%' 

379 t_OR = r'\|' 

380 t_AND = r'&' 

381 t_NOT = r'~' 

382 t_XOR = r'\^' 

383 t_LSHIFT = r'<<' 

384 t_RSHIFT = r'>>' 

385 t_LOR = r'\|\|' 

386 t_LAND = r'&&' 

387 t_LNOT = r'!' 

388 t_LT = r'<' 

389 t_GT = r'>' 

390 t_LE = r'<=' 

391 t_GE = r'>=' 

392 t_EQ = r'==' 

393 t_NE = r'!=' 

394 

395 # Assignment operators 

396 t_EQUALS = r'=' 

397 t_TIMESEQUAL = r'\*=' 

398 t_DIVEQUAL = r'/=' 

399 t_MODEQUAL = r'%=' 

400 t_PLUSEQUAL = r'\+=' 

401 t_MINUSEQUAL = r'-=' 

402 t_LSHIFTEQUAL = r'<<=' 

403 t_RSHIFTEQUAL = r'>>=' 

404 t_ANDEQUAL = r'&=' 

405 t_OREQUAL = r'\|=' 

406 t_XOREQUAL = r'\^=' 

407 

408 # Increment/decrement 

409 t_PLUSPLUS = r'\+\+' 

410 t_MINUSMINUS = r'--' 

411 

412 # -> 

413 t_ARROW = r'->' 

414 

415 # ? 

416 t_CONDOP = r'\?' 

417 

418 # Delimiters 

419 t_LPAREN = r'\(' 

420 t_RPAREN = r'\)' 

421 t_LBRACKET = r'\[' 

422 t_RBRACKET = r'\]' 

423 t_COMMA = r',' 

424 t_PERIOD = r'\.' 

425 t_SEMI = r';' 

426 t_COLON = r':' 

427 t_ELLIPSIS = r'\.\.\.' 

428 

429 # Scope delimiters 

430 # To see why on_lbrace_func is needed, consider: 

431 # typedef char TT; 

432 # void foo(int TT) { TT = 10; } 

433 # TT x = 5; 

434 # Outside the function, TT is a typedef, but inside (starting and ending 

435 # with the braces) it's a parameter. The trouble begins with yacc's 

436 # lookahead token. If we open a new scope in brace_open, then TT has 

437 # already been read and incorrectly interpreted as TYPEID. So, we need 

438 # to open and close scopes from within the lexer. 

439 # Similar for the TT immediately outside the end of the function. 

440 # 

441 @TOKEN(r'\{') 

442 def t_LBRACE(self, t): 

443 self.on_lbrace_func() 

444 return t 

445 @TOKEN(r'\}') 

446 def t_RBRACE(self, t): 

447 self.on_rbrace_func() 

448 return t 

449 

450 t_STRING_LITERAL = string_literal 

451 

452 # The following floating and integer constants are defined as 

453 # functions to impose a strict order (otherwise, decimal 

454 # is placed before the others because its regex is longer, 

455 # and this is bad) 

456 # 

457 @TOKEN(floating_constant) 

458 def t_FLOAT_CONST(self, t): 

459 return t 

460 

461 @TOKEN(hex_floating_constant) 

462 def t_HEX_FLOAT_CONST(self, t): 

463 return t 

464 

465 @TOKEN(hex_constant) 

466 def t_INT_CONST_HEX(self, t): 

467 return t 

468 

469 @TOKEN(bin_constant) 

470 def t_INT_CONST_BIN(self, t): 

471 return t 

472 

473 @TOKEN(bad_octal_constant) 

474 def t_BAD_CONST_OCT(self, t): 

475 msg = "Invalid octal constant" 

476 self._error(msg, t) 

477 

478 @TOKEN(octal_constant) 

479 def t_INT_CONST_OCT(self, t): 

480 return t 

481 

482 @TOKEN(decimal_constant) 

483 def t_INT_CONST_DEC(self, t): 

484 return t 

485 

486 # Must come before bad_char_const, to prevent it from 

487 # catching valid char constants as invalid 

488 # 

489 @TOKEN(multicharacter_constant) 

490 def t_INT_CONST_CHAR(self, t): 

491 return t 

492 

493 @TOKEN(char_const) 

494 def t_CHAR_CONST(self, t): 

495 return t 

496 

497 @TOKEN(wchar_const) 

498 def t_WCHAR_CONST(self, t): 

499 return t 

500 

501 @TOKEN(u8char_const) 

502 def t_U8CHAR_CONST(self, t): 

503 return t 

504 

505 @TOKEN(u16char_const) 

506 def t_U16CHAR_CONST(self, t): 

507 return t 

508 

509 @TOKEN(u32char_const) 

510 def t_U32CHAR_CONST(self, t): 

511 return t 

512 

513 @TOKEN(unmatched_quote) 

514 def t_UNMATCHED_QUOTE(self, t): 

515 msg = "Unmatched '" 

516 self._error(msg, t) 

517 

518 @TOKEN(bad_char_const) 

519 def t_BAD_CHAR_CONST(self, t): 

520 msg = "Invalid char constant %s" % t.value 

521 self._error(msg, t) 

522 

523 @TOKEN(wstring_literal) 

524 def t_WSTRING_LITERAL(self, t): 

525 return t 

526 

527 @TOKEN(u8string_literal) 

528 def t_U8STRING_LITERAL(self, t): 

529 return t 

530 

531 @TOKEN(u16string_literal) 

532 def t_U16STRING_LITERAL(self, t): 

533 return t 

534 

535 @TOKEN(u32string_literal) 

536 def t_U32STRING_LITERAL(self, t): 

537 return t 

538 

539 # unmatched string literals are caught by the preprocessor 

540 

541 @TOKEN(bad_string_literal) 

542 def t_BAD_STRING_LITERAL(self, t): 

543 msg = "String contains invalid escape code" 

544 self._error(msg, t) 

545 

546 @TOKEN(identifier) 

547 def t_ID(self, t): 

548 t.type = self.keyword_map.get(t.value, "ID") 

549 if t.type == 'ID' and self.type_lookup_func(t.value): 

550 t.type = "TYPEID" 

551 return t 

552 

553 def t_error(self, t): 

554 msg = 'Illegal character %s' % repr(t.value[0]) 

555 self._error(msg, t)