Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pycparser/c_lexer.py: 82%

249 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-07 06:10 +0000

1#------------------------------------------------------------------------------ 

2# pycparser: c_lexer.py 

3# 

4# CLexer class: lexer for the C language 

5# 

6# Eli Bendersky [https://eli.thegreenplace.net/] 

7# License: BSD 

8#------------------------------------------------------------------------------ 

9import re 

10 

11from .ply import lex 

12from .ply.lex import TOKEN 

13 

14 

15class CLexer(object): 

16 """ A lexer for the C language. After building it, set the 

17 input text with input(), and call token() to get new 

18 tokens. 

19 

20 The public attribute filename can be set to an initial 

21 filename, but the lexer will update it upon #line 

22 directives. 

23 """ 

24 def __init__(self, error_func, on_lbrace_func, on_rbrace_func, 

25 type_lookup_func): 

26 """ Create a new Lexer. 

27 

28 error_func: 

29 An error function. Will be called with an error 

30 message, line and column as arguments, in case of 

31 an error during lexing. 

32 

33 on_lbrace_func, on_rbrace_func: 

34 Called when an LBRACE or RBRACE is encountered 

35 (likely to push/pop type_lookup_func's scope) 

36 

37 type_lookup_func: 

38 A type lookup function. Given a string, it must 

39 return True IFF this string is a name of a type 

40 that was defined with a typedef earlier. 

41 """ 

42 self.error_func = error_func 

43 self.on_lbrace_func = on_lbrace_func 

44 self.on_rbrace_func = on_rbrace_func 

45 self.type_lookup_func = type_lookup_func 

46 self.filename = '' 

47 

48 # Keeps track of the last token returned from self.token() 

49 self.last_token = None 

50 

51 # Allow either "# line" or "# <num>" to support GCC's 

52 # cpp output 

53 # 

54 self.line_pattern = re.compile(r'([ \t]*line\W)|([ \t]*\d+)') 

55 self.pragma_pattern = re.compile(r'[ \t]*pragma\W') 

56 

57 def build(self, **kwargs): 

58 """ Builds the lexer from the specification. Must be 

59 called after the lexer object is created. 

60 

61 This method exists separately, because the PLY 

62 manual warns against calling lex.lex inside 

63 __init__ 

64 """ 

65 self.lexer = lex.lex(object=self, **kwargs) 

66 

67 def reset_lineno(self): 

68 """ Resets the internal line number counter of the lexer. 

69 """ 

70 self.lexer.lineno = 1 

71 

72 def input(self, text): 

73 self.lexer.input(text) 

74 

75 def token(self): 

76 self.last_token = self.lexer.token() 

77 return self.last_token 

78 

79 def find_tok_column(self, token): 

80 """ Find the column of the token in its line. 

81 """ 

82 last_cr = self.lexer.lexdata.rfind('\n', 0, token.lexpos) 

83 return token.lexpos - last_cr 

84 

85 ######################-- PRIVATE --###################### 

86 

87 ## 

88 ## Internal auxiliary methods 

89 ## 

90 def _error(self, msg, token): 

91 location = self._make_tok_location(token) 

92 self.error_func(msg, location[0], location[1]) 

93 self.lexer.skip(1) 

94 

95 def _make_tok_location(self, token): 

96 return (token.lineno, self.find_tok_column(token)) 

97 

98 ## 

99 ## Reserved keywords 

100 ## 

101 keywords = ( 

102 'AUTO', 'BREAK', 'CASE', 'CHAR', 'CONST', 

103 'CONTINUE', 'DEFAULT', 'DO', 'DOUBLE', 'ELSE', 'ENUM', 'EXTERN', 

104 'FLOAT', 'FOR', 'GOTO', 'IF', 'INLINE', 'INT', 'LONG', 

105 'REGISTER', 'OFFSETOF', 

106 'RESTRICT', 'RETURN', 'SHORT', 'SIGNED', 'SIZEOF', 'STATIC', 'STRUCT', 

107 'SWITCH', 'TYPEDEF', 'UNION', 'UNSIGNED', 'VOID', 

108 'VOLATILE', 'WHILE', '__INT128', 

109 ) 

110 

111 keywords_new = ( 

112 '_BOOL', '_COMPLEX', 

113 '_NORETURN', '_THREAD_LOCAL', '_STATIC_ASSERT', 

114 '_ATOMIC', '_ALIGNOF', '_ALIGNAS', 

115 ) 

116 

117 keyword_map = {} 

118 

119 for keyword in keywords: 

120 keyword_map[keyword.lower()] = keyword 

121 

122 for keyword in keywords_new: 

123 keyword_map[keyword[:2].upper() + keyword[2:].lower()] = keyword 

124 

125 ## 

126 ## All the tokens recognized by the lexer 

127 ## 

128 tokens = keywords + keywords_new + ( 

129 # Identifiers 

130 'ID', 

131 

132 # Type identifiers (identifiers previously defined as 

133 # types with typedef) 

134 'TYPEID', 

135 

136 # constants 

137 'INT_CONST_DEC', 'INT_CONST_OCT', 'INT_CONST_HEX', 'INT_CONST_BIN', 'INT_CONST_CHAR', 

138 'FLOAT_CONST', 'HEX_FLOAT_CONST', 

139 'CHAR_CONST', 

140 'WCHAR_CONST', 

141 'U8CHAR_CONST', 

142 'U16CHAR_CONST', 

143 'U32CHAR_CONST', 

144 

145 # String literals 

146 'STRING_LITERAL', 

147 'WSTRING_LITERAL', 

148 'U8STRING_LITERAL', 

149 'U16STRING_LITERAL', 

150 'U32STRING_LITERAL', 

151 

152 # Operators 

153 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'MOD', 

154 'OR', 'AND', 'NOT', 'XOR', 'LSHIFT', 'RSHIFT', 

155 'LOR', 'LAND', 'LNOT', 

156 'LT', 'LE', 'GT', 'GE', 'EQ', 'NE', 

157 

158 # Assignment 

159 'EQUALS', 'TIMESEQUAL', 'DIVEQUAL', 'MODEQUAL', 

160 'PLUSEQUAL', 'MINUSEQUAL', 

161 'LSHIFTEQUAL','RSHIFTEQUAL', 'ANDEQUAL', 'XOREQUAL', 

162 'OREQUAL', 

163 

164 # Increment/decrement 

165 'PLUSPLUS', 'MINUSMINUS', 

166 

167 # Structure dereference (->) 

168 'ARROW', 

169 

170 # Conditional operator (?) 

171 'CONDOP', 

172 

173 # Delimiters 

174 'LPAREN', 'RPAREN', # ( ) 

175 'LBRACKET', 'RBRACKET', # [ ] 

176 'LBRACE', 'RBRACE', # { } 

177 'COMMA', 'PERIOD', # . , 

178 'SEMI', 'COLON', # ; : 

179 

180 # Ellipsis (...) 

181 'ELLIPSIS', 

182 

183 # pre-processor 

184 'PPHASH', # '#' 

185 'PPPRAGMA', # 'pragma' 

186 'PPPRAGMASTR', 

187 ) 

188 

189 ## 

190 ## Regexes for use in tokens 

191 ## 

192 ## 

193 

194 # valid C identifiers (K&R2: A.2.3), plus '$' (supported by some compilers) 

195 identifier = r'[a-zA-Z_$][0-9a-zA-Z_$]*' 

196 

197 hex_prefix = '0[xX]' 

198 hex_digits = '[0-9a-fA-F]+' 

199 bin_prefix = '0[bB]' 

200 bin_digits = '[01]+' 

201 

202 # integer constants (K&R2: A.2.5.1) 

203 integer_suffix_opt = r'(([uU]ll)|([uU]LL)|(ll[uU]?)|(LL[uU]?)|([uU][lL])|([lL][uU]?)|[uU])?' 

204 decimal_constant = '(0'+integer_suffix_opt+')|([1-9][0-9]*'+integer_suffix_opt+')' 

205 octal_constant = '0[0-7]*'+integer_suffix_opt 

206 hex_constant = hex_prefix+hex_digits+integer_suffix_opt 

207 bin_constant = bin_prefix+bin_digits+integer_suffix_opt 

208 

209 bad_octal_constant = '0[0-7]*[89]' 

210 

211 # character constants (K&R2: A.2.5.2) 

212 # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line 

213 # directives with Windows paths as filenames (..\..\dir\file) 

214 # For the same reason, decimal_escape allows all digit sequences. We want to 

215 # parse all correct code, even if it means to sometimes parse incorrect 

216 # code. 

217 # 

218 # The original regexes were taken verbatim from the C syntax definition, 

219 # and were later modified to avoid worst-case exponential running time. 

220 # 

221 # simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])""" 

222 # decimal_escape = r"""(\d+)""" 

223 # hex_escape = r"""(x[0-9a-fA-F]+)""" 

224 # bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])""" 

225 # 

226 # The following modifications were made to avoid the ambiguity that allowed backtracking: 

227 # (https://github.com/eliben/pycparser/issues/61) 

228 # 

229 # - \x was removed from simple_escape, unless it was not followed by a hex digit, to avoid ambiguity with hex_escape. 

230 # - hex_escape allows one or more hex characters, but requires that the next character(if any) is not hex 

231 # - decimal_escape allows one or more decimal characters, but requires that the next character(if any) is not a decimal 

232 # - bad_escape does not allow any decimals (8-9), to avoid conflicting with the permissive decimal_escape. 

233 # 

234 # Without this change, python's `re` module would recursively try parsing each ambiguous escape sequence in multiple ways. 

235 # e.g. `\123` could be parsed as `\1`+`23`, `\12`+`3`, and `\123`. 

236 

237 simple_escape = r"""([a-wyzA-Z._~!=&\^\-\\?'"]|x(?![0-9a-fA-F]))""" 

238 decimal_escape = r"""(\d+)(?!\d)""" 

239 hex_escape = r"""(x[0-9a-fA-F]+)(?![0-9a-fA-F])""" 

240 bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-9])""" 

241 

242 escape_sequence = r"""(\\("""+simple_escape+'|'+decimal_escape+'|'+hex_escape+'))' 

243 

244 # This complicated regex with lookahead might be slow for strings, so because all of the valid escapes (including \x) allowed 

245 # 0 or more non-escaped characters after the first character, simple_escape+decimal_escape+hex_escape got simplified to 

246 

247 escape_sequence_start_in_string = r"""(\\[0-9a-zA-Z._~!=&\^\-\\?'"])""" 

248 

249 cconst_char = r"""([^'\\\n]|"""+escape_sequence+')' 

250 char_const = "'"+cconst_char+"'" 

251 wchar_const = 'L'+char_const 

252 u8char_const = 'u8'+char_const 

253 u16char_const = 'u'+char_const 

254 u32char_const = 'U'+char_const 

255 multicharacter_constant = "'"+cconst_char+"{2,4}'" 

256 unmatched_quote = "('"+cconst_char+"*\\n)|('"+cconst_char+"*$)" 

257 bad_char_const = r"""('"""+cconst_char+"""[^'\n]+')|('')|('"""+bad_escape+r"""[^'\n]*')""" 

258 

259 # string literals (K&R2: A.2.6) 

260 string_char = r"""([^"\\\n]|"""+escape_sequence_start_in_string+')' 

261 string_literal = '"'+string_char+'*"' 

262 wstring_literal = 'L'+string_literal 

263 u8string_literal = 'u8'+string_literal 

264 u16string_literal = 'u'+string_literal 

265 u32string_literal = 'U'+string_literal 

266 bad_string_literal = '"'+string_char+'*'+bad_escape+string_char+'*"' 

267 

268 # floating constants (K&R2: A.2.5.3) 

269 exponent_part = r"""([eE][-+]?[0-9]+)""" 

270 fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)""" 

271 floating_constant = '(((('+fractional_constant+')'+exponent_part+'?)|([0-9]+'+exponent_part+'))[FfLl]?)' 

272 binary_exponent_part = r'''([pP][+-]?[0-9]+)''' 

273 hex_fractional_constant = '((('+hex_digits+r""")?\."""+hex_digits+')|('+hex_digits+r"""\.))""" 

274 hex_floating_constant = '('+hex_prefix+'('+hex_digits+'|'+hex_fractional_constant+')'+binary_exponent_part+'[FfLl]?)' 

275 

276 ## 

277 ## Lexer states: used for preprocessor \n-terminated directives 

278 ## 

279 states = ( 

280 # ppline: preprocessor line directives 

281 # 

282 ('ppline', 'exclusive'), 

283 

284 # pppragma: pragma 

285 # 

286 ('pppragma', 'exclusive'), 

287 ) 

288 

289 def t_PPHASH(self, t): 

290 r'[ \t]*\#' 

291 if self.line_pattern.match(t.lexer.lexdata, pos=t.lexer.lexpos): 

292 t.lexer.begin('ppline') 

293 self.pp_line = self.pp_filename = None 

294 elif self.pragma_pattern.match(t.lexer.lexdata, pos=t.lexer.lexpos): 

295 t.lexer.begin('pppragma') 

296 else: 

297 t.type = 'PPHASH' 

298 return t 

299 

300 ## 

301 ## Rules for the ppline state 

302 ## 

303 @TOKEN(string_literal) 

304 def t_ppline_FILENAME(self, t): 

305 if self.pp_line is None: 

306 self._error('filename before line number in #line', t) 

307 else: 

308 self.pp_filename = t.value.lstrip('"').rstrip('"') 

309 

310 @TOKEN(decimal_constant) 

311 def t_ppline_LINE_NUMBER(self, t): 

312 if self.pp_line is None: 

313 self.pp_line = t.value 

314 else: 

315 # Ignore: GCC's cpp sometimes inserts a numeric flag 

316 # after the file name 

317 pass 

318 

319 def t_ppline_NEWLINE(self, t): 

320 r'\n' 

321 if self.pp_line is None: 

322 self._error('line number missing in #line', t) 

323 else: 

324 self.lexer.lineno = int(self.pp_line) 

325 

326 if self.pp_filename is not None: 

327 self.filename = self.pp_filename 

328 

329 t.lexer.begin('INITIAL') 

330 

331 def t_ppline_PPLINE(self, t): 

332 r'line' 

333 pass 

334 

335 t_ppline_ignore = ' \t' 

336 

337 def t_ppline_error(self, t): 

338 self._error('invalid #line directive', t) 

339 

340 ## 

341 ## Rules for the pppragma state 

342 ## 

343 def t_pppragma_NEWLINE(self, t): 

344 r'\n' 

345 t.lexer.lineno += 1 

346 t.lexer.begin('INITIAL') 

347 

348 def t_pppragma_PPPRAGMA(self, t): 

349 r'pragma' 

350 return t 

351 

352 t_pppragma_ignore = ' \t' 

353 

354 def t_pppragma_STR(self, t): 

355 '.+' 

356 t.type = 'PPPRAGMASTR' 

357 return t 

358 

359 def t_pppragma_error(self, t): 

360 self._error('invalid #pragma directive', t) 

361 

362 ## 

363 ## Rules for the normal state 

364 ## 

365 t_ignore = ' \t' 

366 

367 # Newlines 

368 def t_NEWLINE(self, t): 

369 r'\n+' 

370 t.lexer.lineno += t.value.count("\n") 

371 

372 # Operators 

373 t_PLUS = r'\+' 

374 t_MINUS = r'-' 

375 t_TIMES = r'\*' 

376 t_DIVIDE = r'/' 

377 t_MOD = r'%' 

378 t_OR = r'\|' 

379 t_AND = r'&' 

380 t_NOT = r'~' 

381 t_XOR = r'\^' 

382 t_LSHIFT = r'<<' 

383 t_RSHIFT = r'>>' 

384 t_LOR = r'\|\|' 

385 t_LAND = r'&&' 

386 t_LNOT = r'!' 

387 t_LT = r'<' 

388 t_GT = r'>' 

389 t_LE = r'<=' 

390 t_GE = r'>=' 

391 t_EQ = r'==' 

392 t_NE = r'!=' 

393 

394 # Assignment operators 

395 t_EQUALS = r'=' 

396 t_TIMESEQUAL = r'\*=' 

397 t_DIVEQUAL = r'/=' 

398 t_MODEQUAL = r'%=' 

399 t_PLUSEQUAL = r'\+=' 

400 t_MINUSEQUAL = r'-=' 

401 t_LSHIFTEQUAL = r'<<=' 

402 t_RSHIFTEQUAL = r'>>=' 

403 t_ANDEQUAL = r'&=' 

404 t_OREQUAL = r'\|=' 

405 t_XOREQUAL = r'\^=' 

406 

407 # Increment/decrement 

408 t_PLUSPLUS = r'\+\+' 

409 t_MINUSMINUS = r'--' 

410 

411 # -> 

412 t_ARROW = r'->' 

413 

414 # ? 

415 t_CONDOP = r'\?' 

416 

417 # Delimiters 

418 t_LPAREN = r'\(' 

419 t_RPAREN = r'\)' 

420 t_LBRACKET = r'\[' 

421 t_RBRACKET = r'\]' 

422 t_COMMA = r',' 

423 t_PERIOD = r'\.' 

424 t_SEMI = r';' 

425 t_COLON = r':' 

426 t_ELLIPSIS = r'\.\.\.' 

427 

428 # Scope delimiters 

429 # To see why on_lbrace_func is needed, consider: 

430 # typedef char TT; 

431 # void foo(int TT) { TT = 10; } 

432 # TT x = 5; 

433 # Outside the function, TT is a typedef, but inside (starting and ending 

434 # with the braces) it's a parameter. The trouble begins with yacc's 

435 # lookahead token. If we open a new scope in brace_open, then TT has 

436 # already been read and incorrectly interpreted as TYPEID. So, we need 

437 # to open and close scopes from within the lexer. 

438 # Similar for the TT immediately outside the end of the function. 

439 # 

440 @TOKEN(r'\{') 

441 def t_LBRACE(self, t): 

442 self.on_lbrace_func() 

443 return t 

444 @TOKEN(r'\}') 

445 def t_RBRACE(self, t): 

446 self.on_rbrace_func() 

447 return t 

448 

449 t_STRING_LITERAL = string_literal 

450 

451 # The following floating and integer constants are defined as 

452 # functions to impose a strict order (otherwise, decimal 

453 # is placed before the others because its regex is longer, 

454 # and this is bad) 

455 # 

456 @TOKEN(floating_constant) 

457 def t_FLOAT_CONST(self, t): 

458 return t 

459 

460 @TOKEN(hex_floating_constant) 

461 def t_HEX_FLOAT_CONST(self, t): 

462 return t 

463 

464 @TOKEN(hex_constant) 

465 def t_INT_CONST_HEX(self, t): 

466 return t 

467 

468 @TOKEN(bin_constant) 

469 def t_INT_CONST_BIN(self, t): 

470 return t 

471 

472 @TOKEN(bad_octal_constant) 

473 def t_BAD_CONST_OCT(self, t): 

474 msg = "Invalid octal constant" 

475 self._error(msg, t) 

476 

477 @TOKEN(octal_constant) 

478 def t_INT_CONST_OCT(self, t): 

479 return t 

480 

481 @TOKEN(decimal_constant) 

482 def t_INT_CONST_DEC(self, t): 

483 return t 

484 

485 # Must come before bad_char_const, to prevent it from 

486 # catching valid char constants as invalid 

487 # 

488 @TOKEN(multicharacter_constant) 

489 def t_INT_CONST_CHAR(self, t): 

490 return t 

491 

492 @TOKEN(char_const) 

493 def t_CHAR_CONST(self, t): 

494 return t 

495 

496 @TOKEN(wchar_const) 

497 def t_WCHAR_CONST(self, t): 

498 return t 

499 

500 @TOKEN(u8char_const) 

501 def t_U8CHAR_CONST(self, t): 

502 return t 

503 

504 @TOKEN(u16char_const) 

505 def t_U16CHAR_CONST(self, t): 

506 return t 

507 

508 @TOKEN(u32char_const) 

509 def t_U32CHAR_CONST(self, t): 

510 return t 

511 

512 @TOKEN(unmatched_quote) 

513 def t_UNMATCHED_QUOTE(self, t): 

514 msg = "Unmatched '" 

515 self._error(msg, t) 

516 

517 @TOKEN(bad_char_const) 

518 def t_BAD_CHAR_CONST(self, t): 

519 msg = "Invalid char constant %s" % t.value 

520 self._error(msg, t) 

521 

522 @TOKEN(wstring_literal) 

523 def t_WSTRING_LITERAL(self, t): 

524 return t 

525 

526 @TOKEN(u8string_literal) 

527 def t_U8STRING_LITERAL(self, t): 

528 return t 

529 

530 @TOKEN(u16string_literal) 

531 def t_U16STRING_LITERAL(self, t): 

532 return t 

533 

534 @TOKEN(u32string_literal) 

535 def t_U32STRING_LITERAL(self, t): 

536 return t 

537 

538 # unmatched string literals are caught by the preprocessor 

539 

540 @TOKEN(bad_string_literal) 

541 def t_BAD_STRING_LITERAL(self, t): 

542 msg = "String contains invalid escape code" 

543 self._error(msg, t) 

544 

545 @TOKEN(identifier) 

546 def t_ID(self, t): 

547 t.type = self.keyword_map.get(t.value, "ID") 

548 if t.type == 'ID' and self.type_lookup_func(t.value): 

549 t.type = "TYPEID" 

550 return t 

551 

552 def t_error(self, t): 

553 msg = 'Illegal character %s' % repr(t.value[0]) 

554 self._error(msg, t)