Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/dissect/cstruct/parser.py: 57%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

416 statements  

1from __future__ import annotations 

2 

3import ast 

4import re 

5from typing import TYPE_CHECKING, Any 

6 

7from dissect.cstruct import compiler 

8from dissect.cstruct.exceptions import ( 

9 ExpressionParserError, 

10 ExpressionTokenizerError, 

11 ParserError, 

12) 

13from dissect.cstruct.expression import Expression 

14from dissect.cstruct.types import BaseArray, BaseType, Field, Structure 

15 

16if TYPE_CHECKING: 

17 from dissect.cstruct import cstruct 

18 

19 

20class Parser: 

21 """Base class for definition parsers. 

22 

23 Args: 

24 cs: An instance of cstruct. 

25 """ 

26 

27 def __init__(self, cs: cstruct): 

28 self.cstruct = cs 

29 

30 def parse(self, data: str) -> None: 

31 """This function should parse definitions to cstruct types. 

32 

33 Args: 

34 data: Data to parse definitions from, usually a string. 

35 """ 

36 raise NotImplementedError 

37 

38 

39class TokenParser(Parser): 

40 """ 

41 Args: 

42 cs: An instance of cstruct. 

43 compiled: Whether structs should be compiled or not. 

44 """ 

45 

46 def __init__(self, cs: cstruct, compiled: bool = True, align: bool = False): 

47 super().__init__(cs) 

48 

49 self.compiled = compiled 

50 self.align = align 

51 self.TOK = self._tokencollection() 

52 

53 @staticmethod 

54 def _tokencollection() -> TokenCollection: 

55 TOK = TokenCollection() 

56 TOK.add(r"#\[(?P<values>[^\]]+)\](?=\s*)", "CONFIG_FLAG") 

57 TOK.add(r"#define\s+(?P<name>[^\s]+)\s+(?P<value>[^\r\n]+)\s*", "DEFINE") 

58 TOK.add(r"typedef(?=\s)", "TYPEDEF") 

59 TOK.add(r"(?:struct|union)(?=\s|{)", "STRUCT") 

60 TOK.add( 

61 r"(?P<enumtype>enum|flag)\s+(?P<name>[^\s:{]+)?\s*(:\s" 

62 r"*(?P<type>[^{]+?)\s*)?\{(?P<values>[^}]+)\}\s*(?=;)", 

63 "ENUM", 

64 ) 

65 TOK.add(r"(?<=})\s*(?P<defs>(?:[a-zA-Z0-9_]+\s*,\s*)+[a-zA-Z0-9_]+)\s*(?=;)", "DEFS") 

66 TOK.add(r"(?P<name>\**?\s*[a-zA-Z0-9_]+)(?:\s*:\s*(?P<bits>\d+))?(?:\[(?P<count>[^;\n]*)\])?\s*(?=;)", "NAME") 

67 TOK.add(r"[a-zA-Z_][a-zA-Z0-9_]*", "IDENTIFIER") 

68 TOK.add(r"[{}]", "BLOCK") 

69 TOK.add(r"\$(?P<name>[^\s]+) = (?P<value>{[^}]+})\w*[\r\n]+", "LOOKUP") 

70 TOK.add(r";", "EOL") 

71 TOK.add(r"\s+", None) 

72 TOK.add(r".", None) 

73 

74 return TOK 

75 

76 def _identifier(self, tokens: TokenConsumer) -> str: 

77 idents = [] 

78 while tokens.next == self.TOK.IDENTIFIER: 

79 idents.append(tokens.consume()) 

80 return " ".join([i.value for i in idents]) 

81 

82 def _constant(self, tokens: TokenConsumer) -> None: 

83 const = tokens.consume() 

84 pattern = self.TOK.patterns[self.TOK.DEFINE] 

85 match = pattern.match(const.value).groupdict() 

86 

87 value = match["value"] 

88 try: 

89 value = ast.literal_eval(value) 

90 except (ValueError, SyntaxError): 

91 pass 

92 

93 if isinstance(value, str): 

94 try: 

95 value = Expression(self.cstruct, value).evaluate() 

96 except (ExpressionParserError, ExpressionTokenizerError): 

97 pass 

98 

99 self.cstruct.consts[match["name"]] = value 

100 

101 def _enum(self, tokens: TokenConsumer) -> None: 

102 # We cheat with enums because the entire enum is in the token 

103 etok = tokens.consume() 

104 

105 pattern = self.TOK.patterns[self.TOK.ENUM] 

106 # Dirty trick because the regex expects a ; but we don't want it to be part of the value 

107 d = pattern.match(etok.value + ";").groupdict() 

108 enumtype = d["enumtype"] 

109 

110 nextval = 0 

111 if enumtype == "flag": 

112 nextval = 1 

113 

114 values = {} 

115 for line in d["values"].splitlines(): 

116 for v in line.split(","): 

117 key, _, val = v.partition("=") 

118 key = key.strip() 

119 val = val.strip() 

120 if not key: 

121 continue 

122 

123 val = nextval if not val else Expression(self.cstruct, val).evaluate(values) 

124 

125 if enumtype == "flag": 

126 high_bit = val.bit_length() - 1 

127 nextval = 2 ** (high_bit + 1) 

128 else: 

129 nextval = val + 1 

130 

131 values[key] = val 

132 

133 if not d["type"]: 

134 d["type"] = "uint32" 

135 

136 factory = self.cstruct._make_flag if enumtype == "flag" else self.cstruct._make_enum 

137 

138 enum = factory(d["name"] or "", self.cstruct.resolve(d["type"]), values) 

139 if not enum.__name__: 

140 self.cstruct.consts.update(enum.__members__) 

141 else: 

142 self.cstruct.add_type(enum.__name__, enum) 

143 

144 tokens.eol() 

145 

146 def _typedef(self, tokens: TokenConsumer) -> None: 

147 tokens.consume() 

148 type_ = None 

149 

150 names = [] 

151 

152 if tokens.next == self.TOK.IDENTIFIER: 

153 type_ = self.cstruct.resolve(self._identifier(tokens)) 

154 elif tokens.next == self.TOK.STRUCT: 

155 type_ = self._struct(tokens) 

156 if not type_.__anonymous__: 

157 names.append(type_.__name__) 

158 

159 names.extend(self._names(tokens)) 

160 for name in names: 

161 if issubclass(type_, Structure) and type_.__anonymous__: 

162 type_.__anonymous__ = False 

163 type_.__name__ = name 

164 type_.__qualname__ = name 

165 

166 type_, name, bits = self._parse_field_type(type_, name) 

167 if bits is not None: 

168 raise ParserError(f"line {self._lineno(tokens.previous)}: typedefs cannot have bitfields") 

169 self.cstruct.add_type(name, type_) 

170 

171 def _struct(self, tokens: TokenConsumer, register: bool = False) -> type[Structure]: 

172 stype = tokens.consume() 

173 

174 factory = self.cstruct._make_union if stype.value.startswith("union") else self.cstruct._make_struct 

175 

176 st = None 

177 names = [] 

178 registered = False 

179 

180 if tokens.next == self.TOK.IDENTIFIER: 

181 ident = tokens.consume() 

182 if register: 

183 # Pre-register an empty struct for self-referencing 

184 # We update this instance later with the fields 

185 st = factory(ident.value, [], align=self.align) 

186 if self.compiled and "nocompile" not in tokens.flags: 

187 st = compiler.compile(st) 

188 self.cstruct.add_type(ident.value, st) 

189 registered = True 

190 else: 

191 names.append(ident.value) 

192 

193 if tokens.next == self.TOK.NAME: 

194 # As part of a struct field 

195 # struct type_name field_name; 

196 if not len(names): 

197 raise ParserError(f"line {self._lineno(tokens.next)}: unexpected anonymous struct") 

198 return self.cstruct.resolve(names[0]) 

199 

200 if tokens.next != self.TOK.BLOCK: 

201 raise ParserError(f"line {self._lineno(tokens.next)}: expected start of block '{tokens.next}'") 

202 

203 fields = [] 

204 tokens.consume() 

205 while len(tokens): 

206 if tokens.next == self.TOK.BLOCK and tokens.next.value == "}": 

207 tokens.consume() 

208 break 

209 

210 field = self._parse_field(tokens) 

211 fields.append(field) 

212 

213 if register: 

214 names.extend(self._names(tokens)) 

215 

216 # If the next token is EOL, consume it 

217 # Otherwise we're part of a typedef or field definition 

218 if tokens.next == self.TOK.EOL: 

219 tokens.eol() 

220 

221 name = names[0] if names else None 

222 

223 if st is None: 

224 is_anonymous = False 

225 if not name: 

226 is_anonymous = True 

227 name = self.cstruct._next_anonymous() 

228 

229 st = factory(name, fields, align=self.align, anonymous=is_anonymous) 

230 if self.compiled and "nocompile" not in tokens.flags: 

231 st = compiler.compile(st) 

232 else: 

233 st.__fields__.extend(fields) 

234 st.commit() 

235 

236 # This is pretty dirty 

237 if register: 

238 if not names and not registered: 

239 raise ParserError(f"line {self._lineno(stype)}: struct has no name") 

240 

241 for name in names: 

242 self.cstruct.add_type(name, st) 

243 

244 tokens.reset_flags() 

245 return st 

246 

247 def _lookup(self, tokens: TokenConsumer) -> None: 

248 # Just like enums, we cheat and have the entire lookup in the token 

249 ltok = tokens.consume() 

250 

251 pattern = self.TOK.patterns[self.TOK.LOOKUP] 

252 # Dirty trick because the regex expects a ; but we don't want it to be part of the value 

253 m = pattern.match(ltok.value + ";") 

254 d = ast.literal_eval(m.group(2)) 

255 self.cstruct.lookups[m.group(1)] = {self.cstruct.consts[k]: v for k, v in d.items()} 

256 

257 def _parse_field(self, tokens: TokenConsumer) -> Field: 

258 type_ = None 

259 if tokens.next == self.TOK.IDENTIFIER: 

260 type_ = self.cstruct.resolve(self._identifier(tokens)) 

261 elif tokens.next == self.TOK.STRUCT: 

262 type_ = self._struct(tokens) 

263 

264 if tokens.next != self.TOK.NAME: 

265 return Field(None, type_, None) 

266 

267 if tokens.next != self.TOK.NAME: 

268 raise ParserError(f"line {self._lineno(tokens.next)}: expected name") 

269 nametok = tokens.consume() 

270 

271 type_, name, bits = self._parse_field_type(type_, nametok.value) 

272 

273 tokens.eol() 

274 return Field(name.strip(), type_, bits) 

275 

276 def _parse_field_type(self, type_: type[BaseType], name: str) -> tuple[type[BaseType], str, int | None]: 

277 pattern = self.TOK.patterns[self.TOK.NAME] 

278 # Dirty trick because the regex expects a ; but we don't want it to be part of the value 

279 d = pattern.match(name + ";").groupdict() 

280 

281 name = d["name"] 

282 count_expression = d["count"] 

283 

284 while name.startswith("*"): 

285 name = name[1:] 

286 type_ = self.cstruct._make_pointer(type_) 

287 

288 if count_expression is not None: 

289 # Poor mans multi-dimensional array by abusing the eager regex match of count 

290 counts = count_expression.split("][") if "][" in count_expression else [count_expression] 

291 

292 for count in reversed(counts): 

293 if count == "": 

294 count = None 

295 else: 

296 count = Expression(self.cstruct, count) 

297 try: 

298 count = count.evaluate() 

299 except Exception: 

300 pass 

301 

302 if issubclass(type_, BaseArray) and count is None: 

303 raise ParserError("Depth required for multi-dimensional array") 

304 

305 type_ = self.cstruct._make_array(type_, count) 

306 

307 return type_, name.strip(), int(d["bits"]) if d["bits"] else None 

308 

309 def _names(self, tokens: TokenConsumer) -> list[str]: 

310 names = [] 

311 while True: 

312 if tokens.next == self.TOK.EOL: 

313 tokens.eol() 

314 break 

315 

316 if tokens.next not in (self.TOK.NAME, self.TOK.DEFS): 

317 break 

318 

319 ntoken = tokens.consume() 

320 if ntoken == self.TOK.NAME: 

321 names.append(ntoken.value.strip()) 

322 elif ntoken == self.TOK.DEFS: 

323 names.extend([name.strip() for name in ntoken.value.strip().split(",")]) 

324 

325 return names 

326 

327 @staticmethod 

328 def _remove_comments(string: str) -> str: 

329 # https://stackoverflow.com/a/18381470 

330 pattern = r"(\".*?\"|\'.*?\')|(/\*.*?\*/|//[^\r\n]*$)" 

331 # first group captures quoted strings (double or single) 

332 # second group captures comments (//single-line or /* multi-line */) 

333 regex = re.compile(pattern, re.MULTILINE | re.DOTALL) 

334 

335 def _replacer(match: re.Match) -> str: 

336 # if the 2nd group (capturing comments) is not None, 

337 # it means we have captured a non-quoted (real) comment string. 

338 if comment := match.group(2): 

339 return "\n" * comment.count("\n") # so we will return empty to remove the comment 

340 # otherwise, we will return the 1st group 

341 return match.group(1) # captured quoted-string 

342 

343 return regex.sub(_replacer, string) 

344 

345 @staticmethod 

346 def _lineno(tok: Token) -> int: 

347 """Quick and dirty line number calculator""" 

348 

349 match = tok.match 

350 return match.string.count("\n", 0, match.start()) + 1 

351 

352 def _config_flag(self, tokens: TokenConsumer) -> None: 

353 flag_token = tokens.consume() 

354 pattern = self.TOK.patterns[self.TOK.CONFIG_FLAG] 

355 tok_dict = pattern.match(flag_token.value).groupdict() 

356 tokens.flags.extend(tok_dict["values"].split(",")) 

357 

358 def parse(self, data: str) -> None: 

359 scanner = re.Scanner(self.TOK.tokens) 

360 data = self._remove_comments(data) 

361 tokens, remaining = scanner.scan(data) 

362 

363 if len(remaining): 

364 lineno = data.count("\n", 0, len(data) - len(remaining)) 

365 raise ParserError(f"line {lineno}: invalid syntax in definition") 

366 

367 tokens = TokenConsumer(tokens) 

368 while True: 

369 token = tokens.next 

370 if token is None: 

371 break 

372 

373 if token == self.TOK.CONFIG_FLAG: 

374 self._config_flag(tokens) 

375 elif token == self.TOK.DEFINE: 

376 self._constant(tokens) 

377 elif token == self.TOK.TYPEDEF: 

378 self._typedef(tokens) 

379 elif token == self.TOK.STRUCT: 

380 self._struct(tokens, register=True) 

381 elif token == self.TOK.ENUM: 

382 self._enum(tokens) 

383 elif token == self.TOK.LOOKUP: 

384 self._lookup(tokens) 

385 else: 

386 raise ParserError(f"line {self._lineno(token)}: unexpected token {token!r}") 

387 

388 

389class CStyleParser(Parser): 

390 """Definition parser for C-like structure syntax. 

391 

392 Args: 

393 cs: An instance of cstruct 

394 compiled: Whether structs should be compiled or not. 

395 """ 

396 

397 def __init__(self, cs: cstruct, compiled: bool = True): 

398 self.compiled = compiled 

399 super().__init__(cs) 

400 

401 def _constants(self, data: str) -> None: 

402 r = re.finditer(r"#define\s+(?P<name>[^\s]+)\s+(?P<value>[^\r\n]+)\s*\n", data) 

403 for t in r: 

404 d = t.groupdict() 

405 v = d["value"].rsplit("//")[0] 

406 

407 try: 

408 v = ast.literal_eval(v) 

409 except (ValueError, SyntaxError): 

410 pass 

411 

412 self.cstruct.consts[d["name"]] = v 

413 

414 def _enums(self, data: str) -> None: 

415 r = re.finditer( 

416 r"(?P<enumtype>enum|flag)\s+(?P<name>[^\s:{]+)\s*(:\s*(?P<type>[^\s]+)\s*)?\{(?P<values>[^}]+)\}\s*;", 

417 data, 

418 ) 

419 for t in r: 

420 d = t.groupdict() 

421 enumtype = d["enumtype"] 

422 

423 nextval = 0 

424 if enumtype == "flag": 

425 nextval = 1 

426 

427 values = {} 

428 for line in d["values"].split("\n"): 

429 line, _, _ = line.partition("//") 

430 for v in line.split(","): 

431 key, _, val = v.partition("=") 

432 key = key.strip() 

433 val = val.strip() 

434 if not key: 

435 continue 

436 

437 val = nextval if not val else Expression(self.cstruct, val).evaluate() 

438 

439 if enumtype == "flag": 

440 high_bit = val.bit_length() - 1 

441 nextval = 2 ** (high_bit + 1) 

442 else: 

443 nextval = val + 1 

444 

445 values[key] = val 

446 

447 if not d["type"]: 

448 d["type"] = "uint32" 

449 

450 factory = self.cstruct._make_enum 

451 if enumtype == "flag": 

452 factory = self.cstruct._make_flag 

453 

454 enum = factory(d["name"], self.cstruct.resolve(d["type"]), values) 

455 self.cstruct.add_type(enum.__name__, enum) 

456 

457 def _structs(self, data: str) -> None: 

458 r = re.finditer( 

459 r"(#(?P<flags>(?:compile))\s+)?" 

460 r"((?P<typedef>typedef)\s+)?" 

461 r"(?P<type>[^\s]+)\s+" 

462 r"(?P<name>[^\s]+)?" 

463 r"(?P<fields>" 

464 r"\s*{[^}]+\}(?P<defs>\s+[^;\n]+)?" 

465 r")?\s*;", 

466 data, 

467 ) 

468 for t in r: 

469 d = t.groupdict() 

470 

471 if d["name"]: 

472 name = d["name"] 

473 elif d["defs"]: 

474 name = d["defs"].strip().split(",")[0].strip() 

475 else: 

476 raise ParserError("No name for struct") 

477 

478 if d["type"] == "struct": 

479 data = self._parse_fields(d["fields"][1:-1].strip()) 

480 st = self.cstruct._make_struct(name, data) 

481 if d["flags"] == "compile" or self.compiled: 

482 st = compiler.compile(st) 

483 elif d["typedef"] == "typedef": 

484 st = d["type"] 

485 else: 

486 continue 

487 

488 if d["name"]: 

489 self.cstruct.add_type(d["name"], st) 

490 

491 if d["defs"]: 

492 for td in d["defs"].strip().split(","): 

493 td = td.strip() 

494 self.cstruct.add_type(td, st) 

495 

496 def _parse_fields(self, data: str) -> None: 

497 fields = re.finditer( 

498 r"(?P<type>[^\s]+)\s+(?P<name>[^\s\[:]+)(:(?P<bits>\d+))?(\[(?P<count>[^;\n]*)\])?;", 

499 data, 

500 ) 

501 

502 result = [] 

503 for f in fields: 

504 d = f.groupdict() 

505 if d["type"].startswith("//"): 

506 continue 

507 

508 type_ = self.cstruct.resolve(d["type"]) 

509 

510 d["name"] = d["name"].replace("(", "").replace(")", "") 

511 

512 # Maybe reimplement lazy type references later 

513 # _type = TypeReference(self, d['type']) 

514 if d["count"] is not None: 

515 if d["count"] == "": 

516 count = None 

517 else: 

518 count = Expression(self.cstruct, d["count"]) 

519 try: 

520 count = count.evaluate() 

521 except Exception: 

522 pass 

523 

524 type_ = self.cstruct._make_array(type_, count) 

525 

526 if d["name"].startswith("*"): 

527 d["name"] = d["name"][1:] 

528 type_ = self.cstruct._make_pointer(type_) 

529 

530 field = Field(d["name"], type_, int(d["bits"]) if d["bits"] else None) 

531 result.append(field) 

532 

533 return result 

534 

535 def _lookups(self, data: str, consts: dict[str, int]) -> None: 

536 r = re.finditer(r"\$(?P<name>[^\s]+) = ({[^}]+})\w*\n", data) 

537 

538 for t in r: 

539 d = ast.literal_eval(t.group(2)) 

540 self.cstruct.lookups[t.group(1)] = {self.cstruct.consts[k]: v for k, v in d.items()} 

541 

542 def parse(self, data: str) -> None: 

543 self._constants(data) 

544 self._enums(data) 

545 self._structs(data) 

546 self._lookups(data, self.cstruct.consts) 

547 

548 

549class Token: 

550 __slots__ = ("match", "token", "value") 

551 

552 def __init__(self, token: str, value: str, match: re.Match): 

553 self.token = token 

554 self.value = value 

555 self.match = match 

556 

557 def __eq__(self, other: object) -> bool: 

558 if isinstance(other, Token): 

559 other = other.token 

560 

561 return self.token == other 

562 

563 def __ne__(self, other: object) -> bool: 

564 return not self == other 

565 

566 def __repr__(self) -> str: 

567 return f"<Token.{self.token} value={self.value!r}>" 

568 

569 

570class TokenCollection: 

571 def __init__(self): 

572 self.tokens: list[Token] = [] 

573 self.lookup: dict[str, str] = {} 

574 self.patterns: dict[str, re.Pattern] = {} 

575 

576 def __getattr__(self, attr: str) -> str | Any: 

577 try: 

578 return self.lookup[attr] 

579 except AttributeError: 

580 pass 

581 

582 return object.__getattribute__(self, attr) 

583 

584 def add(self, regex: str, name: str | None) -> None: 

585 if name is None: 

586 self.tokens.append((regex, None)) 

587 else: 

588 self.lookup[name] = name 

589 self.patterns[name] = re.compile(regex) 

590 self.tokens.append((regex, lambda s, t: Token(name, t, s.match))) 

591 

592 

593class TokenConsumer: 

594 def __init__(self, tokens: list[Token]): 

595 self.tokens = tokens 

596 self.flags = [] 

597 self.previous = None 

598 

599 def __contains__(self, token: Token) -> bool: 

600 return token in self.tokens 

601 

602 def __len__(self) -> int: 

603 return len(self.tokens) 

604 

605 def __repr__(self) -> str: 

606 return f"<TokenConsumer next={self.next!r}>" 

607 

608 @property 

609 def next(self) -> Token: 

610 try: 

611 return self.tokens[0] 

612 except IndexError: 

613 return None 

614 

615 def consume(self) -> Token: 

616 self.previous = self.tokens.pop(0) 

617 return self.previous 

618 

619 def reset_flags(self) -> None: 

620 self.flags = [] 

621 

622 def eol(self) -> None: 

623 token = self.consume() 

624 if token.token != "EOL": 

625 raise ParserError(f"line {self._lineno(token)}: expected EOL")