Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/dissect/cstruct/parser.py: 57%

1from __future__ import annotations

3import ast

4import re

5from typing import TYPE_CHECKING, Any

7from dissect.cstruct import compiler

8from dissect.cstruct.exceptions import (

9 ExpressionParserError,

10 ExpressionTokenizerError,

11 ParserError,

12)

13from dissect.cstruct.expression import Expression

14from dissect.cstruct.types import BaseArray, BaseType, Field, Structure

16if TYPE_CHECKING:

17 from dissect.cstruct import cstruct

20class Parser:

21 """Base class for definition parsers.

23 Args:

24 cs: An instance of cstruct.

25 """

27 def __init__(self, cs: cstruct):

28 self.cstruct = cs

30 def parse(self, data: str) -> None:

31 """This function should parse definitions to cstruct types.

33 Args:

34 data: Data to parse definitions from, usually a string.

35 """

36 raise NotImplementedError

39class TokenParser(Parser):

40 """

41 Args:

42 cs: An instance of cstruct.

43 compiled: Whether structs should be compiled or not.

44 """

46 def __init__(self, cs: cstruct, compiled: bool = True, align: bool = False):

47 super().__init__(cs)

49 self.compiled = compiled

50 self.align = align

51 self.TOK = self._tokencollection()

53 @staticmethod

54 def _tokencollection() -> TokenCollection:

55 TOK = TokenCollection()

56 TOK.add(r"#\[(?P<values>[^\]]+)\](?=\s*)", "CONFIG_FLAG")

57 TOK.add(r"#define\s+(?P<name>[^\s]+)\s+(?P<value>[^\r\n]+)\s*", "DEFINE")

58 TOK.add(r"typedef(?=\s)", "TYPEDEF")

59 TOK.add(r"(?:struct|union)(?=\s|{)", "STRUCT")

60 TOK.add(

61 r"(?P<enumtype>enum|flag)\s+(?P<name>[^\s:{]+)?\s*(:\s"

62 r"*(?P<type>[^{]+?)\s*)?\{(?P<values>[^}]+)\}\s*(?=;)",

63 "ENUM",

64 )

65 TOK.add(r"(?<=})\s*(?P<defs>(?:[a-zA-Z0-9_]+\s*,\s*)+[a-zA-Z0-9_]+)\s*(?=;)", "DEFS")

66 TOK.add(r"(?P<name>\**?\s*[a-zA-Z0-9_]+)(?:\s*:\s*(?P<bits>\d+))?(?:\[(?P<count>[^;\n]*)\])?\s*(?=;)", "NAME")

67 TOK.add(r"[a-zA-Z_][a-zA-Z0-9_]*", "IDENTIFIER")

68 TOK.add(r"[{}]", "BLOCK")

69 TOK.add(r"\$(?P<name>[^\s]+) = (?P<value>{[^}]+})\w*[\r\n]+", "LOOKUP")

70 TOK.add(r";", "EOL")

71 TOK.add(r"\s+", None)

72 TOK.add(r".", None)

74 return TOK

76 def _identifier(self, tokens: TokenConsumer) -> str:

77 idents = []

78 while tokens.next == self.TOK.IDENTIFIER:

79 idents.append(tokens.consume())

80 return " ".join([i.value for i in idents])

82 def _constant(self, tokens: TokenConsumer) -> None:

83 const = tokens.consume()

84 pattern = self.TOK.patterns[self.TOK.DEFINE]

85 match = pattern.match(const.value).groupdict()

87 value = match["value"]

88 try:

89 value = ast.literal_eval(value)

90 except (ValueError, SyntaxError):

91 pass

93 if isinstance(value, str):

94 try:

95 value = Expression(self.cstruct, value).evaluate()

96 except (ExpressionParserError, ExpressionTokenizerError):

97 pass

99 self.cstruct.consts[match["name"]] = value

100

101 def _enum(self, tokens: TokenConsumer) -> None:

102 # We cheat with enums because the entire enum is in the token

103 etok = tokens.consume()

104

105 pattern = self.TOK.patterns[self.TOK.ENUM]

106 # Dirty trick because the regex expects a ; but we don't want it to be part of the value

107 d = pattern.match(etok.value + ";").groupdict()

108 enumtype = d["enumtype"]

109

110 nextval = 0

111 if enumtype == "flag":

112 nextval = 1

113

114 values = {}

115 for line in d["values"].splitlines():

116 for v in line.split(","):

117 key, _, val = v.partition("=")

118 key = key.strip()

119 val = val.strip()

120 if not key:

121 continue

122

123 val = nextval if not val else Expression(self.cstruct, val).evaluate(values)

124

125 if enumtype == "flag":

126 high_bit = val.bit_length() - 1

127 nextval = 2 ** (high_bit + 1)

128 else:

129 nextval = val + 1

130

131 values[key] = val

132

133 if not d["type"]:

134 d["type"] = "uint32"

135

136 factory = self.cstruct._make_flag if enumtype == "flag" else self.cstruct._make_enum

137

138 enum = factory(d["name"] or "", self.cstruct.resolve(d["type"]), values)

139 if not enum.__name__:

140 self.cstruct.consts.update(enum.__members__)

141 else:

142 self.cstruct.add_type(enum.__name__, enum)

143

144 tokens.eol()

145

146 def _typedef(self, tokens: TokenConsumer) -> None:

147 tokens.consume()

148 type_ = None

149

150 names = []

151

152 if tokens.next == self.TOK.IDENTIFIER:

153 type_ = self.cstruct.resolve(self._identifier(tokens))

154 elif tokens.next == self.TOK.STRUCT:

155 type_ = self._struct(tokens)

156 if not type_.__anonymous__:

157 names.append(type_.__name__)

158

159 names.extend(self._names(tokens))

160 for name in names:

161 if issubclass(type_, Structure) and type_.__anonymous__:

162 type_.__anonymous__ = False

163 type_.__name__ = name

164 type_.__qualname__ = name

165

166 type_, name, bits = self._parse_field_type(type_, name)

167 if bits is not None:

168 raise ParserError(f"line {self._lineno(tokens.previous)}: typedefs cannot have bitfields")

169 self.cstruct.add_type(name, type_)

170

171 def _struct(self, tokens: TokenConsumer, register: bool = False) -> type[Structure]:

172 stype = tokens.consume()

173

174 factory = self.cstruct._make_union if stype.value.startswith("union") else self.cstruct._make_struct

175

176 st = None

177 names = []

178 registered = False

179

180 if tokens.next == self.TOK.IDENTIFIER:

181 ident = tokens.consume()

182 if register:

183 # Pre-register an empty struct for self-referencing

184 # We update this instance later with the fields

185 st = factory(ident.value, [], align=self.align)

186 if self.compiled and "nocompile" not in tokens.flags:

187 st = compiler.compile(st)

188 self.cstruct.add_type(ident.value, st)

189 registered = True

190 else:

191 names.append(ident.value)

192

193 if tokens.next == self.TOK.NAME:

194 # As part of a struct field

195 # struct type_name field_name;

196 if not len(names):

197 raise ParserError(f"line {self._lineno(tokens.next)}: unexpected anonymous struct")

198 return self.cstruct.resolve(names[0])

199

200 if tokens.next != self.TOK.BLOCK:

201 raise ParserError(f"line {self._lineno(tokens.next)}: expected start of block '{tokens.next}'")

202

203 fields = []

204 tokens.consume()

205 while len(tokens):

206 if tokens.next == self.TOK.BLOCK and tokens.next.value == "}":

207 tokens.consume()

208 break

209

210 field = self._parse_field(tokens)

211 fields.append(field)

212

213 if register:

214 names.extend(self._names(tokens))

215

216 # If the next token is EOL, consume it

217 # Otherwise we're part of a typedef or field definition

218 if tokens.next == self.TOK.EOL:

219 tokens.eol()

220

221 name = names[0] if names else None

222

223 if st is None:

224 is_anonymous = False

225 if not name:

226 is_anonymous = True

227 name = self.cstruct._next_anonymous()

228

229 st = factory(name, fields, align=self.align, anonymous=is_anonymous)

230 if self.compiled and "nocompile" not in tokens.flags:

231 st = compiler.compile(st)

232 else:

233 st.__fields__.extend(fields)

234 st.commit()

235

236 # This is pretty dirty

237 if register:

238 if not names and not registered:

239 raise ParserError(f"line {self._lineno(stype)}: struct has no name")

240

241 for name in names:

242 self.cstruct.add_type(name, st)

243

244 tokens.reset_flags()

245 return st

246

247 def _lookup(self, tokens: TokenConsumer) -> None:

248 # Just like enums, we cheat and have the entire lookup in the token

249 ltok = tokens.consume()

250

251 pattern = self.TOK.patterns[self.TOK.LOOKUP]

252 # Dirty trick because the regex expects a ; but we don't want it to be part of the value

253 m = pattern.match(ltok.value + ";")

254 d = ast.literal_eval(m.group(2))

255 self.cstruct.lookups[m.group(1)] = {self.cstruct.consts[k]: v for k, v in d.items()}

256

257 def _parse_field(self, tokens: TokenConsumer) -> Field:

258 type_ = None

259 if tokens.next == self.TOK.IDENTIFIER:

260 type_ = self.cstruct.resolve(self._identifier(tokens))

261 elif tokens.next == self.TOK.STRUCT:

262 type_ = self._struct(tokens)

263

264 if tokens.next != self.TOK.NAME:

265 return Field(None, type_, None)

266

267 if tokens.next != self.TOK.NAME:

268 raise ParserError(f"line {self._lineno(tokens.next)}: expected name")

269 nametok = tokens.consume()

270

271 type_, name, bits = self._parse_field_type(type_, nametok.value)

272

273 tokens.eol()

274 return Field(name.strip(), type_, bits)

275

276 def _parse_field_type(self, type_: type[BaseType], name: str) -> tuple[type[BaseType], str, int | None]:

277 pattern = self.TOK.patterns[self.TOK.NAME]

278 # Dirty trick because the regex expects a ; but we don't want it to be part of the value

279 d = pattern.match(name + ";").groupdict()

280

281 name = d["name"]

282 count_expression = d["count"]

283

284 while name.startswith("*"):

285 name = name[1:]

286 type_ = self.cstruct._make_pointer(type_)

287

288 if count_expression is not None:

289 # Poor mans multi-dimensional array by abusing the eager regex match of count

290 counts = count_expression.split("][") if "][" in count_expression else [count_expression]

291

292 for count in reversed(counts):

293 if count == "":

294 count = None

295 else:

296 count = Expression(self.cstruct, count)

297 try:

298 count = count.evaluate()

299 except Exception:

300 pass

301

302 if issubclass(type_, BaseArray) and count is None:

303 raise ParserError("Depth required for multi-dimensional array")

304

305 type_ = self.cstruct._make_array(type_, count)

306

307 return type_, name.strip(), int(d["bits"]) if d["bits"] else None

308

309 def _names(self, tokens: TokenConsumer) -> list[str]:

310 names = []

311 while True:

312 if tokens.next == self.TOK.EOL:

313 tokens.eol()

314 break

315

316 if tokens.next not in (self.TOK.NAME, self.TOK.DEFS):

317 break

318

319 ntoken = tokens.consume()

320 if ntoken == self.TOK.NAME:

321 names.append(ntoken.value.strip())

322 elif ntoken == self.TOK.DEFS:

323 names.extend([name.strip() for name in ntoken.value.strip().split(",")])

324

325 return names

326

327 @staticmethod

328 def _remove_comments(string: str) -> str:

329 # https://stackoverflow.com/a/18381470

330 pattern = r"(\".*?\"|\'.*?\')|(/\*.*?\*/|//[^\r\n]*$)"

331 # first group captures quoted strings (double or single)

332 # second group captures comments (//single-line or /* multi-line */)

333 regex = re.compile(pattern, re.MULTILINE | re.DOTALL)

334

335 def _replacer(match: re.Match) -> str:

336 # if the 2nd group (capturing comments) is not None,

337 # it means we have captured a non-quoted (real) comment string.

338 if comment := match.group(2):

339 return "\n" * comment.count("\n") # so we will return empty to remove the comment

340 # otherwise, we will return the 1st group

341 return match.group(1) # captured quoted-string

342

343 return regex.sub(_replacer, string)

344

345 @staticmethod

346 def _lineno(tok: Token) -> int:

347 """Quick and dirty line number calculator"""

348

349 match = tok.match

350 return match.string.count("\n", 0, match.start()) + 1

351

352 def _config_flag(self, tokens: TokenConsumer) -> None:

353 flag_token = tokens.consume()

354 pattern = self.TOK.patterns[self.TOK.CONFIG_FLAG]

355 tok_dict = pattern.match(flag_token.value).groupdict()

356 tokens.flags.extend(tok_dict["values"].split(","))

357

358 def parse(self, data: str) -> None:

359 scanner = re.Scanner(self.TOK.tokens)

360 data = self._remove_comments(data)

361 tokens, remaining = scanner.scan(data)

362

363 if len(remaining):

364 lineno = data.count("\n", 0, len(data) - len(remaining))

365 raise ParserError(f"line {lineno}: invalid syntax in definition")

366

367 tokens = TokenConsumer(tokens)

368 while True:

369 token = tokens.next

370 if token is None:

371 break

372

373 if token == self.TOK.CONFIG_FLAG:

374 self._config_flag(tokens)

375 elif token == self.TOK.DEFINE:

376 self._constant(tokens)

377 elif token == self.TOK.TYPEDEF:

378 self._typedef(tokens)

379 elif token == self.TOK.STRUCT:

380 self._struct(tokens, register=True)

381 elif token == self.TOK.ENUM:

382 self._enum(tokens)

383 elif token == self.TOK.LOOKUP:

384 self._lookup(tokens)

385 else:

386 raise ParserError(f"line {self._lineno(token)}: unexpected token {token!r}")

387

388

389class CStyleParser(Parser):

390 """Definition parser for C-like structure syntax.

391

392 Args:

393 cs: An instance of cstruct

394 compiled: Whether structs should be compiled or not.

395 """

396

397 def __init__(self, cs: cstruct, compiled: bool = True):

398 self.compiled = compiled

399 super().__init__(cs)

400

401 def _constants(self, data: str) -> None:

402 r = re.finditer(r"#define\s+(?P<name>[^\s]+)\s+(?P<value>[^\r\n]+)\s*\n", data)

403 for t in r:

404 d = t.groupdict()

405 v = d["value"].rsplit("//")[0]

406

407 try:

408 v = ast.literal_eval(v)

409 except (ValueError, SyntaxError):

410 pass

411

412 self.cstruct.consts[d["name"]] = v

413

414 def _enums(self, data: str) -> None:

415 r = re.finditer(

416 r"(?P<enumtype>enum|flag)\s+(?P<name>[^\s:{]+)\s*(:\s*(?P<type>[^\s]+)\s*)?\{(?P<values>[^}]+)\}\s*;",

417 data,

418 )

419 for t in r:

420 d = t.groupdict()

421 enumtype = d["enumtype"]

422

423 nextval = 0

424 if enumtype == "flag":

425 nextval = 1

426

427 values = {}

428 for line in d["values"].split("\n"):

429 line, _, _ = line.partition("//")

430 for v in line.split(","):

431 key, _, val = v.partition("=")

432 key = key.strip()

433 val = val.strip()

434 if not key:

435 continue

436

437 val = nextval if not val else Expression(self.cstruct, val).evaluate()

438

439 if enumtype == "flag":

440 high_bit = val.bit_length() - 1

441 nextval = 2 ** (high_bit + 1)

442 else:

443 nextval = val + 1

444

445 values[key] = val

446

447 if not d["type"]:

448 d["type"] = "uint32"

449

450 factory = self.cstruct._make_enum

451 if enumtype == "flag":

452 factory = self.cstruct._make_flag

453

454 enum = factory(d["name"], self.cstruct.resolve(d["type"]), values)

455 self.cstruct.add_type(enum.__name__, enum)

456

457 def _structs(self, data: str) -> None:

458 r = re.finditer(

459 r"(#(?P<flags>(?:compile))\s+)?"

460 r"((?P<typedef>typedef)\s+)?"

461 r"(?P<type>[^\s]+)\s+"

462 r"(?P<name>[^\s]+)?"

463 r"(?P<fields>"

464 r"\s*{[^}]+\}(?P<defs>\s+[^;\n]+)?"

465 r")?\s*;",

466 data,

467 )

468 for t in r:

469 d = t.groupdict()

470

471 if d["name"]:

472 name = d["name"]

473 elif d["defs"]:

474 name = d["defs"].strip().split(",")[0].strip()

475 else:

476 raise ParserError("No name for struct")

477

478 if d["type"] == "struct":

479 data = self._parse_fields(d["fields"][1:-1].strip())

480 st = self.cstruct._make_struct(name, data)

481 if d["flags"] == "compile" or self.compiled:

482 st = compiler.compile(st)

483 elif d["typedef"] == "typedef":

484 st = d["type"]

485 else:

486 continue

487

488 if d["name"]:

489 self.cstruct.add_type(d["name"], st)

490

491 if d["defs"]:

492 for td in d["defs"].strip().split(","):

493 td = td.strip()

494 self.cstruct.add_type(td, st)

495

496 def _parse_fields(self, data: str) -> None:

497 fields = re.finditer(

498 r"(?P<type>[^\s]+)\s+(?P<name>[^\s\[:]+)(:(?P<bits>\d+))?(\[(?P<count>[^;\n]*)\])?;",

499 data,

500 )

501

502 result = []

503 for f in fields:

504 d = f.groupdict()

505 if d["type"].startswith("//"):

506 continue

507

508 type_ = self.cstruct.resolve(d["type"])

509

510 d["name"] = d["name"].replace("(", "").replace(")", "")

511

512 # Maybe reimplement lazy type references later

513 # _type = TypeReference(self, d['type'])

514 if d["count"] is not None:

515 if d["count"] == "":

516 count = None

517 else:

518 count = Expression(self.cstruct, d["count"])

519 try:

520 count = count.evaluate()

521 except Exception:

522 pass

523

524 type_ = self.cstruct._make_array(type_, count)

525

526 if d["name"].startswith("*"):

527 d["name"] = d["name"][1:]

528 type_ = self.cstruct._make_pointer(type_)

529

530 field = Field(d["name"], type_, int(d["bits"]) if d["bits"] else None)

531 result.append(field)

532

533 return result

534

535 def _lookups(self, data: str, consts: dict[str, int]) -> None:

536 r = re.finditer(r"\$(?P<name>[^\s]+) = ({[^}]+})\w*\n", data)

537

538 for t in r:

539 d = ast.literal_eval(t.group(2))

540 self.cstruct.lookups[t.group(1)] = {self.cstruct.consts[k]: v for k, v in d.items()}

541

542 def parse(self, data: str) -> None:

543 self._constants(data)

544 self._enums(data)

545 self._structs(data)

546 self._lookups(data, self.cstruct.consts)

547

548

549class Token:

550 __slots__ = ("match", "token", "value")

551

552 def __init__(self, token: str, value: str, match: re.Match):

553 self.token = token

554 self.value = value

555 self.match = match

556

557 def __eq__(self, other: object) -> bool:

558 if isinstance(other, Token):

559 other = other.token

560

561 return self.token == other

562

563 def __ne__(self, other: object) -> bool:

564 return not self == other

565

566 def __repr__(self) -> str:

567 return f"<Token.{self.token} value={self.value!r}>"

568

569

570class TokenCollection:

571 def __init__(self):

572 self.tokens: list[Token] = []

573 self.lookup: dict[str, str] = {}

574 self.patterns: dict[str, re.Pattern] = {}

575

576 def __getattr__(self, attr: str) -> str | Any:

577 try:

578 return self.lookup[attr]

579 except AttributeError:

580 pass

581

582 return object.__getattribute__(self, attr)

583

584 def add(self, regex: str, name: str | None) -> None:

585 if name is None:

586 self.tokens.append((regex, None))

587 else:

588 self.lookup[name] = name

589 self.patterns[name] = re.compile(regex)

590 self.tokens.append((regex, lambda s, t: Token(name, t, s.match)))

591

592

593class TokenConsumer:

594 def __init__(self, tokens: list[Token]):

595 self.tokens = tokens

596 self.flags = []

597 self.previous = None

598

599 def __contains__(self, token: Token) -> bool:

600 return token in self.tokens

601

602 def __len__(self) -> int:

603 return len(self.tokens)

604

605 def __repr__(self) -> str:

606 return f"<TokenConsumer next={self.next!r}>"

607

608 @property

609 def next(self) -> Token:

610 try:

611 return self.tokens[0]

612 except IndexError:

613 return None

614

615 def consume(self) -> Token:

616 self.previous = self.tokens.pop(0)

617 return self.previous

618

619 def reset_flags(self) -> None:

620 self.flags = []

621

622 def eol(self) -> None:

623 token = self.consume()

624 if token.token != "EOL":

625 raise ParserError(f"line {self._lineno(token)}: expected EOL")