1from __future__ import annotations
2
3import ast
4import re
5from typing import TYPE_CHECKING, Any
6
7from dissect.cstruct import compiler
8from dissect.cstruct.exceptions import (
9 ExpressionParserError,
10 ExpressionTokenizerError,
11 ParserError,
12)
13from dissect.cstruct.expression import Expression
14from dissect.cstruct.types import BaseArray, BaseType, Field, Structure
15
16if TYPE_CHECKING:
17 from dissect.cstruct import cstruct
18
19
20class Parser:
21 """Base class for definition parsers.
22
23 Args:
24 cs: An instance of cstruct.
25 """
26
27 def __init__(self, cs: cstruct):
28 self.cstruct = cs
29
30 def parse(self, data: str) -> None:
31 """This function should parse definitions to cstruct types.
32
33 Args:
34 data: Data to parse definitions from, usually a string.
35 """
36 raise NotImplementedError
37
38
39class TokenParser(Parser):
40 """
41 Args:
42 cs: An instance of cstruct.
43 compiled: Whether structs should be compiled or not.
44 """
45
46 def __init__(self, cs: cstruct, compiled: bool = True, align: bool = False):
47 super().__init__(cs)
48
49 self.compiled = compiled
50 self.align = align
51 self.TOK = self._tokencollection()
52
53 @staticmethod
54 def _tokencollection() -> TokenCollection:
55 TOK = TokenCollection()
56 TOK.add(r"#\[(?P<values>[^\]]+)\](?=\s*)", "CONFIG_FLAG")
57 TOK.add(r"#define\s+(?P<name>[^\s]+)\s+(?P<value>[^\r\n]+)\s*", "DEFINE")
58 TOK.add(r"typedef(?=\s)", "TYPEDEF")
59 TOK.add(r"(?:struct|union)(?=\s|{)", "STRUCT")
60 TOK.add(
61 r"(?P<enumtype>enum|flag)\s+(?P<name>[^\s:{]+)?\s*(:\s"
62 r"*(?P<type>[^{]+?)\s*)?\{(?P<values>[^}]+)\}\s*(?=;)",
63 "ENUM",
64 )
65 TOK.add(r"(?<=})\s*(?P<defs>(?:[a-zA-Z0-9_]+\s*,\s*)+[a-zA-Z0-9_]+)\s*(?=;)", "DEFS")
66 TOK.add(r"(?P<name>\**?\s*[a-zA-Z0-9_]+)(?:\s*:\s*(?P<bits>\d+))?(?:\[(?P<count>[^;\n]*)\])?\s*(?=;)", "NAME")
67 TOK.add(r"[a-zA-Z_][a-zA-Z0-9_]*", "IDENTIFIER")
68 TOK.add(r"[{}]", "BLOCK")
69 TOK.add(r"\$(?P<name>[^\s]+) = (?P<value>{[^}]+})\w*[\r\n]+", "LOOKUP")
70 TOK.add(r";", "EOL")
71 TOK.add(r"\s+", None)
72 TOK.add(r".", None)
73
74 return TOK
75
76 def _identifier(self, tokens: TokenConsumer) -> str:
77 idents = []
78 while tokens.next == self.TOK.IDENTIFIER:
79 idents.append(tokens.consume())
80 return " ".join([i.value for i in idents])
81
82 def _constant(self, tokens: TokenConsumer) -> None:
83 const = tokens.consume()
84 pattern = self.TOK.patterns[self.TOK.DEFINE]
85 match = pattern.match(const.value).groupdict()
86
87 value = match["value"]
88 try:
89 value = ast.literal_eval(value)
90 except (ValueError, SyntaxError):
91 pass
92
93 if isinstance(value, str):
94 try:
95 value = Expression(self.cstruct, value).evaluate()
96 except (ExpressionParserError, ExpressionTokenizerError):
97 pass
98
99 self.cstruct.consts[match["name"]] = value
100
101 def _enum(self, tokens: TokenConsumer) -> None:
102 # We cheat with enums because the entire enum is in the token
103 etok = tokens.consume()
104
105 pattern = self.TOK.patterns[self.TOK.ENUM]
106 # Dirty trick because the regex expects a ; but we don't want it to be part of the value
107 d = pattern.match(etok.value + ";").groupdict()
108 enumtype = d["enumtype"]
109
110 nextval = 0
111 if enumtype == "flag":
112 nextval = 1
113
114 values = {}
115 for line in d["values"].splitlines():
116 for v in line.split(","):
117 key, _, val = v.partition("=")
118 key = key.strip()
119 val = val.strip()
120 if not key:
121 continue
122
123 val = nextval if not val else Expression(self.cstruct, val).evaluate(values)
124
125 if enumtype == "flag":
126 high_bit = val.bit_length() - 1
127 nextval = 2 ** (high_bit + 1)
128 else:
129 nextval = val + 1
130
131 values[key] = val
132
133 if not d["type"]:
134 d["type"] = "uint32"
135
136 factory = self.cstruct._make_flag if enumtype == "flag" else self.cstruct._make_enum
137
138 enum = factory(d["name"] or "", self.cstruct.resolve(d["type"]), values)
139 if not enum.__name__:
140 self.cstruct.consts.update(enum.__members__)
141 else:
142 self.cstruct.add_type(enum.__name__, enum)
143
144 tokens.eol()
145
146 def _typedef(self, tokens: TokenConsumer) -> None:
147 tokens.consume()
148 type_ = None
149
150 names = []
151
152 if tokens.next == self.TOK.IDENTIFIER:
153 type_ = self.cstruct.resolve(self._identifier(tokens))
154 elif tokens.next == self.TOK.STRUCT:
155 type_ = self._struct(tokens)
156 if not type_.__anonymous__:
157 names.append(type_.__name__)
158
159 names.extend(self._names(tokens))
160 for name in names:
161 if issubclass(type_, Structure) and type_.__anonymous__:
162 type_.__anonymous__ = False
163 type_.__name__ = name
164 type_.__qualname__ = name
165
166 type_, name, bits = self._parse_field_type(type_, name)
167 if bits is not None:
168 raise ParserError(f"line {self._lineno(tokens.previous)}: typedefs cannot have bitfields")
169 self.cstruct.add_type(name, type_)
170
171 def _struct(self, tokens: TokenConsumer, register: bool = False) -> type[Structure]:
172 stype = tokens.consume()
173
174 factory = self.cstruct._make_union if stype.value.startswith("union") else self.cstruct._make_struct
175
176 st = None
177 names = []
178 registered = False
179
180 if tokens.next == self.TOK.IDENTIFIER:
181 ident = tokens.consume()
182 if register:
183 # Pre-register an empty struct for self-referencing
184 # We update this instance later with the fields
185 st = factory(ident.value, [], align=self.align)
186 if self.compiled and "nocompile" not in tokens.flags:
187 st = compiler.compile(st)
188 self.cstruct.add_type(ident.value, st)
189 registered = True
190 else:
191 names.append(ident.value)
192
193 if tokens.next == self.TOK.NAME:
194 # As part of a struct field
195 # struct type_name field_name;
196 if not len(names):
197 raise ParserError(f"line {self._lineno(tokens.next)}: unexpected anonymous struct")
198 return self.cstruct.resolve(names[0])
199
200 if tokens.next != self.TOK.BLOCK:
201 raise ParserError(f"line {self._lineno(tokens.next)}: expected start of block '{tokens.next}'")
202
203 fields = []
204 tokens.consume()
205 while len(tokens):
206 if tokens.next == self.TOK.BLOCK and tokens.next.value == "}":
207 tokens.consume()
208 break
209
210 field = self._parse_field(tokens)
211 fields.append(field)
212
213 if register:
214 names.extend(self._names(tokens))
215
216 # If the next token is EOL, consume it
217 # Otherwise we're part of a typedef or field definition
218 if tokens.next == self.TOK.EOL:
219 tokens.eol()
220
221 name = names[0] if names else None
222
223 if st is None:
224 is_anonymous = False
225 if not name:
226 is_anonymous = True
227 name = self.cstruct._next_anonymous()
228
229 st = factory(name, fields, align=self.align, anonymous=is_anonymous)
230 if self.compiled and "nocompile" not in tokens.flags:
231 st = compiler.compile(st)
232 else:
233 st.__fields__.extend(fields)
234 st.commit()
235
236 # This is pretty dirty
237 if register:
238 if not names and not registered:
239 raise ParserError(f"line {self._lineno(stype)}: struct has no name")
240
241 for name in names:
242 self.cstruct.add_type(name, st)
243
244 tokens.reset_flags()
245 return st
246
247 def _lookup(self, tokens: TokenConsumer) -> None:
248 # Just like enums, we cheat and have the entire lookup in the token
249 ltok = tokens.consume()
250
251 pattern = self.TOK.patterns[self.TOK.LOOKUP]
252 # Dirty trick because the regex expects a ; but we don't want it to be part of the value
253 m = pattern.match(ltok.value + ";")
254 d = ast.literal_eval(m.group(2))
255 self.cstruct.lookups[m.group(1)] = {self.cstruct.consts[k]: v for k, v in d.items()}
256
257 def _parse_field(self, tokens: TokenConsumer) -> Field:
258 type_ = None
259 if tokens.next == self.TOK.IDENTIFIER:
260 type_ = self.cstruct.resolve(self._identifier(tokens))
261 elif tokens.next == self.TOK.STRUCT:
262 type_ = self._struct(tokens)
263
264 if tokens.next != self.TOK.NAME:
265 return Field(None, type_, None)
266
267 if tokens.next != self.TOK.NAME:
268 raise ParserError(f"line {self._lineno(tokens.next)}: expected name")
269 nametok = tokens.consume()
270
271 type_, name, bits = self._parse_field_type(type_, nametok.value)
272
273 tokens.eol()
274 return Field(name.strip(), type_, bits)
275
276 def _parse_field_type(self, type_: type[BaseType], name: str) -> tuple[type[BaseType], str, int | None]:
277 pattern = self.TOK.patterns[self.TOK.NAME]
278 # Dirty trick because the regex expects a ; but we don't want it to be part of the value
279 d = pattern.match(name + ";").groupdict()
280
281 name = d["name"]
282 count_expression = d["count"]
283
284 while name.startswith("*"):
285 name = name[1:]
286 type_ = self.cstruct._make_pointer(type_)
287
288 if count_expression is not None:
289 # Poor mans multi-dimensional array by abusing the eager regex match of count
290 counts = count_expression.split("][") if "][" in count_expression else [count_expression]
291
292 for count in reversed(counts):
293 if count == "":
294 count = None
295 else:
296 count = Expression(self.cstruct, count)
297 try:
298 count = count.evaluate()
299 except Exception:
300 pass
301
302 if issubclass(type_, BaseArray) and count is None:
303 raise ParserError("Depth required for multi-dimensional array")
304
305 type_ = self.cstruct._make_array(type_, count)
306
307 return type_, name.strip(), int(d["bits"]) if d["bits"] else None
308
309 def _names(self, tokens: TokenConsumer) -> list[str]:
310 names = []
311 while True:
312 if tokens.next == self.TOK.EOL:
313 tokens.eol()
314 break
315
316 if tokens.next not in (self.TOK.NAME, self.TOK.DEFS):
317 break
318
319 ntoken = tokens.consume()
320 if ntoken == self.TOK.NAME:
321 names.append(ntoken.value.strip())
322 elif ntoken == self.TOK.DEFS:
323 names.extend([name.strip() for name in ntoken.value.strip().split(",")])
324
325 return names
326
327 @staticmethod
328 def _remove_comments(string: str) -> str:
329 # https://stackoverflow.com/a/18381470
330 pattern = r"(\".*?\"|\'.*?\')|(/\*.*?\*/|//[^\r\n]*$)"
331 # first group captures quoted strings (double or single)
332 # second group captures comments (//single-line or /* multi-line */)
333 regex = re.compile(pattern, re.MULTILINE | re.DOTALL)
334
335 def _replacer(match: re.Match) -> str:
336 # if the 2nd group (capturing comments) is not None,
337 # it means we have captured a non-quoted (real) comment string.
338 if comment := match.group(2):
339 return "\n" * comment.count("\n") # so we will return empty to remove the comment
340 # otherwise, we will return the 1st group
341 return match.group(1) # captured quoted-string
342
343 return regex.sub(_replacer, string)
344
345 @staticmethod
346 def _lineno(tok: Token) -> int:
347 """Quick and dirty line number calculator"""
348
349 match = tok.match
350 return match.string.count("\n", 0, match.start()) + 1
351
352 def _config_flag(self, tokens: TokenConsumer) -> None:
353 flag_token = tokens.consume()
354 pattern = self.TOK.patterns[self.TOK.CONFIG_FLAG]
355 tok_dict = pattern.match(flag_token.value).groupdict()
356 tokens.flags.extend(tok_dict["values"].split(","))
357
358 def parse(self, data: str) -> None:
359 scanner = re.Scanner(self.TOK.tokens)
360 data = self._remove_comments(data)
361 tokens, remaining = scanner.scan(data)
362
363 if len(remaining):
364 lineno = data.count("\n", 0, len(data) - len(remaining))
365 raise ParserError(f"line {lineno}: invalid syntax in definition")
366
367 tokens = TokenConsumer(tokens)
368 while True:
369 token = tokens.next
370 if token is None:
371 break
372
373 if token == self.TOK.CONFIG_FLAG:
374 self._config_flag(tokens)
375 elif token == self.TOK.DEFINE:
376 self._constant(tokens)
377 elif token == self.TOK.TYPEDEF:
378 self._typedef(tokens)
379 elif token == self.TOK.STRUCT:
380 self._struct(tokens, register=True)
381 elif token == self.TOK.ENUM:
382 self._enum(tokens)
383 elif token == self.TOK.LOOKUP:
384 self._lookup(tokens)
385 else:
386 raise ParserError(f"line {self._lineno(token)}: unexpected token {token!r}")
387
388
389class CStyleParser(Parser):
390 """Definition parser for C-like structure syntax.
391
392 Args:
393 cs: An instance of cstruct
394 compiled: Whether structs should be compiled or not.
395 """
396
397 def __init__(self, cs: cstruct, compiled: bool = True):
398 self.compiled = compiled
399 super().__init__(cs)
400
401 def _constants(self, data: str) -> None:
402 r = re.finditer(r"#define\s+(?P<name>[^\s]+)\s+(?P<value>[^\r\n]+)\s*\n", data)
403 for t in r:
404 d = t.groupdict()
405 v = d["value"].rsplit("//")[0]
406
407 try:
408 v = ast.literal_eval(v)
409 except (ValueError, SyntaxError):
410 pass
411
412 self.cstruct.consts[d["name"]] = v
413
414 def _enums(self, data: str) -> None:
415 r = re.finditer(
416 r"(?P<enumtype>enum|flag)\s+(?P<name>[^\s:{]+)\s*(:\s*(?P<type>[^\s]+)\s*)?\{(?P<values>[^}]+)\}\s*;",
417 data,
418 )
419 for t in r:
420 d = t.groupdict()
421 enumtype = d["enumtype"]
422
423 nextval = 0
424 if enumtype == "flag":
425 nextval = 1
426
427 values = {}
428 for line in d["values"].split("\n"):
429 line, _, _ = line.partition("//")
430 for v in line.split(","):
431 key, _, val = v.partition("=")
432 key = key.strip()
433 val = val.strip()
434 if not key:
435 continue
436
437 val = nextval if not val else Expression(self.cstruct, val).evaluate()
438
439 if enumtype == "flag":
440 high_bit = val.bit_length() - 1
441 nextval = 2 ** (high_bit + 1)
442 else:
443 nextval = val + 1
444
445 values[key] = val
446
447 if not d["type"]:
448 d["type"] = "uint32"
449
450 factory = self.cstruct._make_enum
451 if enumtype == "flag":
452 factory = self.cstruct._make_flag
453
454 enum = factory(d["name"], self.cstruct.resolve(d["type"]), values)
455 self.cstruct.add_type(enum.__name__, enum)
456
457 def _structs(self, data: str) -> None:
458 r = re.finditer(
459 r"(#(?P<flags>(?:compile))\s+)?"
460 r"((?P<typedef>typedef)\s+)?"
461 r"(?P<type>[^\s]+)\s+"
462 r"(?P<name>[^\s]+)?"
463 r"(?P<fields>"
464 r"\s*{[^}]+\}(?P<defs>\s+[^;\n]+)?"
465 r")?\s*;",
466 data,
467 )
468 for t in r:
469 d = t.groupdict()
470
471 if d["name"]:
472 name = d["name"]
473 elif d["defs"]:
474 name = d["defs"].strip().split(",")[0].strip()
475 else:
476 raise ParserError("No name for struct")
477
478 if d["type"] == "struct":
479 data = self._parse_fields(d["fields"][1:-1].strip())
480 st = self.cstruct._make_struct(name, data)
481 if d["flags"] == "compile" or self.compiled:
482 st = compiler.compile(st)
483 elif d["typedef"] == "typedef":
484 st = d["type"]
485 else:
486 continue
487
488 if d["name"]:
489 self.cstruct.add_type(d["name"], st)
490
491 if d["defs"]:
492 for td in d["defs"].strip().split(","):
493 td = td.strip()
494 self.cstruct.add_type(td, st)
495
496 def _parse_fields(self, data: str) -> None:
497 fields = re.finditer(
498 r"(?P<type>[^\s]+)\s+(?P<name>[^\s\[:]+)(:(?P<bits>\d+))?(\[(?P<count>[^;\n]*)\])?;",
499 data,
500 )
501
502 result = []
503 for f in fields:
504 d = f.groupdict()
505 if d["type"].startswith("//"):
506 continue
507
508 type_ = self.cstruct.resolve(d["type"])
509
510 d["name"] = d["name"].replace("(", "").replace(")", "")
511
512 # Maybe reimplement lazy type references later
513 # _type = TypeReference(self, d['type'])
514 if d["count"] is not None:
515 if d["count"] == "":
516 count = None
517 else:
518 count = Expression(self.cstruct, d["count"])
519 try:
520 count = count.evaluate()
521 except Exception:
522 pass
523
524 type_ = self.cstruct._make_array(type_, count)
525
526 if d["name"].startswith("*"):
527 d["name"] = d["name"][1:]
528 type_ = self.cstruct._make_pointer(type_)
529
530 field = Field(d["name"], type_, int(d["bits"]) if d["bits"] else None)
531 result.append(field)
532
533 return result
534
535 def _lookups(self, data: str, consts: dict[str, int]) -> None:
536 r = re.finditer(r"\$(?P<name>[^\s]+) = ({[^}]+})\w*\n", data)
537
538 for t in r:
539 d = ast.literal_eval(t.group(2))
540 self.cstruct.lookups[t.group(1)] = {self.cstruct.consts[k]: v for k, v in d.items()}
541
542 def parse(self, data: str) -> None:
543 self._constants(data)
544 self._enums(data)
545 self._structs(data)
546 self._lookups(data, self.cstruct.consts)
547
548
549class Token:
550 __slots__ = ("match", "token", "value")
551
552 def __init__(self, token: str, value: str, match: re.Match):
553 self.token = token
554 self.value = value
555 self.match = match
556
557 def __eq__(self, other: object) -> bool:
558 if isinstance(other, Token):
559 other = other.token
560
561 return self.token == other
562
563 def __ne__(self, other: object) -> bool:
564 return not self == other
565
566 def __repr__(self) -> str:
567 return f"<Token.{self.token} value={self.value!r}>"
568
569
570class TokenCollection:
571 def __init__(self):
572 self.tokens: list[Token] = []
573 self.lookup: dict[str, str] = {}
574 self.patterns: dict[str, re.Pattern] = {}
575
576 def __getattr__(self, attr: str) -> str | Any:
577 try:
578 return self.lookup[attr]
579 except AttributeError:
580 pass
581
582 return object.__getattribute__(self, attr)
583
584 def add(self, regex: str, name: str | None) -> None:
585 if name is None:
586 self.tokens.append((regex, None))
587 else:
588 self.lookup[name] = name
589 self.patterns[name] = re.compile(regex)
590 self.tokens.append((regex, lambda s, t: Token(name, t, s.match)))
591
592
593class TokenConsumer:
594 def __init__(self, tokens: list[Token]):
595 self.tokens = tokens
596 self.flags = []
597 self.previous = None
598
599 def __contains__(self, token: Token) -> bool:
600 return token in self.tokens
601
602 def __len__(self) -> int:
603 return len(self.tokens)
604
605 def __repr__(self) -> str:
606 return f"<TokenConsumer next={self.next!r}>"
607
608 @property
609 def next(self) -> Token:
610 try:
611 return self.tokens[0]
612 except IndexError:
613 return None
614
615 def consume(self) -> Token:
616 self.previous = self.tokens.pop(0)
617 return self.previous
618
619 def reset_flags(self) -> None:
620 self.flags = []
621
622 def eol(self) -> None:
623 token = self.consume()
624 if token.token != "EOL":
625 raise ParserError(f"line {self._lineno(token)}: expected EOL")