1import io
2
3from ...extractors import Command
4from ...file_utils import Endian
5from ...models import (
6 File,
7 HandlerDoc,
8 HandlerType,
9 Reference,
10 Regex,
11 StructHandler,
12 ValidChunk,
13)
14
15PADDING_LEN = 2
16# CPP/7zip/Archive/LzhHandler.cpp
17HEADER_MIN_SIZE = 2 + 22
18
19
20class LZHHandler(StructHandler):
21 NAME = "lzh"
22
23 PATTERNS = [
24 Regex(r"-lh0-"),
25 Regex(r"-lzs-"),
26 Regex(r"-lz4-"),
27 Regex(r"-lh1-"),
28 Regex(r"-lh2-"),
29 Regex(r"-lh3-"),
30 Regex(r"-lh4-"),
31 Regex(r"-lh5-"),
32 Regex(r"-lh6-"),
33 Regex(r"-lh7-"),
34 Regex(r"-lh8-"),
35 Regex(r"-lhd-"),
36 ]
37
38 PATTERN_MATCH_OFFSET = -2
39
40 C_DEFINITIONS = r"""
41 typedef struct lzh_default_header {
42 uint8 header_size; // excludes extended headers size
43 uint8 header_checksum;
44 char method_id[5];
45 uint32 compressed_size; // includes all extended headers size (if level 1)
46 uint32 uncompressed_size;
47 uint32 timestamp;
48 uint8 fd_attribute;
49 uint8 level_identifier;
50 } lzh_default_header_t;
51
52 typedef struct level_2_header {
53 uint16 header_size; // includes all extended headers
54 char method_id[5];
55 uint32 compressed_size; // excludes all extended headers
56 uint32 uncompressed_size;
57 uint32 timestamp;
58 uint8 fd_attribute;
59 uint8 level_identifier;
60 } level_2_header_t;
61 """
62 HEADER_STRUCT = "lzh_default_header_t"
63
64 EXTRACTOR = Command("7z", "x", "-p", "-y", "{inpath}", "-o{outdir}")
65
66 DOC = HandlerDoc(
67 name="LZH",
68 description="LZH is a legacy archive format that uses various compression methods such as '-lh0-' and '-lh5-'. It was widely used in Japan and on older systems for compressing and archiving files.",
69 handler_type=HandlerType.COMPRESSION,
70 vendor=None,
71 references=[
72 Reference(
73 title="LZH Compression Format",
74 url="https://en.wikipedia.org/wiki/LHA_(file_format)",
75 ),
76 ],
77 limitations=[],
78 )
79
80 def calculate_chunk(self, file: File, start_offset: int) -> ValidChunk | None:
81 header = self.parse_header(file, Endian.LITTLE)
82
83 if header.level_identifier > 0x2:
84 return None
85
86 if header.level_identifier == 0x2:
87 # with level 2, the header size is a uint16 rather than uint8 and there
88 # is no checksum. We use this magic trick so we don't parse the header
89 # again. See the level_2_header definition in C_DEFINITIONS
90 header_size = header.header_size + (header.header_checksum << 8)
91 else:
92 header_size = header.header_size + PADDING_LEN
93
94 if header_size < HEADER_MIN_SIZE:
95 return None
96
97 file.seek(-len(header), io.SEEK_CUR)
98 file.seek(header_size + header.compressed_size, io.SEEK_CUR)
99 end_offset = file.tell()
100
101 # LZH files are null terminated, so we have to handle the case where
102 # we matched the last LZH stream of a file and pad appropriately.
103 file.seek(0, io.SEEK_END)
104 end_pos = file.tell()
105
106 if end_pos - end_offset == 1:
107 end_offset = end_pos
108
109 return ValidChunk(
110 start_offset=start_offset,
111 end_offset=end_offset,
112 )