1import io
2from typing import Optional
3
4from ...extractors import Command
5from ...file_utils import Endian
6from ...models import (
7 File,
8 HandlerDoc,
9 HandlerType,
10 Reference,
11 Regex,
12 StructHandler,
13 ValidChunk,
14)
15
16PADDING_LEN = 2
17# CPP/7zip/Archive/LzhHandler.cpp
18HEADER_MIN_SIZE = 2 + 22
19
20
21class LZHHandler(StructHandler):
22 NAME = "lzh"
23
24 PATTERNS = [
25 Regex(r"-lh0-"),
26 Regex(r"-lzs-"),
27 Regex(r"-lz4-"),
28 Regex(r"-lh1-"),
29 Regex(r"-lh2-"),
30 Regex(r"-lh3-"),
31 Regex(r"-lh4-"),
32 Regex(r"-lh5-"),
33 Regex(r"-lh6-"),
34 Regex(r"-lh7-"),
35 Regex(r"-lh8-"),
36 Regex(r"-lhd-"),
37 ]
38
39 PATTERN_MATCH_OFFSET = -2
40
41 C_DEFINITIONS = r"""
42 typedef struct lzh_default_header {
43 uint8 header_size; // excludes extended headers size
44 uint8 header_checksum;
45 char method_id[5];
46 uint32 compressed_size; // includes all extended headers size (if level 1)
47 uint32 uncompressed_size;
48 uint32 timestamp;
49 uint8 fd_attribute;
50 uint8 level_identifier;
51 } lzh_default_header_t;
52
53 typedef struct level_2_header {
54 uint16 header_size; // includes all extended headers
55 char method_id[5];
56 uint32 compressed_size; // excludes all extended headers
57 uint32 uncompressed_size;
58 uint32 timestamp;
59 uint8 fd_attribute;
60 uint8 level_identifier;
61 } level_2_header_t;
62 """
63 HEADER_STRUCT = "lzh_default_header_t"
64
65 EXTRACTOR = Command("7z", "x", "-p", "-y", "{inpath}", "-o{outdir}")
66
67 DOC = HandlerDoc(
68 name="LZH",
69 description="LZH is a legacy archive format that uses various compression methods such as '-lh0-' and '-lh5-'. It was widely used in Japan and on older systems for compressing and archiving files.",
70 handler_type=HandlerType.COMPRESSION,
71 vendor=None,
72 references=[
73 Reference(
74 title="LZH Compression Format",
75 url="https://en.wikipedia.org/wiki/LHA_(file_format)",
76 ),
77 ],
78 limitations=[],
79 )
80
81 def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk]:
82 header = self.parse_header(file, Endian.LITTLE)
83
84 if header.level_identifier > 0x2:
85 return None
86
87 if header.level_identifier == 0x2:
88 # with level 2, the header size is a uint16 rather than uint8 and there
89 # is no checksum. We use this magic trick so we don't parse the header
90 # again. See the level_2_header definition in C_DEFINITIONS
91 header_size = header.header_size + (header.header_checksum << 8)
92 else:
93 header_size = header.header_size + PADDING_LEN
94
95 if header_size < HEADER_MIN_SIZE:
96 return None
97
98 file.seek(-len(header), io.SEEK_CUR)
99 file.seek(header_size + header.compressed_size, io.SEEK_CUR)
100 end_offset = file.tell()
101
102 # LZH files are null terminated, so we have to handle the case where
103 # we matched the last LZH stream of a file and pad appropriately.
104 file.seek(0, io.SEEK_END)
105 end_pos = file.tell()
106
107 if end_pos - end_offset == 1:
108 end_offset = end_pos
109
110 return ValidChunk(
111 start_offset=start_offset,
112 end_offset=end_offset,
113 )