1import binascii
2import io
3
4from structlog import get_logger
5
6from ...extractors import Command
7from ...file_utils import Endian, convert_int32
8from ...models import (
9 File,
10 HandlerDoc,
11 HandlerType,
12 HexString,
13 Reference,
14 StructHandler,
15 ValidChunk,
16)
17
18logger = get_logger()
19
20# CPP/7zip/Archive/ArjHandler.cpp IsArc_Arj()
21MIN_BLOCK_SIZE = 30
22MAX_BLOCK_SIZE = 2600
23BASIC_HEADER_SIZE = 4
24
25
26class ARJError(Exception):
27 pass
28
29
30class InvalidARJSize(ARJError):
31 """Invalid size fields in ARJ header."""
32
33
34class ARJChecksumError(ARJError):
35 """Main ARJ header checksum missmatch."""
36
37
38class ARJExtendedHeader(ARJError):
39 """Main ARJ header contains extended_header, which we don't handle."""
40
41
42class ARJHandler(StructHandler):
43 NAME = "arj"
44
45 PATTERNS = [HexString("60 EA [5] 0? [2] 0?")]
46
47 # https://docs.fileformat.com/compression/arj/
48 # https://github.com/tripsin/unarj/blob/master/UNARJ.H#L203
49 C_DEFINITIONS = r"""
50 typedef struct basic_header {
51 uint16 id;
52 uint16 size;
53 } basic_header_t;
54
55 typedef struct arj_header
56 {
57 basic_header_t header;
58 uint8 first_hdr_size; // size up to "extra data"
59 uint8 archive_version;
60 uint8 min_version;
61 uint8 host_os; // 0-9
62 uint8 arj_flags; // 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40
63 uint8 security_version; // "2 = current"
64 uint8 file_type; // 0-4
65 uint8 garble_password;
66 uint32 datetime_created;
67 uint32 datetime_modified;
68 uint32 archive_size;
69 uint32 filepos_security_env_data;
70 uint16 reserved1;
71 uint16 reserved2;
72 uint16 security_env_length;
73 uint16 host_data;
74 } arj_header_t;
75
76 typedef struct file_header {
77 basic_header_t header;
78 uint8 first_hdr_size; // size up to "extra data"
79 uint8 archive_version;
80 uint8 min_version;
81 uint8 host_os; // 0-9
82 uint8 arj_flags; // 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40
83 uint8 method; // 0-4
84 uint8 file_type;
85 uint8 garble_password;
86 uint32 datetime_modified;
87 uint32 compressed_size;
88 uint32 original_size;
89 uint32 original_file_crc;
90 uint16 entryname_pos_in_filename;
91 uint16 file_access_mode;
92 uint16 host_data;
93 } file_header_t;
94
95 typedef struct metadata {
96 char filename[];
97 char comment[];
98 uint32 crc;
99 } metadata_t;
100
101 typedef struct extended_header {
102 uint16 size;
103 // More would go here if there were an extended header
104 } extended_header_t;
105 """
106
107 HEADER_STRUCT = "arj_header_t"
108
109 EXTRACTOR = Command("7z", "x", "-y", "{inpath}", "-o{outdir}")
110
111 DOC = HandlerDoc(
112 name="ARJ",
113 description="ARJ is a legacy compressed archive formats used to store multiple files with metadata such as file size, creation date, and CRC.",
114 handler_type=HandlerType.ARCHIVE,
115 vendor=None,
116 references=[
117 Reference(
118 title="ARJ File Format Documentation",
119 url="https://docs.fileformat.com/compression/arj/",
120 ),
121 Reference(
122 title="ARJ Technical Information",
123 url="https://github.com/tripsin/unarj/blob/master/UNARJ.H#L203",
124 ),
125 ],
126 limitations=[],
127 )
128
129 def _read_arj_main_header(self, file: File, start_offset: int) -> int:
130 file.seek(start_offset)
131 main_header = self.cparser_le.arj_header(file)
132 logger.debug("Main header parsed", header=main_header, _verbosity=3)
133
134 if (
135 main_header.header.size < MIN_BLOCK_SIZE
136 or main_header.header.size > MAX_BLOCK_SIZE
137 or main_header.header.size < main_header.first_hdr_size
138 ):
139 raise InvalidARJSize
140
141 file.seek(start_offset + BASIC_HEADER_SIZE)
142 content = file.read(main_header.header.size)
143 calculated_crc = binascii.crc32(content)
144 crc = convert_int32(file.read(4), endian=Endian.LITTLE)
145
146 if crc != calculated_crc:
147 raise ARJChecksumError
148
149 file.seek(start_offset + main_header.first_hdr_size + BASIC_HEADER_SIZE)
150 self._read_headers(file)
151 return file.tell()
152
153 def _read_arj_files(self, file: File) -> int:
154 while True:
155 start = file.tell()
156 basic_header = self.cparser_le.basic_header(file)
157 logger.debug("Basic header parsed", header=basic_header, _verbosity=3)
158
159 if basic_header.size == 0:
160 # We've reached the final empty file header. This is where we want to be.
161 return file.tell()
162
163 file.seek(start)
164 file_header = self.cparser_le.file_header_t(file)
165
166 file.seek(start + file_header.first_hdr_size + len(basic_header))
167 self._read_headers(file)
168 # Seek past the file contents
169 file.seek(file_header.compressed_size, io.SEEK_CUR)
170
171 def _read_headers(self, file):
172 metadata = self.cparser_le.metadata_t(file)
173 logger.debug("Metadata header parsed", header=metadata, _verbosity=3)
174
175 # Lack of support for extended header is ok given that no versions of ARJ use the extended header.
176 # Source: 'ARJ TECHNICAL INFORMATION', September 2001
177 extended_header = self.cparser_le.extended_header_t(file)
178 logger.debug("Extended header parsed", header=extended_header, _verbosity=3)
179 if extended_header.size != 0:
180 raise ARJExtendedHeader
181
182 def calculate_chunk(self, file: File, start_offset: int) -> ValidChunk | None:
183 try:
184 # Read past the main header.
185 self._read_arj_main_header(file, start_offset)
186 end_of_arj = self._read_arj_files(file)
187 except ARJError as exc:
188 logger.debug(
189 "Invalid ARJ file",
190 start_offset=start_offset,
191 reason=exc.__doc__,
192 _verbosity=2,
193 )
194 return None
195
196 return ValidChunk(
197 start_offset=start_offset,
198 end_offset=end_of_arj,
199 )