Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/unblob/handlers/archive/arj.py: 98%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

64 statements  

1import binascii 

2import io 

3 

4from structlog import get_logger 

5 

6from ...extractors import Command 

7from ...file_utils import Endian, convert_int32 

8from ...models import ( 

9 File, 

10 HandlerDoc, 

11 HandlerType, 

12 HexString, 

13 Reference, 

14 StructHandler, 

15 ValidChunk, 

16) 

17 

18logger = get_logger() 

19 

20# CPP/7zip/Archive/ArjHandler.cpp IsArc_Arj() 

21MIN_BLOCK_SIZE = 30 

22MAX_BLOCK_SIZE = 2600 

23BASIC_HEADER_SIZE = 4 

24 

25 

26class ARJError(Exception): 

27 pass 

28 

29 

30class InvalidARJSize(ARJError): 

31 """Invalid size fields in ARJ header.""" 

32 

33 

34class ARJChecksumError(ARJError): 

35 """Main ARJ header checksum missmatch.""" 

36 

37 

38class ARJExtendedHeader(ARJError): 

39 """Main ARJ header contains extended_header, which we don't handle.""" 

40 

41 

42class ARJHandler(StructHandler): 

43 NAME = "arj" 

44 

45 PATTERNS = [HexString("60 EA [5] 0? [2] 0?")] 

46 

47 # https://docs.fileformat.com/compression/arj/ 

48 # https://github.com/tripsin/unarj/blob/master/UNARJ.H#L203 

49 C_DEFINITIONS = r""" 

50 typedef struct basic_header { 

51 uint16 id; 

52 uint16 size; 

53 } basic_header_t; 

54 

55 typedef struct arj_header 

56 { 

57 basic_header_t header; 

58 uint8 first_hdr_size; // size up to "extra data" 

59 uint8 archive_version; 

60 uint8 min_version; 

61 uint8 host_os; // 0-9 

62 uint8 arj_flags; // 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40 

63 uint8 security_version; // "2 = current" 

64 uint8 file_type; // 0-4 

65 uint8 garble_password; 

66 uint32 datetime_created; 

67 uint32 datetime_modified; 

68 uint32 archive_size; 

69 uint32 filepos_security_env_data; 

70 uint16 reserved1; 

71 uint16 reserved2; 

72 uint16 security_env_length; 

73 uint16 host_data; 

74 } arj_header_t; 

75 

76 typedef struct file_header { 

77 basic_header_t header; 

78 uint8 first_hdr_size; // size up to "extra data" 

79 uint8 archive_version; 

80 uint8 min_version; 

81 uint8 host_os; // 0-9 

82 uint8 arj_flags; // 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40 

83 uint8 method; // 0-4 

84 uint8 file_type; 

85 uint8 garble_password; 

86 uint32 datetime_modified; 

87 uint32 compressed_size; 

88 uint32 original_size; 

89 uint32 original_file_crc; 

90 uint16 entryname_pos_in_filename; 

91 uint16 file_access_mode; 

92 uint16 host_data; 

93 } file_header_t; 

94 

95 typedef struct metadata { 

96 char filename[]; 

97 char comment[]; 

98 uint32 crc; 

99 } metadata_t; 

100 

101 typedef struct extended_header { 

102 uint16 size; 

103 // More would go here if there were an extended header 

104 } extended_header_t; 

105 """ 

106 

107 HEADER_STRUCT = "arj_header_t" 

108 

109 EXTRACTOR = Command("7z", "x", "-y", "{inpath}", "-o{outdir}") 

110 

111 DOC = HandlerDoc( 

112 name="ARJ", 

113 description="ARJ is a legacy compressed archive formats used to store multiple files with metadata such as file size, creation date, and CRC.", 

114 handler_type=HandlerType.ARCHIVE, 

115 vendor=None, 

116 references=[ 

117 Reference( 

118 title="ARJ File Format Documentation", 

119 url="https://docs.fileformat.com/compression/arj/", 

120 ), 

121 Reference( 

122 title="ARJ Technical Information", 

123 url="https://github.com/tripsin/unarj/blob/master/UNARJ.H#L203", 

124 ), 

125 ], 

126 limitations=[], 

127 ) 

128 

129 def _read_arj_main_header(self, file: File, start_offset: int) -> int: 

130 file.seek(start_offset) 

131 main_header = self.cparser_le.arj_header(file) 

132 logger.debug("Main header parsed", header=main_header, _verbosity=3) 

133 

134 if ( 

135 main_header.header.size < MIN_BLOCK_SIZE 

136 or main_header.header.size > MAX_BLOCK_SIZE 

137 or main_header.header.size < main_header.first_hdr_size 

138 ): 

139 raise InvalidARJSize 

140 

141 file.seek(start_offset + BASIC_HEADER_SIZE) 

142 content = file.read(main_header.header.size) 

143 calculated_crc = binascii.crc32(content) 

144 crc = convert_int32(file.read(4), endian=Endian.LITTLE) 

145 

146 if crc != calculated_crc: 

147 raise ARJChecksumError 

148 

149 file.seek(start_offset + main_header.first_hdr_size + BASIC_HEADER_SIZE) 

150 self._read_headers(file) 

151 return file.tell() 

152 

153 def _read_arj_files(self, file: File) -> int: 

154 while True: 

155 start = file.tell() 

156 basic_header = self.cparser_le.basic_header(file) 

157 logger.debug("Basic header parsed", header=basic_header, _verbosity=3) 

158 

159 if basic_header.size == 0: 

160 # We've reached the final empty file header. This is where we want to be. 

161 return file.tell() 

162 

163 file.seek(start) 

164 file_header = self.cparser_le.file_header_t(file) 

165 

166 file.seek(start + file_header.first_hdr_size + len(basic_header)) 

167 self._read_headers(file) 

168 # Seek past the file contents 

169 file.seek(file_header.compressed_size, io.SEEK_CUR) 

170 

171 def _read_headers(self, file): 

172 metadata = self.cparser_le.metadata_t(file) 

173 logger.debug("Metadata header parsed", header=metadata, _verbosity=3) 

174 

175 # Lack of support for extended header is ok given that no versions of ARJ use the extended header. 

176 # Source: 'ARJ TECHNICAL INFORMATION', September 2001 

177 extended_header = self.cparser_le.extended_header_t(file) 

178 logger.debug("Extended header parsed", header=extended_header, _verbosity=3) 

179 if extended_header.size != 0: 

180 raise ARJExtendedHeader 

181 

182 def calculate_chunk(self, file: File, start_offset: int) -> ValidChunk | None: 

183 try: 

184 # Read past the main header. 

185 self._read_arj_main_header(file, start_offset) 

186 end_of_arj = self._read_arj_files(file) 

187 except ARJError as exc: 

188 logger.debug( 

189 "Invalid ARJ file", 

190 start_offset=start_offset, 

191 reason=exc.__doc__, 

192 _verbosity=2, 

193 ) 

194 return None 

195 

196 return ValidChunk( 

197 start_offset=start_offset, 

198 end_offset=end_of_arj, 

199 )