1import os
2from pathlib import Path
3from typing import Optional
4
5import arpy
6from structlog import get_logger
7
8from ...file_utils import FileSystem, OffsetFile, iterate_file
9from ...models import (
10 Extractor,
11 ExtractResult,
12 File,
13 Handler,
14 HandlerDoc,
15 HandlerType,
16 HexString,
17 Reference,
18 ValidChunk,
19)
20from ...report import ExtractionProblem
21
22logger = get_logger()
23
24
25HEADER_LENGTH = 0x44
26SIGNATURE_LENGTH = 0x8
27
28
29class ArExtractor(Extractor):
30 def extract(self, inpath: Path, outdir: Path) -> Optional[ExtractResult]:
31 fs = FileSystem(outdir)
32
33 with arpy.Archive(inpath.as_posix()) as archive:
34 archive.read_all_headers()
35
36 for name in sorted(archive.archived_files):
37 archived_file = archive.archived_files[name]
38
39 try:
40 path = Path(name.decode())
41 except UnicodeDecodeError:
42 path = Path(name.decode(errors="replace"))
43 fs.record_problem(
44 ExtractionProblem(
45 path=repr(name),
46 problem="Path is not a valid UTF/8 string",
47 resolution=f"Converted to {path}",
48 )
49 )
50
51 fs.write_chunks(
52 path,
53 chunks=iterate_file(
54 archived_file,
55 0,
56 archived_file.header.size,
57 ),
58 )
59
60 return ExtractResult(reports=fs.problems)
61
62
63class ARHandler(Handler):
64 NAME = "ar"
65
66 PATTERNS = [
67 HexString(
68 """
69 // "!<arch>\\n", 58 chars of whatever, then the ARFMAG
70 21 3C 61 72 63 68 3E 0A [58] 60 0A
71 """
72 )
73 ]
74
75 EXTRACTOR = ArExtractor()
76
77 DOC = HandlerDoc(
78 name="AR",
79 description="Unix AR (archive) files are used to store multiple files in a single archive with a simple header format.",
80 handler_type=HandlerType.ARCHIVE,
81 vendor=None,
82 references=[
83 Reference(
84 title="Unix AR File Format Documentation",
85 url="https://en.wikipedia.org/wiki/Ar_(Unix)",
86 )
87 ],
88 limitations=[],
89 )
90
91 def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk]:
92 offset_file = OffsetFile(file, start_offset)
93 ar = arpy.Archive(fileobj=offset_file) # type: ignore
94
95 try:
96 ar.read_all_headers()
97 except arpy.ArchiveFormatError as exc:
98 logger.debug(
99 "Hit an ArchiveFormatError, we've probably hit some other kind of data",
100 exc_info=exc,
101 )
102
103 # wind the cursor back the whole header length to check if we failed on
104 # the first match, which means malformed AR archive
105 ar.file.seek(-HEADER_LENGTH, os.SEEK_CUR)
106 # we check if we failed on the first match
107 if start_offset == file.tell():
108 return None
109 # otherwise we seek past the signature (failure on malformed AR archive
110 # within the whole file, not at the start)
111 ar.file.seek(SIGNATURE_LENGTH, os.SEEK_CUR)
112
113 return ValidChunk(
114 start_offset=start_offset,
115 end_offset=file.tell(),
116 )