1import os
2from pathlib import Path
3
4import arpy
5from structlog import get_logger
6
7from ...file_utils import FileSystem, OffsetFile, iterate_file
8from ...models import (
9 Extractor,
10 ExtractResult,
11 File,
12 Handler,
13 HandlerDoc,
14 HandlerType,
15 HexString,
16 Reference,
17 ValidChunk,
18)
19from ...report import ExtractionProblem
20
21logger = get_logger()
22
23
24HEADER_LENGTH = 0x44
25SIGNATURE_LENGTH = 0x8
26
27
28class ArExtractor(Extractor):
29 def extract(self, inpath: Path, outdir: Path) -> ExtractResult | None:
30 fs = FileSystem(outdir)
31
32 with arpy.Archive(inpath.as_posix()) as archive:
33 archive.read_all_headers()
34
35 for name in sorted(archive.archived_files):
36 archived_file = archive.archived_files[name]
37
38 try:
39 path = Path(name.decode())
40 except UnicodeDecodeError:
41 path = Path(name.decode(errors="replace"))
42 fs.record_problem(
43 ExtractionProblem(
44 path=repr(name),
45 problem="Path is not a valid UTF/8 string",
46 resolution=f"Converted to {path}",
47 )
48 )
49
50 fs.write_chunks(
51 path,
52 chunks=iterate_file(
53 archived_file,
54 0,
55 archived_file.header.size,
56 ),
57 )
58
59 return ExtractResult(reports=fs.problems)
60
61
62class ARHandler(Handler):
63 NAME = "ar"
64
65 PATTERNS = [
66 HexString(
67 """
68 // "!<arch>\\n", 58 chars of whatever, then the ARFMAG
69 21 3C 61 72 63 68 3E 0A [58] 60 0A
70 """
71 )
72 ]
73
74 EXTRACTOR = ArExtractor()
75
76 DOC = HandlerDoc(
77 name="AR",
78 description="Unix AR (archive) files are used to store multiple files in a single archive with a simple header format.",
79 handler_type=HandlerType.ARCHIVE,
80 vendor=None,
81 references=[
82 Reference(
83 title="Unix AR File Format Documentation",
84 url="https://en.wikipedia.org/wiki/Ar_(Unix)",
85 )
86 ],
87 limitations=[],
88 )
89
90 def calculate_chunk(self, file: File, start_offset: int) -> ValidChunk | None:
91 offset_file = OffsetFile(file, start_offset)
92 ar = arpy.Archive(fileobj=offset_file) # type: ignore
93
94 try:
95 ar.read_all_headers()
96 except arpy.ArchiveFormatError as exc:
97 logger.debug(
98 "Hit an ArchiveFormatError, we've probably hit some other kind of data",
99 exc_info=exc,
100 )
101
102 # wind the cursor back the whole header length to check if we failed on
103 # the first match, which means malformed AR archive
104 ar.file.seek(-HEADER_LENGTH, os.SEEK_CUR)
105 # we check if we failed on the first match
106 if start_offset == file.tell():
107 return None
108 # otherwise we seek past the signature (failure on malformed AR archive
109 # within the whole file, not at the start)
110 ar.file.seek(SIGNATURE_LENGTH, os.SEEK_CUR)
111
112 return ValidChunk(
113 start_offset=start_offset,
114 end_offset=file.tell(),
115 )