1from __future__ import annotations
2
3from functools import cached_property
4from typing import TYPE_CHECKING, Generator, cast
5
6from pypdf._utils import format_iso8824_date, parse_iso8824_date
7from pypdf.constants import CatalogAttributes as CA
8from pypdf.constants import FileSpecificationDictionaryEntries
9from pypdf.constants import PageAttributes as PA
10from pypdf.errors import PdfReadError
11from pypdf.generic import (
12 ArrayObject,
13 ByteStringObject,
14 DecodedStreamObject,
15 DictionaryObject,
16 NameObject,
17 NullObject,
18 NumberObject,
19 StreamObject,
20 TextStringObject,
21 is_null_or_none,
22)
23
24if TYPE_CHECKING:
25 import datetime
26
27 from pypdf._writer import PdfWriter
28
29
30class EmbeddedFile:
31 """
32 Container holding the information on an embedded file.
33
34 Attributes are evaluated lazily if possible.
35
36 Further information on embedded files can be found in section 7.11 of the PDF 2.0 specification.
37 """
38 def __init__(self, name: str, pdf_object: DictionaryObject) -> None:
39 """
40 Args:
41 name: The (primary) name as provided in the name tree.
42 pdf_object: The corresponding PDF object to allow retrieving further data.
43 """
44 self._name = name
45 self.pdf_object = pdf_object
46
47 @property
48 def name(self) -> str:
49 """The (primary) name of the embedded file as provided in the name tree."""
50 return self._name
51
52 @classmethod
53 def _create_new(cls, writer: PdfWriter, name: str, content: str | bytes) -> EmbeddedFile:
54 """
55 Create a new embedded file and add it to the PdfWriter.
56
57 Args:
58 writer: The PdfWriter instance to add the embedded file to.
59 name: The filename to display.
60 content: The data in the file.
61
62 Returns:
63 EmbeddedFile instance for the newly created embedded file.
64 """
65 # Convert string content to bytes if needed
66 if isinstance(content, str):
67 content = content.encode("latin-1")
68
69 # Create the file entry (the actual embedded file stream)
70 file_entry = DecodedStreamObject()
71 file_entry.set_data(content)
72 file_entry.update({NameObject(PA.TYPE): NameObject("/EmbeddedFile")})
73
74 # Create the /EF entry
75 ef_entry = DictionaryObject()
76 ef_entry.update({NameObject("/F"): writer._add_object(file_entry)})
77
78 # Create the filespec dictionary
79 from pypdf.generic import create_string_object # noqa: PLC0415
80 filespec = DictionaryObject()
81 filespec.update(
82 {
83 NameObject(PA.TYPE): NameObject("/Filespec"),
84 NameObject(FileSpecificationDictionaryEntries.F): create_string_object(name),
85 NameObject(FileSpecificationDictionaryEntries.EF): ef_entry,
86 }
87 )
88
89 # Add to the catalog's names tree
90 if CA.NAMES not in writer._root_object:
91 writer._root_object[NameObject(CA.NAMES)] = writer._add_object(DictionaryObject())
92
93 names_dict = cast(DictionaryObject, writer._root_object[CA.NAMES])
94 if "/EmbeddedFiles" not in names_dict:
95 embedded_files_names_dictionary = DictionaryObject(
96 {NameObject(CA.NAMES): ArrayObject()}
97 )
98 names_dict[NameObject("/EmbeddedFiles")] = writer._add_object(embedded_files_names_dictionary)
99 else:
100 embedded_files_names_dictionary = cast(DictionaryObject, names_dict["/EmbeddedFiles"])
101
102 # Add the name and filespec to the names array
103 names_array = cast(ArrayObject, embedded_files_names_dictionary[CA.NAMES])
104 names_array.extend([create_string_object(name), filespec])
105
106 # Return an EmbeddedFile instance
107 return cls(name=name, pdf_object=filespec)
108
109 @property
110 def alternative_name(self) -> str | None:
111 """Retrieve the alternative name (file specification)."""
112 for key in [FileSpecificationDictionaryEntries.UF, FileSpecificationDictionaryEntries.F]:
113 # PDF 2.0 reference, table 43:
114 # > A PDF reader shall use the value of the UF key, when present, instead of the F key.
115 if key in self.pdf_object:
116 value = self.pdf_object[key].get_object()
117 if not is_null_or_none(value):
118 return cast(str, value)
119 return None
120
121 @alternative_name.setter
122 def alternative_name(self, value: TextStringObject | None) -> None:
123 """Set the alternative name (file specification)."""
124 if value is None:
125 if FileSpecificationDictionaryEntries.UF in self.pdf_object:
126 self.pdf_object[NameObject(FileSpecificationDictionaryEntries.UF)] = NullObject()
127 if FileSpecificationDictionaryEntries.F in self.pdf_object:
128 self.pdf_object[NameObject(FileSpecificationDictionaryEntries.F)] = NullObject()
129 else:
130 if FileSpecificationDictionaryEntries.UF in self.pdf_object:
131 self.pdf_object[NameObject(FileSpecificationDictionaryEntries.UF)] = value
132 if FileSpecificationDictionaryEntries.F in self.pdf_object:
133 self.pdf_object[NameObject(FileSpecificationDictionaryEntries.F)] = value
134
135 @property
136 def description(self) -> str | None:
137 """Retrieve the description."""
138 value = self.pdf_object.get(FileSpecificationDictionaryEntries.DESC)
139 if is_null_or_none(value):
140 return None
141 return value
142
143 @description.setter
144 def description(self, value: TextStringObject | None) -> None:
145 """Set the description."""
146 if value is None:
147 self.pdf_object[NameObject(FileSpecificationDictionaryEntries.DESC)] = NullObject()
148 else:
149 self.pdf_object[NameObject(FileSpecificationDictionaryEntries.DESC)] = value
150
151 @property
152 def associated_file_relationship(self) -> str:
153 """Retrieve the relationship of the referring document to this embedded file."""
154 return self.pdf_object.get("/AFRelationship", "/Unspecified")
155
156 @associated_file_relationship.setter
157 def associated_file_relationship(self, value: NameObject) -> None:
158 """Set the relationship of the referring document to this embedded file."""
159 self.pdf_object[NameObject("/AFRelationship")] = value
160
161 @property
162 def _embedded_file(self) -> StreamObject:
163 """Retrieve the actual embedded file stream."""
164 if "/EF" not in self.pdf_object:
165 raise PdfReadError(f"/EF entry not found: {self.pdf_object}")
166 ef = cast(DictionaryObject, self.pdf_object["/EF"])
167 for key in [FileSpecificationDictionaryEntries.UF, FileSpecificationDictionaryEntries.F]:
168 if key in ef:
169 return cast(StreamObject, ef[key].get_object())
170 raise PdfReadError(f"No /(U)F key found in file dictionary: {ef}")
171
172 @property
173 def _params(self) -> DictionaryObject:
174 """Retrieve the file-specific parameters."""
175 return self._embedded_file.get("/Params", DictionaryObject()).get_object()
176
177 @cached_property
178 def _ensure_params(self) -> DictionaryObject:
179 """Ensure the /Params dictionary exists and return it."""
180 embedded_file = self._embedded_file
181 if "/Params" not in embedded_file:
182 embedded_file[NameObject("/Params")] = DictionaryObject()
183 return cast(DictionaryObject, embedded_file["/Params"])
184
185 @property
186 def subtype(self) -> str | None:
187 """Retrieve the subtype. This is a MIME media type, prefixed by a slash."""
188 value = self._embedded_file.get("/Subtype")
189 if is_null_or_none(value):
190 return None
191 return value
192
193 @subtype.setter
194 def subtype(self, value: NameObject | None) -> None:
195 """Set the subtype. This should be a MIME media type, prefixed by a slash."""
196 embedded_file = self._embedded_file
197 if value is None:
198 embedded_file[NameObject("/Subtype")] = NullObject()
199 else:
200 embedded_file[NameObject("/Subtype")] = value
201
202 @property
203 def content(self) -> bytes:
204 """Retrieve the actual file content."""
205 return self._embedded_file.get_data()
206
207 @content.setter
208 def content(self, value: str | bytes) -> None:
209 """Set the file content."""
210 if isinstance(value, str):
211 value = value.encode("latin-1")
212 self._embedded_file.set_data(value)
213
214 @property
215 def size(self) -> int | None:
216 """Retrieve the size of the uncompressed file in bytes."""
217 value = self._params.get("/Size")
218 if is_null_or_none(value):
219 return None
220 return value
221
222 @size.setter
223 def size(self, value: NumberObject | None) -> None:
224 """Set the size of the uncompressed file in bytes."""
225 params = self._ensure_params
226 if value is None:
227 params[NameObject("/Size")] = NullObject()
228 else:
229 params[NameObject("/Size")] = value
230
231 @property
232 def creation_date(self) -> datetime.datetime | None:
233 """Retrieve the file creation datetime."""
234 return parse_iso8824_date(self._params.get("/CreationDate"))
235
236 @creation_date.setter
237 def creation_date(self, value: datetime.datetime | None) -> None:
238 """Set the file creation datetime."""
239 params = self._ensure_params
240 if value is None:
241 params[NameObject("/CreationDate")] = NullObject()
242 else:
243 date_str = format_iso8824_date(value)
244 params[NameObject("/CreationDate")] = TextStringObject(date_str)
245
246 @property
247 def modification_date(self) -> datetime.datetime | None:
248 """Retrieve the datetime of the last file modification."""
249 return parse_iso8824_date(self._params.get("/ModDate"))
250
251 @modification_date.setter
252 def modification_date(self, value: datetime.datetime | None) -> None:
253 """Set the datetime of the last file modification."""
254 params = self._ensure_params
255 if value is None:
256 params[NameObject("/ModDate")] = NullObject()
257 else:
258 date_str = format_iso8824_date(value)
259 params[NameObject("/ModDate")] = TextStringObject(date_str)
260
261 @property
262 def checksum(self) -> bytes | None:
263 """Retrieve the MD5 checksum of the (uncompressed) file."""
264 value = self._params.get("/CheckSum")
265 if is_null_or_none(value):
266 return None
267 return value
268
269 @checksum.setter
270 def checksum(self, value: ByteStringObject | None) -> None:
271 """Set the MD5 checksum of the (uncompressed) file."""
272 params = self._ensure_params
273 if value is None:
274 params[NameObject("/CheckSum")] = NullObject()
275 else:
276 params[NameObject("/CheckSum")] = value
277
278 def __repr__(self) -> str:
279 return f"<{self.__class__.__name__} name={self.name!r}>"
280
281 @classmethod
282 def _load_from_names(cls, names: ArrayObject) -> Generator[EmbeddedFile]:
283 """
284 Convert the given name tree into class instances.
285
286 Args:
287 names: The name tree to load the data from.
288
289 Returns:
290 Iterable of class instances for the files found.
291 """
292 # This is a name tree of the format [name_1, reference_1, name_2, reference_2, ...]
293 for i, name in enumerate(names):
294 if not isinstance(name, str):
295 # Skip plain strings and retrieve them as `direct_name` by index.
296 file_dictionary = name.get_object()
297 direct_name = names[i - 1].get_object()
298 yield EmbeddedFile(name=direct_name, pdf_object=file_dictionary)
299
300 @classmethod
301 def _load(cls, catalog: DictionaryObject) -> Generator[EmbeddedFile]:
302 """
303 Load the embedded files for the given document catalog.
304
305 This method and its signature are considered internal API and thus not exposed publicly for now.
306
307 Args:
308 catalog: The document catalog to load from.
309
310 Returns:
311 Iterable of class instances for the files found.
312 """
313 try:
314 container = cast(
315 DictionaryObject,
316 cast(DictionaryObject, catalog["/Names"])["/EmbeddedFiles"],
317 )
318 except KeyError:
319 return
320
321 if "/Kids" in container:
322 for kid in cast(ArrayObject, container["/Kids"].get_object()):
323 # There might be further (nested) kids here.
324 # Wait for an example before evaluating an implementation.
325 kid = kid.get_object()
326 if "/Names" in kid:
327 yield from cls._load_from_names(cast(ArrayObject, kid["/Names"]))
328 if "/Names" in container:
329 yield from cls._load_from_names(cast(ArrayObject, container["/Names"]))