Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/generic/_files.py: 31%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

226 statements  

1from __future__ import annotations 

2 

3import bisect 

4from functools import cached_property 

5from typing import TYPE_CHECKING, cast 

6 

7from pypdf._utils import format_iso8824_date, parse_iso8824_date 

8from pypdf.constants import CatalogAttributes as CA 

9from pypdf.constants import FileSpecificationDictionaryEntries 

10from pypdf.constants import PageAttributes as PG 

11from pypdf.errors import PdfReadError, PyPdfError 

12from pypdf.generic import ( 

13 ArrayObject, 

14 ByteStringObject, 

15 DecodedStreamObject, 

16 DictionaryObject, 

17 NameObject, 

18 NullObject, 

19 NumberObject, 

20 StreamObject, 

21 TextStringObject, 

22 is_null_or_none, 

23) 

24 

25if TYPE_CHECKING: 

26 import datetime 

27 from collections.abc import Generator 

28 

29 from pypdf._writer import PdfWriter 

30 

31 

32class EmbeddedFile: 

33 """ 

34 Container holding the information on an embedded file. 

35 

36 Attributes are evaluated lazily if possible. 

37 

38 Further information on embedded files can be found in section 7.11 of the PDF 2.0 specification. 

39 """ 

40 def __init__(self, name: str, pdf_object: DictionaryObject, parent: ArrayObject | None = None) -> None: 

41 """ 

42 Args: 

43 name: The (primary) name as provided in the name tree. 

44 pdf_object: The corresponding PDF object to allow retrieving further data. 

45 parent: The parent list. 

46 """ 

47 self._name = name 

48 self.pdf_object = pdf_object 

49 self._parent = parent 

50 

51 @property 

52 def name(self) -> str: 

53 """The (primary) name of the embedded file as provided in the name tree.""" 

54 return self._name 

55 

56 @classmethod 

57 def _create_new(cls, writer: PdfWriter, name: str, content: str | bytes) -> EmbeddedFile: 

58 """ 

59 Create a new embedded file and add it to the PdfWriter. 

60 

61 Args: 

62 writer: The PdfWriter instance to add the embedded file to. 

63 name: The filename to display. 

64 content: The data in the file. 

65 

66 Returns: 

67 EmbeddedFile instance for the newly created embedded file. 

68 """ 

69 # Convert string content to bytes if needed 

70 if isinstance(content, str): 

71 content = content.encode("latin-1") 

72 

73 # Create the file entry (the actual embedded file stream) 

74 file_entry = DecodedStreamObject() 

75 file_entry.set_data(content) 

76 file_entry.update({NameObject(PG.TYPE): NameObject("/EmbeddedFile")}) 

77 

78 # Create the /EF entry 

79 ef_entry = DictionaryObject() 

80 ef_entry.update({NameObject("/F"): writer._add_object(file_entry)}) 

81 

82 # Create the filespec dictionary 

83 from pypdf.generic import create_string_object # noqa: PLC0415 

84 filespec = DictionaryObject() 

85 filespec_reference = writer._add_object(filespec) 

86 name_object = cast(TextStringObject, create_string_object(name)) 

87 filespec.update( 

88 { 

89 NameObject(PG.TYPE): NameObject("/Filespec"), 

90 NameObject(FileSpecificationDictionaryEntries.F): name_object, 

91 NameObject(FileSpecificationDictionaryEntries.EF): ef_entry, 

92 } 

93 ) 

94 

95 # Add the name and filespec to the names array. 

96 # We use the inverse order for insertion, as this allows us to re-use the 

97 # same index. 

98 names_array = cls._get_names_array(writer) 

99 insertion_index = cls._get_insertion_index(names_array, name_object) 

100 names_array.insert(insertion_index, filespec_reference) 

101 names_array.insert(insertion_index, name_object) 

102 

103 # Return an EmbeddedFile instance 

104 return cls(name=name, pdf_object=filespec, parent=names_array) 

105 

106 @classmethod 

107 def _get_names_array(cls, writer: PdfWriter) -> ArrayObject: 

108 """Get the names array for embedded files, possibly creating and flattening it.""" 

109 if CA.NAMES not in writer.root_object: 

110 # Add the /Names entry to the catalog. 

111 writer.root_object[NameObject(CA.NAMES)] = writer._add_object(DictionaryObject()) 

112 

113 names_dict = cast(DictionaryObject, writer.root_object[CA.NAMES]) 

114 if "/EmbeddedFiles" not in names_dict: 

115 # We do not yet have an entry for embedded files. Create and return it. 

116 names = ArrayObject() 

117 embedded_files_names_dictionary = DictionaryObject( 

118 {NameObject(CA.NAMES): names} 

119 ) 

120 names_dict[NameObject("/EmbeddedFiles")] = writer._add_object(embedded_files_names_dictionary) 

121 return names 

122 

123 # We have an existing embedded files entry. 

124 embedded_files_names_tree = cast(DictionaryObject, names_dict["/EmbeddedFiles"]) 

125 if "/Names" in embedded_files_names_tree: 

126 # Simple case: We already have a flat list. 

127 return cast(ArrayObject, embedded_files_names_tree[NameObject(CA.NAMES)]) 

128 if "/Kids" not in embedded_files_names_tree: 

129 # Invalid case: This is no name tree. 

130 raise PdfReadError("Got neither Names nor Kids in embedded files tree.") 

131 

132 # Complex case: Convert a /Kids-based name tree to a /Names-based one. 

133 # /Name-based ones are much easier to handle and allow us to simplify the 

134 # actual insertion logic by only having to consider one case. 

135 names = ArrayObject() 

136 kids = cast(ArrayObject, embedded_files_names_tree["/Kids"].get_object()) 

137 embedded_files_names_dictionary = DictionaryObject( 

138 {NameObject(CA.NAMES): names} 

139 ) 

140 names_dict[NameObject("/EmbeddedFiles")] = writer._add_object(embedded_files_names_dictionary) 

141 for kid in kids: 

142 # Write the flattened file entries. As we do not change the actual files, 

143 # this should not have any impact on references to them. 

144 # There might be further (nested) kids here. 

145 # Wait for an example before evaluating an implementation. 

146 for name in kid.get_object().get("/Names", []): 

147 names.append(name) 

148 return names 

149 

150 @classmethod 

151 def _get_insertion_index(cls, names_array: ArrayObject, name: str) -> int: 

152 keys = [names_array[i].encode("utf-8") for i in range(0, len(names_array), 2)] 

153 name_bytes = name.encode("utf-8") 

154 

155 start = bisect.bisect_left(keys, name_bytes) 

156 end = bisect.bisect_right(keys, name_bytes) 

157 

158 if start != end: 

159 return end * 2 

160 if start == 0: 

161 return 0 

162 if start == (key_count := len(keys)): 

163 return key_count * 2 

164 return end * 2 

165 

166 @property 

167 def alternative_name(self) -> str | None: 

168 """Retrieve the alternative name (file specification).""" 

169 for key in [FileSpecificationDictionaryEntries.UF, FileSpecificationDictionaryEntries.F]: 

170 # PDF 2.0 reference, table 43: 

171 # > A PDF reader shall use the value of the UF key, when present, instead of the F key. 

172 if key in self.pdf_object: 

173 value = self.pdf_object[key].get_object() 

174 if not is_null_or_none(value): 

175 return cast(str, value) 

176 return None 

177 

178 @alternative_name.setter 

179 def alternative_name(self, value: TextStringObject | None) -> None: 

180 """Set the alternative name (file specification).""" 

181 if value is None: 

182 if FileSpecificationDictionaryEntries.UF in self.pdf_object: 

183 self.pdf_object[NameObject(FileSpecificationDictionaryEntries.UF)] = NullObject() 

184 if FileSpecificationDictionaryEntries.F in self.pdf_object: 

185 self.pdf_object[NameObject(FileSpecificationDictionaryEntries.F)] = NullObject() 

186 else: 

187 self.pdf_object[NameObject(FileSpecificationDictionaryEntries.UF)] = value 

188 self.pdf_object[NameObject(FileSpecificationDictionaryEntries.F)] = value 

189 

190 @property 

191 def description(self) -> str | None: 

192 """Retrieve the description.""" 

193 value = self.pdf_object.get(FileSpecificationDictionaryEntries.DESC) 

194 if is_null_or_none(value): 

195 return None 

196 return value 

197 

198 @description.setter 

199 def description(self, value: TextStringObject | None) -> None: 

200 """Set the description.""" 

201 if value is None: 

202 self.pdf_object[NameObject(FileSpecificationDictionaryEntries.DESC)] = NullObject() 

203 else: 

204 self.pdf_object[NameObject(FileSpecificationDictionaryEntries.DESC)] = value 

205 

206 @property 

207 def associated_file_relationship(self) -> str: 

208 """Retrieve the relationship of the referring document to this embedded file.""" 

209 return cast( 

210 NameObject, 

211 self.pdf_object.get("/AFRelationship", NameObject("/Unspecified")), 

212 ) 

213 

214 @associated_file_relationship.setter 

215 def associated_file_relationship(self, value: NameObject) -> None: 

216 """Set the relationship of the referring document to this embedded file.""" 

217 self.pdf_object[NameObject("/AFRelationship")] = value 

218 

219 @property 

220 def _embedded_file(self) -> StreamObject: 

221 """Retrieve the actual embedded file stream.""" 

222 if "/EF" not in self.pdf_object: 

223 raise PdfReadError(f"/EF entry not found: {self.pdf_object}") 

224 ef = cast(DictionaryObject, self.pdf_object["/EF"]) 

225 for key in [FileSpecificationDictionaryEntries.UF, FileSpecificationDictionaryEntries.F]: 

226 if key in ef: 

227 return cast(StreamObject, ef[key].get_object()) 

228 raise PdfReadError(f"No /(U)F key found in file dictionary: {ef}") 

229 

230 @property 

231 def _params(self) -> DictionaryObject: 

232 """Retrieve the file-specific parameters.""" 

233 return cast(DictionaryObject, self._embedded_file.get("/Params", DictionaryObject()).get_object()) 

234 

235 @cached_property 

236 def _ensure_params(self) -> DictionaryObject: 

237 """Ensure the /Params dictionary exists and return it.""" 

238 embedded_file = self._embedded_file 

239 if "/Params" not in embedded_file: 

240 embedded_file[NameObject("/Params")] = DictionaryObject() 

241 return cast(DictionaryObject, embedded_file["/Params"]) 

242 

243 @property 

244 def subtype(self) -> str | None: 

245 """Retrieve the subtype. This is a MIME media type, prefixed by a slash.""" 

246 value = self._embedded_file.get("/Subtype") 

247 if is_null_or_none(value): 

248 return None 

249 return value 

250 

251 @subtype.setter 

252 def subtype(self, value: NameObject | None) -> None: 

253 """Set the subtype. This should be a MIME media type, prefixed by a slash.""" 

254 embedded_file = self._embedded_file 

255 if value is None: 

256 embedded_file[NameObject("/Subtype")] = NullObject() 

257 else: 

258 embedded_file[NameObject("/Subtype")] = value 

259 

260 @property 

261 def content(self) -> bytes: 

262 """Retrieve the actual file content.""" 

263 return self._embedded_file.get_data() 

264 

265 @content.setter 

266 def content(self, value: str | bytes) -> None: 

267 """Set the file content.""" 

268 if isinstance(value, str): 

269 value = value.encode("latin-1") 

270 self._embedded_file.set_data(value) 

271 

272 @property 

273 def size(self) -> int | None: 

274 """Retrieve the size of the uncompressed file in bytes.""" 

275 value = self._params.get("/Size") 

276 if is_null_or_none(value): 

277 return None 

278 return value 

279 

280 @size.setter 

281 def size(self, value: NumberObject | None) -> None: 

282 """Set the size of the uncompressed file in bytes.""" 

283 params = self._ensure_params 

284 if value is None: 

285 params[NameObject("/Size")] = NullObject() 

286 else: 

287 params[NameObject("/Size")] = value 

288 

289 @property 

290 def creation_date(self) -> datetime.datetime | None: 

291 """Retrieve the file creation datetime.""" 

292 return parse_iso8824_date(self._params.get("/CreationDate")) 

293 

294 @creation_date.setter 

295 def creation_date(self, value: datetime.datetime | None) -> None: 

296 """Set the file creation datetime.""" 

297 params = self._ensure_params 

298 if value is None: 

299 params[NameObject("/CreationDate")] = NullObject() 

300 else: 

301 date_str = format_iso8824_date(value) 

302 params[NameObject("/CreationDate")] = TextStringObject(date_str) 

303 

304 @property 

305 def modification_date(self) -> datetime.datetime | None: 

306 """Retrieve the datetime of the last file modification.""" 

307 return parse_iso8824_date(self._params.get("/ModDate")) 

308 

309 @modification_date.setter 

310 def modification_date(self, value: datetime.datetime | None) -> None: 

311 """Set the datetime of the last file modification.""" 

312 params = self._ensure_params 

313 if value is None: 

314 params[NameObject("/ModDate")] = NullObject() 

315 else: 

316 date_str = format_iso8824_date(value) 

317 params[NameObject("/ModDate")] = TextStringObject(date_str) 

318 

319 @property 

320 def checksum(self) -> bytes | None: 

321 """Retrieve the MD5 checksum of the (uncompressed) file.""" 

322 value = self._params.get("/CheckSum") 

323 if is_null_or_none(value): 

324 return None 

325 return value 

326 

327 @checksum.setter 

328 def checksum(self, value: ByteStringObject | None) -> None: 

329 """Set the MD5 checksum of the (uncompressed) file.""" 

330 params = self._ensure_params 

331 if value is None: 

332 params[NameObject("/CheckSum")] = NullObject() 

333 else: 

334 params[NameObject("/CheckSum")] = value 

335 

336 def delete(self) -> None: 

337 """Delete the file from the document.""" 

338 if not self._parent: 

339 raise PyPdfError("Parent required to delete file from document.") 

340 if self.pdf_object in self._parent: 

341 index = self._parent.index(self.pdf_object) 

342 elif ( 

343 (indirect_reference := getattr(self.pdf_object, "indirect_reference", None)) is not None 

344 and indirect_reference in self._parent 

345 ): 

346 index = self._parent.index(indirect_reference) 

347 else: 

348 raise PyPdfError("File not found in parent object.") 

349 self._parent.pop(index) # Reference. 

350 self._parent.pop(index - 1) # Name. 

351 self.pdf_object = DictionaryObject() # Invalidate. 

352 

353 def __repr__(self) -> str: 

354 return f"<{self.__class__.__name__} name={self.name!r}>" 

355 

356 @classmethod 

357 def _load_from_names(cls, names: ArrayObject) -> Generator[EmbeddedFile]: 

358 """ 

359 Convert the given name tree into class instances. 

360 

361 Args: 

362 names: The name tree to load the data from. 

363 

364 Returns: 

365 Iterable of class instances for the files found. 

366 """ 

367 # This is a name tree of the format [name_1, reference_1, name_2, reference_2, ...] 

368 for i, name in enumerate(names): 

369 if not isinstance(name, str): 

370 # Skip plain strings and retrieve them as `direct_name` by index. 

371 file_dictionary = name.get_object() 

372 direct_name = names[i - 1].get_object() 

373 yield EmbeddedFile(name=direct_name, pdf_object=file_dictionary, parent=names) 

374 

375 @classmethod 

376 def _load(cls, catalog: DictionaryObject) -> Generator[EmbeddedFile]: 

377 """ 

378 Load the embedded files for the given document catalog. 

379 

380 This method and its signature are considered internal API and thus not exposed publicly for now. 

381 

382 Args: 

383 catalog: The document catalog to load from. 

384 

385 Returns: 

386 Iterable of class instances for the files found. 

387 """ 

388 try: 

389 container = cast( 

390 DictionaryObject, 

391 cast(DictionaryObject, catalog["/Names"])["/EmbeddedFiles"], 

392 ) 

393 except KeyError: 

394 return 

395 

396 if "/Kids" in container: 

397 for kid in cast(ArrayObject, container["/Kids"].get_object()): 

398 # There might be further (nested) kids here. 

399 # Wait for an example before evaluating an implementation. 

400 kid = kid.get_object() 

401 if "/Names" in kid: 

402 yield from cls._load_from_names(cast(ArrayObject, kid["/Names"])) 

403 if "/Names" in container: 

404 yield from cls._load_from_names(cast(ArrayObject, container["/Names"]))