Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/generic/_files.py: 35%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

182 statements  

1from __future__ import annotations 

2 

3from functools import cached_property 

4from typing import TYPE_CHECKING, cast 

5 

6from pypdf._utils import format_iso8824_date, parse_iso8824_date 

7from pypdf.constants import CatalogAttributes as CA 

8from pypdf.constants import FileSpecificationDictionaryEntries 

9from pypdf.constants import PageAttributes as PG 

10from pypdf.errors import PdfReadError 

11from pypdf.generic import ( 

12 ArrayObject, 

13 ByteStringObject, 

14 DecodedStreamObject, 

15 DictionaryObject, 

16 NameObject, 

17 NullObject, 

18 NumberObject, 

19 StreamObject, 

20 TextStringObject, 

21 is_null_or_none, 

22) 

23 

24if TYPE_CHECKING: 

25 import datetime 

26 from collections.abc import Generator 

27 

28 from pypdf._writer import PdfWriter 

29 

30 

31class EmbeddedFile: 

32 """ 

33 Container holding the information on an embedded file. 

34 

35 Attributes are evaluated lazily if possible. 

36 

37 Further information on embedded files can be found in section 7.11 of the PDF 2.0 specification. 

38 """ 

39 def __init__(self, name: str, pdf_object: DictionaryObject) -> None: 

40 """ 

41 Args: 

42 name: The (primary) name as provided in the name tree. 

43 pdf_object: The corresponding PDF object to allow retrieving further data. 

44 """ 

45 self._name = name 

46 self.pdf_object = pdf_object 

47 

48 @property 

49 def name(self) -> str: 

50 """The (primary) name of the embedded file as provided in the name tree.""" 

51 return self._name 

52 

53 @classmethod 

54 def _create_new(cls, writer: PdfWriter, name: str, content: str | bytes) -> EmbeddedFile: 

55 """ 

56 Create a new embedded file and add it to the PdfWriter. 

57 

58 Args: 

59 writer: The PdfWriter instance to add the embedded file to. 

60 name: The filename to display. 

61 content: The data in the file. 

62 

63 Returns: 

64 EmbeddedFile instance for the newly created embedded file. 

65 """ 

66 # Convert string content to bytes if needed 

67 if isinstance(content, str): 

68 content = content.encode("latin-1") 

69 

70 # Create the file entry (the actual embedded file stream) 

71 file_entry = DecodedStreamObject() 

72 file_entry.set_data(content) 

73 file_entry.update({NameObject(PG.TYPE): NameObject("/EmbeddedFile")}) 

74 

75 # Create the /EF entry 

76 ef_entry = DictionaryObject() 

77 ef_entry.update({NameObject("/F"): writer._add_object(file_entry)}) 

78 

79 # Create the filespec dictionary 

80 from pypdf.generic import create_string_object # noqa: PLC0415 

81 filespec = DictionaryObject() 

82 filespec.update( 

83 { 

84 NameObject(PG.TYPE): NameObject("/Filespec"), 

85 NameObject(FileSpecificationDictionaryEntries.F): create_string_object(name), 

86 NameObject(FileSpecificationDictionaryEntries.EF): ef_entry, 

87 } 

88 ) 

89 

90 # Add to the catalog's names tree 

91 if CA.NAMES not in writer._root_object: 

92 writer._root_object[NameObject(CA.NAMES)] = writer._add_object(DictionaryObject()) 

93 

94 names_dict = cast(DictionaryObject, writer._root_object[CA.NAMES]) 

95 if "/EmbeddedFiles" not in names_dict: 

96 embedded_files_names_dictionary = DictionaryObject( 

97 {NameObject(CA.NAMES): ArrayObject()} 

98 ) 

99 names_dict[NameObject("/EmbeddedFiles")] = writer._add_object(embedded_files_names_dictionary) 

100 else: 

101 embedded_files_names_dictionary = cast(DictionaryObject, names_dict["/EmbeddedFiles"]) 

102 

103 # Add the name and filespec to the names array 

104 names_array = cast(ArrayObject, embedded_files_names_dictionary[CA.NAMES]) 

105 names_array.extend([create_string_object(name), filespec]) 

106 

107 # Return an EmbeddedFile instance 

108 return cls(name=name, pdf_object=filespec) 

109 

110 @property 

111 def alternative_name(self) -> str | None: 

112 """Retrieve the alternative name (file specification).""" 

113 for key in [FileSpecificationDictionaryEntries.UF, FileSpecificationDictionaryEntries.F]: 

114 # PDF 2.0 reference, table 43: 

115 # > A PDF reader shall use the value of the UF key, when present, instead of the F key. 

116 if key in self.pdf_object: 

117 value = self.pdf_object[key].get_object() 

118 if not is_null_or_none(value): 

119 return cast(str, value) 

120 return None 

121 

122 @alternative_name.setter 

123 def alternative_name(self, value: TextStringObject | None) -> None: 

124 """Set the alternative name (file specification).""" 

125 if value is None: 

126 if FileSpecificationDictionaryEntries.UF in self.pdf_object: 

127 self.pdf_object[NameObject(FileSpecificationDictionaryEntries.UF)] = NullObject() 

128 if FileSpecificationDictionaryEntries.F in self.pdf_object: 

129 self.pdf_object[NameObject(FileSpecificationDictionaryEntries.F)] = NullObject() 

130 else: 

131 if FileSpecificationDictionaryEntries.UF in self.pdf_object: 

132 self.pdf_object[NameObject(FileSpecificationDictionaryEntries.UF)] = value 

133 if FileSpecificationDictionaryEntries.F in self.pdf_object: 

134 self.pdf_object[NameObject(FileSpecificationDictionaryEntries.F)] = value 

135 

136 @property 

137 def description(self) -> str | None: 

138 """Retrieve the description.""" 

139 value = self.pdf_object.get(FileSpecificationDictionaryEntries.DESC) 

140 if is_null_or_none(value): 

141 return None 

142 return value 

143 

144 @description.setter 

145 def description(self, value: TextStringObject | None) -> None: 

146 """Set the description.""" 

147 if value is None: 

148 self.pdf_object[NameObject(FileSpecificationDictionaryEntries.DESC)] = NullObject() 

149 else: 

150 self.pdf_object[NameObject(FileSpecificationDictionaryEntries.DESC)] = value 

151 

152 @property 

153 def associated_file_relationship(self) -> str: 

154 """Retrieve the relationship of the referring document to this embedded file.""" 

155 return self.pdf_object.get("/AFRelationship", "/Unspecified") 

156 

157 @associated_file_relationship.setter 

158 def associated_file_relationship(self, value: NameObject) -> None: 

159 """Set the relationship of the referring document to this embedded file.""" 

160 self.pdf_object[NameObject("/AFRelationship")] = value 

161 

162 @property 

163 def _embedded_file(self) -> StreamObject: 

164 """Retrieve the actual embedded file stream.""" 

165 if "/EF" not in self.pdf_object: 

166 raise PdfReadError(f"/EF entry not found: {self.pdf_object}") 

167 ef = cast(DictionaryObject, self.pdf_object["/EF"]) 

168 for key in [FileSpecificationDictionaryEntries.UF, FileSpecificationDictionaryEntries.F]: 

169 if key in ef: 

170 return cast(StreamObject, ef[key].get_object()) 

171 raise PdfReadError(f"No /(U)F key found in file dictionary: {ef}") 

172 

173 @property 

174 def _params(self) -> DictionaryObject: 

175 """Retrieve the file-specific parameters.""" 

176 return self._embedded_file.get("/Params", DictionaryObject()).get_object() 

177 

178 @cached_property 

179 def _ensure_params(self) -> DictionaryObject: 

180 """Ensure the /Params dictionary exists and return it.""" 

181 embedded_file = self._embedded_file 

182 if "/Params" not in embedded_file: 

183 embedded_file[NameObject("/Params")] = DictionaryObject() 

184 return cast(DictionaryObject, embedded_file["/Params"]) 

185 

186 @property 

187 def subtype(self) -> str | None: 

188 """Retrieve the subtype. This is a MIME media type, prefixed by a slash.""" 

189 value = self._embedded_file.get("/Subtype") 

190 if is_null_or_none(value): 

191 return None 

192 return value 

193 

194 @subtype.setter 

195 def subtype(self, value: NameObject | None) -> None: 

196 """Set the subtype. This should be a MIME media type, prefixed by a slash.""" 

197 embedded_file = self._embedded_file 

198 if value is None: 

199 embedded_file[NameObject("/Subtype")] = NullObject() 

200 else: 

201 embedded_file[NameObject("/Subtype")] = value 

202 

203 @property 

204 def content(self) -> bytes: 

205 """Retrieve the actual file content.""" 

206 return self._embedded_file.get_data() 

207 

208 @content.setter 

209 def content(self, value: str | bytes) -> None: 

210 """Set the file content.""" 

211 if isinstance(value, str): 

212 value = value.encode("latin-1") 

213 self._embedded_file.set_data(value) 

214 

215 @property 

216 def size(self) -> int | None: 

217 """Retrieve the size of the uncompressed file in bytes.""" 

218 value = self._params.get("/Size") 

219 if is_null_or_none(value): 

220 return None 

221 return value 

222 

223 @size.setter 

224 def size(self, value: NumberObject | None) -> None: 

225 """Set the size of the uncompressed file in bytes.""" 

226 params = self._ensure_params 

227 if value is None: 

228 params[NameObject("/Size")] = NullObject() 

229 else: 

230 params[NameObject("/Size")] = value 

231 

232 @property 

233 def creation_date(self) -> datetime.datetime | None: 

234 """Retrieve the file creation datetime.""" 

235 return parse_iso8824_date(self._params.get("/CreationDate")) 

236 

237 @creation_date.setter 

238 def creation_date(self, value: datetime.datetime | None) -> None: 

239 """Set the file creation datetime.""" 

240 params = self._ensure_params 

241 if value is None: 

242 params[NameObject("/CreationDate")] = NullObject() 

243 else: 

244 date_str = format_iso8824_date(value) 

245 params[NameObject("/CreationDate")] = TextStringObject(date_str) 

246 

247 @property 

248 def modification_date(self) -> datetime.datetime | None: 

249 """Retrieve the datetime of the last file modification.""" 

250 return parse_iso8824_date(self._params.get("/ModDate")) 

251 

252 @modification_date.setter 

253 def modification_date(self, value: datetime.datetime | None) -> None: 

254 """Set the datetime of the last file modification.""" 

255 params = self._ensure_params 

256 if value is None: 

257 params[NameObject("/ModDate")] = NullObject() 

258 else: 

259 date_str = format_iso8824_date(value) 

260 params[NameObject("/ModDate")] = TextStringObject(date_str) 

261 

262 @property 

263 def checksum(self) -> bytes | None: 

264 """Retrieve the MD5 checksum of the (uncompressed) file.""" 

265 value = self._params.get("/CheckSum") 

266 if is_null_or_none(value): 

267 return None 

268 return value 

269 

270 @checksum.setter 

271 def checksum(self, value: ByteStringObject | None) -> None: 

272 """Set the MD5 checksum of the (uncompressed) file.""" 

273 params = self._ensure_params 

274 if value is None: 

275 params[NameObject("/CheckSum")] = NullObject() 

276 else: 

277 params[NameObject("/CheckSum")] = value 

278 

279 def __repr__(self) -> str: 

280 return f"<{self.__class__.__name__} name={self.name!r}>" 

281 

282 @classmethod 

283 def _load_from_names(cls, names: ArrayObject) -> Generator[EmbeddedFile]: 

284 """ 

285 Convert the given name tree into class instances. 

286 

287 Args: 

288 names: The name tree to load the data from. 

289 

290 Returns: 

291 Iterable of class instances for the files found. 

292 """ 

293 # This is a name tree of the format [name_1, reference_1, name_2, reference_2, ...] 

294 for i, name in enumerate(names): 

295 if not isinstance(name, str): 

296 # Skip plain strings and retrieve them as `direct_name` by index. 

297 file_dictionary = name.get_object() 

298 direct_name = names[i - 1].get_object() 

299 yield EmbeddedFile(name=direct_name, pdf_object=file_dictionary) 

300 

301 @classmethod 

302 def _load(cls, catalog: DictionaryObject) -> Generator[EmbeddedFile]: 

303 """ 

304 Load the embedded files for the given document catalog. 

305 

306 This method and its signature are considered internal API and thus not exposed publicly for now. 

307 

308 Args: 

309 catalog: The document catalog to load from. 

310 

311 Returns: 

312 Iterable of class instances for the files found. 

313 """ 

314 try: 

315 container = cast( 

316 DictionaryObject, 

317 cast(DictionaryObject, catalog["/Names"])["/EmbeddedFiles"], 

318 ) 

319 except KeyError: 

320 return 

321 

322 if "/Kids" in container: 

323 for kid in cast(ArrayObject, container["/Kids"].get_object()): 

324 # There might be further (nested) kids here. 

325 # Wait for an example before evaluating an implementation. 

326 kid = kid.get_object() 

327 if "/Names" in kid: 

328 yield from cls._load_from_names(cast(ArrayObject, kid["/Names"])) 

329 if "/Names" in container: 

330 yield from cls._load_from_names(cast(ArrayObject, container["/Names"]))