Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/generic/_files.py: 35%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

181 statements  

1from __future__ import annotations 

2 

3from functools import cached_property 

4from typing import TYPE_CHECKING, Generator, cast 

5 

6from pypdf._utils import format_iso8824_date, parse_iso8824_date 

7from pypdf.constants import CatalogAttributes as CA 

8from pypdf.constants import FileSpecificationDictionaryEntries 

9from pypdf.constants import PageAttributes as PA 

10from pypdf.errors import PdfReadError 

11from pypdf.generic import ( 

12 ArrayObject, 

13 ByteStringObject, 

14 DecodedStreamObject, 

15 DictionaryObject, 

16 NameObject, 

17 NullObject, 

18 NumberObject, 

19 StreamObject, 

20 TextStringObject, 

21 is_null_or_none, 

22) 

23 

24if TYPE_CHECKING: 

25 import datetime 

26 

27 from pypdf._writer import PdfWriter 

28 

29 

30class EmbeddedFile: 

31 """ 

32 Container holding the information on an embedded file. 

33 

34 Attributes are evaluated lazily if possible. 

35 

36 Further information on embedded files can be found in section 7.11 of the PDF 2.0 specification. 

37 """ 

38 def __init__(self, name: str, pdf_object: DictionaryObject) -> None: 

39 """ 

40 Args: 

41 name: The (primary) name as provided in the name tree. 

42 pdf_object: The corresponding PDF object to allow retrieving further data. 

43 """ 

44 self._name = name 

45 self.pdf_object = pdf_object 

46 

47 @property 

48 def name(self) -> str: 

49 """The (primary) name of the embedded file as provided in the name tree.""" 

50 return self._name 

51 

52 @classmethod 

53 def _create_new(cls, writer: PdfWriter, name: str, content: str | bytes) -> EmbeddedFile: 

54 """ 

55 Create a new embedded file and add it to the PdfWriter. 

56 

57 Args: 

58 writer: The PdfWriter instance to add the embedded file to. 

59 name: The filename to display. 

60 content: The data in the file. 

61 

62 Returns: 

63 EmbeddedFile instance for the newly created embedded file. 

64 """ 

65 # Convert string content to bytes if needed 

66 if isinstance(content, str): 

67 content = content.encode("latin-1") 

68 

69 # Create the file entry (the actual embedded file stream) 

70 file_entry = DecodedStreamObject() 

71 file_entry.set_data(content) 

72 file_entry.update({NameObject(PA.TYPE): NameObject("/EmbeddedFile")}) 

73 

74 # Create the /EF entry 

75 ef_entry = DictionaryObject() 

76 ef_entry.update({NameObject("/F"): writer._add_object(file_entry)}) 

77 

78 # Create the filespec dictionary 

79 from pypdf.generic import create_string_object # noqa: PLC0415 

80 filespec = DictionaryObject() 

81 filespec.update( 

82 { 

83 NameObject(PA.TYPE): NameObject("/Filespec"), 

84 NameObject(FileSpecificationDictionaryEntries.F): create_string_object(name), 

85 NameObject(FileSpecificationDictionaryEntries.EF): ef_entry, 

86 } 

87 ) 

88 

89 # Add to the catalog's names tree 

90 if CA.NAMES not in writer._root_object: 

91 writer._root_object[NameObject(CA.NAMES)] = writer._add_object(DictionaryObject()) 

92 

93 names_dict = cast(DictionaryObject, writer._root_object[CA.NAMES]) 

94 if "/EmbeddedFiles" not in names_dict: 

95 embedded_files_names_dictionary = DictionaryObject( 

96 {NameObject(CA.NAMES): ArrayObject()} 

97 ) 

98 names_dict[NameObject("/EmbeddedFiles")] = writer._add_object(embedded_files_names_dictionary) 

99 else: 

100 embedded_files_names_dictionary = cast(DictionaryObject, names_dict["/EmbeddedFiles"]) 

101 

102 # Add the name and filespec to the names array 

103 names_array = cast(ArrayObject, embedded_files_names_dictionary[CA.NAMES]) 

104 names_array.extend([create_string_object(name), filespec]) 

105 

106 # Return an EmbeddedFile instance 

107 return cls(name=name, pdf_object=filespec) 

108 

109 @property 

110 def alternative_name(self) -> str | None: 

111 """Retrieve the alternative name (file specification).""" 

112 for key in [FileSpecificationDictionaryEntries.UF, FileSpecificationDictionaryEntries.F]: 

113 # PDF 2.0 reference, table 43: 

114 # > A PDF reader shall use the value of the UF key, when present, instead of the F key. 

115 if key in self.pdf_object: 

116 value = self.pdf_object[key].get_object() 

117 if not is_null_or_none(value): 

118 return cast(str, value) 

119 return None 

120 

121 @alternative_name.setter 

122 def alternative_name(self, value: TextStringObject | None) -> None: 

123 """Set the alternative name (file specification).""" 

124 if value is None: 

125 if FileSpecificationDictionaryEntries.UF in self.pdf_object: 

126 self.pdf_object[NameObject(FileSpecificationDictionaryEntries.UF)] = NullObject() 

127 if FileSpecificationDictionaryEntries.F in self.pdf_object: 

128 self.pdf_object[NameObject(FileSpecificationDictionaryEntries.F)] = NullObject() 

129 else: 

130 if FileSpecificationDictionaryEntries.UF in self.pdf_object: 

131 self.pdf_object[NameObject(FileSpecificationDictionaryEntries.UF)] = value 

132 if FileSpecificationDictionaryEntries.F in self.pdf_object: 

133 self.pdf_object[NameObject(FileSpecificationDictionaryEntries.F)] = value 

134 

135 @property 

136 def description(self) -> str | None: 

137 """Retrieve the description.""" 

138 value = self.pdf_object.get(FileSpecificationDictionaryEntries.DESC) 

139 if is_null_or_none(value): 

140 return None 

141 return value 

142 

143 @description.setter 

144 def description(self, value: TextStringObject | None) -> None: 

145 """Set the description.""" 

146 if value is None: 

147 self.pdf_object[NameObject(FileSpecificationDictionaryEntries.DESC)] = NullObject() 

148 else: 

149 self.pdf_object[NameObject(FileSpecificationDictionaryEntries.DESC)] = value 

150 

151 @property 

152 def associated_file_relationship(self) -> str: 

153 """Retrieve the relationship of the referring document to this embedded file.""" 

154 return self.pdf_object.get("/AFRelationship", "/Unspecified") 

155 

156 @associated_file_relationship.setter 

157 def associated_file_relationship(self, value: NameObject) -> None: 

158 """Set the relationship of the referring document to this embedded file.""" 

159 self.pdf_object[NameObject("/AFRelationship")] = value 

160 

161 @property 

162 def _embedded_file(self) -> StreamObject: 

163 """Retrieve the actual embedded file stream.""" 

164 if "/EF" not in self.pdf_object: 

165 raise PdfReadError(f"/EF entry not found: {self.pdf_object}") 

166 ef = cast(DictionaryObject, self.pdf_object["/EF"]) 

167 for key in [FileSpecificationDictionaryEntries.UF, FileSpecificationDictionaryEntries.F]: 

168 if key in ef: 

169 return cast(StreamObject, ef[key].get_object()) 

170 raise PdfReadError(f"No /(U)F key found in file dictionary: {ef}") 

171 

172 @property 

173 def _params(self) -> DictionaryObject: 

174 """Retrieve the file-specific parameters.""" 

175 return self._embedded_file.get("/Params", DictionaryObject()).get_object() 

176 

177 @cached_property 

178 def _ensure_params(self) -> DictionaryObject: 

179 """Ensure the /Params dictionary exists and return it.""" 

180 embedded_file = self._embedded_file 

181 if "/Params" not in embedded_file: 

182 embedded_file[NameObject("/Params")] = DictionaryObject() 

183 return cast(DictionaryObject, embedded_file["/Params"]) 

184 

185 @property 

186 def subtype(self) -> str | None: 

187 """Retrieve the subtype. This is a MIME media type, prefixed by a slash.""" 

188 value = self._embedded_file.get("/Subtype") 

189 if is_null_or_none(value): 

190 return None 

191 return value 

192 

193 @subtype.setter 

194 def subtype(self, value: NameObject | None) -> None: 

195 """Set the subtype. This should be a MIME media type, prefixed by a slash.""" 

196 embedded_file = self._embedded_file 

197 if value is None: 

198 embedded_file[NameObject("/Subtype")] = NullObject() 

199 else: 

200 embedded_file[NameObject("/Subtype")] = value 

201 

202 @property 

203 def content(self) -> bytes: 

204 """Retrieve the actual file content.""" 

205 return self._embedded_file.get_data() 

206 

207 @content.setter 

208 def content(self, value: str | bytes) -> None: 

209 """Set the file content.""" 

210 if isinstance(value, str): 

211 value = value.encode("latin-1") 

212 self._embedded_file.set_data(value) 

213 

214 @property 

215 def size(self) -> int | None: 

216 """Retrieve the size of the uncompressed file in bytes.""" 

217 value = self._params.get("/Size") 

218 if is_null_or_none(value): 

219 return None 

220 return value 

221 

222 @size.setter 

223 def size(self, value: NumberObject | None) -> None: 

224 """Set the size of the uncompressed file in bytes.""" 

225 params = self._ensure_params 

226 if value is None: 

227 params[NameObject("/Size")] = NullObject() 

228 else: 

229 params[NameObject("/Size")] = value 

230 

231 @property 

232 def creation_date(self) -> datetime.datetime | None: 

233 """Retrieve the file creation datetime.""" 

234 return parse_iso8824_date(self._params.get("/CreationDate")) 

235 

236 @creation_date.setter 

237 def creation_date(self, value: datetime.datetime | None) -> None: 

238 """Set the file creation datetime.""" 

239 params = self._ensure_params 

240 if value is None: 

241 params[NameObject("/CreationDate")] = NullObject() 

242 else: 

243 date_str = format_iso8824_date(value) 

244 params[NameObject("/CreationDate")] = TextStringObject(date_str) 

245 

246 @property 

247 def modification_date(self) -> datetime.datetime | None: 

248 """Retrieve the datetime of the last file modification.""" 

249 return parse_iso8824_date(self._params.get("/ModDate")) 

250 

251 @modification_date.setter 

252 def modification_date(self, value: datetime.datetime | None) -> None: 

253 """Set the datetime of the last file modification.""" 

254 params = self._ensure_params 

255 if value is None: 

256 params[NameObject("/ModDate")] = NullObject() 

257 else: 

258 date_str = format_iso8824_date(value) 

259 params[NameObject("/ModDate")] = TextStringObject(date_str) 

260 

261 @property 

262 def checksum(self) -> bytes | None: 

263 """Retrieve the MD5 checksum of the (uncompressed) file.""" 

264 value = self._params.get("/CheckSum") 

265 if is_null_or_none(value): 

266 return None 

267 return value 

268 

269 @checksum.setter 

270 def checksum(self, value: ByteStringObject | None) -> None: 

271 """Set the MD5 checksum of the (uncompressed) file.""" 

272 params = self._ensure_params 

273 if value is None: 

274 params[NameObject("/CheckSum")] = NullObject() 

275 else: 

276 params[NameObject("/CheckSum")] = value 

277 

278 def __repr__(self) -> str: 

279 return f"<{self.__class__.__name__} name={self.name!r}>" 

280 

281 @classmethod 

282 def _load_from_names(cls, names: ArrayObject) -> Generator[EmbeddedFile]: 

283 """ 

284 Convert the given name tree into class instances. 

285 

286 Args: 

287 names: The name tree to load the data from. 

288 

289 Returns: 

290 Iterable of class instances for the files found. 

291 """ 

292 # This is a name tree of the format [name_1, reference_1, name_2, reference_2, ...] 

293 for i, name in enumerate(names): 

294 if not isinstance(name, str): 

295 # Skip plain strings and retrieve them as `direct_name` by index. 

296 file_dictionary = name.get_object() 

297 direct_name = names[i - 1].get_object() 

298 yield EmbeddedFile(name=direct_name, pdf_object=file_dictionary) 

299 

300 @classmethod 

301 def _load(cls, catalog: DictionaryObject) -> Generator[EmbeddedFile]: 

302 """ 

303 Load the embedded files for the given document catalog. 

304 

305 This method and its signature are considered internal API and thus not exposed publicly for now. 

306 

307 Args: 

308 catalog: The document catalog to load from. 

309 

310 Returns: 

311 Iterable of class instances for the files found. 

312 """ 

313 try: 

314 container = cast( 

315 DictionaryObject, 

316 cast(DictionaryObject, catalog["/Names"])["/EmbeddedFiles"], 

317 ) 

318 except KeyError: 

319 return 

320 

321 if "/Kids" in container: 

322 for kid in cast(ArrayObject, container["/Kids"].get_object()): 

323 # There might be further (nested) kids here. 

324 # Wait for an example before evaluating an implementation. 

325 kid = kid.get_object() 

326 if "/Names" in kid: 

327 yield from cls._load_from_names(cast(ArrayObject, kid["/Names"])) 

328 if "/Names" in container: 

329 yield from cls._load_from_names(cast(ArrayObject, container["/Names"]))