Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/generic/_files.py: 47%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

76 statements  

1from __future__ import annotations 

2 

3from typing import TYPE_CHECKING, Generator, cast 

4 

5from pypdf._utils import parse_iso8824_date 

6from pypdf.constants import FileSpecificationDictionaryEntries 

7from pypdf.errors import PdfReadError 

8from pypdf.generic import ArrayObject, DictionaryObject, StreamObject 

9 

10if TYPE_CHECKING: 

11 import datetime 

12 

13 

14class EmbeddedFile: 

15 """ 

16 Container holding the information on an embedded file. 

17 

18 Attributes are evaluated lazily if possible. 

19 

20 Further information on embedded files can be found in section 7.11 of the PDF 2.0 specification. 

21 """ 

22 def __init__(self, name: str, pdf_object: DictionaryObject) -> None: 

23 """ 

24 Args: 

25 name: The (primary) name as provided in the name tree. 

26 pdf_object: The corresponding PDF object to allow retrieving further data. 

27 """ 

28 self.name = name 

29 self.pdf_object = pdf_object 

30 

31 @property 

32 def alternative_name(self) -> str | None: 

33 """Retrieve the alternative name (file specification).""" 

34 for key in [FileSpecificationDictionaryEntries.UF, FileSpecificationDictionaryEntries.F]: 

35 # PDF 2.0 reference, table 43: 

36 # > A PDF reader shall use the value of the UF key, when present, instead of the F key. 

37 if key in self.pdf_object: 

38 return cast(str, self.pdf_object[key].get_object()) 

39 return None 

40 

41 @property 

42 def description(self) -> str | None: 

43 """Retrieve the description.""" 

44 return self.pdf_object.get(FileSpecificationDictionaryEntries.DESC) 

45 

46 @property 

47 def associated_file_relationship(self) -> str: 

48 """Retrieve the relationship of the referring document to this embedded file.""" 

49 return self.pdf_object.get("/AFRelationship", "/Unspecified") 

50 

51 @property 

52 def _embedded_file(self) -> StreamObject: 

53 """Retrieve the actual embedded file stream.""" 

54 if "/EF" not in self.pdf_object: 

55 raise PdfReadError(f"/EF entry not found: {self.pdf_object}") 

56 ef = cast(DictionaryObject, self.pdf_object["/EF"]) 

57 for key in [FileSpecificationDictionaryEntries.UF, FileSpecificationDictionaryEntries.F]: 

58 if key in ef: 

59 return cast(StreamObject, ef[key].get_object()) 

60 raise PdfReadError(f"No /(U)F key found in file dictionary: {ef}") 

61 

62 @property 

63 def _params(self) -> DictionaryObject: 

64 """Retrieve the file-specific parameters.""" 

65 return self._embedded_file.get("/Params", DictionaryObject()).get_object() 

66 

67 @property 

68 def subtype(self) -> str | None: 

69 """Retrieve the subtype. This is a MIME media type, prefixed by a slash.""" 

70 return self._embedded_file.get("/Subtype") 

71 

72 @property 

73 def content(self) -> bytes: 

74 """Retrieve the actual file content.""" 

75 return self._embedded_file.get_data() 

76 

77 @property 

78 def size(self) -> int | None: 

79 """Retrieve the size of the uncompressed file in bytes.""" 

80 return self._params.get("/Size") 

81 

82 @property 

83 def creation_date(self) -> datetime.datetime | None: 

84 """Retrieve the file creation datetime.""" 

85 return parse_iso8824_date(self._params.get("/CreationDate")) 

86 

87 @property 

88 def modification_date(self) -> datetime.datetime | None: 

89 """Retrieve the datetime of the last file modification.""" 

90 return parse_iso8824_date(self._params.get("/ModDate")) 

91 

92 @property 

93 def checksum(self) -> bytes | None: 

94 """Retrieve the MD5 checksum of the (uncompressed) file.""" 

95 return self._params.get("/CheckSum") 

96 

97 def __repr__(self) -> str: 

98 return f"<{self.__class__.__name__} name={self.name!r}>" 

99 

100 @classmethod 

101 def _load_from_names(cls, names: ArrayObject) -> Generator[EmbeddedFile]: 

102 """ 

103 Convert the given name tree into class instances. 

104 

105 Args: 

106 names: The name tree to load the data from. 

107 

108 Returns: 

109 Iterable of class instances for the files found. 

110 """ 

111 # This is a name tree of the format [name_1, reference_1, name_2, reference_2, ...] 

112 for i, name in enumerate(names): 

113 if not isinstance(name, str): 

114 # Skip plain strings and retrieve them as `direct_name` by index. 

115 file_dictionary = name.get_object() 

116 direct_name = names[i - 1].get_object() 

117 yield EmbeddedFile(name=direct_name, pdf_object=file_dictionary) 

118 

119 @classmethod 

120 def _load(cls, catalog: DictionaryObject) -> Generator[EmbeddedFile]: 

121 """ 

122 Load the embedded files for the given document catalog. 

123 

124 This method and its signature are considered internal API and thus not exposed publicly for now. 

125 

126 Args: 

127 catalog: The document catalog to load from. 

128 

129 Returns: 

130 Iterable of class instances for the files found. 

131 """ 

132 try: 

133 container = cast( 

134 DictionaryObject, 

135 cast(DictionaryObject, catalog["/Names"])["/EmbeddedFiles"], 

136 ) 

137 except KeyError: 

138 return 

139 

140 if "/Kids" in container: 

141 for kid in cast(ArrayObject, container["/Kids"].get_object()): 

142 # There might be further (nested) kids here. 

143 # Wait for an example before evaluating an implementation. 

144 kid = kid.get_object() 

145 if "/Names" in kid: 

146 yield from cls._load_from_names(cast(ArrayObject, kid["/Names"])) 

147 if "/Names" in container: 

148 yield from cls._load_from_names(cast(ArrayObject, container["/Names"]))