Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/pdfpage.py: 91%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

117 statements  

1import itertools 

2import logging 

3from collections.abc import Container, Iterator 

4from typing import Any, BinaryIO, ClassVar 

5 

6from pdfminer import settings 

7from pdfminer.pdfdocument import ( 

8 PDFDocument, 

9 PDFNoPageLabels, 

10 PDFTextExtractionNotAllowed, 

11) 

12from pdfminer.pdfexceptions import PDFObjectNotFound, PDFValueError 

13from pdfminer.pdfparser import PDFParser 

14from pdfminer.pdftypes import dict_value, int_value, list_value, resolve1 

15from pdfminer.psparser import LIT 

16from pdfminer.utils import Rect, parse_rect 

17 

18log = logging.getLogger(__name__) 

19 

20# some predefined literals and keywords. 

21LITERAL_PAGE = LIT("Page") 

22LITERAL_PAGES = LIT("Pages") 

23 

24 

25class PDFPage: 

26 """An object that holds the information about a page. 

27 

28 A PDFPage object is merely a convenience class that has a set 

29 of keys and values, which describe the properties of a page 

30 and point to its contents. 

31 

32 Attributes 

33 ---------- 

34 doc: a PDFDocument object. 

35 pageid: any Python object that can uniquely identify the page. 

36 attrs: a dictionary of page attributes. 

37 contents: a list of PDFStream objects that represents the page content. 

38 lastmod: the last modified time of the page. 

39 resources: a dictionary of resources used by the page. 

40 mediabox: the physical size of the page. 

41 cropbox: the crop rectangle of the page. 

42 rotate: the page rotation (in degree). 

43 annots: the page annotations. 

44 beads: a chain that represents natural reading order. 

45 label: the page's label (typically, the logical page number). 

46 

47 """ 

48 

49 def __init__( 

50 self, 

51 doc: PDFDocument, 

52 pageid: object, 

53 attrs: object, 

54 label: str | None, 

55 ) -> None: 

56 """Initialize a page object. 

57 

58 doc: a PDFDocument object. 

59 pageid: any Python object that can uniquely identify the page. 

60 attrs: a dictionary of page attributes. 

61 label: page label string. 

62 """ 

63 self.doc = doc 

64 self.pageid = pageid 

65 self.attrs = dict_value(attrs) 

66 self.label = label 

67 self.lastmod = resolve1(self.attrs.get("LastModified")) 

68 self.resources: dict[object, object] = resolve1( 

69 self.attrs.get("Resources", {}), 

70 ) 

71 

72 self.mediabox = self._parse_mediabox(self.attrs.get("MediaBox")) 

73 self.cropbox = self._parse_cropbox(self.attrs.get("CropBox"), self.mediabox) 

74 self.contents = self._parse_contents(self.attrs.get("Contents")) 

75 

76 self.rotate = (int_value(self.attrs.get("Rotate", 0)) + 360) % 360 

77 self.annots = self.attrs.get("Annots") 

78 self.beads = self.attrs.get("B") 

79 

80 def __repr__(self) -> str: 

81 return f"<PDFPage: Resources={self.resources!r}, MediaBox={self.mediabox!r}>" 

82 

83 INHERITABLE_ATTRS: ClassVar[set[str]] = { 

84 "Resources", 

85 "MediaBox", 

86 "CropBox", 

87 "Rotate", 

88 } 

89 

90 @classmethod 

91 def create_pages(cls, document: PDFDocument) -> Iterator["PDFPage"]: 

92 def depth_first_search( 

93 obj: Any, 

94 parent: dict[str, Any], 

95 visited: set[Any] | None = None, 

96 ) -> Iterator[tuple[int, dict[Any, dict[Any, Any]]]]: 

97 if isinstance(obj, int): 

98 object_id = obj 

99 object_properties = dict_value(document.getobj(object_id)).copy() 

100 else: 

101 # This looks broken. obj.objid means obj could be either 

102 # PDFObjRef or PDFStream, but neither is valid for dict_value. 

103 object_id = obj.objid # type: ignore[attr-defined] 

104 object_properties = dict_value(obj).copy() 

105 

106 # Avoid recursion errors by keeping track of visited nodes 

107 if visited is None: 

108 visited = set() 

109 if object_id in visited: 

110 return 

111 visited.add(object_id) 

112 

113 for k, v in parent.items(): 

114 if k in cls.INHERITABLE_ATTRS and k not in object_properties: 

115 object_properties[k] = v 

116 

117 object_type = object_properties.get("Type") 

118 if object_type is None and not settings.STRICT: # See #64 

119 object_type = object_properties.get("type") 

120 

121 if object_type is LITERAL_PAGES and "Kids" in object_properties: 

122 log.debug("Pages: Kids=%r", object_properties["Kids"]) 

123 for child in list_value(object_properties["Kids"]): 

124 yield from depth_first_search(child, object_properties, visited) 

125 

126 elif object_type is LITERAL_PAGE: 

127 log.debug("Page: %r", object_properties) 

128 yield (object_id, object_properties) 

129 

130 try: 

131 page_labels: Iterator[str | None] = document.get_page_labels() 

132 except PDFNoPageLabels: 

133 page_labels = itertools.repeat(None) 

134 

135 pages = False 

136 if "Pages" in document.catalog: 

137 objects = depth_first_search(document.catalog["Pages"], document.catalog) 

138 for objid, tree in objects: 

139 yield cls(document, objid, tree, next(page_labels)) 

140 pages = True 

141 if not pages: 

142 # fallback when /Pages is missing. 

143 for xref in document.xrefs: 

144 for objid in xref.get_objids(): 

145 try: 

146 obj = document.getobj(objid) 

147 if isinstance(obj, dict) and obj.get("Type") is LITERAL_PAGE: 

148 yield cls(document, objid, obj, next(page_labels)) 

149 except PDFObjectNotFound: 

150 pass 

151 

152 @classmethod 

153 def get_pages( 

154 cls, 

155 fp: BinaryIO, 

156 pagenos: Container[int] | None = None, 

157 maxpages: int = 0, 

158 password: str = "", 

159 caching: bool = True, 

160 check_extractable: bool = False, 

161 ) -> Iterator["PDFPage"]: 

162 # Create a PDF parser object associated with the file object. 

163 parser = PDFParser(fp) 

164 # Create a PDF document object that stores the document structure. 

165 doc = PDFDocument(parser, password=password, caching=caching) 

166 # Check if the document allows text extraction. 

167 # If not, warn the user and proceed. 

168 if not doc.is_extractable: 

169 if check_extractable: 

170 error_msg = f"Text extraction is not allowed: {fp!r}" 

171 raise PDFTextExtractionNotAllowed(error_msg) 

172 else: 

173 warning_msg = ( 

174 f"The PDF {fp!r} contains a metadata field " 

175 "indicating that it should not allow " 

176 "text extraction. Ignoring this field " 

177 "and proceeding. Use the check_extractable " 

178 "if you want to raise an error in this case" 

179 ) 

180 log.warning(warning_msg) 

181 # Process each page contained in the document. 

182 for pageno, page in enumerate(cls.create_pages(doc)): 

183 if pagenos and (pageno not in pagenos): 

184 continue 

185 yield page 

186 if maxpages and maxpages <= pageno + 1: 

187 break 

188 

189 def _parse_mediabox(self, value: Any) -> Rect: 

190 us_letter = (0.0, 0.0, 612.0, 792.0) 

191 

192 if value is None: 

193 log.warning( 

194 "MediaBox missing from /Page (and not inherited), " 

195 "defaulting to US Letter" 

196 ) 

197 return us_letter 

198 

199 try: 

200 return parse_rect(resolve1(val) for val in resolve1(value)) 

201 

202 except PDFValueError: 

203 log.warning("Invalid MediaBox in /Page, defaulting to US Letter") 

204 return us_letter 

205 

206 def _parse_cropbox(self, value: Any, mediabox: Rect) -> Rect: 

207 if value is None: 

208 # CropBox is optional, and MediaBox is used if not specified. 

209 return mediabox 

210 

211 try: 

212 return parse_rect(resolve1(val) for val in resolve1(value)) 

213 

214 except PDFValueError: 

215 log.warning("Invalid CropBox in /Page, defaulting to MediaBox") 

216 return mediabox 

217 

218 def _parse_contents(self, value: Any) -> list[Any]: 

219 contents: list[Any] = [] 

220 if value is not None: 

221 contents = resolve1(value) 

222 if not isinstance(contents, list): 

223 contents = [contents] 

224 return contents