Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/pdfpage.py: 97%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

116 statements  

1import itertools 

2import logging 

3from typing import Any, BinaryIO, Container, Dict, Iterator, List, Optional, Set, Tuple 

4 

5from pdfminer import settings 

6from pdfminer.pdfdocument import ( 

7 PDFDocument, 

8 PDFNoPageLabels, 

9 PDFTextExtractionNotAllowed, 

10) 

11from pdfminer.pdfexceptions import PDFObjectNotFound, PDFValueError 

12from pdfminer.pdfparser import PDFParser 

13from pdfminer.pdftypes import dict_value, int_value, list_value, resolve1 

14from pdfminer.psparser import LIT 

15from pdfminer.utils import Rect, parse_rect 

16 

17log = logging.getLogger(__name__) 

18 

19# some predefined literals and keywords. 

20LITERAL_PAGE = LIT("Page") 

21LITERAL_PAGES = LIT("Pages") 

22 

23 

24class PDFPage: 

25 """An object that holds the information about a page. 

26 

27 A PDFPage object is merely a convenience class that has a set 

28 of keys and values, which describe the properties of a page 

29 and point to its contents. 

30 

31 Attributes 

32 ---------- 

33 doc: a PDFDocument object. 

34 pageid: any Python object that can uniquely identify the page. 

35 attrs: a dictionary of page attributes. 

36 contents: a list of PDFStream objects that represents the page content. 

37 lastmod: the last modified time of the page. 

38 resources: a dictionary of resources used by the page. 

39 mediabox: the physical size of the page. 

40 cropbox: the crop rectangle of the page. 

41 rotate: the page rotation (in degree). 

42 annots: the page annotations. 

43 beads: a chain that represents natural reading order. 

44 label: the page's label (typically, the logical page number). 

45 

46 """ 

47 

48 def __init__( 

49 self, 

50 doc: PDFDocument, 

51 pageid: object, 

52 attrs: object, 

53 label: Optional[str], 

54 ) -> None: 

55 """Initialize a page object. 

56 

57 doc: a PDFDocument object. 

58 pageid: any Python object that can uniquely identify the page. 

59 attrs: a dictionary of page attributes. 

60 label: page label string. 

61 """ 

62 self.doc = doc 

63 self.pageid = pageid 

64 self.attrs = dict_value(attrs) 

65 self.label = label 

66 self.lastmod = resolve1(self.attrs.get("LastModified")) 

67 self.resources: Dict[object, object] = resolve1( 

68 self.attrs.get("Resources", dict()), 

69 ) 

70 

71 self.mediabox = self._parse_mediabox(self.attrs.get("MediaBox")) 

72 self.cropbox = self._parse_cropbox(self.attrs.get("CropBox"), self.mediabox) 

73 self.contents = self._parse_contents(self.attrs.get("Contents")) 

74 

75 self.rotate = (int_value(self.attrs.get("Rotate", 0)) + 360) % 360 

76 self.annots = self.attrs.get("Annots") 

77 self.beads = self.attrs.get("B") 

78 

79 def __repr__(self) -> str: 

80 return f"<PDFPage: Resources={self.resources!r}, MediaBox={self.mediabox!r}>" 

81 

82 INHERITABLE_ATTRS = {"Resources", "MediaBox", "CropBox", "Rotate"} 

83 

84 @classmethod 

85 def create_pages(cls, document: PDFDocument) -> Iterator["PDFPage"]: 

86 def depth_first_search( 

87 obj: Any, 

88 parent: Dict[str, Any], 

89 visited: Optional[Set[Any]] = None, 

90 ) -> Iterator[Tuple[int, Dict[Any, Dict[Any, Any]]]]: 

91 if isinstance(obj, int): 

92 object_id = obj 

93 object_properties = dict_value(document.getobj(object_id)).copy() 

94 else: 

95 # This looks broken. obj.objid means obj could be either 

96 # PDFObjRef or PDFStream, but neither is valid for dict_value. 

97 object_id = obj.objid # type: ignore[attr-defined] 

98 object_properties = dict_value(obj).copy() 

99 

100 # Avoid recursion errors by keeping track of visited nodes 

101 if visited is None: 

102 visited = set() 

103 if object_id in visited: 

104 return 

105 visited.add(object_id) 

106 

107 for k, v in parent.items(): 

108 if k in cls.INHERITABLE_ATTRS and k not in object_properties: 

109 object_properties[k] = v 

110 

111 object_type = object_properties.get("Type") 

112 if object_type is None and not settings.STRICT: # See #64 

113 object_type = object_properties.get("type") 

114 

115 if object_type is LITERAL_PAGES and "Kids" in object_properties: 

116 log.debug("Pages: Kids=%r", object_properties["Kids"]) 

117 for child in list_value(object_properties["Kids"]): 

118 yield from depth_first_search(child, object_properties, visited) 

119 

120 elif object_type is LITERAL_PAGE: 

121 log.debug("Page: %r", object_properties) 

122 yield (object_id, object_properties) 

123 

124 try: 

125 page_labels: Iterator[Optional[str]] = document.get_page_labels() 

126 except PDFNoPageLabels: 

127 page_labels = itertools.repeat(None) 

128 

129 pages = False 

130 if "Pages" in document.catalog: 

131 objects = depth_first_search(document.catalog["Pages"], document.catalog) 

132 for objid, tree in objects: 

133 yield cls(document, objid, tree, next(page_labels)) 

134 pages = True 

135 if not pages: 

136 # fallback when /Pages is missing. 

137 for xref in document.xrefs: 

138 for objid in xref.get_objids(): 

139 try: 

140 obj = document.getobj(objid) 

141 if isinstance(obj, dict) and obj.get("Type") is LITERAL_PAGE: 

142 yield cls(document, objid, obj, next(page_labels)) 

143 except PDFObjectNotFound: 

144 pass 

145 

146 @classmethod 

147 def get_pages( 

148 cls, 

149 fp: BinaryIO, 

150 pagenos: Optional[Container[int]] = None, 

151 maxpages: int = 0, 

152 password: str = "", 

153 caching: bool = True, 

154 check_extractable: bool = False, 

155 ) -> Iterator["PDFPage"]: 

156 # Create a PDF parser object associated with the file object. 

157 parser = PDFParser(fp) 

158 # Create a PDF document object that stores the document structure. 

159 doc = PDFDocument(parser, password=password, caching=caching) 

160 # Check if the document allows text extraction. 

161 # If not, warn the user and proceed. 

162 if not doc.is_extractable: 

163 if check_extractable: 

164 error_msg = "Text extraction is not allowed: %r" % fp 

165 raise PDFTextExtractionNotAllowed(error_msg) 

166 else: 

167 warning_msg = ( 

168 "The PDF %r contains a metadata field " 

169 "indicating that it should not allow " 

170 "text extraction. Ignoring this field " 

171 "and proceeding. Use the check_extractable " 

172 "if you want to raise an error in this case" % fp 

173 ) 

174 log.warning(warning_msg) 

175 # Process each page contained in the document. 

176 for pageno, page in enumerate(cls.create_pages(doc)): 

177 if pagenos and (pageno not in pagenos): 

178 continue 

179 yield page 

180 if maxpages and maxpages <= pageno + 1: 

181 break 

182 

183 def _parse_mediabox(self, value: Any) -> Rect: 

184 us_letter = (0.0, 0.0, 612.0, 792.0) 

185 

186 if value is None: 

187 log.warning( 

188 "MediaBox missing from /Page (and not inherited), " 

189 "defaulting to US Letter" 

190 ) 

191 return us_letter 

192 

193 try: 

194 return parse_rect(resolve1(val) for val in resolve1(value)) 

195 

196 except PDFValueError: 

197 log.warning("Invalid MediaBox in /Page, defaulting to US Letter") 

198 return us_letter 

199 

200 def _parse_cropbox(self, value: Any, mediabox: Rect) -> Rect: 

201 if value is None: 

202 # CropBox is optional, and MediaBox is used if not specified. 

203 return mediabox 

204 

205 try: 

206 return parse_rect(resolve1(val) for val in resolve1(value)) 

207 

208 except PDFValueError: 

209 log.warning("Invalid CropBox in /Page, defaulting to MediaBox") 

210 return mediabox 

211 

212 def _parse_contents(self, value: Any) -> List[Any]: 

213 contents: List[Any] = [] 

214 if value is not None: 

215 contents = resolve1(value) 

216 if not isinstance(contents, list): 

217 contents = [contents] 

218 return contents