Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/pdfpage.py: 97%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

102 statements  

1import itertools 

2import logging 

3from typing import Any, BinaryIO, Container, Dict, Iterator, List, Optional, Set, Tuple 

4 

5from pdfminer import settings 

6from pdfminer.pdfdocument import ( 

7 PDFDocument, 

8 PDFNoPageLabels, 

9 PDFTextExtractionNotAllowed, 

10) 

11from pdfminer.pdfexceptions import PDFObjectNotFound, PDFValueError 

12from pdfminer.pdfparser import PDFParser 

13from pdfminer.pdftypes import dict_value, int_value, list_value, resolve1 

14from pdfminer.psparser import LIT 

15from pdfminer.utils import parse_rect 

16 

17log = logging.getLogger(__name__) 

18 

19# some predefined literals and keywords. 

20LITERAL_PAGE = LIT("Page") 

21LITERAL_PAGES = LIT("Pages") 

22 

23 

24class PDFPage: 

25 """An object that holds the information about a page. 

26 

27 A PDFPage object is merely a convenience class that has a set 

28 of keys and values, which describe the properties of a page 

29 and point to its contents. 

30 

31 Attributes 

32 ---------- 

33 doc: a PDFDocument object. 

34 pageid: any Python object that can uniquely identify the page. 

35 attrs: a dictionary of page attributes. 

36 contents: a list of PDFStream objects that represents the page content. 

37 lastmod: the last modified time of the page. 

38 resources: a dictionary of resources used by the page. 

39 mediabox: the physical size of the page. 

40 cropbox: the crop rectangle of the page. 

41 rotate: the page rotation (in degree). 

42 annots: the page annotations. 

43 beads: a chain that represents natural reading order. 

44 label: the page's label (typically, the logical page number). 

45 

46 """ 

47 

48 def __init__( 

49 self, 

50 doc: PDFDocument, 

51 pageid: object, 

52 attrs: object, 

53 label: Optional[str], 

54 ) -> None: 

55 """Initialize a page object. 

56 

57 doc: a PDFDocument object. 

58 pageid: any Python object that can uniquely identify the page. 

59 attrs: a dictionary of page attributes. 

60 label: page label string. 

61 """ 

62 self.doc = doc 

63 self.pageid = pageid 

64 self.attrs = dict_value(attrs) 

65 self.label = label 

66 self.lastmod = resolve1(self.attrs.get("LastModified")) 

67 self.resources: Dict[object, object] = resolve1( 

68 self.attrs.get("Resources", dict()), 

69 ) 

70 mediabox_params: List[Any] = [ 

71 resolve1(mediabox_param) for mediabox_param in self.attrs["MediaBox"] 

72 ] 

73 self.mediabox = parse_rect(resolve1(mediabox_params)) 

74 self.cropbox = self.mediabox 

75 if "CropBox" in self.attrs: 

76 try: 

77 self.cropbox = parse_rect(resolve1(self.attrs["CropBox"])) 

78 except PDFValueError: 

79 pass 

80 

81 self.rotate = (int_value(self.attrs.get("Rotate", 0)) + 360) % 360 

82 self.annots = self.attrs.get("Annots") 

83 self.beads = self.attrs.get("B") 

84 if "Contents" in self.attrs: 

85 contents = resolve1(self.attrs["Contents"]) 

86 else: 

87 contents = [] 

88 if not isinstance(contents, list): 

89 contents = [contents] 

90 self.contents: List[object] = contents 

91 

92 def __repr__(self) -> str: 

93 return f"<PDFPage: Resources={self.resources!r}, MediaBox={self.mediabox!r}>" 

94 

95 INHERITABLE_ATTRS = {"Resources", "MediaBox", "CropBox", "Rotate"} 

96 

97 @classmethod 

98 def create_pages(cls, document: PDFDocument) -> Iterator["PDFPage"]: 

99 def depth_first_search( 

100 obj: Any, 

101 parent: Dict[str, Any], 

102 visited: Optional[Set[Any]] = None, 

103 ) -> Iterator[Tuple[int, Dict[Any, Dict[Any, Any]]]]: 

104 if isinstance(obj, int): 

105 object_id = obj 

106 object_properties = dict_value(document.getobj(object_id)).copy() 

107 else: 

108 # This looks broken. obj.objid means obj could be either 

109 # PDFObjRef or PDFStream, but neither is valid for dict_value. 

110 object_id = obj.objid # type: ignore[attr-defined] 

111 object_properties = dict_value(obj).copy() 

112 

113 # Avoid recursion errors by keeping track of visited nodes 

114 if visited is None: 

115 visited = set() 

116 if object_id in visited: 

117 return 

118 visited.add(object_id) 

119 

120 for k, v in parent.items(): 

121 if k in cls.INHERITABLE_ATTRS and k not in object_properties: 

122 object_properties[k] = v 

123 

124 object_type = object_properties.get("Type") 

125 if object_type is None and not settings.STRICT: # See #64 

126 object_type = object_properties.get("type") 

127 

128 if object_type is LITERAL_PAGES and "Kids" in object_properties: 

129 log.debug("Pages: Kids=%r", object_properties["Kids"]) 

130 for child in list_value(object_properties["Kids"]): 

131 yield from depth_first_search(child, object_properties, visited) 

132 

133 elif object_type is LITERAL_PAGE: 

134 log.debug("Page: %r", object_properties) 

135 yield (object_id, object_properties) 

136 

137 try: 

138 page_labels: Iterator[Optional[str]] = document.get_page_labels() 

139 except PDFNoPageLabels: 

140 page_labels = itertools.repeat(None) 

141 

142 pages = False 

143 if "Pages" in document.catalog: 

144 objects = depth_first_search(document.catalog["Pages"], document.catalog) 

145 for objid, tree in objects: 

146 yield cls(document, objid, tree, next(page_labels)) 

147 pages = True 

148 if not pages: 

149 # fallback when /Pages is missing. 

150 for xref in document.xrefs: 

151 for objid in xref.get_objids(): 

152 try: 

153 obj = document.getobj(objid) 

154 if isinstance(obj, dict) and obj.get("Type") is LITERAL_PAGE: 

155 yield cls(document, objid, obj, next(page_labels)) 

156 except PDFObjectNotFound: 

157 pass 

158 

159 @classmethod 

160 def get_pages( 

161 cls, 

162 fp: BinaryIO, 

163 pagenos: Optional[Container[int]] = None, 

164 maxpages: int = 0, 

165 password: str = "", 

166 caching: bool = True, 

167 check_extractable: bool = False, 

168 ) -> Iterator["PDFPage"]: 

169 # Create a PDF parser object associated with the file object. 

170 parser = PDFParser(fp) 

171 # Create a PDF document object that stores the document structure. 

172 doc = PDFDocument(parser, password=password, caching=caching) 

173 # Check if the document allows text extraction. 

174 # If not, warn the user and proceed. 

175 if not doc.is_extractable: 

176 if check_extractable: 

177 error_msg = "Text extraction is not allowed: %r" % fp 

178 raise PDFTextExtractionNotAllowed(error_msg) 

179 else: 

180 warning_msg = ( 

181 "The PDF %r contains a metadata field " 

182 "indicating that it should not allow " 

183 "text extraction. Ignoring this field " 

184 "and proceeding. Use the check_extractable " 

185 "if you want to raise an error in this case" % fp 

186 ) 

187 log.warning(warning_msg) 

188 # Process each page contained in the document. 

189 for pageno, page in enumerate(cls.create_pages(doc)): 

190 if pagenos and (pageno not in pagenos): 

191 continue 

192 yield page 

193 if maxpages and maxpages <= pageno + 1: 

194 break