Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/pdfpage.py: 97%

1import itertools

2import logging

3from typing import Any, BinaryIO, Container, Dict, Iterator, List, Optional, Set, Tuple

5from pdfminer import settings

6from pdfminer.pdfdocument import (

7 PDFDocument,

8 PDFNoPageLabels,

9 PDFTextExtractionNotAllowed,

10)

11from pdfminer.pdfexceptions import PDFObjectNotFound, PDFValueError

12from pdfminer.pdfparser import PDFParser

13from pdfminer.pdftypes import dict_value, int_value, list_value, resolve1

14from pdfminer.psparser import LIT

15from pdfminer.utils import Rect, parse_rect

17log = logging.getLogger(__name__)

19# some predefined literals and keywords.

20LITERAL_PAGE = LIT("Page")

21LITERAL_PAGES = LIT("Pages")

24class PDFPage:

25 """An object that holds the information about a page.

27 A PDFPage object is merely a convenience class that has a set

28 of keys and values, which describe the properties of a page

29 and point to its contents.

31 Attributes

32 ----------

33 doc: a PDFDocument object.

34 pageid: any Python object that can uniquely identify the page.

35 attrs: a dictionary of page attributes.

36 contents: a list of PDFStream objects that represents the page content.

37 lastmod: the last modified time of the page.

38 resources: a dictionary of resources used by the page.

39 mediabox: the physical size of the page.

40 cropbox: the crop rectangle of the page.

41 rotate: the page rotation (in degree).

42 annots: the page annotations.

43 beads: a chain that represents natural reading order.

44 label: the page's label (typically, the logical page number).

46 """

48 def __init__(

49 self,

50 doc: PDFDocument,

51 pageid: object,

52 attrs: object,

53 label: Optional[str],

54 ) -> None:

55 """Initialize a page object.

57 doc: a PDFDocument object.

58 pageid: any Python object that can uniquely identify the page.

59 attrs: a dictionary of page attributes.

60 label: page label string.

61 """

62 self.doc = doc

63 self.pageid = pageid

64 self.attrs = dict_value(attrs)

65 self.label = label

66 self.lastmod = resolve1(self.attrs.get("LastModified"))

67 self.resources: Dict[object, object] = resolve1(

68 self.attrs.get("Resources", dict()),

69 )

71 self.mediabox = self._parse_mediabox(self.attrs.get("MediaBox"))

72 self.cropbox = self._parse_cropbox(self.attrs.get("CropBox"), self.mediabox)

73 self.contents = self._parse_contents(self.attrs.get("Contents"))

75 self.rotate = (int_value(self.attrs.get("Rotate", 0)) + 360) % 360

76 self.annots = self.attrs.get("Annots")

77 self.beads = self.attrs.get("B")

79 def __repr__(self) -> str:

80 return f"<PDFPage: Resources={self.resources!r}, MediaBox={self.mediabox!r}>"

82 INHERITABLE_ATTRS = {"Resources", "MediaBox", "CropBox", "Rotate"}

84 @classmethod

85 def create_pages(cls, document: PDFDocument) -> Iterator["PDFPage"]:

86 def depth_first_search(

87 obj: Any,

88 parent: Dict[str, Any],

89 visited: Optional[Set[Any]] = None,

90 ) -> Iterator[Tuple[int, Dict[Any, Dict[Any, Any]]]]:

91 if isinstance(obj, int):

92 object_id = obj

93 object_properties = dict_value(document.getobj(object_id)).copy()

94 else:

95 # This looks broken. obj.objid means obj could be either

96 # PDFObjRef or PDFStream, but neither is valid for dict_value.

97 object_id = obj.objid # type: ignore[attr-defined]

98 object_properties = dict_value(obj).copy()

100 # Avoid recursion errors by keeping track of visited nodes

101 if visited is None:

102 visited = set()

103 if object_id in visited:

104 return

105 visited.add(object_id)

106

107 for k, v in parent.items():

108 if k in cls.INHERITABLE_ATTRS and k not in object_properties:

109 object_properties[k] = v

110

111 object_type = object_properties.get("Type")

112 if object_type is None and not settings.STRICT: # See #64

113 object_type = object_properties.get("type")

114

115 if object_type is LITERAL_PAGES and "Kids" in object_properties:

116 log.debug("Pages: Kids=%r", object_properties["Kids"])

117 for child in list_value(object_properties["Kids"]):

118 yield from depth_first_search(child, object_properties, visited)

119

120 elif object_type is LITERAL_PAGE:

121 log.debug("Page: %r", object_properties)

122 yield (object_id, object_properties)

123

124 try:

125 page_labels: Iterator[Optional[str]] = document.get_page_labels()

126 except PDFNoPageLabels:

127 page_labels = itertools.repeat(None)

128

129 pages = False

130 if "Pages" in document.catalog:

131 objects = depth_first_search(document.catalog["Pages"], document.catalog)

132 for objid, tree in objects:

133 yield cls(document, objid, tree, next(page_labels))

134 pages = True

135 if not pages:

136 # fallback when /Pages is missing.

137 for xref in document.xrefs:

138 for objid in xref.get_objids():

139 try:

140 obj = document.getobj(objid)

141 if isinstance(obj, dict) and obj.get("Type") is LITERAL_PAGE:

142 yield cls(document, objid, obj, next(page_labels))

143 except PDFObjectNotFound:

144 pass

145

146 @classmethod

147 def get_pages(

148 cls,

149 fp: BinaryIO,

150 pagenos: Optional[Container[int]] = None,

151 maxpages: int = 0,

152 password: str = "",

153 caching: bool = True,

154 check_extractable: bool = False,

155 ) -> Iterator["PDFPage"]:

156 # Create a PDF parser object associated with the file object.

157 parser = PDFParser(fp)

158 # Create a PDF document object that stores the document structure.

159 doc = PDFDocument(parser, password=password, caching=caching)

160 # Check if the document allows text extraction.

161 # If not, warn the user and proceed.

162 if not doc.is_extractable:

163 if check_extractable:

164 error_msg = "Text extraction is not allowed: %r" % fp

165 raise PDFTextExtractionNotAllowed(error_msg)

166 else:

167 warning_msg = (

168 "The PDF %r contains a metadata field "

169 "indicating that it should not allow "

170 "text extraction. Ignoring this field "

171 "and proceeding. Use the check_extractable "

172 "if you want to raise an error in this case" % fp

173 )

174 log.warning(warning_msg)

175 # Process each page contained in the document.

176 for pageno, page in enumerate(cls.create_pages(doc)):

177 if pagenos and (pageno not in pagenos):

178 continue

179 yield page

180 if maxpages and maxpages <= pageno + 1:

181 break

182

183 def _parse_mediabox(self, value: Any) -> Rect:

184 us_letter = (0.0, 0.0, 612.0, 792.0)

185

186 if value is None:

187 log.warning(

188 "MediaBox missing from /Page (and not inherited), "

189 "defaulting to US Letter"

190 )

191 return us_letter

192

193 try:

194 return parse_rect(resolve1(val) for val in resolve1(value))

195

196 except PDFValueError:

197 log.warning("Invalid MediaBox in /Page, defaulting to US Letter")

198 return us_letter

199

200 def _parse_cropbox(self, value: Any, mediabox: Rect) -> Rect:

201 if value is None:

202 # CropBox is optional, and MediaBox is used if not specified.

203 return mediabox

204

205 try:

206 return parse_rect(resolve1(val) for val in resolve1(value))

207

208 except PDFValueError:

209 log.warning("Invalid CropBox in /Page, defaulting to MediaBox")

210 return mediabox

211

212 def _parse_contents(self, value: Any) -> List[Any]:

213 contents: List[Any] = []

214 if value is not None:

215 contents = resolve1(value)

216 if not isinstance(contents, list):

217 contents = [contents]

218 return contents