Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/pdfpage.py: 91%

1import itertools

2import logging

3from collections.abc import Container, Iterator

4from typing import Any, BinaryIO, ClassVar

6from pdfminer import settings

7from pdfminer.pdfdocument import (

8 PDFDocument,

9 PDFNoPageLabels,

10 PDFTextExtractionNotAllowed,

11)

12from pdfminer.pdfexceptions import PDFObjectNotFound, PDFValueError

13from pdfminer.pdfparser import PDFParser

14from pdfminer.pdftypes import dict_value, int_value, list_value, resolve1

15from pdfminer.psparser import LIT

16from pdfminer.utils import Rect, parse_rect

18log = logging.getLogger(__name__)

20# some predefined literals and keywords.

21LITERAL_PAGE = LIT("Page")

22LITERAL_PAGES = LIT("Pages")

25class PDFPage:

26 """An object that holds the information about a page.

28 A PDFPage object is merely a convenience class that has a set

29 of keys and values, which describe the properties of a page

30 and point to its contents.

32 Attributes

33 ----------

34 doc: a PDFDocument object.

35 pageid: any Python object that can uniquely identify the page.

36 attrs: a dictionary of page attributes.

37 contents: a list of PDFStream objects that represents the page content.

38 lastmod: the last modified time of the page.

39 resources: a dictionary of resources used by the page.

40 mediabox: the physical size of the page.

41 cropbox: the crop rectangle of the page.

42 rotate: the page rotation (in degree).

43 annots: the page annotations.

44 beads: a chain that represents natural reading order.

45 label: the page's label (typically, the logical page number).

47 """

49 def __init__(

50 self,

51 doc: PDFDocument,

52 pageid: object,

53 attrs: object,

54 label: str | None,

55 ) -> None:

56 """Initialize a page object.

58 doc: a PDFDocument object.

59 pageid: any Python object that can uniquely identify the page.

60 attrs: a dictionary of page attributes.

61 label: page label string.

62 """

63 self.doc = doc

64 self.pageid = pageid

65 self.attrs = dict_value(attrs)

66 self.label = label

67 self.lastmod = resolve1(self.attrs.get("LastModified"))

68 self.resources: dict[object, object] = resolve1(

69 self.attrs.get("Resources", {}),

70 )

72 self.mediabox = self._parse_mediabox(self.attrs.get("MediaBox"))

73 self.cropbox = self._parse_cropbox(self.attrs.get("CropBox"), self.mediabox)

74 self.contents = self._parse_contents(self.attrs.get("Contents"))

76 self.rotate = (int_value(self.attrs.get("Rotate", 0)) + 360) % 360

77 self.annots = self.attrs.get("Annots")

78 self.beads = self.attrs.get("B")

80 def __repr__(self) -> str:

81 return f"<PDFPage: Resources={self.resources!r}, MediaBox={self.mediabox!r}>"

83 INHERITABLE_ATTRS: ClassVar[set[str]] = {

84 "Resources",

85 "MediaBox",

86 "CropBox",

87 "Rotate",

88 }

90 @classmethod

91 def create_pages(cls, document: PDFDocument) -> Iterator["PDFPage"]:

92 def depth_first_search(

93 obj: Any,

94 parent: dict[str, Any],

95 visited: set[Any] | None = None,

96 ) -> Iterator[tuple[int, dict[Any, dict[Any, Any]]]]:

97 if isinstance(obj, int):

98 object_id = obj

99 object_properties = dict_value(document.getobj(object_id)).copy()

100 else:

101 # This looks broken. obj.objid means obj could be either

102 # PDFObjRef or PDFStream, but neither is valid for dict_value.

103 object_id = obj.objid # type: ignore[attr-defined]

104 object_properties = dict_value(obj).copy()

105

106 # Avoid recursion errors by keeping track of visited nodes

107 if visited is None:

108 visited = set()

109 if object_id in visited:

110 return

111 visited.add(object_id)

112

113 for k, v in parent.items():

114 if k in cls.INHERITABLE_ATTRS and k not in object_properties:

115 object_properties[k] = v

116

117 object_type = object_properties.get("Type")

118 if object_type is None and not settings.STRICT: # See #64

119 object_type = object_properties.get("type")

120

121 if object_type is LITERAL_PAGES and "Kids" in object_properties:

122 log.debug("Pages: Kids=%r", object_properties["Kids"])

123 for child in list_value(object_properties["Kids"]):

124 yield from depth_first_search(child, object_properties, visited)

125

126 elif object_type is LITERAL_PAGE:

127 log.debug("Page: %r", object_properties)

128 yield (object_id, object_properties)

129

130 try:

131 page_labels: Iterator[str | None] = document.get_page_labels()

132 except PDFNoPageLabels:

133 page_labels = itertools.repeat(None)

134

135 pages = False

136 if "Pages" in document.catalog:

137 objects = depth_first_search(document.catalog["Pages"], document.catalog)

138 for objid, tree in objects:

139 yield cls(document, objid, tree, next(page_labels))

140 pages = True

141 if not pages:

142 # fallback when /Pages is missing.

143 for xref in document.xrefs:

144 for objid in xref.get_objids():

145 try:

146 obj = document.getobj(objid)

147 if isinstance(obj, dict) and obj.get("Type") is LITERAL_PAGE:

148 yield cls(document, objid, obj, next(page_labels))

149 except PDFObjectNotFound:

150 pass

151

152 @classmethod

153 def get_pages(

154 cls,

155 fp: BinaryIO,

156 pagenos: Container[int] | None = None,

157 maxpages: int = 0,

158 password: str = "",

159 caching: bool = True,

160 check_extractable: bool = False,

161 ) -> Iterator["PDFPage"]:

162 # Create a PDF parser object associated with the file object.

163 parser = PDFParser(fp)

164 # Create a PDF document object that stores the document structure.

165 doc = PDFDocument(parser, password=password, caching=caching)

166 # Check if the document allows text extraction.

167 # If not, warn the user and proceed.

168 if not doc.is_extractable:

169 if check_extractable:

170 error_msg = f"Text extraction is not allowed: {fp!r}"

171 raise PDFTextExtractionNotAllowed(error_msg)

172 else:

173 warning_msg = (

174 f"The PDF {fp!r} contains a metadata field "

175 "indicating that it should not allow "

176 "text extraction. Ignoring this field "

177 "and proceeding. Use the check_extractable "

178 "if you want to raise an error in this case"

179 )

180 log.warning(warning_msg)

181 # Process each page contained in the document.

182 for pageno, page in enumerate(cls.create_pages(doc)):

183 if pagenos and (pageno not in pagenos):

184 continue

185 yield page

186 if maxpages and maxpages <= pageno + 1:

187 break

188

189 def _parse_mediabox(self, value: Any) -> Rect:

190 us_letter = (0.0, 0.0, 612.0, 792.0)

191

192 if value is None:

193 log.warning(

194 "MediaBox missing from /Page (and not inherited), "

195 "defaulting to US Letter"

196 )

197 return us_letter

198

199 try:

200 return parse_rect(resolve1(val) for val in resolve1(value))

201

202 except PDFValueError:

203 log.warning("Invalid MediaBox in /Page, defaulting to US Letter")

204 return us_letter

205

206 def _parse_cropbox(self, value: Any, mediabox: Rect) -> Rect:

207 if value is None:

208 # CropBox is optional, and MediaBox is used if not specified.

209 return mediabox

210

211 try:

212 return parse_rect(resolve1(val) for val in resolve1(value))

213

214 except PDFValueError:

215 log.warning("Invalid CropBox in /Page, defaulting to MediaBox")

216 return mediabox

217

218 def _parse_contents(self, value: Any) -> list[Any]:

219 contents: list[Any] = []

220 if value is not None:

221 contents = resolve1(value)

222 if not isinstance(contents, list):

223 contents = [contents]

224 return contents