Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/pdfpage.py: 91%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import itertools
2import logging
3from collections.abc import Container, Iterator
4from typing import Any, BinaryIO, ClassVar
6from pdfminer import settings
7from pdfminer.pdfdocument import (
8 PDFDocument,
9 PDFNoPageLabels,
10 PDFTextExtractionNotAllowed,
11)
12from pdfminer.pdfexceptions import PDFObjectNotFound, PDFValueError
13from pdfminer.pdfparser import PDFParser
14from pdfminer.pdftypes import dict_value, int_value, list_value, resolve1
15from pdfminer.psparser import LIT
16from pdfminer.utils import Rect, parse_rect
18log = logging.getLogger(__name__)
20# some predefined literals and keywords.
21LITERAL_PAGE = LIT("Page")
22LITERAL_PAGES = LIT("Pages")
25class PDFPage:
26 """An object that holds the information about a page.
28 A PDFPage object is merely a convenience class that has a set
29 of keys and values, which describe the properties of a page
30 and point to its contents.
32 Attributes
33 ----------
34 doc: a PDFDocument object.
35 pageid: any Python object that can uniquely identify the page.
36 attrs: a dictionary of page attributes.
37 contents: a list of PDFStream objects that represents the page content.
38 lastmod: the last modified time of the page.
39 resources: a dictionary of resources used by the page.
40 mediabox: the physical size of the page.
41 cropbox: the crop rectangle of the page.
42 rotate: the page rotation (in degree).
43 annots: the page annotations.
44 beads: a chain that represents natural reading order.
45 label: the page's label (typically, the logical page number).
47 """
49 def __init__(
50 self,
51 doc: PDFDocument,
52 pageid: object,
53 attrs: object,
54 label: str | None,
55 ) -> None:
56 """Initialize a page object.
58 doc: a PDFDocument object.
59 pageid: any Python object that can uniquely identify the page.
60 attrs: a dictionary of page attributes.
61 label: page label string.
62 """
63 self.doc = doc
64 self.pageid = pageid
65 self.attrs = dict_value(attrs)
66 self.label = label
67 self.lastmod = resolve1(self.attrs.get("LastModified"))
68 self.resources: dict[object, object] = resolve1(
69 self.attrs.get("Resources", {}),
70 )
72 self.mediabox = self._parse_mediabox(self.attrs.get("MediaBox"))
73 self.cropbox = self._parse_cropbox(self.attrs.get("CropBox"), self.mediabox)
74 self.contents = self._parse_contents(self.attrs.get("Contents"))
76 self.rotate = (int_value(self.attrs.get("Rotate", 0)) + 360) % 360
77 self.annots = self.attrs.get("Annots")
78 self.beads = self.attrs.get("B")
80 def __repr__(self) -> str:
81 return f"<PDFPage: Resources={self.resources!r}, MediaBox={self.mediabox!r}>"
83 INHERITABLE_ATTRS: ClassVar[set[str]] = {
84 "Resources",
85 "MediaBox",
86 "CropBox",
87 "Rotate",
88 }
90 @classmethod
91 def create_pages(cls, document: PDFDocument) -> Iterator["PDFPage"]:
92 def depth_first_search(
93 obj: Any,
94 parent: dict[str, Any],
95 visited: set[Any] | None = None,
96 ) -> Iterator[tuple[int, dict[Any, dict[Any, Any]]]]:
97 if isinstance(obj, int):
98 object_id = obj
99 object_properties = dict_value(document.getobj(object_id)).copy()
100 else:
101 # This looks broken. obj.objid means obj could be either
102 # PDFObjRef or PDFStream, but neither is valid for dict_value.
103 object_id = obj.objid # type: ignore[attr-defined]
104 object_properties = dict_value(obj).copy()
106 # Avoid recursion errors by keeping track of visited nodes
107 if visited is None:
108 visited = set()
109 if object_id in visited:
110 return
111 visited.add(object_id)
113 for k, v in parent.items():
114 if k in cls.INHERITABLE_ATTRS and k not in object_properties:
115 object_properties[k] = v
117 object_type = object_properties.get("Type")
118 if object_type is None and not settings.STRICT: # See #64
119 object_type = object_properties.get("type")
121 if object_type is LITERAL_PAGES and "Kids" in object_properties:
122 log.debug("Pages: Kids=%r", object_properties["Kids"])
123 for child in list_value(object_properties["Kids"]):
124 yield from depth_first_search(child, object_properties, visited)
126 elif object_type is LITERAL_PAGE:
127 log.debug("Page: %r", object_properties)
128 yield (object_id, object_properties)
130 try:
131 page_labels: Iterator[str | None] = document.get_page_labels()
132 except PDFNoPageLabels:
133 page_labels = itertools.repeat(None)
135 pages = False
136 if "Pages" in document.catalog:
137 objects = depth_first_search(document.catalog["Pages"], document.catalog)
138 for objid, tree in objects:
139 yield cls(document, objid, tree, next(page_labels))
140 pages = True
141 if not pages:
142 # fallback when /Pages is missing.
143 for xref in document.xrefs:
144 for objid in xref.get_objids():
145 try:
146 obj = document.getobj(objid)
147 if isinstance(obj, dict) and obj.get("Type") is LITERAL_PAGE:
148 yield cls(document, objid, obj, next(page_labels))
149 except PDFObjectNotFound:
150 pass
152 @classmethod
153 def get_pages(
154 cls,
155 fp: BinaryIO,
156 pagenos: Container[int] | None = None,
157 maxpages: int = 0,
158 password: str = "",
159 caching: bool = True,
160 check_extractable: bool = False,
161 ) -> Iterator["PDFPage"]:
162 # Create a PDF parser object associated with the file object.
163 parser = PDFParser(fp)
164 # Create a PDF document object that stores the document structure.
165 doc = PDFDocument(parser, password=password, caching=caching)
166 # Check if the document allows text extraction.
167 # If not, warn the user and proceed.
168 if not doc.is_extractable:
169 if check_extractable:
170 error_msg = f"Text extraction is not allowed: {fp!r}"
171 raise PDFTextExtractionNotAllowed(error_msg)
172 else:
173 warning_msg = (
174 f"The PDF {fp!r} contains a metadata field "
175 "indicating that it should not allow "
176 "text extraction. Ignoring this field "
177 "and proceeding. Use the check_extractable "
178 "if you want to raise an error in this case"
179 )
180 log.warning(warning_msg)
181 # Process each page contained in the document.
182 for pageno, page in enumerate(cls.create_pages(doc)):
183 if pagenos and (pageno not in pagenos):
184 continue
185 yield page
186 if maxpages and maxpages <= pageno + 1:
187 break
189 def _parse_mediabox(self, value: Any) -> Rect:
190 us_letter = (0.0, 0.0, 612.0, 792.0)
192 if value is None:
193 log.warning(
194 "MediaBox missing from /Page (and not inherited), "
195 "defaulting to US Letter"
196 )
197 return us_letter
199 try:
200 return parse_rect(resolve1(val) for val in resolve1(value))
202 except PDFValueError:
203 log.warning("Invalid MediaBox in /Page, defaulting to US Letter")
204 return us_letter
206 def _parse_cropbox(self, value: Any, mediabox: Rect) -> Rect:
207 if value is None:
208 # CropBox is optional, and MediaBox is used if not specified.
209 return mediabox
211 try:
212 return parse_rect(resolve1(val) for val in resolve1(value))
214 except PDFValueError:
215 log.warning("Invalid CropBox in /Page, defaulting to MediaBox")
216 return mediabox
218 def _parse_contents(self, value: Any) -> list[Any]:
219 contents: list[Any] = []
220 if value is not None:
221 contents = resolve1(value)
222 if not isinstance(contents, list):
223 contents = [contents]
224 return contents