Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/pdfpage.py: 97%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import itertools
2import logging
3from typing import Any, BinaryIO, Container, Dict, Iterator, List, Optional, Set, Tuple
5from pdfminer import settings
6from pdfminer.pdfdocument import (
7 PDFDocument,
8 PDFNoPageLabels,
9 PDFTextExtractionNotAllowed,
10)
11from pdfminer.pdfexceptions import PDFObjectNotFound, PDFValueError
12from pdfminer.pdfparser import PDFParser
13from pdfminer.pdftypes import dict_value, int_value, list_value, resolve1
14from pdfminer.psparser import LIT
15from pdfminer.utils import Rect, parse_rect
17log = logging.getLogger(__name__)
19# some predefined literals and keywords.
20LITERAL_PAGE = LIT("Page")
21LITERAL_PAGES = LIT("Pages")
24class PDFPage:
25 """An object that holds the information about a page.
27 A PDFPage object is merely a convenience class that has a set
28 of keys and values, which describe the properties of a page
29 and point to its contents.
31 Attributes
32 ----------
33 doc: a PDFDocument object.
34 pageid: any Python object that can uniquely identify the page.
35 attrs: a dictionary of page attributes.
36 contents: a list of PDFStream objects that represents the page content.
37 lastmod: the last modified time of the page.
38 resources: a dictionary of resources used by the page.
39 mediabox: the physical size of the page.
40 cropbox: the crop rectangle of the page.
41 rotate: the page rotation (in degree).
42 annots: the page annotations.
43 beads: a chain that represents natural reading order.
44 label: the page's label (typically, the logical page number).
46 """
48 def __init__(
49 self,
50 doc: PDFDocument,
51 pageid: object,
52 attrs: object,
53 label: Optional[str],
54 ) -> None:
55 """Initialize a page object.
57 doc: a PDFDocument object.
58 pageid: any Python object that can uniquely identify the page.
59 attrs: a dictionary of page attributes.
60 label: page label string.
61 """
62 self.doc = doc
63 self.pageid = pageid
64 self.attrs = dict_value(attrs)
65 self.label = label
66 self.lastmod = resolve1(self.attrs.get("LastModified"))
67 self.resources: Dict[object, object] = resolve1(
68 self.attrs.get("Resources", dict()),
69 )
71 self.mediabox = self._parse_mediabox(self.attrs.get("MediaBox"))
72 self.cropbox = self._parse_cropbox(self.attrs.get("CropBox"), self.mediabox)
73 self.contents = self._parse_contents(self.attrs.get("Contents"))
75 self.rotate = (int_value(self.attrs.get("Rotate", 0)) + 360) % 360
76 self.annots = self.attrs.get("Annots")
77 self.beads = self.attrs.get("B")
79 def __repr__(self) -> str:
80 return f"<PDFPage: Resources={self.resources!r}, MediaBox={self.mediabox!r}>"
82 INHERITABLE_ATTRS = {"Resources", "MediaBox", "CropBox", "Rotate"}
84 @classmethod
85 def create_pages(cls, document: PDFDocument) -> Iterator["PDFPage"]:
86 def depth_first_search(
87 obj: Any,
88 parent: Dict[str, Any],
89 visited: Optional[Set[Any]] = None,
90 ) -> Iterator[Tuple[int, Dict[Any, Dict[Any, Any]]]]:
91 if isinstance(obj, int):
92 object_id = obj
93 object_properties = dict_value(document.getobj(object_id)).copy()
94 else:
95 # This looks broken. obj.objid means obj could be either
96 # PDFObjRef or PDFStream, but neither is valid for dict_value.
97 object_id = obj.objid # type: ignore[attr-defined]
98 object_properties = dict_value(obj).copy()
100 # Avoid recursion errors by keeping track of visited nodes
101 if visited is None:
102 visited = set()
103 if object_id in visited:
104 return
105 visited.add(object_id)
107 for k, v in parent.items():
108 if k in cls.INHERITABLE_ATTRS and k not in object_properties:
109 object_properties[k] = v
111 object_type = object_properties.get("Type")
112 if object_type is None and not settings.STRICT: # See #64
113 object_type = object_properties.get("type")
115 if object_type is LITERAL_PAGES and "Kids" in object_properties:
116 log.debug("Pages: Kids=%r", object_properties["Kids"])
117 for child in list_value(object_properties["Kids"]):
118 yield from depth_first_search(child, object_properties, visited)
120 elif object_type is LITERAL_PAGE:
121 log.debug("Page: %r", object_properties)
122 yield (object_id, object_properties)
124 try:
125 page_labels: Iterator[Optional[str]] = document.get_page_labels()
126 except PDFNoPageLabels:
127 page_labels = itertools.repeat(None)
129 pages = False
130 if "Pages" in document.catalog:
131 objects = depth_first_search(document.catalog["Pages"], document.catalog)
132 for objid, tree in objects:
133 yield cls(document, objid, tree, next(page_labels))
134 pages = True
135 if not pages:
136 # fallback when /Pages is missing.
137 for xref in document.xrefs:
138 for objid in xref.get_objids():
139 try:
140 obj = document.getobj(objid)
141 if isinstance(obj, dict) and obj.get("Type") is LITERAL_PAGE:
142 yield cls(document, objid, obj, next(page_labels))
143 except PDFObjectNotFound:
144 pass
146 @classmethod
147 def get_pages(
148 cls,
149 fp: BinaryIO,
150 pagenos: Optional[Container[int]] = None,
151 maxpages: int = 0,
152 password: str = "",
153 caching: bool = True,
154 check_extractable: bool = False,
155 ) -> Iterator["PDFPage"]:
156 # Create a PDF parser object associated with the file object.
157 parser = PDFParser(fp)
158 # Create a PDF document object that stores the document structure.
159 doc = PDFDocument(parser, password=password, caching=caching)
160 # Check if the document allows text extraction.
161 # If not, warn the user and proceed.
162 if not doc.is_extractable:
163 if check_extractable:
164 error_msg = "Text extraction is not allowed: %r" % fp
165 raise PDFTextExtractionNotAllowed(error_msg)
166 else:
167 warning_msg = (
168 "The PDF %r contains a metadata field "
169 "indicating that it should not allow "
170 "text extraction. Ignoring this field "
171 "and proceeding. Use the check_extractable "
172 "if you want to raise an error in this case" % fp
173 )
174 log.warning(warning_msg)
175 # Process each page contained in the document.
176 for pageno, page in enumerate(cls.create_pages(doc)):
177 if pagenos and (pageno not in pagenos):
178 continue
179 yield page
180 if maxpages and maxpages <= pageno + 1:
181 break
183 def _parse_mediabox(self, value: Any) -> Rect:
184 us_letter = (0.0, 0.0, 612.0, 792.0)
186 if value is None:
187 log.warning(
188 "MediaBox missing from /Page (and not inherited), "
189 "defaulting to US Letter"
190 )
191 return us_letter
193 try:
194 return parse_rect(resolve1(val) for val in resolve1(value))
196 except PDFValueError:
197 log.warning("Invalid MediaBox in /Page, defaulting to US Letter")
198 return us_letter
200 def _parse_cropbox(self, value: Any, mediabox: Rect) -> Rect:
201 if value is None:
202 # CropBox is optional, and MediaBox is used if not specified.
203 return mediabox
205 try:
206 return parse_rect(resolve1(val) for val in resolve1(value))
208 except PDFValueError:
209 log.warning("Invalid CropBox in /Page, defaulting to MediaBox")
210 return mediabox
212 def _parse_contents(self, value: Any) -> List[Any]:
213 contents: List[Any] = []
214 if value is not None:
215 contents = resolve1(value)
216 if not isinstance(contents, list):
217 contents = [contents]
218 return contents