Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pdfplumber/pdf.py: 72%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import itertools
2import logging
3import pathlib
4from io import BufferedReader, BytesIO
5from types import TracebackType
6from typing import Any, Dict, Generator, List, Literal, Optional, Tuple, Type, Union
8from pdfminer.layout import LAParams
9from pdfminer.pdfdocument import PDFDocument
10from pdfminer.pdfinterp import PDFResourceManager
11from pdfminer.pdfpage import PDFPage
12from pdfminer.pdfparser import PDFParser
14from ._typing import T_num, T_obj_list
15from .container import Container
16from .page import Page
17from .repair import T_repair_setting, _repair
18from .structure import PDFStructTree, StructTreeMissing
19from .utils import resolve_and_decode
20from .utils.exceptions import PdfminerException
22logger = logging.getLogger(__name__)
25class PDF(Container):
26 cached_properties: List[str] = Container.cached_properties + ["_pages"]
28 def __init__(
29 self,
30 stream: Union[BufferedReader, BytesIO],
31 stream_is_external: bool = False,
32 path: Optional[pathlib.Path] = None,
33 pages: Optional[Union[List[int], Tuple[int]]] = None,
34 laparams: Optional[Dict[str, Any]] = None,
35 password: Optional[str] = None,
36 strict_metadata: bool = False,
37 unicode_norm: Optional[Literal["NFC", "NFKC", "NFD", "NFKD"]] = None,
38 raise_unicode_errors: bool = True,
39 ):
40 self.stream = stream
41 self.stream_is_external = stream_is_external
42 self.path = path
43 self.pages_to_parse = pages
44 self.laparams = None if laparams is None else LAParams(**laparams)
45 self.password = password
46 self.unicode_norm = unicode_norm
47 self.raise_unicode_errors = raise_unicode_errors
49 try:
50 self.doc = PDFDocument(PDFParser(stream), password=password or "")
51 except Exception as e:
52 raise PdfminerException(e)
53 self.rsrcmgr = PDFResourceManager()
54 self.metadata = {}
56 for info in self.doc.info:
57 self.metadata.update(info)
58 for k, v in self.metadata.items():
59 try:
60 self.metadata[k] = resolve_and_decode(v)
61 except Exception as e: # pragma: nocover
62 if strict_metadata:
63 # Raise an exception since unable to resolve the metadata value.
64 raise
65 # This metadata value could not be parsed. Instead of failing the PDF
66 # read, treat it as a warning only if `strict_metadata=False`.
67 logger.warning(
68 f'[WARNING] Metadata key "{k}" could not be parsed due to '
69 f"exception: {str(e)}"
70 )
72 @classmethod
73 def open(
74 cls,
75 path_or_fp: Union[str, pathlib.Path, BufferedReader, BytesIO],
76 pages: Optional[Union[List[int], Tuple[int]]] = None,
77 laparams: Optional[Dict[str, Any]] = None,
78 password: Optional[str] = None,
79 strict_metadata: bool = False,
80 unicode_norm: Optional[Literal["NFC", "NFKC", "NFD", "NFKD"]] = None,
81 repair: bool = False,
82 gs_path: Optional[Union[str, pathlib.Path]] = None,
83 repair_setting: T_repair_setting = "default",
84 raise_unicode_errors: bool = True,
85 ) -> "PDF":
87 stream: Union[BufferedReader, BytesIO]
89 if repair:
90 stream = _repair(
91 path_or_fp, password=password, gs_path=gs_path, setting=repair_setting
92 )
93 stream_is_external = False
94 # Although the original file has a path,
95 # the repaired version does not
96 path = None
97 elif isinstance(path_or_fp, (str, pathlib.Path)):
98 stream = open(path_or_fp, "rb")
99 stream_is_external = False
100 path = pathlib.Path(path_or_fp)
101 else:
102 stream = path_or_fp
103 stream_is_external = True
104 path = None
106 try:
107 return cls(
108 stream,
109 path=path,
110 pages=pages,
111 laparams=laparams,
112 password=password,
113 strict_metadata=strict_metadata,
114 unicode_norm=unicode_norm,
115 stream_is_external=stream_is_external,
116 raise_unicode_errors=raise_unicode_errors,
117 )
119 except PdfminerException:
120 if not stream_is_external:
121 stream.close()
122 raise
124 def close(self) -> None:
125 self.flush_cache()
127 for page in self.pages:
128 page.close()
130 if not self.stream_is_external:
131 self.stream.close()
133 def __enter__(self) -> "PDF":
134 return self
136 def __exit__(
137 self,
138 t: Optional[Type[BaseException]],
139 value: Optional[BaseException],
140 traceback: Optional[TracebackType],
141 ) -> None:
142 self.close()
144 @property
145 def pages(self) -> List[Page]:
146 if hasattr(self, "_pages"):
147 return self._pages
149 doctop: T_num = 0
150 pp = self.pages_to_parse
151 self._pages: List[Page] = []
153 def iter_pages() -> Generator[PDFPage, None, None]:
154 gen = PDFPage.create_pages(self.doc)
155 while True:
156 try:
157 yield next(gen)
158 except StopIteration:
159 break
160 except Exception as e:
161 raise PdfminerException(e)
163 for i, page in enumerate(iter_pages()):
164 page_number = i + 1
165 if pp is not None and page_number not in pp:
166 continue
167 p = Page(self, page, page_number=page_number, initial_doctop=doctop)
168 self._pages.append(p)
169 doctop += p.height
170 return self._pages
172 @property
173 def objects(self) -> Dict[str, T_obj_list]:
174 if hasattr(self, "_objects"):
175 return self._objects
176 all_objects: Dict[str, T_obj_list] = {}
177 for p in self.pages:
178 for kind in p.objects.keys():
179 all_objects[kind] = all_objects.get(kind, []) + p.objects[kind]
180 self._objects: Dict[str, T_obj_list] = all_objects
181 return self._objects
183 @property
184 def annots(self) -> List[Dict[str, Any]]:
185 gen = (p.annots for p in self.pages)
186 return list(itertools.chain(*gen))
188 @property
189 def hyperlinks(self) -> List[Dict[str, Any]]:
190 gen = (p.hyperlinks for p in self.pages)
191 return list(itertools.chain(*gen))
193 @property
194 def structure_tree(self) -> List[Dict[str, Any]]:
195 """Return the structure tree for the document."""
196 try:
197 return [elem.to_dict() for elem in PDFStructTree(self)]
198 except StructTreeMissing:
199 return []
201 def to_dict(self, object_types: Optional[List[str]] = None) -> Dict[str, Any]:
202 return {
203 "metadata": self.metadata,
204 "pages": [page.to_dict(object_types) for page in self.pages],
205 }