Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pdfplumber/structure.py: 15%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import itertools
2import logging
3import re
4from collections import deque
5from dataclasses import asdict, dataclass, field
6from typing import (
7 TYPE_CHECKING,
8 Any,
9 Callable,
10 Dict,
11 Iterable,
12 Iterator,
13 List,
14 Optional,
15 Pattern,
16 Tuple,
17 Union,
18)
20from pdfminer.data_structures import NumberTree
21from pdfminer.pdfparser import PDFParser
22from pdfminer.pdftypes import PDFObjRef, resolve1
23from pdfminer.psparser import PSLiteral
25from ._typing import T_bbox, T_obj
26from .utils import decode_text, geometry
28logger = logging.getLogger(__name__)
31if TYPE_CHECKING: # pragma: nocover
32 from .page import Page
33 from .pdf import PDF
36MatchFunc = Callable[["PDFStructElement"], bool]
39def _find_all(
40 elements: Iterable["PDFStructElement"],
41 matcher: Union[str, Pattern[str], MatchFunc],
42) -> Iterator["PDFStructElement"]:
43 """
44 Common code for `find_all()` in trees and elements.
45 """
47 def match_tag(x: "PDFStructElement") -> bool:
48 """Match an element name."""
49 return x.type == matcher
51 def match_regex(x: "PDFStructElement") -> bool:
52 """Match an element name by regular expression."""
53 return matcher.match(x.type) # type: ignore
55 if isinstance(matcher, str):
56 match_func = match_tag
57 elif isinstance(matcher, re.Pattern):
58 match_func = match_regex
59 else:
60 match_func = matcher # type: ignore
61 d = deque(elements)
62 while d:
63 el = d.popleft()
64 if match_func(el):
65 yield el
66 d.extendleft(reversed(el.children))
69class Findable:
70 """find() and find_all() methods that can be inherited to avoid
71 repeating oneself"""
73 children: List["PDFStructElement"]
75 def find_all(
76 self, matcher: Union[str, Pattern[str], MatchFunc]
77 ) -> Iterator["PDFStructElement"]:
78 """Iterate depth-first over matching elements in subtree.
80 The `matcher` argument is either an element name, a regular
81 expression, or a function taking a `PDFStructElement` and
82 returning `True` if the element matches.
83 """
84 return _find_all(self.children, matcher)
86 def find(
87 self, matcher: Union[str, Pattern[str], MatchFunc]
88 ) -> Optional["PDFStructElement"]:
89 """Find the first matching element in subtree.
91 The `matcher` argument is either an element name, a regular
92 expression, or a function taking a `PDFStructElement` and
93 returning `True` if the element matches.
94 """
95 try:
96 return next(_find_all(self.children, matcher))
97 except StopIteration:
98 return None
101@dataclass
102class PDFStructElement(Findable):
103 type: str
104 revision: Optional[int]
105 id: Optional[str]
106 lang: Optional[str]
107 alt_text: Optional[str]
108 actual_text: Optional[str]
109 title: Optional[str]
110 page_number: Optional[int]
111 attributes: Dict[str, Any] = field(default_factory=dict)
112 mcids: List[int] = field(default_factory=list)
113 children: List["PDFStructElement"] = field(default_factory=list)
115 def __iter__(self) -> Iterator["PDFStructElement"]:
116 return iter(self.children)
118 def all_mcids(self) -> Iterator[Tuple[Optional[int], int]]:
119 """Collect all MCIDs (with their page numbers, if there are
120 multiple pages in the tree) inside a structure element.
121 """
122 # Collect them depth-first to preserve ordering
123 for mcid in self.mcids:
124 yield self.page_number, mcid
125 d = deque(self.children)
126 while d:
127 el = d.popleft()
128 for mcid in el.mcids:
129 yield el.page_number, mcid
130 d.extendleft(reversed(el.children))
132 def to_dict(self) -> Dict[str, Any]:
133 """Return a compacted dict representation."""
134 r = asdict(self)
135 # Prune empty values (does not matter in which order)
136 d = deque([r])
137 while d:
138 el = d.popleft()
139 for k in list(el.keys()):
140 if el[k] is None or el[k] == [] or el[k] == {}:
141 del el[k]
142 if "children" in el:
143 d.extend(el["children"])
144 return r
147class StructTreeMissing(ValueError):
148 pass
151class PDFStructTree(Findable):
152 """Parse the structure tree of a PDF.
154 The constructor takes a `pdfplumber.PDF` and optionally a
155 `pdfplumber.Page`. To avoid creating the entire tree for a large
156 document it is recommended to provide a page.
158 This class creates a representation of the portion of the
159 structure tree that reaches marked content sections, either for a
160 single page, or for the whole document. Note that this is slightly
161 different from the behaviour of other PDF libraries which will
162 also include structure elements with no content.
164 If the PDF has no structure, the constructor will raise
165 `StructTreeMissing`.
167 """
169 page: Optional["Page"]
171 def __init__(self, doc: "PDF", page: Optional["Page"] = None):
172 self.doc = doc.doc
173 if "StructTreeRoot" not in self.doc.catalog:
174 raise StructTreeMissing("PDF has no structure")
175 self.root = resolve1(self.doc.catalog["StructTreeRoot"])
176 self.role_map = resolve1(self.root.get("RoleMap", {}))
177 self.class_map = resolve1(self.root.get("ClassMap", {}))
178 self.children: List[PDFStructElement] = []
180 # If we have a specific page then we will work backwards from
181 # its ParentTree - this is because structure elements could
182 # span multiple pages, and the "Pg" attribute is *optional*,
183 # so this is the approved way to get a page's structure...
184 if page is not None:
185 self.page = page
186 self.pages = {page.page_number: page}
187 self.page_dict = None
188 # ...EXCEPT that the ParentTree is sometimes missing, in which
189 # case we fall back to the non-approved way.
190 parent_tree_obj = self.root.get("ParentTree")
191 if parent_tree_obj is None:
192 self._parse_struct_tree()
193 else:
194 parent_tree = NumberTree(parent_tree_obj)
195 # If there is no marked content in the structure tree for
196 # this page (which can happen even when there is a
197 # structure tree) then there is no `StructParents`.
198 # Note however that if there are XObjects in a page,
199 # *they* may have `StructParent` (not `StructParents`)
200 if "StructParents" not in self.page.page_obj.attrs:
201 return
202 parent_id = self.page.page_obj.attrs["StructParents"]
203 # NumberTree should have a `get` method like it does in pdf.js...
204 parent_array = resolve1(
205 next(array for num, array in parent_tree.values if num == parent_id)
206 )
207 self._parse_parent_tree(parent_array)
208 else:
209 self.page = None
210 # Overhead of creating pages shouldn't be too bad we hope!
211 self.pages = {page.page_number: page for page in doc.pages}
212 self.page_dict = {
213 page.page_obj.pageid: page.page_number for page in self.pages.values()
214 }
215 self._parse_struct_tree()
217 def _make_attributes(
218 self, obj: Dict[str, Any], revision: Optional[int]
219 ) -> Dict[str, Any]:
220 attr_obj_list = []
221 for key in "C", "A":
222 if key not in obj:
223 continue
224 attr_obj = resolve1(obj[key])
225 # It could be a list of attribute objects (why?)
226 if isinstance(attr_obj, list):
227 attr_obj_list.extend(attr_obj)
228 else:
229 attr_obj_list.append(attr_obj)
230 attr_objs = []
231 prev_obj = None
232 for aref in attr_obj_list:
233 # If we find a revision number, which might "follow the
234 # revision object" (the spec is not clear about what this
235 # should look like but it implies they are simply adjacent
236 # in a flat array), then use it to decide whether to take
237 # the previous object...
238 if isinstance(aref, int):
239 if aref == revision and prev_obj is not None:
240 attr_objs.append(prev_obj)
241 prev_obj = None
242 else:
243 if prev_obj is not None:
244 attr_objs.append(prev_obj)
245 prev_obj = resolve1(aref)
246 if prev_obj is not None:
247 attr_objs.append(prev_obj)
248 # Now merge all the attribute objects in the collected to a
249 # single set (again, the spec doesn't really explain this but
250 # does say that attributes in /A supersede those in /C)
251 attr = {}
252 for obj in attr_objs:
253 if isinstance(obj, PSLiteral):
254 key = decode_text(obj.name)
255 if key not in self.class_map:
256 logger.warning("Unknown attribute class %s", key)
257 continue
258 obj = self.class_map[key]
259 for k, v in obj.items():
260 if isinstance(v, PSLiteral):
261 attr[k] = decode_text(v.name)
262 else:
263 attr[k] = obj[k]
264 return attr
266 def _make_element(self, obj: Any) -> Tuple[Optional[PDFStructElement], List[Any]]:
267 # We hopefully caught these earlier
268 assert "MCID" not in obj, "Uncaught MCR: %s" % obj
269 assert "Obj" not in obj, "Uncaught OBJR: %s" % obj
270 # Get page number if necessary
271 page_number = None
272 if self.page_dict is not None and "Pg" in obj:
273 page_objid = obj["Pg"].objid
274 assert page_objid in self.page_dict, "Object on unparsed page: %s" % obj
275 page_number = self.page_dict[page_objid]
276 obj_tag = ""
277 if "S" in obj:
278 obj_tag = decode_text(obj["S"].name)
279 if obj_tag in self.role_map:
280 obj_tag = decode_text(self.role_map[obj_tag].name)
281 children = resolve1(obj["K"]) if "K" in obj else []
282 if isinstance(children, int): # ugh... isinstance...
283 children = [children]
284 elif isinstance(children, dict): # a single object.. ugh...
285 children = [obj["K"]]
286 revision = obj.get("R")
287 attributes = self._make_attributes(obj, revision)
288 element_id = decode_text(resolve1(obj["ID"])) if "ID" in obj else None
289 title = decode_text(resolve1(obj["T"])) if "T" in obj else None
290 lang = decode_text(resolve1(obj["Lang"])) if "Lang" in obj else None
291 alt_text = decode_text(resolve1(obj["Alt"])) if "Alt" in obj else None
292 actual_text = (
293 decode_text(resolve1(obj["ActualText"])) if "ActualText" in obj else None
294 )
295 element = PDFStructElement(
296 type=obj_tag,
297 id=element_id,
298 page_number=page_number,
299 revision=revision,
300 lang=lang,
301 title=title,
302 alt_text=alt_text,
303 actual_text=actual_text,
304 attributes=attributes,
305 )
306 return element, children
308 def _parse_parent_tree(self, parent_array: List[Any]) -> None:
309 """Populate the structure tree using the leaves of the parent tree for
310 a given page."""
311 # First walk backwards from the leaves to the root, tracking references
312 d = deque(parent_array)
313 s = {}
314 found_root = False
315 while d:
316 ref = d.popleft()
317 # In the case where an MCID is not associated with any
318 # structure, there will be a "null" in the parent tree.
319 if ref == PDFParser.KEYWORD_NULL:
320 continue
321 if repr(ref) in s:
322 continue
323 obj = resolve1(ref)
324 # This is required! It's in the spec!
325 if "Type" in obj and decode_text(obj["Type"].name) == "StructTreeRoot":
326 found_root = True
327 else:
328 # We hope that these are actual elements and not
329 # references or marked-content sections...
330 element, children = self._make_element(obj)
331 # We have no page tree so we assume this page was parsed
332 assert element is not None
333 s[repr(ref)] = element, children
334 d.append(obj["P"])
335 # If we didn't reach the root something is quite wrong!
336 assert found_root
337 self._resolve_children(s)
339 def on_parsed_page(self, obj: Dict[str, Any]) -> bool:
340 if "Pg" not in obj:
341 return True
342 page_objid = obj["Pg"].objid
343 if self.page_dict is not None:
344 return page_objid in self.page_dict
345 if self.page is not None:
346 # We have to do this to satisfy mypy
347 if page_objid != self.page.page_obj.pageid:
348 return False
349 return True
351 def _parse_struct_tree(self) -> None:
352 """Populate the structure tree starting from the root, skipping
353 unparsed pages and empty elements."""
354 root = resolve1(self.root["K"])
356 # It could just be a single object ... it's in the spec (argh)
357 if isinstance(root, dict):
358 root = [self.root["K"]]
359 d = deque(root)
360 s = {}
361 while d:
362 ref = d.popleft()
363 # In case the tree is actually a DAG and not a tree...
364 if repr(ref) in s: # pragma: nocover (shouldn't happen)
365 continue
366 obj = resolve1(ref)
367 # Deref top-level OBJR skipping refs to unparsed pages
368 if isinstance(obj, dict) and "Obj" in obj:
369 if not self.on_parsed_page(obj):
370 continue
371 ref = obj["Obj"]
372 obj = resolve1(ref)
373 element, children = self._make_element(obj)
374 # Similar to above, delay resolving the children to avoid
375 # tree-recursion.
376 s[repr(ref)] = element, children
377 for child in children:
378 obj = resolve1(child)
379 if isinstance(obj, dict):
380 if not self.on_parsed_page(obj):
381 continue
382 if "Obj" in obj:
383 child = obj["Obj"]
384 elif "MCID" in obj:
385 continue
386 if isinstance(child, PDFObjRef):
387 d.append(child)
389 # Traverse depth-first, removing empty elements (unsure how to
390 # do this non-recursively)
391 def prune(elements: List[Any]) -> List[Any]:
392 next_elements = []
393 for ref in elements:
394 obj = resolve1(ref)
395 if isinstance(ref, int):
396 next_elements.append(ref)
397 continue
398 elif isinstance(obj, dict):
399 if not self.on_parsed_page(obj):
400 continue
401 if "MCID" in obj:
402 next_elements.append(obj["MCID"])
403 continue
404 elif "Obj" in obj:
405 ref = obj["Obj"]
406 element, children = s[repr(ref)]
407 children = prune(children)
408 # See assertions below
409 if element is None or not children:
410 del s[repr(ref)]
411 else:
412 s[repr(ref)] = element, children
413 next_elements.append(ref)
414 return next_elements
416 prune(root)
417 self._resolve_children(s)
419 def _resolve_children(self, seen: Dict[str, Any]) -> None:
420 """Resolve children starting from the tree root based on references we
421 saw when traversing the structure tree.
422 """
423 root = resolve1(self.root["K"])
424 # It could just be a single object ... it's in the spec (argh)
425 if isinstance(root, dict):
426 root = [self.root["K"]]
427 self.children = []
428 # Create top-level self.children
429 parsed_root = []
430 for ref in root:
431 obj = resolve1(ref)
432 if isinstance(obj, dict) and "Obj" in obj:
433 if not self.on_parsed_page(obj):
434 continue
435 ref = obj["Obj"]
436 if repr(ref) in seen:
437 parsed_root.append(ref)
438 d = deque(parsed_root)
439 while d:
440 ref = d.popleft()
441 element, children = seen[repr(ref)]
442 assert element is not None, "Unparsed element"
443 for child in children:
444 obj = resolve1(child)
445 if isinstance(obj, int):
446 element.mcids.append(obj)
447 elif isinstance(obj, dict):
448 # Skip out-of-page MCIDS and OBJRs
449 if not self.on_parsed_page(obj):
450 continue
451 if "MCID" in obj:
452 element.mcids.append(obj["MCID"])
453 elif "Obj" in obj:
454 child = obj["Obj"]
455 # NOTE: if, not elif, in case of OBJR above
456 if isinstance(child, PDFObjRef):
457 child_element, _ = seen.get(repr(child), (None, None))
458 if child_element is not None:
459 element.children.append(child_element)
460 d.append(child)
461 self.children = [seen[repr(ref)][0] for ref in parsed_root]
463 def __iter__(self) -> Iterator[PDFStructElement]:
464 return iter(self.children)
466 def element_bbox(self, el: PDFStructElement) -> T_bbox:
467 """Get the bounding box for an element for visual debugging."""
468 page = None
469 if self.page is not None:
470 page = self.page
471 elif el.page_number is not None:
472 page = self.pages[el.page_number]
473 bbox = el.attributes.get("BBox", None)
474 if page is not None and bbox is not None:
475 from .page import CroppedPage, _invert_box, _normalize_box
477 # Use secret knowledge of CroppedPage (cannot use
478 # page.height because it is the *cropped* dimension, but
479 # cropping does not actually translate coordinates)
480 bbox = _invert_box(
481 _normalize_box(bbox), page.mediabox[3] - page.mediabox[1]
482 )
483 # Use more secret knowledge of CroppedPage
484 if isinstance(page, CroppedPage):
485 rect = geometry.bbox_to_rect(bbox)
486 rects = page._crop_fn([rect])
487 if not rects:
488 raise IndexError("Element no longer on page")
489 return geometry.obj_to_bbox(rects[0])
490 else:
491 # Not sure why mypy complains here
492 return bbox # type: ignore
493 else:
494 mcid_objs = []
495 for page_number, mcid in el.all_mcids():
496 objects: Iterable[T_obj]
497 if page_number is None:
498 if page is not None:
499 objects = itertools.chain.from_iterable(page.objects.values())
500 else:
501 objects = [] # pragma: nocover
502 else:
503 objects = itertools.chain.from_iterable(
504 self.pages[page_number].objects.values()
505 )
506 for c in objects:
507 if c["mcid"] == mcid:
508 mcid_objs.append(c)
509 if not mcid_objs:
510 raise IndexError("No objects found") # pragma: nocover
511 return geometry.objects_to_bbox(mcid_objs)