Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pdfplumber/page.py: 25%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import numbers
2import re
3from functools import lru_cache
4from typing import (
5 TYPE_CHECKING,
6 Any,
7 Callable,
8 Dict,
9 Generator,
10 List,
11 Optional,
12 Pattern,
13 Tuple,
14 Union,
15)
16from unicodedata import normalize as normalize_unicode
17from warnings import warn
19from pdfminer.converter import PDFPageAggregator
20from pdfminer.layout import (
21 LTChar,
22 LTComponent,
23 LTContainer,
24 LTCurve,
25 LTItem,
26 LTPage,
27 LTTextContainer,
28)
29from pdfminer.pdfinterp import PDFPageInterpreter, PDFStackT
30from pdfminer.pdfpage import PDFPage
31from pdfminer.psparser import PSLiteral
33from . import utils
34from ._typing import T_bbox, T_num, T_obj, T_obj_list
35from .container import Container
36from .structure import PDFStructTree, StructTreeMissing
37from .table import T_table_settings, Table, TableFinder, TableSettings
38from .utils import decode_text, resolve_all, resolve_and_decode
39from .utils.exceptions import MalformedPDFException, PdfminerException
40from .utils.text import TextMap
42lt_pat = re.compile(r"^LT")
44ALL_ATTRS = set(
45 [
46 "adv",
47 "height",
48 "linewidth",
49 "pts",
50 "size",
51 "srcsize",
52 "width",
53 "x0",
54 "x1",
55 "y0",
56 "y1",
57 "bits",
58 "matrix",
59 "upright",
60 "fontname",
61 "text",
62 "imagemask",
63 "colorspace",
64 "evenodd",
65 "fill",
66 "non_stroking_color",
67 "stroke",
68 "stroking_color",
69 "stream",
70 "name",
71 "mcid",
72 "tag",
73 ]
74)
77if TYPE_CHECKING: # pragma: nocover
78 from .display import PageImage
79 from .pdf import PDF
81# via https://git.ghostscript.com/?p=mupdf.git;a=blob;f=source/pdf/pdf-font.c;h=6322cedf2c26cfb312c0c0878d7aff97b4c7470e;hb=HEAD#l774 # noqa
83CP936_FONTNAMES = {
84 b"\xcb\xce\xcc\xe5": "SimSun,Regular",
85 b"\xba\xda\xcc\xe5": "SimHei,Regular",
86 b"\xbf\xac\xcc\xe5_GB2312": "SimKai,Regular",
87 b"\xb7\xc2\xcb\xce_GB2312": "SimFang,Regular",
88 b"\xc1\xa5\xca\xe9": "SimLi,Regular",
89}
92def fix_fontname_bytes(fontname: bytes) -> str:
93 if b"+" in fontname:
94 split_at = fontname.index(b"+") + 1
95 prefix, suffix = fontname[:split_at], fontname[split_at:]
96 else:
97 prefix, suffix = b"", fontname
99 suffix_new = CP936_FONTNAMES.get(suffix, str(suffix)[2:-1])
100 return str(prefix)[2:-1] + suffix_new
103def tuplify_list_kwargs(kwargs: Dict[str, Any]) -> Dict[str, Any]:
104 return {
105 key: (tuple(value) if isinstance(value, list) else value)
106 for key, value in kwargs.items()
107 }
110class PDFPageAggregatorWithMarkedContent(PDFPageAggregator):
111 """Extract layout from a specific page, adding marked-content IDs to
112 objects where found."""
114 cur_mcid: Optional[int] = None
115 cur_tag: Optional[str] = None
117 def begin_tag(self, tag: PSLiteral, props: Optional[PDFStackT] = None) -> None:
118 """Handle beginning of tag, setting current MCID if any."""
119 self.cur_tag = decode_text(tag.name)
120 if isinstance(props, dict) and "MCID" in props:
121 self.cur_mcid = props["MCID"]
122 else:
123 self.cur_mcid = None
125 def end_tag(self) -> None:
126 """Handle beginning of tag, clearing current MCID."""
127 self.cur_tag = None
128 self.cur_mcid = None
130 def tag_cur_item(self) -> None:
131 """Add current MCID to what we hope to be the most recent object created
132 by pdfminer.six."""
133 # This is somewhat hacky and would not be necessary if
134 # pdfminer.six supported MCIDs. In reading the code it's
135 # clear that the `render_*` methods methods will only ever
136 # create one object, but that is far from being guaranteed.
137 # Even if pdfminer.six's API would just return the objects it
138 # creates, we wouldn't have to do this.
139 if self.cur_item._objs:
140 cur_obj = self.cur_item._objs[-1]
141 cur_obj.mcid = self.cur_mcid # type: ignore
142 cur_obj.tag = self.cur_tag # type: ignore
144 def render_char(self, *args, **kwargs) -> float: # type: ignore
145 """Hook for rendering characters, adding the `mcid` attribute."""
146 adv = super().render_char(*args, **kwargs)
147 self.tag_cur_item()
148 return adv
150 def render_image(self, *args, **kwargs) -> None: # type: ignore
151 """Hook for rendering images, adding the `mcid` attribute."""
152 super().render_image(*args, **kwargs)
153 self.tag_cur_item()
155 def paint_path(self, *args, **kwargs) -> None: # type: ignore
156 """Hook for rendering lines and curves, adding the `mcid` attribute."""
157 super().paint_path(*args, **kwargs)
158 self.tag_cur_item()
161def _normalize_box(box_raw: T_bbox, rotation: T_num = 0) -> T_bbox:
162 # Per PDF Reference 3.8.4: "Note: Although rectangles are
163 # conventionally specified by their lower-left and upperright
164 # corners, it is acceptable to specify any two diagonally opposite
165 # corners."
166 if not all(isinstance(x, numbers.Number) for x in box_raw): # pragma: nocover
167 raise MalformedPDFException(
168 f"Bounding box contains non-number coordinate(s): {box_raw}"
169 )
170 x0, x1 = sorted((box_raw[0], box_raw[2]))
171 y0, y1 = sorted((box_raw[1], box_raw[3]))
172 if rotation in [90, 270]:
173 return (y0, x0, y1, x1)
174 else:
175 return (x0, y0, x1, y1)
178# PDFs coordinate spaces refer to an origin in the bottom-left of the
179# page; pdfplumber flips this vertically, so that the origin is in the
180# top-left.
181def _invert_box(box_raw: T_bbox, mb_height: T_num) -> T_bbox:
182 x0, y0, x1, y1 = box_raw
183 return (x0, mb_height - y1, x1, mb_height - y0)
186class Page(Container):
187 cached_properties: List[str] = Container.cached_properties + ["_layout"]
188 is_original: bool = True
189 pages = None
191 def __init__(
192 self,
193 pdf: "PDF",
194 page_obj: PDFPage,
195 page_number: int,
196 initial_doctop: T_num = 0,
197 ):
198 self.pdf = pdf
199 self.root_page = self
200 self.page_obj = page_obj
201 self.page_number = page_number
202 self.initial_doctop = initial_doctop
204 def get_attr(key: str, default: Any = None) -> Any:
205 value = resolve_all(page_obj.attrs.get(key))
206 return default if value is None else value
208 # Per PDF Reference Table 3.27: "The number of degrees by which the
209 # page should be rotated clockwise when displayed or printed. The value
210 # must be a multiple of 90. Default value: 0"
211 _rotation = get_attr("Rotate", 0)
212 self.rotation = _rotation % 360
214 mb_raw = _normalize_box(get_attr("MediaBox"), self.rotation)
215 mb_height = mb_raw[3] - mb_raw[1]
217 self.mediabox = _invert_box(mb_raw, mb_height)
219 for box_name in ["CropBox", "TrimBox", "BleedBox", "ArtBox"]:
220 if box_name in page_obj.attrs:
221 box_normalized = _invert_box(
222 _normalize_box(get_attr(box_name), self.rotation), mb_height
223 )
224 setattr(self, box_name.lower(), box_normalized)
226 if "CropBox" not in page_obj.attrs:
227 self.cropbox = self.mediabox
229 # Page.bbox defaults to self.mediabox, but can be altered by Page.crop(...)
230 self.bbox = self.mediabox
232 # See https://rednafi.com/python/lru_cache_on_methods/
233 self.get_textmap = lru_cache()(self._get_textmap)
235 def close(self) -> None:
236 self.flush_cache()
237 self.get_textmap.cache_clear()
239 @property
240 def width(self) -> T_num:
241 return self.bbox[2] - self.bbox[0]
243 @property
244 def height(self) -> T_num:
245 return self.bbox[3] - self.bbox[1]
247 @property
248 def structure_tree(self) -> List[Dict[str, Any]]:
249 """Return the structure tree for a page, if any."""
250 try:
251 return [elem.to_dict() for elem in PDFStructTree(self.pdf, self)]
252 except StructTreeMissing:
253 return []
255 @property
256 def layout(self) -> LTPage:
257 if hasattr(self, "_layout"):
258 return self._layout
259 device = PDFPageAggregatorWithMarkedContent(
260 self.pdf.rsrcmgr,
261 pageno=self.page_number,
262 laparams=self.pdf.laparams,
263 )
264 interpreter = PDFPageInterpreter(self.pdf.rsrcmgr, device)
265 try:
266 interpreter.process_page(self.page_obj)
267 except Exception as e:
268 raise PdfminerException(e)
269 self._layout: LTPage = device.get_result()
270 return self._layout
272 @property
273 def annots(self) -> T_obj_list:
274 def rotate_point(pt: Tuple[float, float], r: int) -> Tuple[float, float]:
275 turns = r // 90
276 for i in range(turns):
277 x, y = pt
278 comp = self.width if i == turns % 2 else self.height
279 pt = (y, (comp - x))
280 return pt
282 def parse(annot: T_obj) -> T_obj:
283 _a, _b, _c, _d = annot["Rect"]
284 pt0 = rotate_point((_a, _b), self.rotation)
285 pt1 = rotate_point((_c, _d), self.rotation)
286 rh = self.root_page.height
287 x0, top, x1, bottom = _invert_box(_normalize_box((*pt0, *pt1)), rh)
289 a = annot.get("A", {})
290 extras = {
291 "uri": a.get("URI"),
292 "title": annot.get("T"),
293 "contents": annot.get("Contents"),
294 }
295 for k, v in extras.items():
296 if v is not None:
297 try:
298 extras[k] = v.decode("utf-8")
299 except UnicodeDecodeError:
300 try:
301 extras[k] = v.decode("utf-16")
302 except UnicodeDecodeError:
303 if self.pdf.raise_unicode_errors:
304 raise
305 warn(
306 f"Could not decode {k} of annotation."
307 f" {k} will be missing."
308 )
310 parsed = {
311 "page_number": self.page_number,
312 "object_type": "annot",
313 "x0": x0,
314 "y0": rh - bottom,
315 "x1": x1,
316 "y1": rh - top,
317 "doctop": self.initial_doctop + top,
318 "top": top,
319 "bottom": bottom,
320 "width": x1 - x0,
321 "height": bottom - top,
322 }
323 parsed.update(extras)
324 # Replace the indirect reference to the page dictionary
325 # with a pointer to our actual page
326 if "P" in annot:
327 annot["P"] = self
328 parsed["data"] = annot
329 return parsed
331 raw = resolve_all(self.page_obj.annots) or []
332 parsed = list(map(parse, raw))
333 if isinstance(self, CroppedPage):
334 return self._crop_fn(parsed)
335 else:
336 return parsed
338 @property
339 def hyperlinks(self) -> T_obj_list:
340 return [a for a in self.annots if a["uri"] is not None]
342 @property
343 def objects(self) -> Dict[str, T_obj_list]:
344 if hasattr(self, "_objects"):
345 return self._objects
346 self._objects: Dict[str, T_obj_list] = self.parse_objects()
347 return self._objects
349 def point2coord(self, pt: Tuple[T_num, T_num]) -> Tuple[T_num, T_num]:
350 # See note below re. #1181 and mediabox-adjustment reversions
351 return (self.mediabox[0] + pt[0], self.mediabox[1] + self.height - pt[1])
353 def process_object(self, obj: LTItem) -> T_obj:
354 kind = re.sub(lt_pat, "", obj.__class__.__name__).lower()
356 def process_attr(item: Tuple[str, Any]) -> Optional[Tuple[str, Any]]:
357 k, v = item
358 if k in ALL_ATTRS:
359 res = resolve_all(v)
360 return (k, res)
361 else:
362 return None
364 attr = dict(filter(None, map(process_attr, obj.__dict__.items())))
366 attr["object_type"] = kind
367 attr["page_number"] = self.page_number
369 for cs in ["ncs", "scs"]:
370 # Note: As of pdfminer.six v20221105, that library only
371 # exposes ncs for LTChars, and neither attribute for
372 # other objects. Keeping this code here, though,
373 # for ease of addition if color spaces become
374 # more available via pdfminer.six
375 if hasattr(obj, cs):
376 attr[cs] = resolve_and_decode(getattr(obj, cs).name)
378 if isinstance(obj, (LTChar, LTTextContainer)):
379 text = obj.get_text()
380 attr["text"] = (
381 normalize_unicode(self.pdf.unicode_norm, text)
382 if self.pdf.unicode_norm is not None
383 else text
384 )
386 if isinstance(obj, LTChar):
387 # pdfminer.six (at least as of v20221105) does not
388 # directly expose .stroking_color and .non_stroking_color
389 # for LTChar objects (unlike, e.g., LTRect objects).
390 gs = obj.graphicstate
391 attr["stroking_color"] = (
392 gs.scolor if isinstance(gs.scolor, tuple) else (gs.scolor,)
393 )
394 attr["non_stroking_color"] = (
395 gs.ncolor if isinstance(gs.ncolor, tuple) else (gs.ncolor,)
396 )
398 # Handle (rare) byte-encoded fontnames
399 if isinstance(attr["fontname"], bytes): # pragma: nocover
400 attr["fontname"] = fix_fontname_bytes(attr["fontname"])
402 elif isinstance(obj, (LTCurve,)):
403 attr["pts"] = list(map(self.point2coord, attr["pts"]))
405 # Ignoring typing because type signature for obj.original_path
406 # appears to be incorrect
407 attr["path"] = [(cmd, *map(self.point2coord, pts)) for cmd, *pts in obj.original_path] # type: ignore # noqa: E501
409 attr["dash"] = obj.dashing_style
411 # As noted in #1181, `pdfminer.six` adjusts objects'
412 # coordinates relative to the MediaBox:
413 # https://github.com/pdfminer/pdfminer.six/blob/1a8bd2f730295b31d6165e4d95fcb5a03793c978/pdfminer/converter.py#L79-L84
414 mb_x0, mb_top = self.mediabox[:2]
416 if "y0" in attr:
417 attr["top"] = (self.height - attr["y1"]) + mb_top
418 attr["bottom"] = (self.height - attr["y0"]) + mb_top
419 attr["doctop"] = self.initial_doctop + attr["top"]
421 if "x0" in attr and mb_x0 != 0:
422 attr["x0"] = attr["x0"] + mb_x0
423 attr["x1"] = attr["x1"] + mb_x0
425 return attr
427 def iter_layout_objects(
428 self, layout_objects: List[LTComponent]
429 ) -> Generator[T_obj, None, None]:
430 for obj in layout_objects:
431 # If object is, like LTFigure, a higher-level object ...
432 if isinstance(obj, LTContainer):
433 # and LAParams is passed, process the object itself.
434 if self.pdf.laparams is not None:
435 yield self.process_object(obj)
436 # Regardless, iterate through its children
437 yield from self.iter_layout_objects(obj._objs)
438 else:
439 yield self.process_object(obj)
441 def parse_objects(self) -> Dict[str, T_obj_list]:
442 objects: Dict[str, T_obj_list] = {}
443 for obj in self.iter_layout_objects(self.layout._objs):
444 kind = obj["object_type"]
445 if kind in ["anno"]:
446 continue
447 if objects.get(kind) is None:
448 objects[kind] = []
449 objects[kind].append(obj)
450 return objects
452 def debug_tablefinder(
453 self, table_settings: Optional[T_table_settings] = None
454 ) -> TableFinder:
455 tset = TableSettings.resolve(table_settings)
456 return TableFinder(self, tset)
458 def find_tables(
459 self, table_settings: Optional[T_table_settings] = None
460 ) -> List[Table]:
461 tset = TableSettings.resolve(table_settings)
462 return TableFinder(self, tset).tables
464 def find_table(
465 self, table_settings: Optional[T_table_settings] = None
466 ) -> Optional[Table]:
467 tset = TableSettings.resolve(table_settings)
468 tables = self.find_tables(tset)
470 if len(tables) == 0:
471 return None
473 # Return the largest table, as measured by number of cells.
474 def sorter(x: Table) -> Tuple[int, T_num, T_num]:
475 return (-len(x.cells), x.bbox[1], x.bbox[0])
477 largest = list(sorted(tables, key=sorter))[0]
479 return largest
481 def extract_tables(
482 self, table_settings: Optional[T_table_settings] = None
483 ) -> List[List[List[Optional[str]]]]:
484 tset = TableSettings.resolve(table_settings)
485 tables = self.find_tables(tset)
486 return [table.extract(**(tset.text_settings or {})) for table in tables]
488 def extract_table(
489 self, table_settings: Optional[T_table_settings] = None
490 ) -> Optional[List[List[Optional[str]]]]:
491 tset = TableSettings.resolve(table_settings)
492 table = self.find_table(tset)
493 if table is None:
494 return None
495 else:
496 return table.extract(**(tset.text_settings or {}))
498 def _get_textmap(self, **kwargs: Any) -> TextMap:
499 defaults: Dict[str, Any] = dict(
500 layout_bbox=self.bbox,
501 )
502 if "layout_width_chars" not in kwargs:
503 defaults.update({"layout_width": self.width})
504 if "layout_height_chars" not in kwargs:
505 defaults.update({"layout_height": self.height})
506 full_kwargs: Dict[str, Any] = {**defaults, **kwargs}
507 return utils.chars_to_textmap(self.chars, **full_kwargs)
509 def search(
510 self,
511 pattern: Union[str, Pattern[str]],
512 regex: bool = True,
513 case: bool = True,
514 main_group: int = 0,
515 return_chars: bool = True,
516 return_groups: bool = True,
517 **kwargs: Any,
518 ) -> List[Dict[str, Any]]:
519 textmap = self.get_textmap(**tuplify_list_kwargs(kwargs))
520 return textmap.search(
521 pattern,
522 regex=regex,
523 case=case,
524 main_group=main_group,
525 return_chars=return_chars,
526 return_groups=return_groups,
527 )
529 def extract_text(self, **kwargs: Any) -> str:
530 return self.get_textmap(**tuplify_list_kwargs(kwargs)).as_string
532 def extract_text_simple(self, **kwargs: Any) -> str:
533 return utils.extract_text_simple(self.chars, **kwargs)
535 def extract_words(self, **kwargs: Any) -> T_obj_list:
536 return utils.extract_words(self.chars, **kwargs)
538 def extract_text_lines(
539 self, strip: bool = True, return_chars: bool = True, **kwargs: Any
540 ) -> T_obj_list:
541 return self.get_textmap(**tuplify_list_kwargs(kwargs)).extract_text_lines(
542 strip=strip, return_chars=return_chars
543 )
545 def crop(
546 self, bbox: T_bbox, relative: bool = False, strict: bool = True
547 ) -> "CroppedPage":
548 return CroppedPage(self, bbox, relative=relative, strict=strict)
550 def within_bbox(
551 self, bbox: T_bbox, relative: bool = False, strict: bool = True
552 ) -> "CroppedPage":
553 """
554 Same as .crop, except only includes objects fully within the bbox
555 """
556 return CroppedPage(
557 self, bbox, relative=relative, strict=strict, crop_fn=utils.within_bbox
558 )
560 def outside_bbox(
561 self, bbox: T_bbox, relative: bool = False, strict: bool = True
562 ) -> "CroppedPage":
563 """
564 Same as .crop, except only includes objects fully within the bbox
565 """
566 return CroppedPage(
567 self, bbox, relative=relative, strict=strict, crop_fn=utils.outside_bbox
568 )
570 def filter(self, test_function: Callable[[T_obj], bool]) -> "FilteredPage":
571 return FilteredPage(self, test_function)
573 def dedupe_chars(self, **kwargs: Any) -> "FilteredPage":
574 """
575 Removes duplicate chars — those sharing the same text and positioning
576 (within `tolerance`) as other characters in the set. Adjust extra_args
577 to be more/less restrictive with the properties checked.
578 """
579 p = FilteredPage(self, lambda x: True)
580 p._objects = {kind: objs for kind, objs in self.objects.items()}
581 p._objects["char"] = utils.dedupe_chars(self.chars, **kwargs)
582 return p
584 def to_image(
585 self,
586 resolution: Optional[Union[int, float]] = None,
587 width: Optional[Union[int, float]] = None,
588 height: Optional[Union[int, float]] = None,
589 antialias: bool = False,
590 force_mediabox: bool = False,
591 ) -> "PageImage":
592 """
593 You can pass a maximum of 1 of the following:
594 - resolution: The desired number pixels per inch. Defaults to 72.
595 - width: The desired image width in pixels.
596 - height: The desired image width in pixels.
597 """
598 from .display import DEFAULT_RESOLUTION, PageImage
600 num_specs = sum(x is not None for x in [resolution, width, height])
601 if num_specs > 1:
602 raise ValueError(
603 f"Only one of these arguments can be provided: resolution, width, height. You provided {num_specs}" # noqa: E501
604 )
605 elif width is not None:
606 resolution = 72 * width / self.width
607 elif height is not None:
608 resolution = 72 * height / self.height
610 return PageImage(
611 self,
612 resolution=resolution or DEFAULT_RESOLUTION,
613 antialias=antialias,
614 force_mediabox=force_mediabox,
615 )
617 def to_dict(self, object_types: Optional[List[str]] = None) -> Dict[str, Any]:
618 if object_types is None:
619 _object_types = list(self.objects.keys()) + ["annot"]
620 else:
621 _object_types = object_types
622 d = {
623 "page_number": self.page_number,
624 "initial_doctop": self.initial_doctop,
625 "rotation": self.rotation,
626 "cropbox": self.cropbox,
627 "mediabox": self.mediabox,
628 "bbox": self.bbox,
629 "width": self.width,
630 "height": self.height,
631 }
632 for t in _object_types:
633 d[t + "s"] = getattr(self, t + "s")
634 return d
636 def __repr__(self) -> str:
637 return f"<Page:{self.page_number}>"
640class DerivedPage(Page):
641 is_original: bool = False
643 def __init__(self, parent_page: Page):
644 self.parent_page = parent_page
645 self.root_page = parent_page.root_page
646 self.pdf = parent_page.pdf
647 self.page_obj = parent_page.page_obj
648 self.page_number = parent_page.page_number
649 self.initial_doctop = parent_page.initial_doctop
650 self.rotation = parent_page.rotation
651 self.mediabox = parent_page.mediabox
652 self.cropbox = parent_page.cropbox
653 self.flush_cache(Container.cached_properties)
654 self.get_textmap = lru_cache()(self._get_textmap)
657def test_proposed_bbox(bbox: T_bbox, parent_bbox: T_bbox) -> None:
658 bbox_area = utils.calculate_area(bbox)
659 if bbox_area == 0:
660 raise ValueError(f"Bounding box {bbox} has an area of zero.")
662 overlap = utils.get_bbox_overlap(bbox, parent_bbox)
663 if overlap is None:
664 raise ValueError(
665 f"Bounding box {bbox} is entirely outside "
666 f"parent page bounding box {parent_bbox}"
667 )
669 overlap_area = utils.calculate_area(overlap)
670 if overlap_area < bbox_area:
671 raise ValueError(
672 f"Bounding box {bbox} is not fully within "
673 f"parent page bounding box {parent_bbox}"
674 )
677class CroppedPage(DerivedPage):
678 def __init__(
679 self,
680 parent_page: Page,
681 crop_bbox: T_bbox,
682 crop_fn: Callable[[T_obj_list, T_bbox], T_obj_list] = utils.crop_to_bbox,
683 relative: bool = False,
684 strict: bool = True,
685 ):
686 if relative:
687 o_x0, o_top, _, _ = parent_page.bbox
688 x0, top, x1, bottom = crop_bbox
689 crop_bbox = (x0 + o_x0, top + o_top, x1 + o_x0, bottom + o_top)
691 if strict:
692 test_proposed_bbox(crop_bbox, parent_page.bbox)
694 def _crop_fn(objs: T_obj_list) -> T_obj_list:
695 return crop_fn(objs, crop_bbox)
697 super().__init__(parent_page)
699 self._crop_fn = _crop_fn
701 # Note: testing for original function passed, not _crop_fn
702 if crop_fn is utils.outside_bbox:
703 self.bbox = parent_page.bbox
704 else:
705 self.bbox = crop_bbox
707 @property
708 def objects(self) -> Dict[str, T_obj_list]:
709 if hasattr(self, "_objects"):
710 return self._objects
711 self._objects: Dict[str, T_obj_list] = {
712 k: self._crop_fn(v) for k, v in self.parent_page.objects.items()
713 }
714 return self._objects
717class FilteredPage(DerivedPage):
718 def __init__(self, parent_page: Page, filter_fn: Callable[[T_obj], bool]):
719 self.bbox = parent_page.bbox
720 self.filter_fn = filter_fn
721 super().__init__(parent_page)
723 @property
724 def objects(self) -> Dict[str, T_obj_list]:
725 if hasattr(self, "_objects"):
726 return self._objects
727 self._objects: Dict[str, T_obj_list] = {
728 k: list(filter(self.filter_fn, v))
729 for k, v in self.parent_page.objects.items()
730 }
731 return self._objects