Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pdfplumber/page.py: 25%

1import numbers

2import re

3from functools import lru_cache

4from typing import (

5 TYPE_CHECKING,

6 Any,

7 Callable,

8 Dict,

9 Generator,

10 List,

11 Optional,

12 Pattern,

13 Tuple,

14 Union,

15)

16from unicodedata import normalize as normalize_unicode

17from warnings import warn

19from pdfminer.converter import PDFPageAggregator

20from pdfminer.layout import (

21 LTChar,

22 LTComponent,

23 LTContainer,

24 LTCurve,

25 LTItem,

26 LTPage,

27 LTTextContainer,

28)

29from pdfminer.pdfinterp import PDFPageInterpreter, PDFStackT

30from pdfminer.pdfpage import PDFPage

31from pdfminer.psparser import PSLiteral

33from . import utils

34from ._typing import T_bbox, T_num, T_obj, T_obj_list

35from .container import Container

36from .structure import PDFStructTree, StructTreeMissing

37from .table import T_table_settings, Table, TableFinder, TableSettings

38from .utils import decode_text, resolve_all, resolve_and_decode

39from .utils.exceptions import MalformedPDFException, PdfminerException

40from .utils.text import TextMap

42lt_pat = re.compile(r"^LT")

44ALL_ATTRS = set(

45 [

46 "adv",

47 "height",

48 "linewidth",

49 "pts",

50 "size",

51 "srcsize",

52 "width",

53 "x0",

54 "x1",

55 "y0",

56 "y1",

57 "bits",

58 "matrix",

59 "upright",

60 "fontname",

61 "text",

62 "imagemask",

63 "colorspace",

64 "evenodd",

65 "fill",

66 "non_stroking_color",

67 "stroke",

68 "stroking_color",

69 "stream",

70 "name",

71 "mcid",

72 "tag",

73 ]

74)

77if TYPE_CHECKING: # pragma: nocover

78 from .display import PageImage

79 from .pdf import PDF

81# via https://git.ghostscript.com/?p=mupdf.git;a=blob;f=source/pdf/pdf-font.c;h=6322cedf2c26cfb312c0c0878d7aff97b4c7470e;hb=HEAD#l774 # noqa

83CP936_FONTNAMES = {

84 b"\xcb\xce\xcc\xe5": "SimSun,Regular",

85 b"\xba\xda\xcc\xe5": "SimHei,Regular",

86 b"\xbf\xac\xcc\xe5_GB2312": "SimKai,Regular",

87 b"\xb7\xc2\xcb\xce_GB2312": "SimFang,Regular",

88 b"\xc1\xa5\xca\xe9": "SimLi,Regular",

89}

92def fix_fontname_bytes(fontname: bytes) -> str:

93 if b"+" in fontname:

94 split_at = fontname.index(b"+") + 1

95 prefix, suffix = fontname[:split_at], fontname[split_at:]

96 else:

97 prefix, suffix = b"", fontname

99 suffix_new = CP936_FONTNAMES.get(suffix, str(suffix)[2:-1])

100 return str(prefix)[2:-1] + suffix_new

101

102

103def tuplify_list_kwargs(kwargs: Dict[str, Any]) -> Dict[str, Any]:

104 return {

105 key: (tuple(value) if isinstance(value, list) else value)

106 for key, value in kwargs.items()

107 }

108

109

110class PDFPageAggregatorWithMarkedContent(PDFPageAggregator):

111 """Extract layout from a specific page, adding marked-content IDs to

112 objects where found."""

113

114 cur_mcid: Optional[int] = None

115 cur_tag: Optional[str] = None

116

117 def begin_tag(self, tag: PSLiteral, props: Optional[PDFStackT] = None) -> None:

118 """Handle beginning of tag, setting current MCID if any."""

119 self.cur_tag = decode_text(tag.name)

120 if isinstance(props, dict) and "MCID" in props:

121 self.cur_mcid = props["MCID"]

122 else:

123 self.cur_mcid = None

124

125 def end_tag(self) -> None:

126 """Handle beginning of tag, clearing current MCID."""

127 self.cur_tag = None

128 self.cur_mcid = None

129

130 def tag_cur_item(self) -> None:

131 """Add current MCID to what we hope to be the most recent object created

132 by pdfminer.six."""

133 # This is somewhat hacky and would not be necessary if

134 # pdfminer.six supported MCIDs. In reading the code it's

135 # clear that the `render_*` methods methods will only ever

136 # create one object, but that is far from being guaranteed.

137 # Even if pdfminer.six's API would just return the objects it

138 # creates, we wouldn't have to do this.

139 if self.cur_item._objs:

140 cur_obj = self.cur_item._objs[-1]

141 cur_obj.mcid = self.cur_mcid # type: ignore

142 cur_obj.tag = self.cur_tag # type: ignore

143

144 def render_char(self, *args, **kwargs) -> float: # type: ignore

145 """Hook for rendering characters, adding the `mcid` attribute."""

146 adv = super().render_char(*args, **kwargs)

147 self.tag_cur_item()

148 return adv

149

150 def render_image(self, *args, **kwargs) -> None: # type: ignore

151 """Hook for rendering images, adding the `mcid` attribute."""

152 super().render_image(*args, **kwargs)

153 self.tag_cur_item()

154

155 def paint_path(self, *args, **kwargs) -> None: # type: ignore

156 """Hook for rendering lines and curves, adding the `mcid` attribute."""

157 super().paint_path(*args, **kwargs)

158 self.tag_cur_item()

159

160

161def _normalize_box(box_raw: T_bbox, rotation: T_num = 0) -> T_bbox:

162 # Per PDF Reference 3.8.4: "Note: Although rectangles are

163 # conventionally specified by their lower-left and upperright

164 # corners, it is acceptable to specify any two diagonally opposite

165 # corners."

166 if not all(isinstance(x, numbers.Number) for x in box_raw): # pragma: nocover

167 raise MalformedPDFException(

168 f"Bounding box contains non-number coordinate(s): {box_raw}"

169 )

170 x0, x1 = sorted((box_raw[0], box_raw[2]))

171 y0, y1 = sorted((box_raw[1], box_raw[3]))

172 if rotation in [90, 270]:

173 return (y0, x0, y1, x1)

174 else:

175 return (x0, y0, x1, y1)

176

177

178# PDFs coordinate spaces refer to an origin in the bottom-left of the

179# page; pdfplumber flips this vertically, so that the origin is in the

180# top-left.

181def _invert_box(box_raw: T_bbox, mb_height: T_num) -> T_bbox:

182 x0, y0, x1, y1 = box_raw

183 return (x0, mb_height - y1, x1, mb_height - y0)

184

185

186class Page(Container):

187 cached_properties: List[str] = Container.cached_properties + ["_layout"]

188 is_original: bool = True

189 pages = None

190

191 def __init__(

192 self,

193 pdf: "PDF",

194 page_obj: PDFPage,

195 page_number: int,

196 initial_doctop: T_num = 0,

197 ):

198 self.pdf = pdf

199 self.root_page = self

200 self.page_obj = page_obj

201 self.page_number = page_number

202 self.initial_doctop = initial_doctop

203

204 def get_attr(key: str, default: Any = None) -> Any:

205 value = resolve_all(page_obj.attrs.get(key))

206 return default if value is None else value

207

208 # Per PDF Reference Table 3.27: "The number of degrees by which the

209 # page should be rotated clockwise when displayed or printed. The value

210 # must be a multiple of 90. Default value: 0"

211 _rotation = get_attr("Rotate", 0)

212 self.rotation = _rotation % 360

213

214 mb_raw = _normalize_box(get_attr("MediaBox"), self.rotation)

215 mb_height = mb_raw[3] - mb_raw[1]

216

217 self.mediabox = _invert_box(mb_raw, mb_height)

218

219 for box_name in ["CropBox", "TrimBox", "BleedBox", "ArtBox"]:

220 if box_name in page_obj.attrs:

221 box_normalized = _invert_box(

222 _normalize_box(get_attr(box_name), self.rotation), mb_height

223 )

224 setattr(self, box_name.lower(), box_normalized)

225

226 if "CropBox" not in page_obj.attrs:

227 self.cropbox = self.mediabox

228

229 # Page.bbox defaults to self.mediabox, but can be altered by Page.crop(...)

230 self.bbox = self.mediabox

231

232 # See https://rednafi.com/python/lru_cache_on_methods/

233 self.get_textmap = lru_cache()(self._get_textmap)

234

235 def close(self) -> None:

236 self.flush_cache()

237 self.get_textmap.cache_clear()

238

239 @property

240 def width(self) -> T_num:

241 return self.bbox[2] - self.bbox[0]

242

243 @property

244 def height(self) -> T_num:

245 return self.bbox[3] - self.bbox[1]

246

247 @property

248 def structure_tree(self) -> List[Dict[str, Any]]:

249 """Return the structure tree for a page, if any."""

250 try:

251 return [elem.to_dict() for elem in PDFStructTree(self.pdf, self)]

252 except StructTreeMissing:

253 return []

254

255 @property

256 def layout(self) -> LTPage:

257 if hasattr(self, "_layout"):

258 return self._layout

259 device = PDFPageAggregatorWithMarkedContent(

260 self.pdf.rsrcmgr,

261 pageno=self.page_number,

262 laparams=self.pdf.laparams,

263 )

264 interpreter = PDFPageInterpreter(self.pdf.rsrcmgr, device)

265 try:

266 interpreter.process_page(self.page_obj)

267 except Exception as e:

268 raise PdfminerException(e)

269 self._layout: LTPage = device.get_result()

270 return self._layout

271

272 @property

273 def annots(self) -> T_obj_list:

274 def rotate_point(pt: Tuple[float, float], r: int) -> Tuple[float, float]:

275 turns = r // 90

276 for i in range(turns):

277 x, y = pt

278 comp = self.width if i == turns % 2 else self.height

279 pt = (y, (comp - x))

280 return pt

281

282 def parse(annot: T_obj) -> T_obj:

283 _a, _b, _c, _d = annot["Rect"]

284 pt0 = rotate_point((_a, _b), self.rotation)

285 pt1 = rotate_point((_c, _d), self.rotation)

286 rh = self.root_page.height

287 x0, top, x1, bottom = _invert_box(_normalize_box((*pt0, *pt1)), rh)

288

289 a = annot.get("A", {})

290 extras = {

291 "uri": a.get("URI"),

292 "title": annot.get("T"),

293 "contents": annot.get("Contents"),

294 }

295 for k, v in extras.items():

296 if v is not None:

297 try:

298 extras[k] = v.decode("utf-8")

299 except UnicodeDecodeError:

300 try:

301 extras[k] = v.decode("utf-16")

302 except UnicodeDecodeError:

303 if self.pdf.raise_unicode_errors:

304 raise

305 warn(

306 f"Could not decode {k} of annotation."

307 f" {k} will be missing."

308 )

309

310 parsed = {

311 "page_number": self.page_number,

312 "object_type": "annot",

313 "x0": x0,

314 "y0": rh - bottom,

315 "x1": x1,

316 "y1": rh - top,

317 "doctop": self.initial_doctop + top,

318 "top": top,

319 "bottom": bottom,

320 "width": x1 - x0,

321 "height": bottom - top,

322 }

323 parsed.update(extras)

324 # Replace the indirect reference to the page dictionary

325 # with a pointer to our actual page

326 if "P" in annot:

327 annot["P"] = self

328 parsed["data"] = annot

329 return parsed

330

331 raw = resolve_all(self.page_obj.annots) or []

332 parsed = list(map(parse, raw))

333 if isinstance(self, CroppedPage):

334 return self._crop_fn(parsed)

335 else:

336 return parsed

337

338 @property

339 def hyperlinks(self) -> T_obj_list:

340 return [a for a in self.annots if a["uri"] is not None]

341

342 @property

343 def objects(self) -> Dict[str, T_obj_list]:

344 if hasattr(self, "_objects"):

345 return self._objects

346 self._objects: Dict[str, T_obj_list] = self.parse_objects()

347 return self._objects

348

349 def point2coord(self, pt: Tuple[T_num, T_num]) -> Tuple[T_num, T_num]:

350 # See note below re. #1181 and mediabox-adjustment reversions

351 return (self.mediabox[0] + pt[0], self.mediabox[1] + self.height - pt[1])

352

353 def process_object(self, obj: LTItem) -> T_obj:

354 kind = re.sub(lt_pat, "", obj.__class__.__name__).lower()

355

356 def process_attr(item: Tuple[str, Any]) -> Optional[Tuple[str, Any]]:

357 k, v = item

358 if k in ALL_ATTRS:

359 res = resolve_all(v)

360 return (k, res)

361 else:

362 return None

363

364 attr = dict(filter(None, map(process_attr, obj.__dict__.items())))

365

366 attr["object_type"] = kind

367 attr["page_number"] = self.page_number

368

369 for cs in ["ncs", "scs"]:

370 # Note: As of pdfminer.six v20221105, that library only

371 # exposes ncs for LTChars, and neither attribute for

372 # other objects. Keeping this code here, though,

373 # for ease of addition if color spaces become

374 # more available via pdfminer.six

375 if hasattr(obj, cs):

376 attr[cs] = resolve_and_decode(getattr(obj, cs).name)

377

378 if isinstance(obj, (LTChar, LTTextContainer)):

379 text = obj.get_text()

380 attr["text"] = (

381 normalize_unicode(self.pdf.unicode_norm, text)

382 if self.pdf.unicode_norm is not None

383 else text

384 )

385

386 if isinstance(obj, LTChar):

387 # pdfminer.six (at least as of v20221105) does not

388 # directly expose .stroking_color and .non_stroking_color

389 # for LTChar objects (unlike, e.g., LTRect objects).

390 gs = obj.graphicstate

391 attr["stroking_color"] = (

392 gs.scolor if isinstance(gs.scolor, tuple) else (gs.scolor,)

393 )

394 attr["non_stroking_color"] = (

395 gs.ncolor if isinstance(gs.ncolor, tuple) else (gs.ncolor,)

396 )

397

398 # Handle (rare) byte-encoded fontnames

399 if isinstance(attr["fontname"], bytes): # pragma: nocover

400 attr["fontname"] = fix_fontname_bytes(attr["fontname"])

401

402 elif isinstance(obj, (LTCurve,)):

403 attr["pts"] = list(map(self.point2coord, attr["pts"]))

404

405 # Ignoring typing because type signature for obj.original_path

406 # appears to be incorrect

407 attr["path"] = [(cmd, *map(self.point2coord, pts)) for cmd, *pts in obj.original_path] # type: ignore # noqa: E501

408

409 attr["dash"] = obj.dashing_style

410

411 # As noted in #1181, `pdfminer.six` adjusts objects'

412 # coordinates relative to the MediaBox:

413 # https://github.com/pdfminer/pdfminer.six/blob/1a8bd2f730295b31d6165e4d95fcb5a03793c978/pdfminer/converter.py#L79-L84

414 mb_x0, mb_top = self.mediabox[:2]

415

416 if "y0" in attr:

417 attr["top"] = (self.height - attr["y1"]) + mb_top

418 attr["bottom"] = (self.height - attr["y0"]) + mb_top

419 attr["doctop"] = self.initial_doctop + attr["top"]

420

421 if "x0" in attr and mb_x0 != 0:

422 attr["x0"] = attr["x0"] + mb_x0

423 attr["x1"] = attr["x1"] + mb_x0

424

425 return attr

426

427 def iter_layout_objects(

428 self, layout_objects: List[LTComponent]

429 ) -> Generator[T_obj, None, None]:

430 for obj in layout_objects:

431 # If object is, like LTFigure, a higher-level object ...

432 if isinstance(obj, LTContainer):

433 # and LAParams is passed, process the object itself.

434 if self.pdf.laparams is not None:

435 yield self.process_object(obj)

436 # Regardless, iterate through its children

437 yield from self.iter_layout_objects(obj._objs)

438 else:

439 yield self.process_object(obj)

440

441 def parse_objects(self) -> Dict[str, T_obj_list]:

442 objects: Dict[str, T_obj_list] = {}

443 for obj in self.iter_layout_objects(self.layout._objs):

444 kind = obj["object_type"]

445 if kind in ["anno"]:

446 continue

447 if objects.get(kind) is None:

448 objects[kind] = []

449 objects[kind].append(obj)

450 return objects

451

452 def debug_tablefinder(

453 self, table_settings: Optional[T_table_settings] = None

454 ) -> TableFinder:

455 tset = TableSettings.resolve(table_settings)

456 return TableFinder(self, tset)

457

458 def find_tables(

459 self, table_settings: Optional[T_table_settings] = None

460 ) -> List[Table]:

461 tset = TableSettings.resolve(table_settings)

462 return TableFinder(self, tset).tables

463

464 def find_table(

465 self, table_settings: Optional[T_table_settings] = None

466 ) -> Optional[Table]:

467 tset = TableSettings.resolve(table_settings)

468 tables = self.find_tables(tset)

469

470 if len(tables) == 0:

471 return None

472

473 # Return the largest table, as measured by number of cells.

474 def sorter(x: Table) -> Tuple[int, T_num, T_num]:

475 return (-len(x.cells), x.bbox[1], x.bbox[0])

476

477 largest = list(sorted(tables, key=sorter))[0]

478

479 return largest

480

481 def extract_tables(

482 self, table_settings: Optional[T_table_settings] = None

483 ) -> List[List[List[Optional[str]]]]:

484 tset = TableSettings.resolve(table_settings)

485 tables = self.find_tables(tset)

486 return [table.extract(**(tset.text_settings or {})) for table in tables]

487

488 def extract_table(

489 self, table_settings: Optional[T_table_settings] = None

490 ) -> Optional[List[List[Optional[str]]]]:

491 tset = TableSettings.resolve(table_settings)

492 table = self.find_table(tset)

493 if table is None:

494 return None

495 else:

496 return table.extract(**(tset.text_settings or {}))

497

498 def _get_textmap(self, **kwargs: Any) -> TextMap:

499 defaults: Dict[str, Any] = dict(

500 layout_bbox=self.bbox,

501 )

502 if "layout_width_chars" not in kwargs:

503 defaults.update({"layout_width": self.width})

504 if "layout_height_chars" not in kwargs:

505 defaults.update({"layout_height": self.height})

506 full_kwargs: Dict[str, Any] = {**defaults, **kwargs}

507 return utils.chars_to_textmap(self.chars, **full_kwargs)

508

509 def search(

510 self,

511 pattern: Union[str, Pattern[str]],

512 regex: bool = True,

513 case: bool = True,

514 main_group: int = 0,

515 return_chars: bool = True,

516 return_groups: bool = True,

517 **kwargs: Any,

518 ) -> List[Dict[str, Any]]:

519 textmap = self.get_textmap(**tuplify_list_kwargs(kwargs))

520 return textmap.search(

521 pattern,

522 regex=regex,

523 case=case,

524 main_group=main_group,

525 return_chars=return_chars,

526 return_groups=return_groups,

527 )

528

529 def extract_text(self, **kwargs: Any) -> str:

530 return self.get_textmap(**tuplify_list_kwargs(kwargs)).as_string

531

532 def extract_text_simple(self, **kwargs: Any) -> str:

533 return utils.extract_text_simple(self.chars, **kwargs)

534

535 def extract_words(self, **kwargs: Any) -> T_obj_list:

536 return utils.extract_words(self.chars, **kwargs)

537

538 def extract_text_lines(

539 self, strip: bool = True, return_chars: bool = True, **kwargs: Any

540 ) -> T_obj_list:

541 return self.get_textmap(**tuplify_list_kwargs(kwargs)).extract_text_lines(

542 strip=strip, return_chars=return_chars

543 )

544

545 def crop(

546 self, bbox: T_bbox, relative: bool = False, strict: bool = True

547 ) -> "CroppedPage":

548 return CroppedPage(self, bbox, relative=relative, strict=strict)

549

550 def within_bbox(

551 self, bbox: T_bbox, relative: bool = False, strict: bool = True

552 ) -> "CroppedPage":

553 """

554 Same as .crop, except only includes objects fully within the bbox

555 """

556 return CroppedPage(

557 self, bbox, relative=relative, strict=strict, crop_fn=utils.within_bbox

558 )

559

560 def outside_bbox(

561 self, bbox: T_bbox, relative: bool = False, strict: bool = True

562 ) -> "CroppedPage":

563 """

564 Same as .crop, except only includes objects fully within the bbox

565 """

566 return CroppedPage(

567 self, bbox, relative=relative, strict=strict, crop_fn=utils.outside_bbox

568 )

569

570 def filter(self, test_function: Callable[[T_obj], bool]) -> "FilteredPage":

571 return FilteredPage(self, test_function)

572

573 def dedupe_chars(self, **kwargs: Any) -> "FilteredPage":

574 """

575 Removes duplicate chars — those sharing the same text and positioning

576 (within `tolerance`) as other characters in the set. Adjust extra_args

577 to be more/less restrictive with the properties checked.

578 """

579 p = FilteredPage(self, lambda x: True)

580 p._objects = {kind: objs for kind, objs in self.objects.items()}

581 p._objects["char"] = utils.dedupe_chars(self.chars, **kwargs)

582 return p

583

584 def to_image(

585 self,

586 resolution: Optional[Union[int, float]] = None,

587 width: Optional[Union[int, float]] = None,

588 height: Optional[Union[int, float]] = None,

589 antialias: bool = False,

590 force_mediabox: bool = False,

591 ) -> "PageImage":

592 """

593 You can pass a maximum of 1 of the following:

594 - resolution: The desired number pixels per inch. Defaults to 72.

595 - width: The desired image width in pixels.

596 - height: The desired image width in pixels.

597 """

598 from .display import DEFAULT_RESOLUTION, PageImage

599

600 num_specs = sum(x is not None for x in [resolution, width, height])

601 if num_specs > 1:

602 raise ValueError(

603 f"Only one of these arguments can be provided: resolution, width, height. You provided {num_specs}" # noqa: E501

604 )

605 elif width is not None:

606 resolution = 72 * width / self.width

607 elif height is not None:

608 resolution = 72 * height / self.height

609

610 return PageImage(

611 self,

612 resolution=resolution or DEFAULT_RESOLUTION,

613 antialias=antialias,

614 force_mediabox=force_mediabox,

615 )

616

617 def to_dict(self, object_types: Optional[List[str]] = None) -> Dict[str, Any]:

618 if object_types is None:

619 _object_types = list(self.objects.keys()) + ["annot"]

620 else:

621 _object_types = object_types

622 d = {

623 "page_number": self.page_number,

624 "initial_doctop": self.initial_doctop,

625 "rotation": self.rotation,

626 "cropbox": self.cropbox,

627 "mediabox": self.mediabox,

628 "bbox": self.bbox,

629 "width": self.width,

630 "height": self.height,

631 }

632 for t in _object_types:

633 d[t + "s"] = getattr(self, t + "s")

634 return d

635

636 def __repr__(self) -> str:

637 return f"<Page:{self.page_number}>"

638

639

640class DerivedPage(Page):

641 is_original: bool = False

642

643 def __init__(self, parent_page: Page):

644 self.parent_page = parent_page

645 self.root_page = parent_page.root_page

646 self.pdf = parent_page.pdf

647 self.page_obj = parent_page.page_obj

648 self.page_number = parent_page.page_number

649 self.initial_doctop = parent_page.initial_doctop

650 self.rotation = parent_page.rotation

651 self.mediabox = parent_page.mediabox

652 self.cropbox = parent_page.cropbox

653 self.flush_cache(Container.cached_properties)

654 self.get_textmap = lru_cache()(self._get_textmap)

655

656

657def test_proposed_bbox(bbox: T_bbox, parent_bbox: T_bbox) -> None:

658 bbox_area = utils.calculate_area(bbox)

659 if bbox_area == 0:

660 raise ValueError(f"Bounding box {bbox} has an area of zero.")

661

662 overlap = utils.get_bbox_overlap(bbox, parent_bbox)

663 if overlap is None:

664 raise ValueError(

665 f"Bounding box {bbox} is entirely outside "

666 f"parent page bounding box {parent_bbox}"

667 )

668

669 overlap_area = utils.calculate_area(overlap)

670 if overlap_area < bbox_area:

671 raise ValueError(

672 f"Bounding box {bbox} is not fully within "

673 f"parent page bounding box {parent_bbox}"

674 )

675

676

677class CroppedPage(DerivedPage):

678 def __init__(

679 self,

680 parent_page: Page,

681 crop_bbox: T_bbox,

682 crop_fn: Callable[[T_obj_list, T_bbox], T_obj_list] = utils.crop_to_bbox,

683 relative: bool = False,

684 strict: bool = True,

685 ):

686 if relative:

687 o_x0, o_top, _, _ = parent_page.bbox

688 x0, top, x1, bottom = crop_bbox

689 crop_bbox = (x0 + o_x0, top + o_top, x1 + o_x0, bottom + o_top)

690

691 if strict:

692 test_proposed_bbox(crop_bbox, parent_page.bbox)

693

694 def _crop_fn(objs: T_obj_list) -> T_obj_list:

695 return crop_fn(objs, crop_bbox)

696

697 super().__init__(parent_page)

698

699 self._crop_fn = _crop_fn

700

701 # Note: testing for original function passed, not _crop_fn

702 if crop_fn is utils.outside_bbox:

703 self.bbox = parent_page.bbox

704 else:

705 self.bbox = crop_bbox

706

707 @property

708 def objects(self) -> Dict[str, T_obj_list]:

709 if hasattr(self, "_objects"):

710 return self._objects

711 self._objects: Dict[str, T_obj_list] = {

712 k: self._crop_fn(v) for k, v in self.parent_page.objects.items()

713 }

714 return self._objects

715

716

717class FilteredPage(DerivedPage):

718 def __init__(self, parent_page: Page, filter_fn: Callable[[T_obj], bool]):

719 self.bbox = parent_page.bbox

720 self.filter_fn = filter_fn

721 super().__init__(parent_page)

722

723 @property

724 def objects(self) -> Dict[str, T_obj_list]:

725 if hasattr(self, "_objects"):

726 return self._objects

727 self._objects: Dict[str, T_obj_list] = {

728 k: list(filter(self.filter_fn, v))

729 for k, v in self.parent_page.objects.items()

730 }

731 return self._objects