Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pdfplumber/page.py: 25%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

353 statements  

1import numbers 

2import re 

3from functools import lru_cache 

4from typing import ( 

5 TYPE_CHECKING, 

6 Any, 

7 Callable, 

8 Dict, 

9 Generator, 

10 List, 

11 Optional, 

12 Pattern, 

13 Tuple, 

14 Union, 

15) 

16from unicodedata import normalize as normalize_unicode 

17from warnings import warn 

18 

19from pdfminer.converter import PDFPageAggregator 

20from pdfminer.layout import ( 

21 LTChar, 

22 LTComponent, 

23 LTContainer, 

24 LTCurve, 

25 LTItem, 

26 LTPage, 

27 LTTextContainer, 

28) 

29from pdfminer.pdfinterp import PDFPageInterpreter, PDFStackT 

30from pdfminer.pdfpage import PDFPage 

31from pdfminer.psparser import PSLiteral 

32 

33from . import utils 

34from ._typing import T_bbox, T_num, T_obj, T_obj_list 

35from .container import Container 

36from .structure import PDFStructTree, StructTreeMissing 

37from .table import T_table_settings, Table, TableFinder, TableSettings 

38from .utils import decode_text, resolve_all, resolve_and_decode 

39from .utils.exceptions import MalformedPDFException, PdfminerException 

40from .utils.text import TextMap 

41 

42lt_pat = re.compile(r"^LT") 

43 

44ALL_ATTRS = set( 

45 [ 

46 "adv", 

47 "height", 

48 "linewidth", 

49 "pts", 

50 "size", 

51 "srcsize", 

52 "width", 

53 "x0", 

54 "x1", 

55 "y0", 

56 "y1", 

57 "bits", 

58 "matrix", 

59 "upright", 

60 "fontname", 

61 "text", 

62 "imagemask", 

63 "colorspace", 

64 "evenodd", 

65 "fill", 

66 "non_stroking_color", 

67 "stroke", 

68 "stroking_color", 

69 "stream", 

70 "name", 

71 "mcid", 

72 "tag", 

73 ] 

74) 

75 

76 

77if TYPE_CHECKING: # pragma: nocover 

78 from .display import PageImage 

79 from .pdf import PDF 

80 

81# via https://git.ghostscript.com/?p=mupdf.git;a=blob;f=source/pdf/pdf-font.c;h=6322cedf2c26cfb312c0c0878d7aff97b4c7470e;hb=HEAD#l774 # noqa 

82 

83CP936_FONTNAMES = { 

84 b"\xcb\xce\xcc\xe5": "SimSun,Regular", 

85 b"\xba\xda\xcc\xe5": "SimHei,Regular", 

86 b"\xbf\xac\xcc\xe5_GB2312": "SimKai,Regular", 

87 b"\xb7\xc2\xcb\xce_GB2312": "SimFang,Regular", 

88 b"\xc1\xa5\xca\xe9": "SimLi,Regular", 

89} 

90 

91 

92def fix_fontname_bytes(fontname: bytes) -> str: 

93 if b"+" in fontname: 

94 split_at = fontname.index(b"+") + 1 

95 prefix, suffix = fontname[:split_at], fontname[split_at:] 

96 else: 

97 prefix, suffix = b"", fontname 

98 

99 suffix_new = CP936_FONTNAMES.get(suffix, str(suffix)[2:-1]) 

100 return str(prefix)[2:-1] + suffix_new 

101 

102 

103def tuplify_list_kwargs(kwargs: Dict[str, Any]) -> Dict[str, Any]: 

104 return { 

105 key: (tuple(value) if isinstance(value, list) else value) 

106 for key, value in kwargs.items() 

107 } 

108 

109 

110class PDFPageAggregatorWithMarkedContent(PDFPageAggregator): 

111 """Extract layout from a specific page, adding marked-content IDs to 

112 objects where found.""" 

113 

114 cur_mcid: Optional[int] = None 

115 cur_tag: Optional[str] = None 

116 

117 def begin_tag(self, tag: PSLiteral, props: Optional[PDFStackT] = None) -> None: 

118 """Handle beginning of tag, setting current MCID if any.""" 

119 self.cur_tag = decode_text(tag.name) 

120 if isinstance(props, dict) and "MCID" in props: 

121 self.cur_mcid = props["MCID"] 

122 else: 

123 self.cur_mcid = None 

124 

125 def end_tag(self) -> None: 

126 """Handle beginning of tag, clearing current MCID.""" 

127 self.cur_tag = None 

128 self.cur_mcid = None 

129 

130 def tag_cur_item(self) -> None: 

131 """Add current MCID to what we hope to be the most recent object created 

132 by pdfminer.six.""" 

133 # This is somewhat hacky and would not be necessary if 

134 # pdfminer.six supported MCIDs. In reading the code it's 

135 # clear that the `render_*` methods methods will only ever 

136 # create one object, but that is far from being guaranteed. 

137 # Even if pdfminer.six's API would just return the objects it 

138 # creates, we wouldn't have to do this. 

139 if self.cur_item._objs: 

140 cur_obj = self.cur_item._objs[-1] 

141 cur_obj.mcid = self.cur_mcid # type: ignore 

142 cur_obj.tag = self.cur_tag # type: ignore 

143 

144 def render_char(self, *args, **kwargs) -> float: # type: ignore 

145 """Hook for rendering characters, adding the `mcid` attribute.""" 

146 adv = super().render_char(*args, **kwargs) 

147 self.tag_cur_item() 

148 return adv 

149 

150 def render_image(self, *args, **kwargs) -> None: # type: ignore 

151 """Hook for rendering images, adding the `mcid` attribute.""" 

152 super().render_image(*args, **kwargs) 

153 self.tag_cur_item() 

154 

155 def paint_path(self, *args, **kwargs) -> None: # type: ignore 

156 """Hook for rendering lines and curves, adding the `mcid` attribute.""" 

157 super().paint_path(*args, **kwargs) 

158 self.tag_cur_item() 

159 

160 

161def _normalize_box(box_raw: T_bbox, rotation: T_num = 0) -> T_bbox: 

162 # Per PDF Reference 3.8.4: "Note: Although rectangles are 

163 # conventionally specified by their lower-left and upperright 

164 # corners, it is acceptable to specify any two diagonally opposite 

165 # corners." 

166 if not all(isinstance(x, numbers.Number) for x in box_raw): # pragma: nocover 

167 raise MalformedPDFException( 

168 f"Bounding box contains non-number coordinate(s): {box_raw}" 

169 ) 

170 x0, x1 = sorted((box_raw[0], box_raw[2])) 

171 y0, y1 = sorted((box_raw[1], box_raw[3])) 

172 if rotation in [90, 270]: 

173 return (y0, x0, y1, x1) 

174 else: 

175 return (x0, y0, x1, y1) 

176 

177 

178# PDFs coordinate spaces refer to an origin in the bottom-left of the 

179# page; pdfplumber flips this vertically, so that the origin is in the 

180# top-left. 

181def _invert_box(box_raw: T_bbox, mb_height: T_num) -> T_bbox: 

182 x0, y0, x1, y1 = box_raw 

183 return (x0, mb_height - y1, x1, mb_height - y0) 

184 

185 

186class Page(Container): 

187 cached_properties: List[str] = Container.cached_properties + ["_layout"] 

188 is_original: bool = True 

189 pages = None 

190 

191 def __init__( 

192 self, 

193 pdf: "PDF", 

194 page_obj: PDFPage, 

195 page_number: int, 

196 initial_doctop: T_num = 0, 

197 ): 

198 self.pdf = pdf 

199 self.root_page = self 

200 self.page_obj = page_obj 

201 self.page_number = page_number 

202 self.initial_doctop = initial_doctop 

203 

204 def get_attr(key: str, default: Any = None) -> Any: 

205 value = resolve_all(page_obj.attrs.get(key)) 

206 return default if value is None else value 

207 

208 # Per PDF Reference Table 3.27: "The number of degrees by which the 

209 # page should be rotated clockwise when displayed or printed. The value 

210 # must be a multiple of 90. Default value: 0" 

211 _rotation = get_attr("Rotate", 0) 

212 self.rotation = _rotation % 360 

213 

214 mb_raw = _normalize_box(get_attr("MediaBox"), self.rotation) 

215 mb_height = mb_raw[3] - mb_raw[1] 

216 

217 self.mediabox = _invert_box(mb_raw, mb_height) 

218 

219 for box_name in ["CropBox", "TrimBox", "BleedBox", "ArtBox"]: 

220 if box_name in page_obj.attrs: 

221 box_normalized = _invert_box( 

222 _normalize_box(get_attr(box_name), self.rotation), mb_height 

223 ) 

224 setattr(self, box_name.lower(), box_normalized) 

225 

226 if "CropBox" not in page_obj.attrs: 

227 self.cropbox = self.mediabox 

228 

229 # Page.bbox defaults to self.mediabox, but can be altered by Page.crop(...) 

230 self.bbox = self.mediabox 

231 

232 # See https://rednafi.com/python/lru_cache_on_methods/ 

233 self.get_textmap = lru_cache()(self._get_textmap) 

234 

235 def close(self) -> None: 

236 self.flush_cache() 

237 self.get_textmap.cache_clear() 

238 

239 @property 

240 def width(self) -> T_num: 

241 return self.bbox[2] - self.bbox[0] 

242 

243 @property 

244 def height(self) -> T_num: 

245 return self.bbox[3] - self.bbox[1] 

246 

247 @property 

248 def structure_tree(self) -> List[Dict[str, Any]]: 

249 """Return the structure tree for a page, if any.""" 

250 try: 

251 return [elem.to_dict() for elem in PDFStructTree(self.pdf, self)] 

252 except StructTreeMissing: 

253 return [] 

254 

255 @property 

256 def layout(self) -> LTPage: 

257 if hasattr(self, "_layout"): 

258 return self._layout 

259 device = PDFPageAggregatorWithMarkedContent( 

260 self.pdf.rsrcmgr, 

261 pageno=self.page_number, 

262 laparams=self.pdf.laparams, 

263 ) 

264 interpreter = PDFPageInterpreter(self.pdf.rsrcmgr, device) 

265 try: 

266 interpreter.process_page(self.page_obj) 

267 except Exception as e: 

268 raise PdfminerException(e) 

269 self._layout: LTPage = device.get_result() 

270 return self._layout 

271 

272 @property 

273 def annots(self) -> T_obj_list: 

274 def rotate_point(pt: Tuple[float, float], r: int) -> Tuple[float, float]: 

275 turns = r // 90 

276 for i in range(turns): 

277 x, y = pt 

278 comp = self.width if i == turns % 2 else self.height 

279 pt = (y, (comp - x)) 

280 return pt 

281 

282 def parse(annot: T_obj) -> T_obj: 

283 _a, _b, _c, _d = annot["Rect"] 

284 pt0 = rotate_point((_a, _b), self.rotation) 

285 pt1 = rotate_point((_c, _d), self.rotation) 

286 rh = self.root_page.height 

287 x0, top, x1, bottom = _invert_box(_normalize_box((*pt0, *pt1)), rh) 

288 

289 a = annot.get("A", {}) 

290 extras = { 

291 "uri": a.get("URI"), 

292 "title": annot.get("T"), 

293 "contents": annot.get("Contents"), 

294 } 

295 for k, v in extras.items(): 

296 if v is not None: 

297 try: 

298 extras[k] = v.decode("utf-8") 

299 except UnicodeDecodeError: 

300 try: 

301 extras[k] = v.decode("utf-16") 

302 except UnicodeDecodeError: 

303 if self.pdf.raise_unicode_errors: 

304 raise 

305 warn( 

306 f"Could not decode {k} of annotation." 

307 f" {k} will be missing." 

308 ) 

309 

310 parsed = { 

311 "page_number": self.page_number, 

312 "object_type": "annot", 

313 "x0": x0, 

314 "y0": rh - bottom, 

315 "x1": x1, 

316 "y1": rh - top, 

317 "doctop": self.initial_doctop + top, 

318 "top": top, 

319 "bottom": bottom, 

320 "width": x1 - x0, 

321 "height": bottom - top, 

322 } 

323 parsed.update(extras) 

324 # Replace the indirect reference to the page dictionary 

325 # with a pointer to our actual page 

326 if "P" in annot: 

327 annot["P"] = self 

328 parsed["data"] = annot 

329 return parsed 

330 

331 raw = resolve_all(self.page_obj.annots) or [] 

332 parsed = list(map(parse, raw)) 

333 if isinstance(self, CroppedPage): 

334 return self._crop_fn(parsed) 

335 else: 

336 return parsed 

337 

338 @property 

339 def hyperlinks(self) -> T_obj_list: 

340 return [a for a in self.annots if a["uri"] is not None] 

341 

342 @property 

343 def objects(self) -> Dict[str, T_obj_list]: 

344 if hasattr(self, "_objects"): 

345 return self._objects 

346 self._objects: Dict[str, T_obj_list] = self.parse_objects() 

347 return self._objects 

348 

349 def point2coord(self, pt: Tuple[T_num, T_num]) -> Tuple[T_num, T_num]: 

350 # See note below re. #1181 and mediabox-adjustment reversions 

351 return (self.mediabox[0] + pt[0], self.mediabox[1] + self.height - pt[1]) 

352 

353 def process_object(self, obj: LTItem) -> T_obj: 

354 kind = re.sub(lt_pat, "", obj.__class__.__name__).lower() 

355 

356 def process_attr(item: Tuple[str, Any]) -> Optional[Tuple[str, Any]]: 

357 k, v = item 

358 if k in ALL_ATTRS: 

359 res = resolve_all(v) 

360 return (k, res) 

361 else: 

362 return None 

363 

364 attr = dict(filter(None, map(process_attr, obj.__dict__.items()))) 

365 

366 attr["object_type"] = kind 

367 attr["page_number"] = self.page_number 

368 

369 for cs in ["ncs", "scs"]: 

370 # Note: As of pdfminer.six v20221105, that library only 

371 # exposes ncs for LTChars, and neither attribute for 

372 # other objects. Keeping this code here, though, 

373 # for ease of addition if color spaces become 

374 # more available via pdfminer.six 

375 if hasattr(obj, cs): 

376 attr[cs] = resolve_and_decode(getattr(obj, cs).name) 

377 

378 if isinstance(obj, (LTChar, LTTextContainer)): 

379 text = obj.get_text() 

380 attr["text"] = ( 

381 normalize_unicode(self.pdf.unicode_norm, text) 

382 if self.pdf.unicode_norm is not None 

383 else text 

384 ) 

385 

386 if isinstance(obj, LTChar): 

387 # pdfminer.six (at least as of v20221105) does not 

388 # directly expose .stroking_color and .non_stroking_color 

389 # for LTChar objects (unlike, e.g., LTRect objects). 

390 gs = obj.graphicstate 

391 attr["stroking_color"] = ( 

392 gs.scolor if isinstance(gs.scolor, tuple) else (gs.scolor,) 

393 ) 

394 attr["non_stroking_color"] = ( 

395 gs.ncolor if isinstance(gs.ncolor, tuple) else (gs.ncolor,) 

396 ) 

397 

398 # Handle (rare) byte-encoded fontnames 

399 if isinstance(attr["fontname"], bytes): # pragma: nocover 

400 attr["fontname"] = fix_fontname_bytes(attr["fontname"]) 

401 

402 elif isinstance(obj, (LTCurve,)): 

403 attr["pts"] = list(map(self.point2coord, attr["pts"])) 

404 

405 # Ignoring typing because type signature for obj.original_path 

406 # appears to be incorrect 

407 attr["path"] = [(cmd, *map(self.point2coord, pts)) for cmd, *pts in obj.original_path] # type: ignore # noqa: E501 

408 

409 attr["dash"] = obj.dashing_style 

410 

411 # As noted in #1181, `pdfminer.six` adjusts objects' 

412 # coordinates relative to the MediaBox: 

413 # https://github.com/pdfminer/pdfminer.six/blob/1a8bd2f730295b31d6165e4d95fcb5a03793c978/pdfminer/converter.py#L79-L84 

414 mb_x0, mb_top = self.mediabox[:2] 

415 

416 if "y0" in attr: 

417 attr["top"] = (self.height - attr["y1"]) + mb_top 

418 attr["bottom"] = (self.height - attr["y0"]) + mb_top 

419 attr["doctop"] = self.initial_doctop + attr["top"] 

420 

421 if "x0" in attr and mb_x0 != 0: 

422 attr["x0"] = attr["x0"] + mb_x0 

423 attr["x1"] = attr["x1"] + mb_x0 

424 

425 return attr 

426 

427 def iter_layout_objects( 

428 self, layout_objects: List[LTComponent] 

429 ) -> Generator[T_obj, None, None]: 

430 for obj in layout_objects: 

431 # If object is, like LTFigure, a higher-level object ... 

432 if isinstance(obj, LTContainer): 

433 # and LAParams is passed, process the object itself. 

434 if self.pdf.laparams is not None: 

435 yield self.process_object(obj) 

436 # Regardless, iterate through its children 

437 yield from self.iter_layout_objects(obj._objs) 

438 else: 

439 yield self.process_object(obj) 

440 

441 def parse_objects(self) -> Dict[str, T_obj_list]: 

442 objects: Dict[str, T_obj_list] = {} 

443 for obj in self.iter_layout_objects(self.layout._objs): 

444 kind = obj["object_type"] 

445 if kind in ["anno"]: 

446 continue 

447 if objects.get(kind) is None: 

448 objects[kind] = [] 

449 objects[kind].append(obj) 

450 return objects 

451 

452 def debug_tablefinder( 

453 self, table_settings: Optional[T_table_settings] = None 

454 ) -> TableFinder: 

455 tset = TableSettings.resolve(table_settings) 

456 return TableFinder(self, tset) 

457 

458 def find_tables( 

459 self, table_settings: Optional[T_table_settings] = None 

460 ) -> List[Table]: 

461 tset = TableSettings.resolve(table_settings) 

462 return TableFinder(self, tset).tables 

463 

464 def find_table( 

465 self, table_settings: Optional[T_table_settings] = None 

466 ) -> Optional[Table]: 

467 tset = TableSettings.resolve(table_settings) 

468 tables = self.find_tables(tset) 

469 

470 if len(tables) == 0: 

471 return None 

472 

473 # Return the largest table, as measured by number of cells. 

474 def sorter(x: Table) -> Tuple[int, T_num, T_num]: 

475 return (-len(x.cells), x.bbox[1], x.bbox[0]) 

476 

477 largest = list(sorted(tables, key=sorter))[0] 

478 

479 return largest 

480 

481 def extract_tables( 

482 self, table_settings: Optional[T_table_settings] = None 

483 ) -> List[List[List[Optional[str]]]]: 

484 tset = TableSettings.resolve(table_settings) 

485 tables = self.find_tables(tset) 

486 return [table.extract(**(tset.text_settings or {})) for table in tables] 

487 

488 def extract_table( 

489 self, table_settings: Optional[T_table_settings] = None 

490 ) -> Optional[List[List[Optional[str]]]]: 

491 tset = TableSettings.resolve(table_settings) 

492 table = self.find_table(tset) 

493 if table is None: 

494 return None 

495 else: 

496 return table.extract(**(tset.text_settings or {})) 

497 

498 def _get_textmap(self, **kwargs: Any) -> TextMap: 

499 defaults: Dict[str, Any] = dict( 

500 layout_bbox=self.bbox, 

501 ) 

502 if "layout_width_chars" not in kwargs: 

503 defaults.update({"layout_width": self.width}) 

504 if "layout_height_chars" not in kwargs: 

505 defaults.update({"layout_height": self.height}) 

506 full_kwargs: Dict[str, Any] = {**defaults, **kwargs} 

507 return utils.chars_to_textmap(self.chars, **full_kwargs) 

508 

509 def search( 

510 self, 

511 pattern: Union[str, Pattern[str]], 

512 regex: bool = True, 

513 case: bool = True, 

514 main_group: int = 0, 

515 return_chars: bool = True, 

516 return_groups: bool = True, 

517 **kwargs: Any, 

518 ) -> List[Dict[str, Any]]: 

519 textmap = self.get_textmap(**tuplify_list_kwargs(kwargs)) 

520 return textmap.search( 

521 pattern, 

522 regex=regex, 

523 case=case, 

524 main_group=main_group, 

525 return_chars=return_chars, 

526 return_groups=return_groups, 

527 ) 

528 

529 def extract_text(self, **kwargs: Any) -> str: 

530 return self.get_textmap(**tuplify_list_kwargs(kwargs)).as_string 

531 

532 def extract_text_simple(self, **kwargs: Any) -> str: 

533 return utils.extract_text_simple(self.chars, **kwargs) 

534 

535 def extract_words(self, **kwargs: Any) -> T_obj_list: 

536 return utils.extract_words(self.chars, **kwargs) 

537 

538 def extract_text_lines( 

539 self, strip: bool = True, return_chars: bool = True, **kwargs: Any 

540 ) -> T_obj_list: 

541 return self.get_textmap(**tuplify_list_kwargs(kwargs)).extract_text_lines( 

542 strip=strip, return_chars=return_chars 

543 ) 

544 

545 def crop( 

546 self, bbox: T_bbox, relative: bool = False, strict: bool = True 

547 ) -> "CroppedPage": 

548 return CroppedPage(self, bbox, relative=relative, strict=strict) 

549 

550 def within_bbox( 

551 self, bbox: T_bbox, relative: bool = False, strict: bool = True 

552 ) -> "CroppedPage": 

553 """ 

554 Same as .crop, except only includes objects fully within the bbox 

555 """ 

556 return CroppedPage( 

557 self, bbox, relative=relative, strict=strict, crop_fn=utils.within_bbox 

558 ) 

559 

560 def outside_bbox( 

561 self, bbox: T_bbox, relative: bool = False, strict: bool = True 

562 ) -> "CroppedPage": 

563 """ 

564 Same as .crop, except only includes objects fully within the bbox 

565 """ 

566 return CroppedPage( 

567 self, bbox, relative=relative, strict=strict, crop_fn=utils.outside_bbox 

568 ) 

569 

570 def filter(self, test_function: Callable[[T_obj], bool]) -> "FilteredPage": 

571 return FilteredPage(self, test_function) 

572 

573 def dedupe_chars(self, **kwargs: Any) -> "FilteredPage": 

574 """ 

575 Removes duplicate chars — those sharing the same text and positioning 

576 (within `tolerance`) as other characters in the set. Adjust extra_args 

577 to be more/less restrictive with the properties checked. 

578 """ 

579 p = FilteredPage(self, lambda x: True) 

580 p._objects = {kind: objs for kind, objs in self.objects.items()} 

581 p._objects["char"] = utils.dedupe_chars(self.chars, **kwargs) 

582 return p 

583 

584 def to_image( 

585 self, 

586 resolution: Optional[Union[int, float]] = None, 

587 width: Optional[Union[int, float]] = None, 

588 height: Optional[Union[int, float]] = None, 

589 antialias: bool = False, 

590 force_mediabox: bool = False, 

591 ) -> "PageImage": 

592 """ 

593 You can pass a maximum of 1 of the following: 

594 - resolution: The desired number pixels per inch. Defaults to 72. 

595 - width: The desired image width in pixels. 

596 - height: The desired image width in pixels. 

597 """ 

598 from .display import DEFAULT_RESOLUTION, PageImage 

599 

600 num_specs = sum(x is not None for x in [resolution, width, height]) 

601 if num_specs > 1: 

602 raise ValueError( 

603 f"Only one of these arguments can be provided: resolution, width, height. You provided {num_specs}" # noqa: E501 

604 ) 

605 elif width is not None: 

606 resolution = 72 * width / self.width 

607 elif height is not None: 

608 resolution = 72 * height / self.height 

609 

610 return PageImage( 

611 self, 

612 resolution=resolution or DEFAULT_RESOLUTION, 

613 antialias=antialias, 

614 force_mediabox=force_mediabox, 

615 ) 

616 

617 def to_dict(self, object_types: Optional[List[str]] = None) -> Dict[str, Any]: 

618 if object_types is None: 

619 _object_types = list(self.objects.keys()) + ["annot"] 

620 else: 

621 _object_types = object_types 

622 d = { 

623 "page_number": self.page_number, 

624 "initial_doctop": self.initial_doctop, 

625 "rotation": self.rotation, 

626 "cropbox": self.cropbox, 

627 "mediabox": self.mediabox, 

628 "bbox": self.bbox, 

629 "width": self.width, 

630 "height": self.height, 

631 } 

632 for t in _object_types: 

633 d[t + "s"] = getattr(self, t + "s") 

634 return d 

635 

636 def __repr__(self) -> str: 

637 return f"<Page:{self.page_number}>" 

638 

639 

640class DerivedPage(Page): 

641 is_original: bool = False 

642 

643 def __init__(self, parent_page: Page): 

644 self.parent_page = parent_page 

645 self.root_page = parent_page.root_page 

646 self.pdf = parent_page.pdf 

647 self.page_obj = parent_page.page_obj 

648 self.page_number = parent_page.page_number 

649 self.initial_doctop = parent_page.initial_doctop 

650 self.rotation = parent_page.rotation 

651 self.mediabox = parent_page.mediabox 

652 self.cropbox = parent_page.cropbox 

653 self.flush_cache(Container.cached_properties) 

654 self.get_textmap = lru_cache()(self._get_textmap) 

655 

656 

657def test_proposed_bbox(bbox: T_bbox, parent_bbox: T_bbox) -> None: 

658 bbox_area = utils.calculate_area(bbox) 

659 if bbox_area == 0: 

660 raise ValueError(f"Bounding box {bbox} has an area of zero.") 

661 

662 overlap = utils.get_bbox_overlap(bbox, parent_bbox) 

663 if overlap is None: 

664 raise ValueError( 

665 f"Bounding box {bbox} is entirely outside " 

666 f"parent page bounding box {parent_bbox}" 

667 ) 

668 

669 overlap_area = utils.calculate_area(overlap) 

670 if overlap_area < bbox_area: 

671 raise ValueError( 

672 f"Bounding box {bbox} is not fully within " 

673 f"parent page bounding box {parent_bbox}" 

674 ) 

675 

676 

677class CroppedPage(DerivedPage): 

678 def __init__( 

679 self, 

680 parent_page: Page, 

681 crop_bbox: T_bbox, 

682 crop_fn: Callable[[T_obj_list, T_bbox], T_obj_list] = utils.crop_to_bbox, 

683 relative: bool = False, 

684 strict: bool = True, 

685 ): 

686 if relative: 

687 o_x0, o_top, _, _ = parent_page.bbox 

688 x0, top, x1, bottom = crop_bbox 

689 crop_bbox = (x0 + o_x0, top + o_top, x1 + o_x0, bottom + o_top) 

690 

691 if strict: 

692 test_proposed_bbox(crop_bbox, parent_page.bbox) 

693 

694 def _crop_fn(objs: T_obj_list) -> T_obj_list: 

695 return crop_fn(objs, crop_bbox) 

696 

697 super().__init__(parent_page) 

698 

699 self._crop_fn = _crop_fn 

700 

701 # Note: testing for original function passed, not _crop_fn 

702 if crop_fn is utils.outside_bbox: 

703 self.bbox = parent_page.bbox 

704 else: 

705 self.bbox = crop_bbox 

706 

707 @property 

708 def objects(self) -> Dict[str, T_obj_list]: 

709 if hasattr(self, "_objects"): 

710 return self._objects 

711 self._objects: Dict[str, T_obj_list] = { 

712 k: self._crop_fn(v) for k, v in self.parent_page.objects.items() 

713 } 

714 return self._objects 

715 

716 

717class FilteredPage(DerivedPage): 

718 def __init__(self, parent_page: Page, filter_fn: Callable[[T_obj], bool]): 

719 self.bbox = parent_page.bbox 

720 self.filter_fn = filter_fn 

721 super().__init__(parent_page) 

722 

723 @property 

724 def objects(self) -> Dict[str, T_obj_list]: 

725 if hasattr(self, "_objects"): 

726 return self._objects 

727 self._objects: Dict[str, T_obj_list] = { 

728 k: list(filter(self.filter_fn, v)) 

729 for k, v in self.parent_page.objects.items() 

730 } 

731 return self._objects