Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/layout.py: 89%

404 return f"<{self.__class__.__name__} {bbox2str(self.bbox)} matrix={matrix2str(self.matrix)} font={self.fontname!r} adv={self.adv} text={self.get_text()!r}>"

405

406 def get_text(self) -> str:

407 return self._text

408

409

410LTItemT = TypeVar("LTItemT", bound=LTItem)

411

412

413class LTContainer(LTComponent, Generic[LTItemT]):

414 """Object that can be extended and analyzed"""

415

416 def __init__(self, bbox: Rect) -> None:

417 LTComponent.__init__(self, bbox)

418 self._objs: List[LTItemT] = []

419

420 def __iter__(self) -> Iterator[LTItemT]:

421 return iter(self._objs)

422

423 def __len__(self) -> int:

424 return len(self._objs)

425

426 def add(self, obj: LTItemT) -> None:

427 self._objs.append(obj)

428

429 def extend(self, objs: Iterable[LTItemT]) -> None:

430 for obj in objs:

431 self.add(obj)

432

433 def analyze(self, laparams: LAParams) -> None:

434 for obj in self._objs:

435 obj.analyze(laparams)

436

437

438class LTExpandableContainer(LTContainer[LTItemT]):

439 def __init__(self) -> None:

440 LTContainer.__init__(self, (+INF, +INF, -INF, -INF))

441

442 # Incompatible override: we take an LTComponent (with bounding box), but

443 # super() LTContainer only considers LTItem (no bounding box).

444 def add(self, obj: LTComponent) -> None: # type: ignore[override]

445 LTContainer.add(self, cast(LTItemT, obj))

446 self.set_bbox(

447 (

448 min(self.x0, obj.x0),

449 min(self.y0, obj.y0),

450 max(self.x1, obj.x1),

451 max(self.y1, obj.y1),

452 ),

453 )

454

455

456class LTTextContainer(LTExpandableContainer[LTItemT], LTText):

457 def __init__(self) -> None:

458 LTText.__init__(self)

459 LTExpandableContainer.__init__(self)

460

461 def get_text(self) -> str:

462 return "".join(

463 cast(LTText, obj).get_text() for obj in self if isinstance(obj, LTText)

464 )

465

466

467TextLineElement = Union[LTChar, LTAnno]

468

469

470class LTTextLine(LTTextContainer[TextLineElement]):

471 """Contains a list of LTChar objects that represent a single text line.

472

473 The characters are aligned either horizontally or vertically, depending on

474 the text's writing mode.

475 """

476

477 def __init__(self, word_margin: float) -> None:

478 super().__init__()

479 self.word_margin = word_margin

480

481 def __repr__(self) -> str:

482 return f"<{self.__class__.__name__} {bbox2str(self.bbox)} {self.get_text()!r}>"

483

484 def analyze(self, laparams: LAParams) -> None:

485 for obj in self._objs:

486 obj.analyze(laparams)

487 LTContainer.add(self, LTAnno("\n"))

488

489 def find_neighbors(

490 self,

491 plane: Plane[LTComponentT],

492 ratio: float,

493 ) -> List["LTTextLine"]:

494 raise NotImplementedError

495

496 def is_empty(self) -> bool:

497 return super().is_empty() or self.get_text().isspace()

498

499

500class LTTextLineHorizontal(LTTextLine):

501 def __init__(self, word_margin: float) -> None:

502 LTTextLine.__init__(self, word_margin)

503 self._x1: float = +INF

504

505 # Incompatible override: we take an LTComponent (with bounding box), but

506 # LTContainer only considers LTItem (no bounding box).

507 def add(self, obj: LTComponent) -> None: # type: ignore[override]

508 if isinstance(obj, LTChar) and self.word_margin:

509 margin = self.word_margin * max(obj.width, obj.height)

510 if self._x1 < obj.x0 - margin:

511 LTContainer.add(self, LTAnno(" "))

512 self._x1 = obj.x1

513 super().add(obj)

514

515 def find_neighbors(

516 self,

517 plane: Plane[LTComponentT],

518 ratio: float,

519 ) -> List[LTTextLine]:

520 """Finds neighboring LTTextLineHorizontals in the plane.

521

522 Returns a list of other LTTestLineHorizontals in the plane which are

523 close to self. "Close" can be controlled by ratio. The returned objects

524 will be the same height as self, and also either left-, right-, or

525 centrally-aligned.

526 """

527 d = ratio * self.height

528 objs = plane.find((self.x0, self.y0 - d, self.x1, self.y1 + d))

529 return [

530 obj

531 for obj in objs

532 if (

533 isinstance(obj, LTTextLineHorizontal)

534 and self._is_same_height_as(obj, tolerance=d)

535 and (

536 self._is_left_aligned_with(obj, tolerance=d)

537 or self._is_right_aligned_with(obj, tolerance=d)

538 or self._is_centrally_aligned_with(obj, tolerance=d)

539 )

540 )

541 ]

542

543 def _is_left_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:

544 """Whether the left-hand edge of `other` is within `tolerance`."""

545 return abs(other.x0 - self.x0) <= tolerance

546

547 def _is_right_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:

548 """Whether the right-hand edge of `other` is within `tolerance`."""

549 return abs(other.x1 - self.x1) <= tolerance

550

551 def _is_centrally_aligned_with(

552 self,

553 other: LTComponent,

554 tolerance: float = 0,

555 ) -> bool:

556 """Whether the horizontal center of `other` is within `tolerance`."""

557 return abs((other.x0 + other.x1) / 2 - (self.x0 + self.x1) / 2) <= tolerance

558

559 def _is_same_height_as(self, other: LTComponent, tolerance: float = 0) -> bool:

560 return abs(other.height - self.height) <= tolerance

561

562

563class LTTextLineVertical(LTTextLine):

564 def __init__(self, word_margin: float) -> None:

565 LTTextLine.__init__(self, word_margin)

566 self._y0: float = -INF

567

568 # Incompatible override: we take an LTComponent (with bounding box), but

569 # LTContainer only considers LTItem (no bounding box).

570 def add(self, obj: LTComponent) -> None: # type: ignore[override]

571 if isinstance(obj, LTChar) and self.word_margin:

572 margin = self.word_margin * max(obj.width, obj.height)

573 if obj.y1 + margin < self._y0:

574 LTContainer.add(self, LTAnno(" "))

575 self._y0 = obj.y0

576 super().add(obj)

577

578 def find_neighbors(

579 self,

580 plane: Plane[LTComponentT],

581 ratio: float,

582 ) -> List[LTTextLine]:

583 """Finds neighboring LTTextLineVerticals in the plane.

584

585 Returns a list of other LTTextLineVerticals in the plane which are

586 close to self. "Close" can be controlled by ratio. The returned objects

587 will be the same width as self, and also either upper-, lower-, or

588 centrally-aligned.

589 """

590 d = ratio * self.width

591 objs = plane.find((self.x0 - d, self.y0, self.x1 + d, self.y1))

592 return [

593 obj

594 for obj in objs

595 if (

596 isinstance(obj, LTTextLineVertical)

597 and self._is_same_width_as(obj, tolerance=d)

598 and (

599 self._is_lower_aligned_with(obj, tolerance=d)

600 or self._is_upper_aligned_with(obj, tolerance=d)

601 or self._is_centrally_aligned_with(obj, tolerance=d)

602 )

603 )

604 ]

605

606 def _is_lower_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:

607 """Whether the lower edge of `other` is within `tolerance`."""

608 return abs(other.y0 - self.y0) <= tolerance

609

610 def _is_upper_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:

611 """Whether the upper edge of `other` is within `tolerance`."""

612 return abs(other.y1 - self.y1) <= tolerance

613

614 def _is_centrally_aligned_with(

615 self,

616 other: LTComponent,

617 tolerance: float = 0,

618 ) -> bool:

619 """Whether the vertical center of `other` is within `tolerance`."""

620 return abs((other.y0 + other.y1) / 2 - (self.y0 + self.y1) / 2) <= tolerance

621

622 def _is_same_width_as(self, other: LTComponent, tolerance: float) -> bool:

623 return abs(other.width - self.width) <= tolerance

624

625

626class LTTextBox(LTTextContainer[LTTextLine]):

627 """Represents a group of text chunks in a rectangular area.

628

629 Note that this box is created by geometric analysis and does not

630 necessarily represents a logical boundary of the text. It contains a list

631 of LTTextLine objects.

632 """

633

634 def __init__(self) -> None:

635 LTTextContainer.__init__(self)

636 self.index: int = -1

637

638 def __repr__(self) -> str:

639 return f"<{self.__class__.__name__}({self.index}) {bbox2str(self.bbox)} {self.get_text()!r}>"

640

641 def get_writing_mode(self) -> str:

642 raise NotImplementedError

643

644

645class LTTextBoxHorizontal(LTTextBox):

646 def analyze(self, laparams: LAParams) -> None:

647 super().analyze(laparams)

648 self._objs.sort(key=lambda obj: -obj.y1)

649

650 def get_writing_mode(self) -> str:

651 return "lr-tb"

652

653

654class LTTextBoxVertical(LTTextBox):

655 def analyze(self, laparams: LAParams) -> None:

656 super().analyze(laparams)

657 self._objs.sort(key=lambda obj: -obj.x1)

658

659 def get_writing_mode(self) -> str:

660 return "tb-rl"

661

662

663TextGroupElement = Union[LTTextBox, "LTTextGroup"]

664

665

666class LTTextGroup(LTTextContainer[TextGroupElement]):

667 def __init__(self, objs: Iterable[TextGroupElement]) -> None:

668 super().__init__()

669 self.extend(objs)

670

671

672class LTTextGroupLRTB(LTTextGroup):

673 def analyze(self, laparams: LAParams) -> None:

674 super().analyze(laparams)

675 assert laparams.boxes_flow is not None

676 boxes_flow = laparams.boxes_flow

677 # reorder the objects from top-left to bottom-right.

678 self._objs.sort(

679 key=lambda obj: (1 - boxes_flow) * obj.x0

680 - (1 + boxes_flow) * (obj.y0 + obj.y1),

681 )

682

683

684class LTTextGroupTBRL(LTTextGroup):

685 def analyze(self, laparams: LAParams) -> None:

686 super().analyze(laparams)

687 assert laparams.boxes_flow is not None

688 boxes_flow = laparams.boxes_flow

689 # reorder the objects from top-right to bottom-left.

690 self._objs.sort(

691 key=lambda obj: -(1 + boxes_flow) * (obj.x0 + obj.x1)

692 - (1 - boxes_flow) * obj.y1,

693 )

694

695

696class LTLayoutContainer(LTContainer[LTComponent]):

697 def __init__(self, bbox: Rect) -> None:

698 LTContainer.__init__(self, bbox)

699 self.groups: Optional[List[LTTextGroup]] = None

700

701 # group_objects: group text object to textlines.

702 def group_objects(

703 self,

704 laparams: LAParams,

705 objs: Iterable[LTComponent],

706 ) -> Iterator[LTTextLine]:

707 obj0 = None

708 line = None

709 for obj1 in objs:

710 if obj0 is not None:

711 # halign: obj0 and obj1 is horizontally aligned.

712 #

713 # +------+ - - -

714 # | obj0 | - - +------+ -

715 # | | | obj1 | | (line_overlap)

716 # +------+ - - | | -

717 # - - - +------+

718 #

719 # |<--->|

720 # (char_margin)

721 halign = (

722 obj0.is_voverlap(obj1)

723 and min(obj0.height, obj1.height) * laparams.line_overlap

724 < obj0.voverlap(obj1)

725 and obj0.hdistance(obj1)

726 < max(obj0.width, obj1.width) * laparams.char_margin

727 )

728

729 # valign: obj0 and obj1 is vertically aligned.

730 #

731 # +------+

732 # | obj0 |

733 # | |

734 # +------+ - - -

735 # | | | (char_margin)

736 # +------+ - -

737 # | obj1 |

738 # | |

739 # +------+

740 #

741 # |<-->|

742 # (line_overlap)

743 valign = (

744 laparams.detect_vertical

745 and obj0.is_hoverlap(obj1)

746 and min(obj0.width, obj1.width) * laparams.line_overlap

747 < obj0.hoverlap(obj1)

748 and obj0.vdistance(obj1)

749 < max(obj0.height, obj1.height) * laparams.char_margin

750 )

751

752 if (halign and isinstance(line, LTTextLineHorizontal)) or (

753 valign and isinstance(line, LTTextLineVertical)

754 ):

755 line.add(obj1)

756 elif line is not None:

757 yield line

758 line = None

759 elif valign and not halign:

760 line = LTTextLineVertical(laparams.word_margin)

761 line.add(obj0)

762 line.add(obj1)

763 elif halign and not valign:

764 line = LTTextLineHorizontal(laparams.word_margin)

765 line.add(obj0)

766 line.add(obj1)

767 else:

768 line = LTTextLineHorizontal(laparams.word_margin)

769 line.add(obj0)

770 yield line

771 line = None

772 obj0 = obj1

773 if line is None:

774 line = LTTextLineHorizontal(laparams.word_margin)

775 assert obj0 is not None

776 line.add(obj0)

777 yield line

778

779 def group_textlines(

780 self,

781 laparams: LAParams,

782 lines: Iterable[LTTextLine],

783 ) -> Iterator[LTTextBox]:

784 """Group neighboring lines to textboxes"""

785 plane: Plane[LTTextLine] = Plane(self.bbox)

786 plane.extend(lines)

787 boxes: Dict[LTTextLine, LTTextBox] = {}

788 for line in lines:

789 neighbors = line.find_neighbors(plane, laparams.line_margin)

790 members = [line]

791 for obj1 in neighbors:

792 members.append(obj1)

793 if obj1 in boxes:

794 members.extend(boxes.pop(obj1))

795 if isinstance(line, LTTextLineHorizontal):

796 box: LTTextBox = LTTextBoxHorizontal()

797 else:

798 box = LTTextBoxVertical()

799 for obj in uniq(members):

800 box.add(obj)

801 boxes[obj] = box

802 done = set()

803 for line in lines:

804 if line not in boxes:

805 continue

806 box = boxes[line]

807 if box in done:

808 continue

809 done.add(box)

810 if not box.is_empty():

811 yield box

812

813 def group_textboxes(

814 self,

815 laparams: LAParams,

816 boxes: Sequence[LTTextBox],

817 ) -> List[LTTextGroup]:

818 """Group textboxes hierarchically.

819

820 Get pair-wise distances, via dist func defined below, and then merge

821 from the closest textbox pair. Once obj1 and obj2 are merged /

822 grouped, the resulting group is considered as a new object, and its

823 distances to other objects & groups are added to the process queue.

824

825 For performance reason, pair-wise distances and object pair info are

826 maintained in a heap of (idx, dist, id(obj1), id(obj2), obj1, obj2)

827 tuples. It ensures quick access to the smallest element. Note that

828 since comparison operators, e.g., __lt__, are disabled for

829 LTComponent, id(obj) has to appear before obj in element tuples.

830

831 :param laparams: LAParams object.

832 :param boxes: All textbox objects to be grouped.

833 :return: a list that has only one element, the final top level group.

834 """

835 ElementT = Union[LTTextBox, LTTextGroup]

836 plane: Plane[ElementT] = Plane(self.bbox)

837

838 def dist(obj1: LTComponent, obj2: LTComponent) -> float:

839 """A distance function between two TextBoxes.

840

841 Consider the bounding rectangle for obj1 and obj2.

842 Return its area less the areas of obj1 and obj2,

843 shown as 'www' below. This value may be negative.

844 +------+..........+ (x1, y1)

845 | obj1 |wwwwwwwwww:

846 +------+www+------+

847 :wwwwwwwwww| obj2 |

848 (x0, y0) +..........+------+

849 """

850 x0 = min(obj1.x0, obj2.x0)

851 y0 = min(obj1.y0, obj2.y0)

852 x1 = max(obj1.x1, obj2.x1)

853 y1 = max(obj1.y1, obj2.y1)

854 return (

855 (x1 - x0) * (y1 - y0)

856 - obj1.width * obj1.height

857 - obj2.width * obj2.height

858 )

859

860 def isany(obj1: ElementT, obj2: ElementT) -> Set[ElementT]:

861 """Check if there's any other object between obj1 and obj2."""

862 x0 = min(obj1.x0, obj2.x0)

863 y0 = min(obj1.y0, obj2.y0)

864 x1 = max(obj1.x1, obj2.x1)

865 y1 = max(obj1.y1, obj2.y1)

866 objs = set(plane.find((x0, y0, x1, y1)))

867 return objs.difference((obj1, obj2))

868

869 dists: List[Tuple[bool, float, int, int, ElementT, ElementT]] = []

870 for i in range(len(boxes)):

871 box1 = boxes[i]

872 for j in range(i + 1, len(boxes)):

873 box2 = boxes[j]

874 dists.append((False, dist(box1, box2), id(box1), id(box2), box1, box2))

875 heapq.heapify(dists)

876

877 plane.extend(boxes)

878 done = set()

879 while len(dists) > 0:

880 (skip_isany, d, id1, id2, obj1, obj2) = heapq.heappop(dists)

881 # Skip objects that are already merged

882 if (id1 not in done) and (id2 not in done):

883 if not skip_isany and isany(obj1, obj2):

884 heapq.heappush(dists, (True, d, id1, id2, obj1, obj2))

885 continue

886 if isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or isinstance(

887 obj2,

888 (LTTextBoxVertical, LTTextGroupTBRL),

889 ):

890 group: LTTextGroup = LTTextGroupTBRL([obj1, obj2])

891 else:

892 group = LTTextGroupLRTB([obj1, obj2])

893 plane.remove(obj1)

894 plane.remove(obj2)

895 done.update([id1, id2])

896

897 for other in plane:

898 heapq.heappush(

899 dists,

900 (False, dist(group, other), id(group), id(other), group, other),

901 )

902 plane.add(group)

903 # By now only groups are in the plane

904 return list(cast(LTTextGroup, g) for g in plane)

905

906 def analyze(self, laparams: LAParams) -> None:

907 # textobjs is a list of LTChar objects, i.e.

908 # it has all the individual characters in the page.

909 (textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self)

910 for obj in otherobjs:

911 obj.analyze(laparams)

912 if not textobjs:

913 return

914 textlines = list(self.group_objects(laparams, textobjs))

915 (empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines)

916 for obj in empties:

917 obj.analyze(laparams)

918 textboxes = list(self.group_textlines(laparams, textlines))

919 if laparams.boxes_flow is None:

920 for textbox in textboxes:

921 textbox.analyze(laparams)

922

923 def getkey(box: LTTextBox) -> Tuple[int, float, float]:

924 if isinstance(box, LTTextBoxVertical):

925 return (0, -box.x1, -box.y0)

926 else:

927 return (1, -box.y0, box.x0)

928

929 textboxes.sort(key=getkey)

930 else:

931 self.groups = self.group_textboxes(laparams, textboxes)

932 assigner = IndexAssigner()

933 for group in self.groups:

934 group.analyze(laparams)

935 assigner.run(group)

936 textboxes.sort(key=lambda box: box.index)

937 self._objs = (

938 cast(List[LTComponent], textboxes)

939 + otherobjs

940 + cast(List[LTComponent], empties)

941 )

942

943

944class LTFigure(LTLayoutContainer):

945 """Represents an area used by PDF Form objects.

946

947 PDF Forms can be used to present figures or pictures by embedding yet

948 another PDF document within a page. Note that LTFigure objects can appear

949 recursively.

950 """

951

952 def __init__(self, name: str, bbox: Rect, matrix: Matrix) -> None:

953 self.name = name

954 self.matrix = matrix

955 (x, y, w, h) = bbox

956 rect = (x, y, x + w, y + h)

957 bbox = apply_matrix_rect(matrix, rect)

958 LTLayoutContainer.__init__(self, bbox)

959

960 def __repr__(self) -> str:

961 return f"<{self.__class__.__name__}({self.name}) {bbox2str(self.bbox)} matrix={matrix2str(self.matrix)}>"

962

963 def analyze(self, laparams: LAParams) -> None:

964 if not laparams.all_texts:

965 return

966 LTLayoutContainer.analyze(self, laparams)

967

968

969class LTPage(LTLayoutContainer):

970 """Represents an entire page.

971

972 Like any other LTLayoutContainer, an LTPage can be iterated to obtain child

973 objects like LTTextBox, LTFigure, LTImage, LTRect, LTCurve and LTLine.

974 """

975

976 def __init__(self, pageid: int, bbox: Rect, rotate: float = 0) -> None:

977 LTLayoutContainer.__init__(self, bbox)

978 self.pageid = pageid

979 self.rotate = rotate

980

981 def __repr__(self) -> str:

982 return f"<{self.__class__.__name__}({self.pageid!r}) {bbox2str(self.bbox)} rotate={self.rotate!r}>"