Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/layout.py: 94%

407 return f"<{self.__class__.__name__} {bbox2str(self.bbox)} matrix={matrix2str(self.matrix)} font={self.fontname!r} adv={self.adv} text={self.get_text()!r}>"

408

409 def get_text(self) -> str:

410 return self._text

411

412

413LTItemT = TypeVar("LTItemT", bound=LTItem)

414

415

416class LTContainer(LTComponent, Generic[LTItemT]):

417 """Object that can be extended and analyzed"""

418

419 def __init__(self, bbox: Rect) -> None:

420 LTComponent.__init__(self, bbox)

421 self._objs: List[LTItemT] = []

422

423 def __iter__(self) -> Iterator[LTItemT]:

424 return iter(self._objs)

425

426 def __len__(self) -> int:

427 return len(self._objs)

428

429 def add(self, obj: LTItemT) -> None:

430 self._objs.append(obj)

431

432 def extend(self, objs: Iterable[LTItemT]) -> None:

433 for obj in objs:

434 self.add(obj)

435

436 def analyze(self, laparams: LAParams) -> None:

437 for obj in self._objs:

438 obj.analyze(laparams)

439

440

441class LTExpandableContainer(LTContainer[LTItemT]):

442 def __init__(self) -> None:

443 LTContainer.__init__(self, (+INF, +INF, -INF, -INF))

444

445 # Incompatible override: we take an LTComponent (with bounding box), but

446 # super() LTContainer only considers LTItem (no bounding box).

447 def add(self, obj: LTComponent) -> None: # type: ignore[override]

448 LTContainer.add(self, cast(LTItemT, obj))

449 self.set_bbox(

450 (

451 min(self.x0, obj.x0),

452 min(self.y0, obj.y0),

453 max(self.x1, obj.x1),

454 max(self.y1, obj.y1),

455 ),

456 )

457

458

459class LTTextContainer(LTExpandableContainer[LTItemT], LTText):

460 def __init__(self) -> None:

461 LTText.__init__(self)

462 LTExpandableContainer.__init__(self)

463

464 def get_text(self) -> str:

465 return "".join(

466 cast(LTText, obj).get_text() for obj in self if isinstance(obj, LTText)

467 )

468

469

470TextLineElement = Union[LTChar, LTAnno]

471

472

473class LTTextLine(LTTextContainer[TextLineElement]):

474 """Contains a list of LTChar objects that represent a single text line.

475

476 The characters are aligned either horizontally or vertically, depending on

477 the text's writing mode.

478 """

479

480 def __init__(self, word_margin: float) -> None:

481 super().__init__()

482 self.word_margin = word_margin

483

484 def __repr__(self) -> str:

485 return f"<{self.__class__.__name__} {bbox2str(self.bbox)} {self.get_text()!r}>"

486

487 def analyze(self, laparams: LAParams) -> None:

488 for obj in self._objs:

489 obj.analyze(laparams)

490 LTContainer.add(self, LTAnno("\n"))

491

492 def find_neighbors(

493 self,

494 plane: Plane[LTComponentT],

495 ratio: float,

496 ) -> List["LTTextLine"]:

497 raise NotImplementedError

498

499 def is_empty(self) -> bool:

500 return super().is_empty() or self.get_text().isspace()

501

502

503class LTTextLineHorizontal(LTTextLine):

504 def __init__(self, word_margin: float) -> None:

505 LTTextLine.__init__(self, word_margin)

506 self._x1: float = +INF

507

508 # Incompatible override: we take an LTComponent (with bounding box), but

509 # LTContainer only considers LTItem (no bounding box).

510 def add(self, obj: LTComponent) -> None: # type: ignore[override]

511 if isinstance(obj, LTChar) and self.word_margin:

512 margin = self.word_margin * max(obj.width, obj.height)

513 if self._x1 < obj.x0 - margin:

514 LTContainer.add(self, LTAnno(" "))

515 self._x1 = obj.x1

516 super().add(obj)

517

518 def find_neighbors(

519 self,

520 plane: Plane[LTComponentT],

521 ratio: float,

522 ) -> List[LTTextLine]:

523 """Finds neighboring LTTextLineHorizontals in the plane.

524

525 Returns a list of other LTTestLineHorizontals in the plane which are

526 close to self. "Close" can be controlled by ratio. The returned objects

527 will be the same height as self, and also either left-, right-, or

528 centrally-aligned.

529 """

530 d = ratio * self.height

531 objs = plane.find((self.x0, self.y0 - d, self.x1, self.y1 + d))

532 return [

533 obj

534 for obj in objs

535 if (

536 isinstance(obj, LTTextLineHorizontal)

537 and self._is_same_height_as(obj, tolerance=d)

538 and (

539 self._is_left_aligned_with(obj, tolerance=d)

540 or self._is_right_aligned_with(obj, tolerance=d)

541 or self._is_centrally_aligned_with(obj, tolerance=d)

542 )

543 )

544 ]

545

546 def _is_left_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:

547 """Whether the left-hand edge of `other` is within `tolerance`."""

548 return abs(other.x0 - self.x0) <= tolerance

549

550 def _is_right_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:

551 """Whether the right-hand edge of `other` is within `tolerance`."""

552 return abs(other.x1 - self.x1) <= tolerance

553

554 def _is_centrally_aligned_with(

555 self,

556 other: LTComponent,

557 tolerance: float = 0,

558 ) -> bool:

559 """Whether the horizontal center of `other` is within `tolerance`."""

560 return abs((other.x0 + other.x1) / 2 - (self.x0 + self.x1) / 2) <= tolerance

561

562 def _is_same_height_as(self, other: LTComponent, tolerance: float = 0) -> bool:

563 return abs(other.height - self.height) <= tolerance

564

565

566class LTTextLineVertical(LTTextLine):

567 def __init__(self, word_margin: float) -> None:

568 LTTextLine.__init__(self, word_margin)

569 self._y0: float = -INF

570

571 # Incompatible override: we take an LTComponent (with bounding box), but

572 # LTContainer only considers LTItem (no bounding box).

573 def add(self, obj: LTComponent) -> None: # type: ignore[override]

574 if isinstance(obj, LTChar) and self.word_margin:

575 margin = self.word_margin * max(obj.width, obj.height)

576 if obj.y1 + margin < self._y0:

577 LTContainer.add(self, LTAnno(" "))

578 self._y0 = obj.y0

579 super().add(obj)

580

581 def find_neighbors(

582 self,

583 plane: Plane[LTComponentT],

584 ratio: float,

585 ) -> List[LTTextLine]:

586 """Finds neighboring LTTextLineVerticals in the plane.

587

588 Returns a list of other LTTextLineVerticals in the plane which are

589 close to self. "Close" can be controlled by ratio. The returned objects

590 will be the same width as self, and also either upper-, lower-, or

591 centrally-aligned.

592 """

593 d = ratio * self.width

594 objs = plane.find((self.x0 - d, self.y0, self.x1 + d, self.y1))

595 return [

596 obj

597 for obj in objs

598 if (

599 isinstance(obj, LTTextLineVertical)

600 and self._is_same_width_as(obj, tolerance=d)

601 and (

602 self._is_lower_aligned_with(obj, tolerance=d)

603 or self._is_upper_aligned_with(obj, tolerance=d)

604 or self._is_centrally_aligned_with(obj, tolerance=d)

605 )

606 )

607 ]

608

609 def _is_lower_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:

610 """Whether the lower edge of `other` is within `tolerance`."""

611 return abs(other.y0 - self.y0) <= tolerance

612

613 def _is_upper_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:

614 """Whether the upper edge of `other` is within `tolerance`."""

615 return abs(other.y1 - self.y1) <= tolerance

616

617 def _is_centrally_aligned_with(

618 self,

619 other: LTComponent,

620 tolerance: float = 0,

621 ) -> bool:

622 """Whether the vertical center of `other` is within `tolerance`."""

623 return abs((other.y0 + other.y1) / 2 - (self.y0 + self.y1) / 2) <= tolerance

624

625 def _is_same_width_as(self, other: LTComponent, tolerance: float) -> bool:

626 return abs(other.width - self.width) <= tolerance

627

628

629class LTTextBox(LTTextContainer[LTTextLine]):

630 """Represents a group of text chunks in a rectangular area.

631

632 Note that this box is created by geometric analysis and does not

633 necessarily represents a logical boundary of the text. It contains a list

634 of LTTextLine objects.

635 """

636

637 def __init__(self) -> None:

638 LTTextContainer.__init__(self)

639 self.index: int = -1

640

641 def __repr__(self) -> str:

642 return f"<{self.__class__.__name__}({self.index}) {bbox2str(self.bbox)} {self.get_text()!r}>"

643

644 def get_writing_mode(self) -> str:

645 raise NotImplementedError

646

647

648class LTTextBoxHorizontal(LTTextBox):

649 def analyze(self, laparams: LAParams) -> None:

650 super().analyze(laparams)

651 self._objs.sort(key=lambda obj: -obj.y1)

652

653 def get_writing_mode(self) -> str:

654 return "lr-tb"

655

656

657class LTTextBoxVertical(LTTextBox):

658 def analyze(self, laparams: LAParams) -> None:

659 super().analyze(laparams)

660 self._objs.sort(key=lambda obj: -obj.x1)

661

662 def get_writing_mode(self) -> str:

663 return "tb-rl"

664

665

666TextGroupElement = Union[LTTextBox, "LTTextGroup"]

667

668

669class LTTextGroup(LTTextContainer[TextGroupElement]):

670 def __init__(self, objs: Iterable[TextGroupElement]) -> None:

671 super().__init__()

672 self.extend(objs)

673

674

675class LTTextGroupLRTB(LTTextGroup):

676 def analyze(self, laparams: LAParams) -> None:

677 super().analyze(laparams)

678 assert laparams.boxes_flow is not None

679 boxes_flow = laparams.boxes_flow

680 # reorder the objects from top-left to bottom-right.

681 self._objs.sort(

682 key=lambda obj: (1 - boxes_flow) * obj.x0

683 - (1 + boxes_flow) * (obj.y0 + obj.y1),

684 )

685

686

687class LTTextGroupTBRL(LTTextGroup):

688 def analyze(self, laparams: LAParams) -> None:

689 super().analyze(laparams)

690 assert laparams.boxes_flow is not None

691 boxes_flow = laparams.boxes_flow

692 # reorder the objects from top-right to bottom-left.

693 self._objs.sort(

694 key=lambda obj: -(1 + boxes_flow) * (obj.x0 + obj.x1)

695 - (1 - boxes_flow) * obj.y1,

696 )

697

698

699class LTLayoutContainer(LTContainer[LTComponent]):

700 def __init__(self, bbox: Rect) -> None:

701 LTContainer.__init__(self, bbox)

702 self.groups: Optional[List[LTTextGroup]] = None

703

704 # group_objects: group text object to textlines.

705 def group_objects(

706 self,

707 laparams: LAParams,

708 objs: Iterable[LTComponent],

709 ) -> Iterator[LTTextLine]:

710 obj0 = None

711 line = None

712 for obj1 in objs:

713 if obj0 is not None:

714 # halign: obj0 and obj1 is horizontally aligned.

715 #

716 # +------+ - - -

717 # | obj0 | - - +------+ -

718 # | | | obj1 | | (line_overlap)

719 # +------+ - - | | -

720 # - - - +------+

721 #

722 # |<--->|

723 # (char_margin)

724 halign = (

725 obj0.is_voverlap(obj1)

726 and min(obj0.height, obj1.height) * laparams.line_overlap

727 < obj0.voverlap(obj1)

728 and obj0.hdistance(obj1)

729 < max(obj0.width, obj1.width) * laparams.char_margin

730 )

731

732 # valign: obj0 and obj1 is vertically aligned.

733 #

734 # +------+

735 # | obj0 |

736 # | |

737 # +------+ - - -

738 # | | | (char_margin)

739 # +------+ - -

740 # | obj1 |

741 # | |

742 # +------+

743 #

744 # |<-->|

745 # (line_overlap)

746 valign = (

747 laparams.detect_vertical

748 and obj0.is_hoverlap(obj1)

749 and min(obj0.width, obj1.width) * laparams.line_overlap

750 < obj0.hoverlap(obj1)

751 and obj0.vdistance(obj1)

752 < max(obj0.height, obj1.height) * laparams.char_margin

753 )

754

755 if (halign and isinstance(line, LTTextLineHorizontal)) or (

756 valign and isinstance(line, LTTextLineVertical)

757 ):

758 line.add(obj1)

759 elif line is not None:

760 yield line

761 line = None

762 elif valign and not halign:

763 line = LTTextLineVertical(laparams.word_margin)

764 line.add(obj0)

765 line.add(obj1)

766 elif halign and not valign:

767 line = LTTextLineHorizontal(laparams.word_margin)

768 line.add(obj0)

769 line.add(obj1)

770 else:

771 line = LTTextLineHorizontal(laparams.word_margin)

772 line.add(obj0)

773 yield line

774 line = None

775 obj0 = obj1

776 if line is None:

777 line = LTTextLineHorizontal(laparams.word_margin)

778 assert obj0 is not None

779 line.add(obj0)

780 yield line

781

782 def group_textlines(

783 self,

784 laparams: LAParams,

785 lines: Iterable[LTTextLine],

786 ) -> Iterator[LTTextBox]:

787 """Group neighboring lines to textboxes"""

788 plane: Plane[LTTextLine] = Plane(self.bbox)

789 plane.extend(lines)

790 boxes: Dict[LTTextLine, LTTextBox] = {}

791 for line in lines:

792 neighbors = line.find_neighbors(plane, laparams.line_margin)

793 members = [line]

794 for obj1 in neighbors:

795 members.append(obj1)

796 if obj1 in boxes:

797 members.extend(boxes.pop(obj1))

798 if isinstance(line, LTTextLineHorizontal):

799 box: LTTextBox = LTTextBoxHorizontal()

800 else:

801 box = LTTextBoxVertical()

802 for obj in uniq(members):

803 box.add(obj)

804 boxes[obj] = box

805 done = set()

806 for line in lines:

807 if line not in boxes:

808 continue

809 box = boxes[line]

810 if box in done:

811 continue

812 done.add(box)

813 if not box.is_empty():

814 yield box

815

816 def group_textboxes(

817 self,

818 laparams: LAParams,

819 boxes: Sequence[LTTextBox],

820 ) -> List[LTTextGroup]:

821 """Group textboxes hierarchically.

822

823 Get pair-wise distances, via dist func defined below, and then merge

824 from the closest textbox pair. Once obj1 and obj2 are merged /

825 grouped, the resulting group is considered as a new object, and its

826 distances to other objects & groups are added to the process queue.

827

828 For performance reason, pair-wise distances and object pair info are

829 maintained in a heap of (idx, dist, id(obj1), id(obj2), obj1, obj2)

830 tuples. It ensures quick access to the smallest element. Note that

831 since comparison operators, e.g., __lt__, are disabled for

832 LTComponent, id(obj) has to appear before obj in element tuples.

833

834 :param laparams: LAParams object.

835 :param boxes: All textbox objects to be grouped.

836 :return: a list that has only one element, the final top level group.

837 """

838 ElementT = Union[LTTextBox, LTTextGroup]

839 plane: Plane[ElementT] = Plane(self.bbox)

840

841 def dist(obj1: LTComponent, obj2: LTComponent) -> float:

842 """A distance function between two TextBoxes.

843

844 Consider the bounding rectangle for obj1 and obj2.

845 Return its area less the areas of obj1 and obj2,

846 shown as 'www' below. This value may be negative.

847 +------+..........+ (x1, y1)

848 | obj1 |wwwwwwwwww:

849 +------+www+------+

850 :wwwwwwwwww| obj2 |

851 (x0, y0) +..........+------+

852 """

853 x0 = min(obj1.x0, obj2.x0)

854 y0 = min(obj1.y0, obj2.y0)

855 x1 = max(obj1.x1, obj2.x1)

856 y1 = max(obj1.y1, obj2.y1)

857 return (

858 (x1 - x0) * (y1 - y0)

859 - obj1.width * obj1.height

860 - obj2.width * obj2.height

861 )

862

863 def isany(obj1: ElementT, obj2: ElementT) -> Set[ElementT]:

864 """Check if there's any other object between obj1 and obj2."""

865 x0 = min(obj1.x0, obj2.x0)

866 y0 = min(obj1.y0, obj2.y0)

867 x1 = max(obj1.x1, obj2.x1)

868 y1 = max(obj1.y1, obj2.y1)

869 objs = set(plane.find((x0, y0, x1, y1)))

870 return objs.difference((obj1, obj2))

871

872 dists: List[Tuple[bool, float, int, int, ElementT, ElementT]] = []

873 for i in range(len(boxes)):

874 box1 = boxes[i]

875 for j in range(i + 1, len(boxes)):

876 box2 = boxes[j]

877 dists.append((False, dist(box1, box2), id(box1), id(box2), box1, box2))

878 heapq.heapify(dists)

879

880 plane.extend(boxes)

881 done = set()

882 while len(dists) > 0:

883 (skip_isany, d, id1, id2, obj1, obj2) = heapq.heappop(dists)

884 # Skip objects that are already merged

885 if (id1 not in done) and (id2 not in done):

886 if not skip_isany and isany(obj1, obj2):

887 heapq.heappush(dists, (True, d, id1, id2, obj1, obj2))

888 continue

889 if isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or isinstance(

890 obj2,

891 (LTTextBoxVertical, LTTextGroupTBRL),

892 ):

893 group: LTTextGroup = LTTextGroupTBRL([obj1, obj2])

894 else:

895 group = LTTextGroupLRTB([obj1, obj2])

896 plane.remove(obj1)

897 plane.remove(obj2)

898 done.update([id1, id2])

899

900 for other in plane:

901 heapq.heappush(

902 dists,

903 (False, dist(group, other), id(group), id(other), group, other),

904 )

905 plane.add(group)

906 # By now only groups are in the plane

907 return list(cast(LTTextGroup, g) for g in plane)

908

909 def analyze(self, laparams: LAParams) -> None:

910 # textobjs is a list of LTChar objects, i.e.

911 # it has all the individual characters in the page.

912 (textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self)

913 for obj in otherobjs:

914 obj.analyze(laparams)

915 if not textobjs:

916 return

917 textlines = list(self.group_objects(laparams, textobjs))

918 (empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines)

919 for obj in empties:

920 obj.analyze(laparams)

921 textboxes = list(self.group_textlines(laparams, textlines))

922 if laparams.boxes_flow is None:

923 for textbox in textboxes:

924 textbox.analyze(laparams)

925

926 def getkey(box: LTTextBox) -> Tuple[int, float, float]:

927 if isinstance(box, LTTextBoxVertical):

928 return (0, -box.x1, -box.y0)

929 else:

930 return (1, -box.y0, box.x0)

931

932 textboxes.sort(key=getkey)

933 else:

934 self.groups = self.group_textboxes(laparams, textboxes)

935 assigner = IndexAssigner()

936 for group in self.groups:

937 group.analyze(laparams)

938 assigner.run(group)

939 textboxes.sort(key=lambda box: box.index)

940 self._objs = (

941 cast(List[LTComponent], textboxes)

942 + otherobjs

943 + cast(List[LTComponent], empties)

944 )

945

946

947class LTFigure(LTLayoutContainer):

948 """Represents an area used by PDF Form objects.

949

950 PDF Forms can be used to present figures or pictures by embedding yet

951 another PDF document within a page. Note that LTFigure objects can appear

952 recursively.

953 """

954

955 def __init__(self, name: str, bbox: Rect, matrix: Matrix) -> None:

956 self.name = name

957 self.matrix = matrix

958 (x, y, w, h) = bbox

959 bounds = ((x, y), (x + w, y), (x, y + h), (x + w, y + h))

960 bbox = get_bound(apply_matrix_pt(matrix, (p, q)) for (p, q) in bounds)

961 LTLayoutContainer.__init__(self, bbox)

962

963 def __repr__(self) -> str:

964 return f"<{self.__class__.__name__}({self.name}) {bbox2str(self.bbox)} matrix={matrix2str(self.matrix)}>"

965

966 def analyze(self, laparams: LAParams) -> None:

967 if not laparams.all_texts:

968 return

969 LTLayoutContainer.analyze(self, laparams)

970

971

972class LTPage(LTLayoutContainer):

973 """Represents an entire page.

974

975 Like any other LTLayoutContainer, an LTPage can be iterated to obtain child

976 objects like LTTextBox, LTFigure, LTImage, LTRect, LTCurve and LTLine.

977 """

978

979 def __init__(self, pageid: int, bbox: Rect, rotate: float = 0) -> None:

980 LTLayoutContainer.__init__(self, bbox)

981 self.pageid = pageid

982 self.rotate = rotate

983

984 def __repr__(self) -> str:

985 return f"<{self.__class__.__name__}({self.pageid!r}) {bbox2str(self.bbox)} rotate={self.rotate!r}>"