Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/layout.py: 89%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import heapq
2import logging
3from typing import (
4 Dict,
5 Generic,
6 Iterable,
7 Iterator,
8 List,
9 Optional,
10 Sequence,
11 Set,
12 Tuple,
13 TypeVar,
14 Union,
15 cast,
16)
18from pdfminer.pdfcolor import PDFColorSpace
19from pdfminer.pdfexceptions import PDFTypeError, PDFValueError
20from pdfminer.pdffont import PDFFont
21from pdfminer.pdfinterp import Color, PDFGraphicState
22from pdfminer.pdftypes import PDFStream
23from pdfminer.utils import (
24 INF,
25 LTComponentT,
26 Matrix,
27 PathSegment,
28 Plane,
29 Point,
30 Rect,
31 apply_matrix_rect,
32 bbox2str,
33 fsplit,
34 get_bound,
35 matrix2str,
36 uniq,
37)
39logger = logging.getLogger(__name__)
42class IndexAssigner:
43 def __init__(self, index: int = 0) -> None:
44 self.index = index
46 def run(self, obj: "LTItem") -> None:
47 if isinstance(obj, LTTextBox):
48 obj.index = self.index
49 self.index += 1
50 elif isinstance(obj, LTTextGroup):
51 for x in obj:
52 self.run(x)
55class LAParams:
56 """Parameters for layout analysis
58 :param line_overlap: If two characters have more overlap than this they
59 are considered to be on the same line. The overlap is specified
60 relative to the minimum height of both characters.
61 :param char_margin: If two characters are closer together than this
62 margin they are considered part of the same line. The margin is
63 specified relative to the width of the character.
64 :param word_margin: If two characters on the same line are further apart
65 than this margin then they are considered to be two separate words, and
66 an intermediate space will be added for readability. The margin is
67 specified relative to the width of the character.
68 :param line_margin: If two lines are are close together they are
69 considered to be part of the same paragraph. The margin is
70 specified relative to the height of a line.
71 :param boxes_flow: Specifies how much a horizontal and vertical position
72 of a text matters when determining the order of text boxes. The value
73 should be within the range of -1.0 (only horizontal position
74 matters) to +1.0 (only vertical position matters). You can also pass
75 `None` to disable advanced layout analysis, and instead return text
76 based on the position of the bottom left corner of the text box.
77 :param detect_vertical: If vertical text should be considered during
78 layout analysis
79 :param all_texts: If layout analysis should be performed on text in
80 figures.
81 """
83 def __init__(
84 self,
85 line_overlap: float = 0.5,
86 char_margin: float = 2.0,
87 line_margin: float = 0.5,
88 word_margin: float = 0.1,
89 boxes_flow: Optional[float] = 0.5,
90 detect_vertical: bool = False,
91 all_texts: bool = False,
92 ) -> None:
93 self.line_overlap = line_overlap
94 self.char_margin = char_margin
95 self.line_margin = line_margin
96 self.word_margin = word_margin
97 self.boxes_flow = boxes_flow
98 self.detect_vertical = detect_vertical
99 self.all_texts = all_texts
101 self._validate()
103 def _validate(self) -> None:
104 if self.boxes_flow is not None:
105 boxes_flow_err_msg = (
106 "LAParam boxes_flow should be None, or a number between -1 and +1"
107 )
108 if not (
109 isinstance(self.boxes_flow, int) or isinstance(self.boxes_flow, float)
110 ):
111 raise PDFTypeError(boxes_flow_err_msg)
112 if not -1 <= self.boxes_flow <= 1:
113 raise PDFValueError(boxes_flow_err_msg)
115 def __repr__(self) -> str:
116 return (
117 "<LAParams: char_margin=%.1f, line_margin=%.1f, "
118 "word_margin=%.1f all_texts=%r>"
119 % (self.char_margin, self.line_margin, self.word_margin, self.all_texts)
120 )
123class LTItem:
124 """Interface for things that can be analyzed"""
126 def analyze(self, laparams: LAParams) -> None:
127 """Perform the layout analysis."""
130class LTText:
131 """Interface for things that have text"""
133 def __repr__(self) -> str:
134 return f"<{self.__class__.__name__} {self.get_text()!r}>"
136 def get_text(self) -> str:
137 """Text contained in this object"""
138 raise NotImplementedError
141class LTComponent(LTItem):
142 """Object with a bounding box"""
144 def __init__(self, bbox: Rect) -> None:
145 LTItem.__init__(self)
146 self.set_bbox(bbox)
148 def __repr__(self) -> str:
149 return f"<{self.__class__.__name__} {bbox2str(self.bbox)}>"
151 # Disable comparison.
152 def __lt__(self, _: object) -> bool:
153 raise PDFValueError
155 def __le__(self, _: object) -> bool:
156 raise PDFValueError
158 def __gt__(self, _: object) -> bool:
159 raise PDFValueError
161 def __ge__(self, _: object) -> bool:
162 raise PDFValueError
164 def set_bbox(self, bbox: Rect) -> None:
165 (x0, y0, x1, y1) = bbox
166 self.x0 = x0
167 self.y0 = y0
168 self.x1 = x1
169 self.y1 = y1
170 self.width = x1 - x0
171 self.height = y1 - y0
172 self.bbox = bbox
174 def is_empty(self) -> bool:
175 return self.width <= 0 or self.height <= 0
177 def is_hoverlap(self, obj: "LTComponent") -> bool:
178 assert isinstance(obj, LTComponent), str(type(obj))
179 return obj.x0 <= self.x1 and self.x0 <= obj.x1
181 def hdistance(self, obj: "LTComponent") -> float:
182 assert isinstance(obj, LTComponent), str(type(obj))
183 if self.is_hoverlap(obj):
184 return 0
185 else:
186 return min(abs(self.x0 - obj.x1), abs(self.x1 - obj.x0))
188 def hoverlap(self, obj: "LTComponent") -> float:
189 assert isinstance(obj, LTComponent), str(type(obj))
190 if self.is_hoverlap(obj):
191 return min(abs(self.x0 - obj.x1), abs(self.x1 - obj.x0))
192 else:
193 return 0
195 def is_voverlap(self, obj: "LTComponent") -> bool:
196 assert isinstance(obj, LTComponent), str(type(obj))
197 return obj.y0 <= self.y1 and self.y0 <= obj.y1
199 def vdistance(self, obj: "LTComponent") -> float:
200 assert isinstance(obj, LTComponent), str(type(obj))
201 if self.is_voverlap(obj):
202 return 0
203 else:
204 return min(abs(self.y0 - obj.y1), abs(self.y1 - obj.y0))
206 def voverlap(self, obj: "LTComponent") -> float:
207 assert isinstance(obj, LTComponent), str(type(obj))
208 if self.is_voverlap(obj):
209 return min(abs(self.y0 - obj.y1), abs(self.y1 - obj.y0))
210 else:
211 return 0
214class LTCurve(LTComponent):
215 """A generic Bezier curve
217 The parameter `original_path` contains the original
218 pathing information from the pdf (e.g. for reconstructing Bezier Curves).
220 `dashing_style` contains the Dashing information if any.
221 """
223 def __init__(
224 self,
225 linewidth: float,
226 pts: List[Point],
227 stroke: bool = False,
228 fill: bool = False,
229 evenodd: bool = False,
230 stroking_color: Optional[Color] = None,
231 non_stroking_color: Optional[Color] = None,
232 original_path: Optional[List[PathSegment]] = None,
233 dashing_style: Optional[Tuple[object, object]] = None,
234 ) -> None:
235 LTComponent.__init__(self, get_bound(pts))
236 self.pts = pts
237 self.linewidth = linewidth
238 self.stroke = stroke
239 self.fill = fill
240 self.evenodd = evenodd
241 self.stroking_color = stroking_color
242 self.non_stroking_color = non_stroking_color
243 self.original_path = original_path
244 self.dashing_style = dashing_style
246 def get_pts(self) -> str:
247 return ",".join("%.3f,%.3f" % p for p in self.pts)
250class LTLine(LTCurve):
251 """A single straight line.
253 Could be used for separating text or figures.
254 """
256 def __init__(
257 self,
258 linewidth: float,
259 p0: Point,
260 p1: Point,
261 stroke: bool = False,
262 fill: bool = False,
263 evenodd: bool = False,
264 stroking_color: Optional[Color] = None,
265 non_stroking_color: Optional[Color] = None,
266 original_path: Optional[List[PathSegment]] = None,
267 dashing_style: Optional[Tuple[object, object]] = None,
268 ) -> None:
269 LTCurve.__init__(
270 self,
271 linewidth,
272 [p0, p1],
273 stroke,
274 fill,
275 evenodd,
276 stroking_color,
277 non_stroking_color,
278 original_path,
279 dashing_style,
280 )
283class LTRect(LTCurve):
284 """A rectangle.
286 Could be used for framing another pictures or figures.
287 """
289 def __init__(
290 self,
291 linewidth: float,
292 bbox: Rect,
293 stroke: bool = False,
294 fill: bool = False,
295 evenodd: bool = False,
296 stroking_color: Optional[Color] = None,
297 non_stroking_color: Optional[Color] = None,
298 original_path: Optional[List[PathSegment]] = None,
299 dashing_style: Optional[Tuple[object, object]] = None,
300 ) -> None:
301 (x0, y0, x1, y1) = bbox
302 LTCurve.__init__(
303 self,
304 linewidth,
305 [(x0, y0), (x1, y0), (x1, y1), (x0, y1)],
306 stroke,
307 fill,
308 evenodd,
309 stroking_color,
310 non_stroking_color,
311 original_path,
312 dashing_style,
313 )
316class LTImage(LTComponent):
317 """An image object.
319 Embedded images can be in JPEG, Bitmap or JBIG2.
320 """
322 def __init__(self, name: str, stream: PDFStream, bbox: Rect) -> None:
323 LTComponent.__init__(self, bbox)
324 self.name = name
325 self.stream = stream
326 self.srcsize = (stream.get_any(("W", "Width")), stream.get_any(("H", "Height")))
327 self.imagemask = stream.get_any(("IM", "ImageMask"))
328 self.bits = stream.get_any(("BPC", "BitsPerComponent"), 1)
329 self.colorspace = stream.get_any(("CS", "ColorSpace"))
330 if not isinstance(self.colorspace, list):
331 self.colorspace = [self.colorspace]
333 def __repr__(self) -> str:
334 return f"<{self.__class__.__name__}({self.name}) {bbox2str(self.bbox)} {self.srcsize!r}>"
337class LTAnno(LTItem, LTText):
338 """Actual letter in the text as a Unicode string.
340 Note that, while a LTChar object has actual boundaries, LTAnno objects does
341 not, as these are "virtual" characters, inserted by a layout analyzer
342 according to the relationship between two characters (e.g. a space).
343 """
345 def __init__(self, text: str) -> None:
346 self._text = text
348 def get_text(self) -> str:
349 return self._text
352class LTChar(LTComponent, LTText):
353 """Actual letter in the text as a Unicode string."""
355 def __init__(
356 self,
357 matrix: Matrix,
358 font: PDFFont,
359 fontsize: float,
360 scaling: float,
361 rise: float,
362 text: str,
363 textwidth: float,
364 textdisp: Union[float, Tuple[Optional[float], float]],
365 ncs: PDFColorSpace,
366 graphicstate: PDFGraphicState,
367 ) -> None:
368 LTText.__init__(self)
369 self._text = text
370 self.matrix = matrix
371 self.fontname = font.fontname
372 self.ncs = ncs
373 self.graphicstate = graphicstate
374 self.adv = textwidth * fontsize * scaling
375 # compute the boundary rectangle.
376 if font.is_vertical():
377 # vertical
378 assert isinstance(textdisp, tuple)
379 (vx, vy) = textdisp
380 if vx is None:
381 vx = fontsize * 0.5
382 else:
383 vx = vx * fontsize * 0.001
384 vy = (1000 - vy) * fontsize * 0.001
385 bbox = (-vx, vy + rise + self.adv, -vx + fontsize, vy + rise)
386 else:
387 # horizontal
388 descent = font.get_descent() * fontsize
389 bbox = (0, descent + rise, self.adv, descent + rise + fontsize)
390 (a, b, c, d, e, f) = self.matrix
391 self.upright = a * d * scaling > 0 and b * c <= 0
392 (x0, y0, x1, y1) = apply_matrix_rect(self.matrix, bbox)
393 if x1 < x0:
394 (x0, x1) = (x1, x0)
395 if y1 < y0:
396 (y0, y1) = (y1, y0)
397 LTComponent.__init__(self, (x0, y0, x1, y1))
398 if font.is_vertical():
399 self.size = self.width
400 else:
401 self.size = self.height
403 def __repr__(self) -> str:
404 return f"<{self.__class__.__name__} {bbox2str(self.bbox)} matrix={matrix2str(self.matrix)} font={self.fontname!r} adv={self.adv} text={self.get_text()!r}>"
406 def get_text(self) -> str:
407 return self._text
410LTItemT = TypeVar("LTItemT", bound=LTItem)
413class LTContainer(LTComponent, Generic[LTItemT]):
414 """Object that can be extended and analyzed"""
416 def __init__(self, bbox: Rect) -> None:
417 LTComponent.__init__(self, bbox)
418 self._objs: List[LTItemT] = []
420 def __iter__(self) -> Iterator[LTItemT]:
421 return iter(self._objs)
423 def __len__(self) -> int:
424 return len(self._objs)
426 def add(self, obj: LTItemT) -> None:
427 self._objs.append(obj)
429 def extend(self, objs: Iterable[LTItemT]) -> None:
430 for obj in objs:
431 self.add(obj)
433 def analyze(self, laparams: LAParams) -> None:
434 for obj in self._objs:
435 obj.analyze(laparams)
438class LTExpandableContainer(LTContainer[LTItemT]):
439 def __init__(self) -> None:
440 LTContainer.__init__(self, (+INF, +INF, -INF, -INF))
442 # Incompatible override: we take an LTComponent (with bounding box), but
443 # super() LTContainer only considers LTItem (no bounding box).
444 def add(self, obj: LTComponent) -> None: # type: ignore[override]
445 LTContainer.add(self, cast(LTItemT, obj))
446 self.set_bbox(
447 (
448 min(self.x0, obj.x0),
449 min(self.y0, obj.y0),
450 max(self.x1, obj.x1),
451 max(self.y1, obj.y1),
452 ),
453 )
456class LTTextContainer(LTExpandableContainer[LTItemT], LTText):
457 def __init__(self) -> None:
458 LTText.__init__(self)
459 LTExpandableContainer.__init__(self)
461 def get_text(self) -> str:
462 return "".join(
463 cast(LTText, obj).get_text() for obj in self if isinstance(obj, LTText)
464 )
467TextLineElement = Union[LTChar, LTAnno]
470class LTTextLine(LTTextContainer[TextLineElement]):
471 """Contains a list of LTChar objects that represent a single text line.
473 The characters are aligned either horizontally or vertically, depending on
474 the text's writing mode.
475 """
477 def __init__(self, word_margin: float) -> None:
478 super().__init__()
479 self.word_margin = word_margin
481 def __repr__(self) -> str:
482 return f"<{self.__class__.__name__} {bbox2str(self.bbox)} {self.get_text()!r}>"
484 def analyze(self, laparams: LAParams) -> None:
485 for obj in self._objs:
486 obj.analyze(laparams)
487 LTContainer.add(self, LTAnno("\n"))
489 def find_neighbors(
490 self,
491 plane: Plane[LTComponentT],
492 ratio: float,
493 ) -> List["LTTextLine"]:
494 raise NotImplementedError
496 def is_empty(self) -> bool:
497 return super().is_empty() or self.get_text().isspace()
500class LTTextLineHorizontal(LTTextLine):
501 def __init__(self, word_margin: float) -> None:
502 LTTextLine.__init__(self, word_margin)
503 self._x1: float = +INF
505 # Incompatible override: we take an LTComponent (with bounding box), but
506 # LTContainer only considers LTItem (no bounding box).
507 def add(self, obj: LTComponent) -> None: # type: ignore[override]
508 if isinstance(obj, LTChar) and self.word_margin:
509 margin = self.word_margin * max(obj.width, obj.height)
510 if self._x1 < obj.x0 - margin:
511 LTContainer.add(self, LTAnno(" "))
512 self._x1 = obj.x1
513 super().add(obj)
515 def find_neighbors(
516 self,
517 plane: Plane[LTComponentT],
518 ratio: float,
519 ) -> List[LTTextLine]:
520 """Finds neighboring LTTextLineHorizontals in the plane.
522 Returns a list of other LTTestLineHorizontals in the plane which are
523 close to self. "Close" can be controlled by ratio. The returned objects
524 will be the same height as self, and also either left-, right-, or
525 centrally-aligned.
526 """
527 d = ratio * self.height
528 objs = plane.find((self.x0, self.y0 - d, self.x1, self.y1 + d))
529 return [
530 obj
531 for obj in objs
532 if (
533 isinstance(obj, LTTextLineHorizontal)
534 and self._is_same_height_as(obj, tolerance=d)
535 and (
536 self._is_left_aligned_with(obj, tolerance=d)
537 or self._is_right_aligned_with(obj, tolerance=d)
538 or self._is_centrally_aligned_with(obj, tolerance=d)
539 )
540 )
541 ]
543 def _is_left_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:
544 """Whether the left-hand edge of `other` is within `tolerance`."""
545 return abs(other.x0 - self.x0) <= tolerance
547 def _is_right_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:
548 """Whether the right-hand edge of `other` is within `tolerance`."""
549 return abs(other.x1 - self.x1) <= tolerance
551 def _is_centrally_aligned_with(
552 self,
553 other: LTComponent,
554 tolerance: float = 0,
555 ) -> bool:
556 """Whether the horizontal center of `other` is within `tolerance`."""
557 return abs((other.x0 + other.x1) / 2 - (self.x0 + self.x1) / 2) <= tolerance
559 def _is_same_height_as(self, other: LTComponent, tolerance: float = 0) -> bool:
560 return abs(other.height - self.height) <= tolerance
563class LTTextLineVertical(LTTextLine):
564 def __init__(self, word_margin: float) -> None:
565 LTTextLine.__init__(self, word_margin)
566 self._y0: float = -INF
568 # Incompatible override: we take an LTComponent (with bounding box), but
569 # LTContainer only considers LTItem (no bounding box).
570 def add(self, obj: LTComponent) -> None: # type: ignore[override]
571 if isinstance(obj, LTChar) and self.word_margin:
572 margin = self.word_margin * max(obj.width, obj.height)
573 if obj.y1 + margin < self._y0:
574 LTContainer.add(self, LTAnno(" "))
575 self._y0 = obj.y0
576 super().add(obj)
578 def find_neighbors(
579 self,
580 plane: Plane[LTComponentT],
581 ratio: float,
582 ) -> List[LTTextLine]:
583 """Finds neighboring LTTextLineVerticals in the plane.
585 Returns a list of other LTTextLineVerticals in the plane which are
586 close to self. "Close" can be controlled by ratio. The returned objects
587 will be the same width as self, and also either upper-, lower-, or
588 centrally-aligned.
589 """
590 d = ratio * self.width
591 objs = plane.find((self.x0 - d, self.y0, self.x1 + d, self.y1))
592 return [
593 obj
594 for obj in objs
595 if (
596 isinstance(obj, LTTextLineVertical)
597 and self._is_same_width_as(obj, tolerance=d)
598 and (
599 self._is_lower_aligned_with(obj, tolerance=d)
600 or self._is_upper_aligned_with(obj, tolerance=d)
601 or self._is_centrally_aligned_with(obj, tolerance=d)
602 )
603 )
604 ]
606 def _is_lower_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:
607 """Whether the lower edge of `other` is within `tolerance`."""
608 return abs(other.y0 - self.y0) <= tolerance
610 def _is_upper_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:
611 """Whether the upper edge of `other` is within `tolerance`."""
612 return abs(other.y1 - self.y1) <= tolerance
614 def _is_centrally_aligned_with(
615 self,
616 other: LTComponent,
617 tolerance: float = 0,
618 ) -> bool:
619 """Whether the vertical center of `other` is within `tolerance`."""
620 return abs((other.y0 + other.y1) / 2 - (self.y0 + self.y1) / 2) <= tolerance
622 def _is_same_width_as(self, other: LTComponent, tolerance: float) -> bool:
623 return abs(other.width - self.width) <= tolerance
626class LTTextBox(LTTextContainer[LTTextLine]):
627 """Represents a group of text chunks in a rectangular area.
629 Note that this box is created by geometric analysis and does not
630 necessarily represents a logical boundary of the text. It contains a list
631 of LTTextLine objects.
632 """
634 def __init__(self) -> None:
635 LTTextContainer.__init__(self)
636 self.index: int = -1
638 def __repr__(self) -> str:
639 return f"<{self.__class__.__name__}({self.index}) {bbox2str(self.bbox)} {self.get_text()!r}>"
641 def get_writing_mode(self) -> str:
642 raise NotImplementedError
645class LTTextBoxHorizontal(LTTextBox):
646 def analyze(self, laparams: LAParams) -> None:
647 super().analyze(laparams)
648 self._objs.sort(key=lambda obj: -obj.y1)
650 def get_writing_mode(self) -> str:
651 return "lr-tb"
654class LTTextBoxVertical(LTTextBox):
655 def analyze(self, laparams: LAParams) -> None:
656 super().analyze(laparams)
657 self._objs.sort(key=lambda obj: -obj.x1)
659 def get_writing_mode(self) -> str:
660 return "tb-rl"
663TextGroupElement = Union[LTTextBox, "LTTextGroup"]
666class LTTextGroup(LTTextContainer[TextGroupElement]):
667 def __init__(self, objs: Iterable[TextGroupElement]) -> None:
668 super().__init__()
669 self.extend(objs)
672class LTTextGroupLRTB(LTTextGroup):
673 def analyze(self, laparams: LAParams) -> None:
674 super().analyze(laparams)
675 assert laparams.boxes_flow is not None
676 boxes_flow = laparams.boxes_flow
677 # reorder the objects from top-left to bottom-right.
678 self._objs.sort(
679 key=lambda obj: (1 - boxes_flow) * obj.x0
680 - (1 + boxes_flow) * (obj.y0 + obj.y1),
681 )
684class LTTextGroupTBRL(LTTextGroup):
685 def analyze(self, laparams: LAParams) -> None:
686 super().analyze(laparams)
687 assert laparams.boxes_flow is not None
688 boxes_flow = laparams.boxes_flow
689 # reorder the objects from top-right to bottom-left.
690 self._objs.sort(
691 key=lambda obj: -(1 + boxes_flow) * (obj.x0 + obj.x1)
692 - (1 - boxes_flow) * obj.y1,
693 )
696class LTLayoutContainer(LTContainer[LTComponent]):
697 def __init__(self, bbox: Rect) -> None:
698 LTContainer.__init__(self, bbox)
699 self.groups: Optional[List[LTTextGroup]] = None
701 # group_objects: group text object to textlines.
702 def group_objects(
703 self,
704 laparams: LAParams,
705 objs: Iterable[LTComponent],
706 ) -> Iterator[LTTextLine]:
707 obj0 = None
708 line = None
709 for obj1 in objs:
710 if obj0 is not None:
711 # halign: obj0 and obj1 is horizontally aligned.
712 #
713 # +------+ - - -
714 # | obj0 | - - +------+ -
715 # | | | obj1 | | (line_overlap)
716 # +------+ - - | | -
717 # - - - +------+
718 #
719 # |<--->|
720 # (char_margin)
721 halign = (
722 obj0.is_voverlap(obj1)
723 and min(obj0.height, obj1.height) * laparams.line_overlap
724 < obj0.voverlap(obj1)
725 and obj0.hdistance(obj1)
726 < max(obj0.width, obj1.width) * laparams.char_margin
727 )
729 # valign: obj0 and obj1 is vertically aligned.
730 #
731 # +------+
732 # | obj0 |
733 # | |
734 # +------+ - - -
735 # | | | (char_margin)
736 # +------+ - -
737 # | obj1 |
738 # | |
739 # +------+
740 #
741 # |<-->|
742 # (line_overlap)
743 valign = (
744 laparams.detect_vertical
745 and obj0.is_hoverlap(obj1)
746 and min(obj0.width, obj1.width) * laparams.line_overlap
747 < obj0.hoverlap(obj1)
748 and obj0.vdistance(obj1)
749 < max(obj0.height, obj1.height) * laparams.char_margin
750 )
752 if (halign and isinstance(line, LTTextLineHorizontal)) or (
753 valign and isinstance(line, LTTextLineVertical)
754 ):
755 line.add(obj1)
756 elif line is not None:
757 yield line
758 line = None
759 elif valign and not halign:
760 line = LTTextLineVertical(laparams.word_margin)
761 line.add(obj0)
762 line.add(obj1)
763 elif halign and not valign:
764 line = LTTextLineHorizontal(laparams.word_margin)
765 line.add(obj0)
766 line.add(obj1)
767 else:
768 line = LTTextLineHorizontal(laparams.word_margin)
769 line.add(obj0)
770 yield line
771 line = None
772 obj0 = obj1
773 if line is None:
774 line = LTTextLineHorizontal(laparams.word_margin)
775 assert obj0 is not None
776 line.add(obj0)
777 yield line
779 def group_textlines(
780 self,
781 laparams: LAParams,
782 lines: Iterable[LTTextLine],
783 ) -> Iterator[LTTextBox]:
784 """Group neighboring lines to textboxes"""
785 plane: Plane[LTTextLine] = Plane(self.bbox)
786 plane.extend(lines)
787 boxes: Dict[LTTextLine, LTTextBox] = {}
788 for line in lines:
789 neighbors = line.find_neighbors(plane, laparams.line_margin)
790 members = [line]
791 for obj1 in neighbors:
792 members.append(obj1)
793 if obj1 in boxes:
794 members.extend(boxes.pop(obj1))
795 if isinstance(line, LTTextLineHorizontal):
796 box: LTTextBox = LTTextBoxHorizontal()
797 else:
798 box = LTTextBoxVertical()
799 for obj in uniq(members):
800 box.add(obj)
801 boxes[obj] = box
802 done = set()
803 for line in lines:
804 if line not in boxes:
805 continue
806 box = boxes[line]
807 if box in done:
808 continue
809 done.add(box)
810 if not box.is_empty():
811 yield box
813 def group_textboxes(
814 self,
815 laparams: LAParams,
816 boxes: Sequence[LTTextBox],
817 ) -> List[LTTextGroup]:
818 """Group textboxes hierarchically.
820 Get pair-wise distances, via dist func defined below, and then merge
821 from the closest textbox pair. Once obj1 and obj2 are merged /
822 grouped, the resulting group is considered as a new object, and its
823 distances to other objects & groups are added to the process queue.
825 For performance reason, pair-wise distances and object pair info are
826 maintained in a heap of (idx, dist, id(obj1), id(obj2), obj1, obj2)
827 tuples. It ensures quick access to the smallest element. Note that
828 since comparison operators, e.g., __lt__, are disabled for
829 LTComponent, id(obj) has to appear before obj in element tuples.
831 :param laparams: LAParams object.
832 :param boxes: All textbox objects to be grouped.
833 :return: a list that has only one element, the final top level group.
834 """
835 ElementT = Union[LTTextBox, LTTextGroup]
836 plane: Plane[ElementT] = Plane(self.bbox)
838 def dist(obj1: LTComponent, obj2: LTComponent) -> float:
839 """A distance function between two TextBoxes.
841 Consider the bounding rectangle for obj1 and obj2.
842 Return its area less the areas of obj1 and obj2,
843 shown as 'www' below. This value may be negative.
844 +------+..........+ (x1, y1)
845 | obj1 |wwwwwwwwww:
846 +------+www+------+
847 :wwwwwwwwww| obj2 |
848 (x0, y0) +..........+------+
849 """
850 x0 = min(obj1.x0, obj2.x0)
851 y0 = min(obj1.y0, obj2.y0)
852 x1 = max(obj1.x1, obj2.x1)
853 y1 = max(obj1.y1, obj2.y1)
854 return (
855 (x1 - x0) * (y1 - y0)
856 - obj1.width * obj1.height
857 - obj2.width * obj2.height
858 )
860 def isany(obj1: ElementT, obj2: ElementT) -> Set[ElementT]:
861 """Check if there's any other object between obj1 and obj2."""
862 x0 = min(obj1.x0, obj2.x0)
863 y0 = min(obj1.y0, obj2.y0)
864 x1 = max(obj1.x1, obj2.x1)
865 y1 = max(obj1.y1, obj2.y1)
866 objs = set(plane.find((x0, y0, x1, y1)))
867 return objs.difference((obj1, obj2))
869 dists: List[Tuple[bool, float, int, int, ElementT, ElementT]] = []
870 for i in range(len(boxes)):
871 box1 = boxes[i]
872 for j in range(i + 1, len(boxes)):
873 box2 = boxes[j]
874 dists.append((False, dist(box1, box2), id(box1), id(box2), box1, box2))
875 heapq.heapify(dists)
877 plane.extend(boxes)
878 done = set()
879 while len(dists) > 0:
880 (skip_isany, d, id1, id2, obj1, obj2) = heapq.heappop(dists)
881 # Skip objects that are already merged
882 if (id1 not in done) and (id2 not in done):
883 if not skip_isany and isany(obj1, obj2):
884 heapq.heappush(dists, (True, d, id1, id2, obj1, obj2))
885 continue
886 if isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or isinstance(
887 obj2,
888 (LTTextBoxVertical, LTTextGroupTBRL),
889 ):
890 group: LTTextGroup = LTTextGroupTBRL([obj1, obj2])
891 else:
892 group = LTTextGroupLRTB([obj1, obj2])
893 plane.remove(obj1)
894 plane.remove(obj2)
895 done.update([id1, id2])
897 for other in plane:
898 heapq.heappush(
899 dists,
900 (False, dist(group, other), id(group), id(other), group, other),
901 )
902 plane.add(group)
903 # By now only groups are in the plane
904 return list(cast(LTTextGroup, g) for g in plane)
906 def analyze(self, laparams: LAParams) -> None:
907 # textobjs is a list of LTChar objects, i.e.
908 # it has all the individual characters in the page.
909 (textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self)
910 for obj in otherobjs:
911 obj.analyze(laparams)
912 if not textobjs:
913 return
914 textlines = list(self.group_objects(laparams, textobjs))
915 (empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines)
916 for obj in empties:
917 obj.analyze(laparams)
918 textboxes = list(self.group_textlines(laparams, textlines))
919 if laparams.boxes_flow is None:
920 for textbox in textboxes:
921 textbox.analyze(laparams)
923 def getkey(box: LTTextBox) -> Tuple[int, float, float]:
924 if isinstance(box, LTTextBoxVertical):
925 return (0, -box.x1, -box.y0)
926 else:
927 return (1, -box.y0, box.x0)
929 textboxes.sort(key=getkey)
930 else:
931 self.groups = self.group_textboxes(laparams, textboxes)
932 assigner = IndexAssigner()
933 for group in self.groups:
934 group.analyze(laparams)
935 assigner.run(group)
936 textboxes.sort(key=lambda box: box.index)
937 self._objs = (
938 cast(List[LTComponent], textboxes)
939 + otherobjs
940 + cast(List[LTComponent], empties)
941 )
944class LTFigure(LTLayoutContainer):
945 """Represents an area used by PDF Form objects.
947 PDF Forms can be used to present figures or pictures by embedding yet
948 another PDF document within a page. Note that LTFigure objects can appear
949 recursively.
950 """
952 def __init__(self, name: str, bbox: Rect, matrix: Matrix) -> None:
953 self.name = name
954 self.matrix = matrix
955 (x, y, w, h) = bbox
956 rect = (x, y, x + w, y + h)
957 bbox = apply_matrix_rect(matrix, rect)
958 LTLayoutContainer.__init__(self, bbox)
960 def __repr__(self) -> str:
961 return f"<{self.__class__.__name__}({self.name}) {bbox2str(self.bbox)} matrix={matrix2str(self.matrix)}>"
963 def analyze(self, laparams: LAParams) -> None:
964 if not laparams.all_texts:
965 return
966 LTLayoutContainer.analyze(self, laparams)
969class LTPage(LTLayoutContainer):
970 """Represents an entire page.
972 Like any other LTLayoutContainer, an LTPage can be iterated to obtain child
973 objects like LTTextBox, LTFigure, LTImage, LTRect, LTCurve and LTLine.
974 """
976 def __init__(self, pageid: int, bbox: Rect, rotate: float = 0) -> None:
977 LTLayoutContainer.__init__(self, bbox)
978 self.pageid = pageid
979 self.rotate = rotate
981 def __repr__(self) -> str:
982 return f"<{self.__class__.__name__}({self.pageid!r}) {bbox2str(self.bbox)} rotate={self.rotate!r}>"