Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/layout.py: 87%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import heapq
2import logging
3from collections.abc import Iterable, Iterator, Sequence
4from typing import (
5 Generic,
6 TypeVar,
7 Union,
8 cast,
9)
11from pdfminer.pdfcolor import PDFColorSpace
12from pdfminer.pdfexceptions import PDFTypeError, PDFValueError
13from pdfminer.pdffont import PDFFont
14from pdfminer.pdfinterp import Color, PDFGraphicState
15from pdfminer.pdftypes import PDFStream
16from pdfminer.utils import (
17 INF,
18 LTComponentT,
19 Matrix,
20 PathSegment,
21 Plane,
22 Point,
23 Rect,
24 apply_matrix_rect,
25 bbox2str,
26 fsplit,
27 get_bound,
28 matrix2str,
29 uniq,
30)
32logger = logging.getLogger(__name__)
35class IndexAssigner:
36 def __init__(self, index: int = 0) -> None:
37 self.index = index
39 def run(self, obj: "LTItem") -> None:
40 if isinstance(obj, LTTextBox):
41 obj.index = self.index
42 self.index += 1
43 elif isinstance(obj, LTTextGroup):
44 for x in obj:
45 self.run(x)
48class LAParams:
49 """Parameters for layout analysis
51 :param line_overlap: If two characters have more overlap than this they
52 are considered to be on the same line. The overlap is specified
53 relative to the minimum height of both characters.
54 :param char_margin: If two characters are closer together than this
55 margin they are considered part of the same line. The margin is
56 specified relative to the width of the character.
57 :param word_margin: If two characters on the same line are further apart
58 than this margin then they are considered to be two separate words, and
59 an intermediate space will be added for readability. The margin is
60 specified relative to the width of the character.
61 :param line_margin: If two lines are are close together they are
62 considered to be part of the same paragraph. The margin is
63 specified relative to the height of a line.
64 :param boxes_flow: Specifies how much a horizontal and vertical position
65 of a text matters when determining the order of text boxes. The value
66 should be within the range of -1.0 (only horizontal position
67 matters) to +1.0 (only vertical position matters). You can also pass
68 `None` to disable advanced layout analysis, and instead return text
69 based on the position of the bottom left corner of the text box.
70 :param detect_vertical: If vertical text should be considered during
71 layout analysis
72 :param all_texts: If layout analysis should be performed on text in
73 figures.
74 """
76 def __init__(
77 self,
78 line_overlap: float = 0.5,
79 char_margin: float = 2.0,
80 line_margin: float = 0.5,
81 word_margin: float = 0.1,
82 boxes_flow: float | None = 0.5,
83 detect_vertical: bool = False,
84 all_texts: bool = False,
85 ) -> None:
86 self.line_overlap = line_overlap
87 self.char_margin = char_margin
88 self.line_margin = line_margin
89 self.word_margin = word_margin
90 self.boxes_flow = boxes_flow
91 self.detect_vertical = detect_vertical
92 self.all_texts = all_texts
94 self._validate()
96 def _validate(self) -> None:
97 if self.boxes_flow is not None:
98 boxes_flow_err_msg = (
99 "LAParam boxes_flow should be None, or a number between -1 and +1"
100 )
101 if not (isinstance(self.boxes_flow, (int, float))):
102 raise PDFTypeError(boxes_flow_err_msg)
103 if not -1 <= self.boxes_flow <= 1:
104 raise PDFValueError(boxes_flow_err_msg)
106 def __repr__(self) -> str:
107 return (
108 f"<LAParams: char_margin={self.char_margin:.1f}, "
109 f"line_margin={self.line_margin:.1f}, "
110 f"word_margin={self.word_margin:.1f} "
111 f"all_texts={self.all_texts!r}>"
112 )
115class LTItem:
116 """Interface for things that can be analyzed"""
118 def analyze(self, laparams: LAParams) -> None:
119 """Perform the layout analysis."""
122class LTText:
123 """Interface for things that have text"""
125 def __repr__(self) -> str:
126 return f"<{self.__class__.__name__} {self.get_text()!r}>"
128 def get_text(self) -> str:
129 """Text contained in this object"""
130 raise NotImplementedError
133class LTComponent(LTItem):
134 """Object with a bounding box"""
136 def __init__(self, bbox: Rect) -> None:
137 LTItem.__init__(self)
138 self.set_bbox(bbox)
140 def __repr__(self) -> str:
141 return f"<{self.__class__.__name__} {bbox2str(self.bbox)}>"
143 # Disable comparison.
144 def __lt__(self, _: object) -> bool:
145 raise PDFValueError
147 def __le__(self, _: object) -> bool:
148 raise PDFValueError
150 def __gt__(self, _: object) -> bool:
151 raise PDFValueError
153 def __ge__(self, _: object) -> bool:
154 raise PDFValueError
156 def set_bbox(self, bbox: Rect) -> None:
157 (x0, y0, x1, y1) = bbox
158 self.x0 = x0
159 self.y0 = y0
160 self.x1 = x1
161 self.y1 = y1
162 self.width = x1 - x0
163 self.height = y1 - y0
164 self.bbox = bbox
166 def is_empty(self) -> bool:
167 return self.width <= 0 or self.height <= 0
169 def is_hoverlap(self, obj: "LTComponent") -> bool:
170 assert isinstance(obj, LTComponent), str(type(obj))
171 return obj.x0 <= self.x1 and self.x0 <= obj.x1
173 def hdistance(self, obj: "LTComponent") -> float:
174 assert isinstance(obj, LTComponent), str(type(obj))
175 if self.is_hoverlap(obj):
176 return 0
177 else:
178 return min(abs(self.x0 - obj.x1), abs(self.x1 - obj.x0))
180 def hoverlap(self, obj: "LTComponent") -> float:
181 assert isinstance(obj, LTComponent), str(type(obj))
182 if self.is_hoverlap(obj):
183 return min(abs(self.x0 - obj.x1), abs(self.x1 - obj.x0))
184 else:
185 return 0
187 def is_voverlap(self, obj: "LTComponent") -> bool:
188 assert isinstance(obj, LTComponent), str(type(obj))
189 return obj.y0 <= self.y1 and self.y0 <= obj.y1
191 def vdistance(self, obj: "LTComponent") -> float:
192 assert isinstance(obj, LTComponent), str(type(obj))
193 if self.is_voverlap(obj):
194 return 0
195 else:
196 return min(abs(self.y0 - obj.y1), abs(self.y1 - obj.y0))
198 def voverlap(self, obj: "LTComponent") -> float:
199 assert isinstance(obj, LTComponent), str(type(obj))
200 if self.is_voverlap(obj):
201 return min(abs(self.y0 - obj.y1), abs(self.y1 - obj.y0))
202 else:
203 return 0
206class LTCurve(LTComponent):
207 """A generic Bezier curve
209 The parameter `original_path` contains the original
210 pathing information from the pdf (e.g. for reconstructing Bezier Curves).
212 `dashing_style` contains the Dashing information if any.
213 """
215 def __init__(
216 self,
217 linewidth: float,
218 pts: list[Point],
219 stroke: bool = False,
220 fill: bool = False,
221 evenodd: bool = False,
222 stroking_color: Color | None = None,
223 non_stroking_color: Color | None = None,
224 original_path: list[PathSegment] | None = None,
225 dashing_style: tuple[object, object] | None = None,
226 ) -> None:
227 LTComponent.__init__(self, get_bound(pts))
228 self.pts = pts
229 self.linewidth = linewidth
230 self.stroke = stroke
231 self.fill = fill
232 self.evenodd = evenodd
233 self.stroking_color = stroking_color
234 self.non_stroking_color = non_stroking_color
235 self.original_path = original_path
236 self.dashing_style = dashing_style
238 def get_pts(self) -> str:
239 return ",".join("{:.3f},{:.3f}".format(*p) for p in self.pts)
242class LTLine(LTCurve):
243 """A single straight line.
245 Could be used for separating text or figures.
246 """
248 def __init__(
249 self,
250 linewidth: float,
251 p0: Point,
252 p1: Point,
253 stroke: bool = False,
254 fill: bool = False,
255 evenodd: bool = False,
256 stroking_color: Color | None = None,
257 non_stroking_color: Color | None = None,
258 original_path: list[PathSegment] | None = None,
259 dashing_style: tuple[object, object] | None = None,
260 ) -> None:
261 LTCurve.__init__(
262 self,
263 linewidth,
264 [p0, p1],
265 stroke,
266 fill,
267 evenodd,
268 stroking_color,
269 non_stroking_color,
270 original_path,
271 dashing_style,
272 )
275class LTRect(LTCurve):
276 """A rectangle.
278 Could be used for framing another pictures or figures.
279 """
281 def __init__(
282 self,
283 linewidth: float,
284 bbox: Rect,
285 stroke: bool = False,
286 fill: bool = False,
287 evenodd: bool = False,
288 stroking_color: Color | None = None,
289 non_stroking_color: Color | None = None,
290 original_path: list[PathSegment] | None = None,
291 dashing_style: tuple[object, object] | None = None,
292 ) -> None:
293 (x0, y0, x1, y1) = bbox
294 LTCurve.__init__(
295 self,
296 linewidth,
297 [(x0, y0), (x1, y0), (x1, y1), (x0, y1)],
298 stroke,
299 fill,
300 evenodd,
301 stroking_color,
302 non_stroking_color,
303 original_path,
304 dashing_style,
305 )
308class LTImage(LTComponent):
309 """An image object.
311 Embedded images can be in JPEG, Bitmap or JBIG2.
312 """
314 def __init__(self, name: str, stream: PDFStream, bbox: Rect) -> None:
315 LTComponent.__init__(self, bbox)
316 self.name = name
317 self.stream = stream
318 self.srcsize = (stream.get_any(("W", "Width")), stream.get_any(("H", "Height")))
319 self.imagemask = stream.get_any(("IM", "ImageMask"))
320 self.bits = stream.get_any(("BPC", "BitsPerComponent"), 1)
321 self.colorspace = stream.get_any(("CS", "ColorSpace"))
322 if not isinstance(self.colorspace, list):
323 self.colorspace = [self.colorspace]
325 def __repr__(self) -> str:
326 return (
327 f"<{self.__class__.__name__}({self.name}) "
328 f"{bbox2str(self.bbox)} {self.srcsize!r}>"
329 )
332class LTAnno(LTItem, LTText):
333 """Actual letter in the text as a Unicode string.
335 Note that, while a LTChar object has actual boundaries, LTAnno objects does
336 not, as these are "virtual" characters, inserted by a layout analyzer
337 according to the relationship between two characters (e.g. a space).
338 """
340 def __init__(self, text: str) -> None:
341 self._text = text
343 def get_text(self) -> str:
344 return self._text
347class LTChar(LTComponent, LTText):
348 """Actual letter in the text as a Unicode string."""
350 def __init__(
351 self,
352 matrix: Matrix,
353 font: PDFFont,
354 fontsize: float,
355 scaling: float,
356 rise: float,
357 text: str,
358 textwidth: float,
359 textdisp: float | tuple[float | None, float],
360 ncs: PDFColorSpace,
361 graphicstate: PDFGraphicState,
362 ) -> None:
363 LTText.__init__(self)
364 self._text = text
365 self.matrix = matrix
366 self.fontname = font.fontname
367 self.ncs = ncs
368 self.graphicstate = graphicstate
369 self.adv = textwidth * fontsize * scaling
370 # compute the boundary rectangle.
371 if font.is_vertical():
372 # vertical
373 assert isinstance(textdisp, tuple)
374 (vx, vy) = textdisp
375 vx = fontsize * 0.5 if vx is None else vx * fontsize * 0.001
376 vy = (1000 - vy) * fontsize * 0.001
377 bbox = (-vx, vy + rise + self.adv, -vx + fontsize, vy + rise)
378 else:
379 # horizontal
380 descent = font.get_descent() * fontsize
381 bbox = (0, descent + rise, self.adv, descent + rise + fontsize)
382 (a, b, c, d, _e, _f) = self.matrix
383 self.upright = a * d * scaling > 0 and b * c <= 0
384 (x0, y0, x1, y1) = apply_matrix_rect(self.matrix, bbox)
385 if x1 < x0:
386 (x0, x1) = (x1, x0)
387 if y1 < y0:
388 (y0, y1) = (y1, y0)
389 LTComponent.__init__(self, (x0, y0, x1, y1))
390 if font.is_vertical():
391 self.size = self.width
392 else:
393 self.size = self.height
395 def __repr__(self) -> str:
396 return (
397 f"<{self.__class__.__name__} {bbox2str(self.bbox)} "
398 f"matrix={matrix2str(self.matrix)} "
399 f"font={self.fontname!r} "
400 f"adv={self.adv} "
401 f"text={self.get_text()!r}>"
402 )
404 def get_text(self) -> str:
405 return self._text
408LTItemT = TypeVar("LTItemT", bound=LTItem)
411class LTContainer(LTComponent, Generic[LTItemT]):
412 """Object that can be extended and analyzed"""
414 def __init__(self, bbox: Rect) -> None:
415 LTComponent.__init__(self, bbox)
416 self._objs: list[LTItemT] = []
418 def __iter__(self) -> Iterator[LTItemT]:
419 return iter(self._objs)
421 def __len__(self) -> int:
422 return len(self._objs)
424 def add(self, obj: LTItemT) -> None:
425 self._objs.append(obj)
427 def extend(self, objs: Iterable[LTItemT]) -> None:
428 for obj in objs:
429 self.add(obj)
431 def analyze(self, laparams: LAParams) -> None:
432 for obj in self._objs:
433 obj.analyze(laparams)
436class LTExpandableContainer(LTContainer[LTItemT]):
437 def __init__(self) -> None:
438 LTContainer.__init__(self, (+INF, +INF, -INF, -INF))
440 # Incompatible override: we take an LTComponent (with bounding box), but
441 # super() LTContainer only considers LTItem (no bounding box).
442 def add(self, obj: LTComponent) -> None: # type: ignore[override]
443 LTContainer.add(self, cast(LTItemT, obj))
444 self.set_bbox(
445 (
446 min(self.x0, obj.x0),
447 min(self.y0, obj.y0),
448 max(self.x1, obj.x1),
449 max(self.y1, obj.y1),
450 ),
451 )
454class LTTextContainer(LTExpandableContainer[LTItemT], LTText):
455 def __init__(self) -> None:
456 LTText.__init__(self)
457 LTExpandableContainer.__init__(self)
459 def get_text(self) -> str:
460 return "".join(
461 cast(LTText, obj).get_text() for obj in self if isinstance(obj, LTText)
462 )
465TextLineElement = Union[LTChar, LTAnno]
468class LTTextLine(LTTextContainer[TextLineElement]):
469 """Contains a list of LTChar objects that represent a single text line.
471 The characters are aligned either horizontally or vertically, depending on
472 the text's writing mode.
473 """
475 def __init__(self, word_margin: float) -> None:
476 super().__init__()
477 self.word_margin = word_margin
479 def __repr__(self) -> str:
480 return f"<{self.__class__.__name__} {bbox2str(self.bbox)} {self.get_text()!r}>"
482 def analyze(self, laparams: LAParams) -> None:
483 for obj in self._objs:
484 obj.analyze(laparams)
485 LTContainer.add(self, LTAnno("\n"))
487 def find_neighbors(
488 self,
489 plane: Plane[LTComponentT],
490 ratio: float,
491 ) -> list["LTTextLine"]:
492 raise NotImplementedError
494 def is_empty(self) -> bool:
495 return super().is_empty() or self.get_text().isspace()
498class LTTextLineHorizontal(LTTextLine):
499 def __init__(self, word_margin: float) -> None:
500 LTTextLine.__init__(self, word_margin)
501 self._x1: float = +INF
503 # Incompatible override: we take an LTComponent (with bounding box), but
504 # LTContainer only considers LTItem (no bounding box).
505 def add(self, obj: LTComponent) -> None: # type: ignore[override]
506 if isinstance(obj, LTChar) and self.word_margin:
507 margin = self.word_margin * max(obj.width, obj.height)
508 if self._x1 < obj.x0 - margin:
509 LTContainer.add(self, LTAnno(" "))
510 self._x1 = obj.x1
511 super().add(obj)
513 def find_neighbors(
514 self,
515 plane: Plane[LTComponentT],
516 ratio: float,
517 ) -> list[LTTextLine]:
518 """Finds neighboring LTTextLineHorizontals in the plane.
520 Returns a list of other LTTestLineHorizontals in the plane which are
521 close to self. "Close" can be controlled by ratio. The returned objects
522 will be the same height as self, and also either left-, right-, or
523 centrally-aligned.
524 """
525 d = ratio * self.height
526 objs = plane.find((self.x0, self.y0 - d, self.x1, self.y1 + d))
527 return [
528 obj
529 for obj in objs
530 if (
531 isinstance(obj, LTTextLineHorizontal)
532 and self._is_same_height_as(obj, tolerance=d)
533 and (
534 self._is_left_aligned_with(obj, tolerance=d)
535 or self._is_right_aligned_with(obj, tolerance=d)
536 or self._is_centrally_aligned_with(obj, tolerance=d)
537 )
538 )
539 ]
541 def _is_left_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:
542 """Whether the left-hand edge of `other` is within `tolerance`."""
543 return abs(other.x0 - self.x0) <= tolerance
545 def _is_right_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:
546 """Whether the right-hand edge of `other` is within `tolerance`."""
547 return abs(other.x1 - self.x1) <= tolerance
549 def _is_centrally_aligned_with(
550 self,
551 other: LTComponent,
552 tolerance: float = 0,
553 ) -> bool:
554 """Whether the horizontal center of `other` is within `tolerance`."""
555 return abs((other.x0 + other.x1) / 2 - (self.x0 + self.x1) / 2) <= tolerance
557 def _is_same_height_as(self, other: LTComponent, tolerance: float = 0) -> bool:
558 return abs(other.height - self.height) <= tolerance
561class LTTextLineVertical(LTTextLine):
562 def __init__(self, word_margin: float) -> None:
563 LTTextLine.__init__(self, word_margin)
564 self._y0: float = -INF
566 # Incompatible override: we take an LTComponent (with bounding box), but
567 # LTContainer only considers LTItem (no bounding box).
568 def add(self, obj: LTComponent) -> None: # type: ignore[override]
569 if isinstance(obj, LTChar) and self.word_margin:
570 margin = self.word_margin * max(obj.width, obj.height)
571 if obj.y1 + margin < self._y0:
572 LTContainer.add(self, LTAnno(" "))
573 self._y0 = obj.y0
574 super().add(obj)
576 def find_neighbors(
577 self,
578 plane: Plane[LTComponentT],
579 ratio: float,
580 ) -> list[LTTextLine]:
581 """Finds neighboring LTTextLineVerticals in the plane.
583 Returns a list of other LTTextLineVerticals in the plane which are
584 close to self. "Close" can be controlled by ratio. The returned objects
585 will be the same width as self, and also either upper-, lower-, or
586 centrally-aligned.
587 """
588 d = ratio * self.width
589 objs = plane.find((self.x0 - d, self.y0, self.x1 + d, self.y1))
590 return [
591 obj
592 for obj in objs
593 if (
594 isinstance(obj, LTTextLineVertical)
595 and self._is_same_width_as(obj, tolerance=d)
596 and (
597 self._is_lower_aligned_with(obj, tolerance=d)
598 or self._is_upper_aligned_with(obj, tolerance=d)
599 or self._is_centrally_aligned_with(obj, tolerance=d)
600 )
601 )
602 ]
604 def _is_lower_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:
605 """Whether the lower edge of `other` is within `tolerance`."""
606 return abs(other.y0 - self.y0) <= tolerance
608 def _is_upper_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:
609 """Whether the upper edge of `other` is within `tolerance`."""
610 return abs(other.y1 - self.y1) <= tolerance
612 def _is_centrally_aligned_with(
613 self,
614 other: LTComponent,
615 tolerance: float = 0,
616 ) -> bool:
617 """Whether the vertical center of `other` is within `tolerance`."""
618 return abs((other.y0 + other.y1) / 2 - (self.y0 + self.y1) / 2) <= tolerance
620 def _is_same_width_as(self, other: LTComponent, tolerance: float) -> bool:
621 return abs(other.width - self.width) <= tolerance
624class LTTextBox(LTTextContainer[LTTextLine]):
625 """Represents a group of text chunks in a rectangular area.
627 Note that this box is created by geometric analysis and does not
628 necessarily represents a logical boundary of the text. It contains a list
629 of LTTextLine objects.
630 """
632 def __init__(self) -> None:
633 LTTextContainer.__init__(self)
634 self.index: int = -1
636 def __repr__(self) -> str:
637 return (
638 f"<{self.__class__.__name__}({self.index}) "
639 f"{bbox2str(self.bbox)} {self.get_text()!r}>"
640 )
642 def get_writing_mode(self) -> str:
643 raise NotImplementedError
646class LTTextBoxHorizontal(LTTextBox):
647 def analyze(self, laparams: LAParams) -> None:
648 super().analyze(laparams)
649 self._objs.sort(key=lambda obj: -obj.y1)
651 def get_writing_mode(self) -> str:
652 return "lr-tb"
655class LTTextBoxVertical(LTTextBox):
656 def analyze(self, laparams: LAParams) -> None:
657 super().analyze(laparams)
658 self._objs.sort(key=lambda obj: -obj.x1)
660 def get_writing_mode(self) -> str:
661 return "tb-rl"
664TextGroupElement = Union[LTTextBox, "LTTextGroup"]
667class LTTextGroup(LTTextContainer[TextGroupElement]):
668 def __init__(self, objs: Iterable[TextGroupElement]) -> None:
669 super().__init__()
670 self.extend(objs)
673class LTTextGroupLRTB(LTTextGroup):
674 def analyze(self, laparams: LAParams) -> None:
675 super().analyze(laparams)
676 assert laparams.boxes_flow is not None
677 boxes_flow = laparams.boxes_flow
678 # reorder the objects from top-left to bottom-right.
679 self._objs.sort(
680 key=lambda obj: (1 - boxes_flow) * obj.x0
681 - (1 + boxes_flow) * (obj.y0 + obj.y1),
682 )
685class LTTextGroupTBRL(LTTextGroup):
686 def analyze(self, laparams: LAParams) -> None:
687 super().analyze(laparams)
688 assert laparams.boxes_flow is not None
689 boxes_flow = laparams.boxes_flow
690 # reorder the objects from top-right to bottom-left.
691 self._objs.sort(
692 key=lambda obj: -(1 + boxes_flow) * (obj.x0 + obj.x1)
693 - (1 - boxes_flow) * obj.y1,
694 )
697class LTLayoutContainer(LTContainer[LTComponent]):
698 def __init__(self, bbox: Rect) -> None:
699 LTContainer.__init__(self, bbox)
700 self.groups: list[LTTextGroup] | None = None
702 # group_objects: group text object to textlines.
703 def group_objects(
704 self,
705 laparams: LAParams,
706 objs: Iterable[LTComponent],
707 ) -> Iterator[LTTextLine]:
708 obj0 = None
709 line: LTTextLine | None = None
710 for obj1 in objs:
711 if obj0 is not None:
712 # halign: obj0 and obj1 is horizontally aligned.
713 #
714 # +------+ - - -
715 # | obj0 | - - +------+ -
716 # | | | obj1 | | (line_overlap)
717 # +------+ - - | | -
718 # - - - +------+
719 #
720 # |<--->|
721 # (char_margin)
722 halign = (
723 obj0.is_voverlap(obj1)
724 and min(obj0.height, obj1.height) * laparams.line_overlap
725 < obj0.voverlap(obj1)
726 and obj0.hdistance(obj1)
727 < max(obj0.width, obj1.width) * laparams.char_margin
728 )
730 # valign: obj0 and obj1 is vertically aligned.
731 #
732 # +------+
733 # | obj0 |
734 # | |
735 # +------+ - - -
736 # | | | (char_margin)
737 # +------+ - -
738 # | obj1 |
739 # | |
740 # +------+
741 #
742 # |<-->|
743 # (line_overlap)
744 valign = (
745 laparams.detect_vertical
746 and obj0.is_hoverlap(obj1)
747 and min(obj0.width, obj1.width) * laparams.line_overlap
748 < obj0.hoverlap(obj1)
749 and obj0.vdistance(obj1)
750 < max(obj0.height, obj1.height) * laparams.char_margin
751 )
753 if (halign and isinstance(line, LTTextLineHorizontal)) or (
754 valign and isinstance(line, LTTextLineVertical)
755 ):
756 line.add(obj1)
757 elif line is not None:
758 yield line
759 line = None
760 elif valign and not halign:
761 line = LTTextLineVertical(laparams.word_margin)
762 line.add(obj0)
763 line.add(obj1)
764 elif halign and not valign:
765 line = LTTextLineHorizontal(laparams.word_margin)
766 line.add(obj0)
767 line.add(obj1)
768 else:
769 line = LTTextLineHorizontal(laparams.word_margin)
770 line.add(obj0)
771 yield line
772 line = None
773 obj0 = obj1
774 if line is None:
775 line = LTTextLineHorizontal(laparams.word_margin)
776 assert obj0 is not None
777 line.add(obj0)
778 yield line
780 def group_textlines(
781 self,
782 laparams: LAParams,
783 lines: Iterable[LTTextLine],
784 ) -> Iterator[LTTextBox]:
785 """Group neighboring lines to textboxes"""
786 plane: Plane[LTTextLine] = Plane(self.bbox)
787 plane.extend(lines)
788 boxes: dict[LTTextLine, LTTextBox] = {}
789 for line in lines:
790 neighbors = line.find_neighbors(plane, laparams.line_margin)
791 members = [line]
792 for obj1 in neighbors:
793 members.append(obj1)
794 if obj1 in boxes:
795 members.extend(boxes.pop(obj1))
796 if isinstance(line, LTTextLineHorizontal):
797 box: LTTextBox = LTTextBoxHorizontal()
798 else:
799 box = LTTextBoxVertical()
800 for obj in uniq(members):
801 box.add(obj)
802 boxes[obj] = box
803 done = set()
804 for line in lines:
805 if line not in boxes:
806 continue
807 box = boxes[line]
808 if box in done:
809 continue
810 done.add(box)
811 if not box.is_empty():
812 yield box
814 def group_textboxes(
815 self,
816 laparams: LAParams,
817 boxes: Sequence[LTTextBox],
818 ) -> list[LTTextGroup]:
819 """Group textboxes hierarchically.
821 Get pair-wise distances, via dist func defined below, and then merge
822 from the closest textbox pair. Once obj1 and obj2 are merged /
823 grouped, the resulting group is considered as a new object, and its
824 distances to other objects & groups are added to the process queue.
826 For performance reason, pair-wise distances and object pair info are
827 maintained in a heap of (idx, dist, id(obj1), id(obj2), obj1, obj2)
828 tuples. It ensures quick access to the smallest element. Note that
829 since comparison operators, e.g., __lt__, are disabled for
830 LTComponent, id(obj) has to appear before obj in element tuples.
832 :param laparams: LAParams object.
833 :param boxes: All textbox objects to be grouped.
834 :return: a list that has only one element, the final top level group.
835 """
836 ElementT = Union[LTTextBox, LTTextGroup]
837 plane: Plane[ElementT] = Plane(self.bbox)
839 def dist(obj1: LTComponent, obj2: LTComponent) -> float:
840 """A distance function between two TextBoxes.
842 Consider the bounding rectangle for obj1 and obj2.
843 Return its area less the areas of obj1 and obj2,
844 shown as 'www' below. This value may be negative.
845 +------+..........+ (x1, y1)
846 | obj1 |wwwwwwwwww:
847 +------+www+------+
848 :wwwwwwwwww| obj2 |
849 (x0, y0) +..........+------+
850 """
851 x0 = min(obj1.x0, obj2.x0)
852 y0 = min(obj1.y0, obj2.y0)
853 x1 = max(obj1.x1, obj2.x1)
854 y1 = max(obj1.y1, obj2.y1)
855 return (
856 (x1 - x0) * (y1 - y0)
857 - obj1.width * obj1.height
858 - obj2.width * obj2.height
859 )
861 def isany(obj1: ElementT, obj2: ElementT) -> set[ElementT]:
862 """Check if there's any other object between obj1 and obj2."""
863 x0 = min(obj1.x0, obj2.x0)
864 y0 = min(obj1.y0, obj2.y0)
865 x1 = max(obj1.x1, obj2.x1)
866 y1 = max(obj1.y1, obj2.y1)
867 objs = set(plane.find((x0, y0, x1, y1)))
868 return objs.difference((obj1, obj2))
870 dists: list[tuple[bool, float, int, int, ElementT, ElementT]] = []
871 for i in range(len(boxes)):
872 box1 = boxes[i]
873 for j in range(i + 1, len(boxes)):
874 box2 = boxes[j]
875 dists.append((False, dist(box1, box2), id(box1), id(box2), box1, box2))
876 heapq.heapify(dists)
878 plane.extend(boxes)
879 done = set()
880 while len(dists) > 0:
881 (skip_isany, d, id1, id2, obj1, obj2) = heapq.heappop(dists)
882 # Skip objects that are already merged
883 if (id1 not in done) and (id2 not in done):
884 if not skip_isany and isany(obj1, obj2):
885 heapq.heappush(dists, (True, d, id1, id2, obj1, obj2))
886 continue
887 if isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or isinstance(
888 obj2,
889 (LTTextBoxVertical, LTTextGroupTBRL),
890 ):
891 group: LTTextGroup = LTTextGroupTBRL([obj1, obj2])
892 else:
893 group = LTTextGroupLRTB([obj1, obj2])
894 plane.remove(obj1)
895 plane.remove(obj2)
896 done.update([id1, id2])
898 for other in plane:
899 heapq.heappush(
900 dists,
901 (False, dist(group, other), id(group), id(other), group, other),
902 )
903 plane.add(group)
904 # By now only groups are in the plane
905 return [cast(LTTextGroup, g) for g in plane]
907 def analyze(self, laparams: LAParams) -> None:
908 # textobjs is a list of LTChar objects, i.e.
909 # it has all the individual characters in the page.
910 (textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self)
911 for obj in otherobjs:
912 obj.analyze(laparams)
913 if not textobjs:
914 return
915 textlines = list(self.group_objects(laparams, textobjs))
916 (empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines)
917 for obj in empties:
918 obj.analyze(laparams)
919 textboxes = list(self.group_textlines(laparams, textlines))
920 if laparams.boxes_flow is None:
921 for textbox in textboxes:
922 textbox.analyze(laparams)
924 def getkey(box: LTTextBox) -> tuple[int, float, float]:
925 if isinstance(box, LTTextBoxVertical):
926 return (0, -box.x1, -box.y0)
927 else:
928 return (1, -box.y0, box.x0)
930 textboxes.sort(key=getkey)
931 else:
932 self.groups = self.group_textboxes(laparams, textboxes)
933 assigner = IndexAssigner()
934 for group in self.groups:
935 group.analyze(laparams)
936 assigner.run(group)
937 textboxes.sort(key=lambda box: box.index)
938 self._objs = (
939 cast(list[LTComponent], textboxes)
940 + otherobjs
941 + cast(list[LTComponent], empties)
942 )
945class LTFigure(LTLayoutContainer):
946 """Represents an area used by PDF Form objects.
948 PDF Forms can be used to present figures or pictures by embedding yet
949 another PDF document within a page. Note that LTFigure objects can appear
950 recursively.
951 """
953 def __init__(self, name: str, bbox: Rect, matrix: Matrix) -> None:
954 self.name = name
955 self.matrix = matrix
956 (x, y, w, h) = bbox
957 rect = (x, y, x + w, y + h)
958 bbox = apply_matrix_rect(matrix, rect)
959 LTLayoutContainer.__init__(self, bbox)
961 def __repr__(self) -> str:
962 return (
963 f"<{self.__class__.__name__}({self.name}) "
964 f"{bbox2str(self.bbox)} "
965 f"matrix={matrix2str(self.matrix)}>"
966 )
968 def analyze(self, laparams: LAParams) -> None:
969 if not laparams.all_texts:
970 return
971 LTLayoutContainer.analyze(self, laparams)
974class LTPage(LTLayoutContainer):
975 """Represents an entire page.
977 Like any other LTLayoutContainer, an LTPage can be iterated to obtain child
978 objects like LTTextBox, LTFigure, LTImage, LTRect, LTCurve and LTLine.
979 """
981 def __init__(self, pageid: int, bbox: Rect, rotate: float = 0) -> None:
982 LTLayoutContainer.__init__(self, bbox)
983 self.pageid = pageid
984 self.rotate = rotate
986 def __repr__(self) -> str:
987 return (
988 f"<{self.__class__.__name__}({self.pageid!r}) "
989 f"{bbox2str(self.bbox)} "
990 f"rotate={self.rotate!r}>"
991 )