Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/layout.py: 87%

1import heapq

2import logging

3from collections.abc import Iterable, Iterator, Sequence

4from typing import (

5 Generic,

6 TypeVar,

7 Union,

8 cast,

11from pdfminer.pdfcolor import PDFColorSpace

12from pdfminer.pdfexceptions import PDFTypeError, PDFValueError

13from pdfminer.pdffont import PDFFont

14from pdfminer.pdfinterp import Color, PDFGraphicState

15from pdfminer.pdftypes import PDFStream

16from pdfminer.utils import (

17 INF,

18 LTComponentT,

19 Matrix,

20 PathSegment,

21 Plane,

22 Point,

23 Rect,

24 apply_matrix_rect,

25 bbox2str,

26 fsplit,

27 get_bound,

28 matrix2str,

29 uniq,

30)

32logger = logging.getLogger(__name__)

35class IndexAssigner:

36 def __init__(self, index: int = 0) -> None:

37 self.index = index

39 def run(self, obj: "LTItem") -> None:

40 if isinstance(obj, LTTextBox):

41 obj.index = self.index

42 self.index += 1

43 elif isinstance(obj, LTTextGroup):

44 for x in obj:

45 self.run(x)

48class LAParams:

49 """Parameters for layout analysis

51 :param line_overlap: If two characters have more overlap than this they

52 are considered to be on the same line. The overlap is specified

53 relative to the minimum height of both characters.

54 :param char_margin: If two characters are closer together than this

55 margin they are considered part of the same line. The margin is

56 specified relative to the width of the character.

57 :param word_margin: If two characters on the same line are further apart

58 than this margin then they are considered to be two separate words, and

59 an intermediate space will be added for readability. The margin is

60 specified relative to the width of the character.

61 :param line_margin: If two lines are are close together they are

62 considered to be part of the same paragraph. The margin is

63 specified relative to the height of a line.

64 :param boxes_flow: Specifies how much a horizontal and vertical position

65 of a text matters when determining the order of text boxes. The value

66 should be within the range of -1.0 (only horizontal position

67 matters) to +1.0 (only vertical position matters). You can also pass

68 `None` to disable advanced layout analysis, and instead return text

69 based on the position of the bottom left corner of the text box.

70 :param detect_vertical: If vertical text should be considered during

71 layout analysis

72 :param all_texts: If layout analysis should be performed on text in

73 figures.

74 """

76 def __init__(

77 self,

78 line_overlap: float = 0.5,

79 char_margin: float = 2.0,

80 line_margin: float = 0.5,

81 word_margin: float = 0.1,

82 boxes_flow: float | None = 0.5,

83 detect_vertical: bool = False,

84 all_texts: bool = False,

85 ) -> None:

86 self.line_overlap = line_overlap

87 self.char_margin = char_margin

88 self.line_margin = line_margin

89 self.word_margin = word_margin

90 self.boxes_flow = boxes_flow

91 self.detect_vertical = detect_vertical

92 self.all_texts = all_texts

94 self._validate()

96 def _validate(self) -> None:

97 if self.boxes_flow is not None:

98 boxes_flow_err_msg = (

99 "LAParam boxes_flow should be None, or a number between -1 and +1"

100 )

101 if not (isinstance(self.boxes_flow, (int, float))):

102 raise PDFTypeError(boxes_flow_err_msg)

103 if not -1 <= self.boxes_flow <= 1:

104 raise PDFValueError(boxes_flow_err_msg)

105

106 def __repr__(self) -> str:

107 return (

108 f"<LAParams: char_margin={self.char_margin:.1f}, "

109 f"line_margin={self.line_margin:.1f}, "

110 f"word_margin={self.word_margin:.1f} "

111 f"all_texts={self.all_texts!r}>"

112 )

113

114

115class LTItem:

116 """Interface for things that can be analyzed"""

117

118 def analyze(self, laparams: LAParams) -> None:

119 """Perform the layout analysis."""

120

121

122class LTText:

123 """Interface for things that have text"""

124

125 def __repr__(self) -> str:

126 return f"<{self.__class__.__name__} {self.get_text()!r}>"

127

128 def get_text(self) -> str:

129 """Text contained in this object"""

130 raise NotImplementedError

131

132

133class LTComponent(LTItem):

134 """Object with a bounding box"""

135

136 def __init__(self, bbox: Rect) -> None:

137 LTItem.__init__(self)

138 self.set_bbox(bbox)

139

140 def __repr__(self) -> str:

141 return f"<{self.__class__.__name__} {bbox2str(self.bbox)}>"

142

143 # Disable comparison.

144 def __lt__(self, _: object) -> bool:

145 raise PDFValueError

146

147 def __le__(self, _: object) -> bool:

148 raise PDFValueError

149

150 def __gt__(self, _: object) -> bool:

151 raise PDFValueError

152

153 def __ge__(self, _: object) -> bool:

154 raise PDFValueError

155

156 def set_bbox(self, bbox: Rect) -> None:

157 (x0, y0, x1, y1) = bbox

158 self.x0 = x0

159 self.y0 = y0

160 self.x1 = x1

161 self.y1 = y1

162 self.width = x1 - x0

163 self.height = y1 - y0

164 self.bbox = bbox

165

166 def is_empty(self) -> bool:

167 return self.width <= 0 or self.height <= 0

168

169 def is_hoverlap(self, obj: "LTComponent") -> bool:

170 assert isinstance(obj, LTComponent), str(type(obj))

171 return obj.x0 <= self.x1 and self.x0 <= obj.x1

172

173 def hdistance(self, obj: "LTComponent") -> float:

174 assert isinstance(obj, LTComponent), str(type(obj))

175 if self.is_hoverlap(obj):

176 return 0

177 else:

178 return min(abs(self.x0 - obj.x1), abs(self.x1 - obj.x0))

179

180 def hoverlap(self, obj: "LTComponent") -> float:

181 assert isinstance(obj, LTComponent), str(type(obj))

182 if self.is_hoverlap(obj):

183 return min(abs(self.x0 - obj.x1), abs(self.x1 - obj.x0))

184 else:

185 return 0

186

187 def is_voverlap(self, obj: "LTComponent") -> bool:

188 assert isinstance(obj, LTComponent), str(type(obj))

189 return obj.y0 <= self.y1 and self.y0 <= obj.y1

190

191 def vdistance(self, obj: "LTComponent") -> float:

192 assert isinstance(obj, LTComponent), str(type(obj))

193 if self.is_voverlap(obj):

194 return 0

195 else:

196 return min(abs(self.y0 - obj.y1), abs(self.y1 - obj.y0))

197

198 def voverlap(self, obj: "LTComponent") -> float:

199 assert isinstance(obj, LTComponent), str(type(obj))

200 if self.is_voverlap(obj):

201 return min(abs(self.y0 - obj.y1), abs(self.y1 - obj.y0))

202 else:

203 return 0

204

205

206class LTCurve(LTComponent):

207 """A generic Bezier curve

208

209 The parameter `original_path` contains the original

210 pathing information from the pdf (e.g. for reconstructing Bezier Curves).

211

212 `dashing_style` contains the Dashing information if any.

213 """

214

215 def __init__(

216 self,

217 linewidth: float,

218 pts: list[Point],

219 stroke: bool = False,

220 fill: bool = False,

221 evenodd: bool = False,

222 stroking_color: Color | None = None,

223 non_stroking_color: Color | None = None,

224 original_path: list[PathSegment] | None = None,

225 dashing_style: tuple[object, object] | None = None,

226 ) -> None:

227 LTComponent.__init__(self, get_bound(pts))

228 self.pts = pts

229 self.linewidth = linewidth

230 self.stroke = stroke

231 self.fill = fill

232 self.evenodd = evenodd

233 self.stroking_color = stroking_color

234 self.non_stroking_color = non_stroking_color

235 self.original_path = original_path

236 self.dashing_style = dashing_style

237

238 def get_pts(self) -> str:

239 return ",".join("{:.3f},{:.3f}".format(*p) for p in self.pts)

240

241

242class LTLine(LTCurve):

243 """A single straight line.

244

245 Could be used for separating text or figures.

246 """

247

248 def __init__(

249 self,

250 linewidth: float,

251 p0: Point,

252 p1: Point,

253 stroke: bool = False,

254 fill: bool = False,

255 evenodd: bool = False,

256 stroking_color: Color | None = None,

257 non_stroking_color: Color | None = None,

258 original_path: list[PathSegment] | None = None,

259 dashing_style: tuple[object, object] | None = None,

260 ) -> None:

261 LTCurve.__init__(

262 self,

263 linewidth,

264 [p0, p1],

265 stroke,

266 fill,

267 evenodd,

268 stroking_color,

269 non_stroking_color,

270 original_path,

271 dashing_style,

272 )

273

274

275class LTRect(LTCurve):

276 """A rectangle.

277

278 Could be used for framing another pictures or figures.

279 """

280

281 def __init__(

282 self,

283 linewidth: float,

284 bbox: Rect,

285 stroke: bool = False,

286 fill: bool = False,

287 evenodd: bool = False,

288 stroking_color: Color | None = None,

289 non_stroking_color: Color | None = None,

290 original_path: list[PathSegment] | None = None,

291 dashing_style: tuple[object, object] | None = None,

292 ) -> None:

293 (x0, y0, x1, y1) = bbox

294 LTCurve.__init__(

295 self,

296 linewidth,

297 [(x0, y0), (x1, y0), (x1, y1), (x0, y1)],

298 stroke,

299 fill,

300 evenodd,

301 stroking_color,

302 non_stroking_color,

303 original_path,

304 dashing_style,

305 )

306

307

308class LTImage(LTComponent):

309 """An image object.

310

311 Embedded images can be in JPEG, Bitmap or JBIG2.

312 """

313

314 def __init__(self, name: str, stream: PDFStream, bbox: Rect) -> None:

315 LTComponent.__init__(self, bbox)

316 self.name = name

317 self.stream = stream

318 self.srcsize = (stream.get_any(("W", "Width")), stream.get_any(("H", "Height")))

319 self.imagemask = stream.get_any(("IM", "ImageMask"))

320 self.bits = stream.get_any(("BPC", "BitsPerComponent"), 1)

321 self.colorspace = stream.get_any(("CS", "ColorSpace"))

322 if not isinstance(self.colorspace, list):

323 self.colorspace = [self.colorspace]

324

325 def __repr__(self) -> str:

326 return (

327 f"<{self.__class__.__name__}({self.name}) "

328 f"{bbox2str(self.bbox)} {self.srcsize!r}>"

329 )

330

331

332class LTAnno(LTItem, LTText):

333 """Actual letter in the text as a Unicode string.

334

335 Note that, while a LTChar object has actual boundaries, LTAnno objects does

336 not, as these are "virtual" characters, inserted by a layout analyzer

337 according to the relationship between two characters (e.g. a space).

338 """

339

340 def __init__(self, text: str) -> None:

341 self._text = text

342

343 def get_text(self) -> str:

344 return self._text

345

346

347class LTChar(LTComponent, LTText):

348 """Actual letter in the text as a Unicode string."""

349

350 def __init__(

351 self,

352 matrix: Matrix,

353 font: PDFFont,

354 fontsize: float,

355 scaling: float,

356 rise: float,

357 text: str,

358 textwidth: float,

359 textdisp: float | tuple[float | None, float],

360 ncs: PDFColorSpace,

361 graphicstate: PDFGraphicState,

362 ) -> None:

363 LTText.__init__(self)

364 self._text = text

365 self.matrix = matrix

366 self.fontname = font.fontname

367 self.ncs = ncs

368 self.graphicstate = graphicstate

369 self.adv = textwidth * fontsize * scaling

370 # compute the boundary rectangle.

371 if font.is_vertical():

372 # vertical

373 assert isinstance(textdisp, tuple)

374 (vx, vy) = textdisp

375 vx = fontsize * 0.5 if vx is None else vx * fontsize * 0.001

376 vy = (1000 - vy) * fontsize * 0.001

377 bbox = (-vx, vy + rise + self.adv, -vx + fontsize, vy + rise)

378 else:

379 # horizontal

380 descent = font.get_descent() * fontsize

381 bbox = (0, descent + rise, self.adv, descent + rise + fontsize)

382 (a, b, c, d, _e, _f) = self.matrix

383 self.upright = a * d * scaling > 0 and b * c <= 0

384 (x0, y0, x1, y1) = apply_matrix_rect(self.matrix, bbox)

385 if x1 < x0:

386 (x0, x1) = (x1, x0)

387 if y1 < y0:

388 (y0, y1) = (y1, y0)

389 LTComponent.__init__(self, (x0, y0, x1, y1))

390 if font.is_vertical():

391 self.size = self.width

392 else:

393 self.size = self.height

394

395 def __repr__(self) -> str:

396 return (

397 f"<{self.__class__.__name__} {bbox2str(self.bbox)} "

398 f"matrix={matrix2str(self.matrix)} "

399 f"font={self.fontname!r} "

400 f"adv={self.adv} "

401 f"text={self.get_text()!r}>"

402 )

403

404 def get_text(self) -> str:

405 return self._text

406

407

408LTItemT = TypeVar("LTItemT", bound=LTItem)

409

410

411class LTContainer(LTComponent, Generic[LTItemT]):

412 """Object that can be extended and analyzed"""

413

414 def __init__(self, bbox: Rect) -> None:

415 LTComponent.__init__(self, bbox)

416 self._objs: list[LTItemT] = []

417

418 def __iter__(self) -> Iterator[LTItemT]:

419 return iter(self._objs)

420

421 def __len__(self) -> int:

422 return len(self._objs)

423

424 def add(self, obj: LTItemT) -> None:

425 self._objs.append(obj)

426

427 def extend(self, objs: Iterable[LTItemT]) -> None:

428 for obj in objs:

429 self.add(obj)

430

431 def analyze(self, laparams: LAParams) -> None:

432 for obj in self._objs:

433 obj.analyze(laparams)

434

435

436class LTExpandableContainer(LTContainer[LTItemT]):

437 def __init__(self) -> None:

438 LTContainer.__init__(self, (+INF, +INF, -INF, -INF))

439

440 # Incompatible override: we take an LTComponent (with bounding box), but

441 # super() LTContainer only considers LTItem (no bounding box).

442 def add(self, obj: LTComponent) -> None: # type: ignore[override]

443 LTContainer.add(self, cast(LTItemT, obj))

444 self.set_bbox(

445 (

446 min(self.x0, obj.x0),

447 min(self.y0, obj.y0),

448 max(self.x1, obj.x1),

449 max(self.y1, obj.y1),

450 ),

451 )

452

453

454class LTTextContainer(LTExpandableContainer[LTItemT], LTText):

455 def __init__(self) -> None:

456 LTText.__init__(self)

457 LTExpandableContainer.__init__(self)

458

459 def get_text(self) -> str:

460 return "".join(

461 cast(LTText, obj).get_text() for obj in self if isinstance(obj, LTText)

462 )

463

464

465TextLineElement = Union[LTChar, LTAnno]

466

467

468class LTTextLine(LTTextContainer[TextLineElement]):

469 """Contains a list of LTChar objects that represent a single text line.

470

471 The characters are aligned either horizontally or vertically, depending on

472 the text's writing mode.

473 """

474

475 def __init__(self, word_margin: float) -> None:

476 super().__init__()

477 self.word_margin = word_margin

478

479 def __repr__(self) -> str:

480 return f"<{self.__class__.__name__} {bbox2str(self.bbox)} {self.get_text()!r}>"

481

482 def analyze(self, laparams: LAParams) -> None:

483 for obj in self._objs:

484 obj.analyze(laparams)

485 LTContainer.add(self, LTAnno("\n"))

486

487 def find_neighbors(

488 self,

489 plane: Plane[LTComponentT],

490 ratio: float,

491 ) -> list["LTTextLine"]:

492 raise NotImplementedError

493

494 def is_empty(self) -> bool:

495 return super().is_empty() or self.get_text().isspace()

496

497

498class LTTextLineHorizontal(LTTextLine):

499 def __init__(self, word_margin: float) -> None:

500 LTTextLine.__init__(self, word_margin)

501 self._x1: float = +INF

502

503 # Incompatible override: we take an LTComponent (with bounding box), but

504 # LTContainer only considers LTItem (no bounding box).

505 def add(self, obj: LTComponent) -> None: # type: ignore[override]

506 if isinstance(obj, LTChar) and self.word_margin:

507 margin = self.word_margin * max(obj.width, obj.height)

508 if self._x1 < obj.x0 - margin:

509 LTContainer.add(self, LTAnno(" "))

510 self._x1 = obj.x1

511 super().add(obj)

512

513 def find_neighbors(

514 self,

515 plane: Plane[LTComponentT],

516 ratio: float,

517 ) -> list[LTTextLine]:

518 """Finds neighboring LTTextLineHorizontals in the plane.

519

520 Returns a list of other LTTestLineHorizontals in the plane which are

521 close to self. "Close" can be controlled by ratio. The returned objects

522 will be the same height as self, and also either left-, right-, or

523 centrally-aligned.

524 """

525 d = ratio * self.height

526 objs = plane.find((self.x0, self.y0 - d, self.x1, self.y1 + d))

527 return [

528 obj

529 for obj in objs

530 if (

531 isinstance(obj, LTTextLineHorizontal)

532 and self._is_same_height_as(obj, tolerance=d)

533 and (

534 self._is_left_aligned_with(obj, tolerance=d)

535 or self._is_right_aligned_with(obj, tolerance=d)

536 or self._is_centrally_aligned_with(obj, tolerance=d)

537 )

538 )

539 ]

540

541 def _is_left_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:

542 """Whether the left-hand edge of `other` is within `tolerance`."""

543 return abs(other.x0 - self.x0) <= tolerance

544

545 def _is_right_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:

546 """Whether the right-hand edge of `other` is within `tolerance`."""

547 return abs(other.x1 - self.x1) <= tolerance

548

549 def _is_centrally_aligned_with(

550 self,

551 other: LTComponent,

552 tolerance: float = 0,

553 ) -> bool:

554 """Whether the horizontal center of `other` is within `tolerance`."""

555 return abs((other.x0 + other.x1) / 2 - (self.x0 + self.x1) / 2) <= tolerance

556

557 def _is_same_height_as(self, other: LTComponent, tolerance: float = 0) -> bool:

558 return abs(other.height - self.height) <= tolerance

559

560

561class LTTextLineVertical(LTTextLine):

562 def __init__(self, word_margin: float) -> None:

563 LTTextLine.__init__(self, word_margin)

564 self._y0: float = -INF

565

566 # Incompatible override: we take an LTComponent (with bounding box), but

567 # LTContainer only considers LTItem (no bounding box).

568 def add(self, obj: LTComponent) -> None: # type: ignore[override]

569 if isinstance(obj, LTChar) and self.word_margin:

570 margin = self.word_margin * max(obj.width, obj.height)

571 if obj.y1 + margin < self._y0:

572 LTContainer.add(self, LTAnno(" "))

573 self._y0 = obj.y0

574 super().add(obj)

575

576 def find_neighbors(

577 self,

578 plane: Plane[LTComponentT],

579 ratio: float,

580 ) -> list[LTTextLine]:

581 """Finds neighboring LTTextLineVerticals in the plane.

582

583 Returns a list of other LTTextLineVerticals in the plane which are

584 close to self. "Close" can be controlled by ratio. The returned objects

585 will be the same width as self, and also either upper-, lower-, or

586 centrally-aligned.

587 """

588 d = ratio * self.width

589 objs = plane.find((self.x0 - d, self.y0, self.x1 + d, self.y1))

590 return [

591 obj

592 for obj in objs

593 if (

594 isinstance(obj, LTTextLineVertical)

595 and self._is_same_width_as(obj, tolerance=d)

596 and (

597 self._is_lower_aligned_with(obj, tolerance=d)

598 or self._is_upper_aligned_with(obj, tolerance=d)

599 or self._is_centrally_aligned_with(obj, tolerance=d)

600 )

601 )

602 ]

603

604 def _is_lower_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:

605 """Whether the lower edge of `other` is within `tolerance`."""

606 return abs(other.y0 - self.y0) <= tolerance

607

608 def _is_upper_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:

609 """Whether the upper edge of `other` is within `tolerance`."""

610 return abs(other.y1 - self.y1) <= tolerance

611

612 def _is_centrally_aligned_with(

613 self,

614 other: LTComponent,

615 tolerance: float = 0,

616 ) -> bool:

617 """Whether the vertical center of `other` is within `tolerance`."""

618 return abs((other.y0 + other.y1) / 2 - (self.y0 + self.y1) / 2) <= tolerance

619

620 def _is_same_width_as(self, other: LTComponent, tolerance: float) -> bool:

621 return abs(other.width - self.width) <= tolerance

622

623

624class LTTextBox(LTTextContainer[LTTextLine]):

625 """Represents a group of text chunks in a rectangular area.

626

627 Note that this box is created by geometric analysis and does not

628 necessarily represents a logical boundary of the text. It contains a list

629 of LTTextLine objects.

630 """

631

632 def __init__(self) -> None:

633 LTTextContainer.__init__(self)

634 self.index: int = -1

635

636 def __repr__(self) -> str:

637 return (

638 f"<{self.__class__.__name__}({self.index}) "

639 f"{bbox2str(self.bbox)} {self.get_text()!r}>"

640 )

641

642 def get_writing_mode(self) -> str:

643 raise NotImplementedError

644

645

646class LTTextBoxHorizontal(LTTextBox):

647 def analyze(self, laparams: LAParams) -> None:

648 super().analyze(laparams)

649 self._objs.sort(key=lambda obj: -obj.y1)

650

651 def get_writing_mode(self) -> str:

652 return "lr-tb"

653

654

655class LTTextBoxVertical(LTTextBox):

656 def analyze(self, laparams: LAParams) -> None:

657 super().analyze(laparams)

658 self._objs.sort(key=lambda obj: -obj.x1)

659

660 def get_writing_mode(self) -> str:

661 return "tb-rl"

662

663

664TextGroupElement = Union[LTTextBox, "LTTextGroup"]

665

666

667class LTTextGroup(LTTextContainer[TextGroupElement]):

668 def __init__(self, objs: Iterable[TextGroupElement]) -> None:

669 super().__init__()

670 self.extend(objs)

671

672

673class LTTextGroupLRTB(LTTextGroup):

674 def analyze(self, laparams: LAParams) -> None:

675 super().analyze(laparams)

676 assert laparams.boxes_flow is not None

677 boxes_flow = laparams.boxes_flow

678 # reorder the objects from top-left to bottom-right.

679 self._objs.sort(

680 key=lambda obj: (1 - boxes_flow) * obj.x0

681 - (1 + boxes_flow) * (obj.y0 + obj.y1),

682 )

683

684

685class LTTextGroupTBRL(LTTextGroup):

686 def analyze(self, laparams: LAParams) -> None:

687 super().analyze(laparams)

688 assert laparams.boxes_flow is not None

689 boxes_flow = laparams.boxes_flow

690 # reorder the objects from top-right to bottom-left.

691 self._objs.sort(

692 key=lambda obj: -(1 + boxes_flow) * (obj.x0 + obj.x1)

693 - (1 - boxes_flow) * obj.y1,

694 )

695

696

697class LTLayoutContainer(LTContainer[LTComponent]):

698 def __init__(self, bbox: Rect) -> None:

699 LTContainer.__init__(self, bbox)

700 self.groups: list[LTTextGroup] | None = None

701

702 # group_objects: group text object to textlines.

703 def group_objects(

704 self,

705 laparams: LAParams,

706 objs: Iterable[LTComponent],

707 ) -> Iterator[LTTextLine]:

708 obj0 = None

709 line: LTTextLine | None = None

710 for obj1 in objs:

711 if obj0 is not None:

712 # halign: obj0 and obj1 is horizontally aligned.

713 #

714 # +------+ - - -

715 # | obj0 | - - +------+ -

716 # | | | obj1 | | (line_overlap)

717 # +------+ - - | | -

718 # - - - +------+

719 #

720 # |<--->|

721 # (char_margin)

722 halign = (

723 obj0.is_voverlap(obj1)

724 and min(obj0.height, obj1.height) * laparams.line_overlap

725 < obj0.voverlap(obj1)

726 and obj0.hdistance(obj1)

727 < max(obj0.width, obj1.width) * laparams.char_margin

728 )

729

730 # valign: obj0 and obj1 is vertically aligned.

731 #

732 # +------+

733 # | obj0 |

734 # | |

735 # +------+ - - -

736 # | | | (char_margin)

737 # +------+ - -

738 # | obj1 |

739 # | |

740 # +------+

741 #

742 # |<-->|

743 # (line_overlap)

744 valign = (

745 laparams.detect_vertical

746 and obj0.is_hoverlap(obj1)

747 and min(obj0.width, obj1.width) * laparams.line_overlap

748 < obj0.hoverlap(obj1)

749 and obj0.vdistance(obj1)

750 < max(obj0.height, obj1.height) * laparams.char_margin

751 )

752

753 if (halign and isinstance(line, LTTextLineHorizontal)) or (

754 valign and isinstance(line, LTTextLineVertical)

755 ):

756 line.add(obj1)

757 elif line is not None:

758 yield line

759 line = None

760 elif valign and not halign:

761 line = LTTextLineVertical(laparams.word_margin)

762 line.add(obj0)

763 line.add(obj1)

764 elif halign and not valign:

765 line = LTTextLineHorizontal(laparams.word_margin)

766 line.add(obj0)

767 line.add(obj1)

768 else:

769 line = LTTextLineHorizontal(laparams.word_margin)

770 line.add(obj0)

771 yield line

772 line = None

773 obj0 = obj1

774 if line is None:

775 line = LTTextLineHorizontal(laparams.word_margin)

776 assert obj0 is not None

777 line.add(obj0)

778 yield line

779

780 def group_textlines(

781 self,

782 laparams: LAParams,

783 lines: Iterable[LTTextLine],

784 ) -> Iterator[LTTextBox]:

785 """Group neighboring lines to textboxes"""

786 plane: Plane[LTTextLine] = Plane(self.bbox)

787 plane.extend(lines)

788 boxes: dict[LTTextLine, LTTextBox] = {}

789 for line in lines:

790 neighbors = line.find_neighbors(plane, laparams.line_margin)

791 members = [line]

792 for obj1 in neighbors:

793 members.append(obj1)

794 if obj1 in boxes:

795 members.extend(boxes.pop(obj1))

796 if isinstance(line, LTTextLineHorizontal):

797 box: LTTextBox = LTTextBoxHorizontal()

798 else:

799 box = LTTextBoxVertical()

800 for obj in uniq(members):

801 box.add(obj)

802 boxes[obj] = box

803 done = set()

804 for line in lines:

805 if line not in boxes:

806 continue

807 box = boxes[line]

808 if box in done:

809 continue

810 done.add(box)

811 if not box.is_empty():

812 yield box

813

814 def group_textboxes(

815 self,

816 laparams: LAParams,

817 boxes: Sequence[LTTextBox],

818 ) -> list[LTTextGroup]:

819 """Group textboxes hierarchically.

820

821 Get pair-wise distances, via dist func defined below, and then merge

822 from the closest textbox pair. Once obj1 and obj2 are merged /

823 grouped, the resulting group is considered as a new object, and its

824 distances to other objects & groups are added to the process queue.

825

826 For performance reason, pair-wise distances and object pair info are

827 maintained in a heap of (idx, dist, id(obj1), id(obj2), obj1, obj2)

828 tuples. It ensures quick access to the smallest element. Note that

829 since comparison operators, e.g., __lt__, are disabled for

830 LTComponent, id(obj) has to appear before obj in element tuples.

831

832 :param laparams: LAParams object.

833 :param boxes: All textbox objects to be grouped.

834 :return: a list that has only one element, the final top level group.

835 """

836 ElementT = Union[LTTextBox, LTTextGroup]

837 plane: Plane[ElementT] = Plane(self.bbox)

838

839 def dist(obj1: LTComponent, obj2: LTComponent) -> float:

840 """A distance function between two TextBoxes.

841

842 Consider the bounding rectangle for obj1 and obj2.

843 Return its area less the areas of obj1 and obj2,

844 shown as 'www' below. This value may be negative.

845 +------+..........+ (x1, y1)

846 | obj1 |wwwwwwwwww:

847 +------+www+------+

848 :wwwwwwwwww| obj2 |

849 (x0, y0) +..........+------+

850 """

851 x0 = min(obj1.x0, obj2.x0)

852 y0 = min(obj1.y0, obj2.y0)

853 x1 = max(obj1.x1, obj2.x1)

854 y1 = max(obj1.y1, obj2.y1)

855 return (

856 (x1 - x0) * (y1 - y0)

857 - obj1.width * obj1.height

858 - obj2.width * obj2.height

859 )

860

861 def isany(obj1: ElementT, obj2: ElementT) -> set[ElementT]:

862 """Check if there's any other object between obj1 and obj2."""

863 x0 = min(obj1.x0, obj2.x0)

864 y0 = min(obj1.y0, obj2.y0)

865 x1 = max(obj1.x1, obj2.x1)

866 y1 = max(obj1.y1, obj2.y1)

867 objs = set(plane.find((x0, y0, x1, y1)))

868 return objs.difference((obj1, obj2))

869

870 dists: list[tuple[bool, float, int, int, ElementT, ElementT]] = []

871 for i in range(len(boxes)):

872 box1 = boxes[i]

873 for j in range(i + 1, len(boxes)):

874 box2 = boxes[j]

875 dists.append((False, dist(box1, box2), id(box1), id(box2), box1, box2))

876 heapq.heapify(dists)

877

878 plane.extend(boxes)

879 done = set()

880 while len(dists) > 0:

881 (skip_isany, d, id1, id2, obj1, obj2) = heapq.heappop(dists)

882 # Skip objects that are already merged

883 if (id1 not in done) and (id2 not in done):

884 if not skip_isany and isany(obj1, obj2):

885 heapq.heappush(dists, (True, d, id1, id2, obj1, obj2))

886 continue

887 if isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or isinstance(

888 obj2,

889 (LTTextBoxVertical, LTTextGroupTBRL),

890 ):

891 group: LTTextGroup = LTTextGroupTBRL([obj1, obj2])

892 else:

893 group = LTTextGroupLRTB([obj1, obj2])

894 plane.remove(obj1)

895 plane.remove(obj2)

896 done.update([id1, id2])

897

898 for other in plane:

899 heapq.heappush(

900 dists,

901 (False, dist(group, other), id(group), id(other), group, other),

902 )

903 plane.add(group)

904 # By now only groups are in the plane

905 return [cast(LTTextGroup, g) for g in plane]

906

907 def analyze(self, laparams: LAParams) -> None:

908 # textobjs is a list of LTChar objects, i.e.

909 # it has all the individual characters in the page.

910 (textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self)

911 for obj in otherobjs:

912 obj.analyze(laparams)

913 if not textobjs:

914 return

915 textlines = list(self.group_objects(laparams, textobjs))

916 (empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines)

917 for obj in empties:

918 obj.analyze(laparams)

919 textboxes = list(self.group_textlines(laparams, textlines))

920 if laparams.boxes_flow is None:

921 for textbox in textboxes:

922 textbox.analyze(laparams)

923

924 def getkey(box: LTTextBox) -> tuple[int, float, float]:

925 if isinstance(box, LTTextBoxVertical):

926 return (0, -box.x1, -box.y0)

927 else:

928 return (1, -box.y0, box.x0)

929

930 textboxes.sort(key=getkey)

931 else:

932 self.groups = self.group_textboxes(laparams, textboxes)

933 assigner = IndexAssigner()

934 for group in self.groups:

935 group.analyze(laparams)

936 assigner.run(group)

937 textboxes.sort(key=lambda box: box.index)

938 self._objs = (

939 cast(list[LTComponent], textboxes)

940 + otherobjs

941 + cast(list[LTComponent], empties)

942 )

943

944

945class LTFigure(LTLayoutContainer):

946 """Represents an area used by PDF Form objects.

947

948 PDF Forms can be used to present figures or pictures by embedding yet

949 another PDF document within a page. Note that LTFigure objects can appear

950 recursively.

951 """

952

953 def __init__(self, name: str, bbox: Rect, matrix: Matrix) -> None:

954 self.name = name

955 self.matrix = matrix

956 (x, y, w, h) = bbox

957 rect = (x, y, x + w, y + h)

958 bbox = apply_matrix_rect(matrix, rect)

959 LTLayoutContainer.__init__(self, bbox)

960

961 def __repr__(self) -> str:

962 return (

963 f"<{self.__class__.__name__}({self.name}) "

964 f"{bbox2str(self.bbox)} "

965 f"matrix={matrix2str(self.matrix)}>"

966 )

967

968 def analyze(self, laparams: LAParams) -> None:

969 if not laparams.all_texts:

970 return

971 LTLayoutContainer.analyze(self, laparams)

972

973

974class LTPage(LTLayoutContainer):

975 """Represents an entire page.

976

977 Like any other LTLayoutContainer, an LTPage can be iterated to obtain child

978 objects like LTTextBox, LTFigure, LTImage, LTRect, LTCurve and LTLine.

979 """

980

981 def __init__(self, pageid: int, bbox: Rect, rotate: float = 0) -> None:

982 LTLayoutContainer.__init__(self, bbox)

983 self.pageid = pageid

984 self.rotate = rotate

985

986 def __repr__(self) -> str:

987 return (

988 f"<{self.__class__.__name__}({self.pageid!r}) "

989 f"{bbox2str(self.bbox)} "

990 f"rotate={self.rotate!r}>"

991 )