Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/layout.py: 87%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

445 statements  

1import heapq 

2import logging 

3from collections.abc import Iterable, Iterator, Sequence 

4from typing import ( 

5 Generic, 

6 TypeVar, 

7 Union, 

8 cast, 

9) 

10 

11from pdfminer.pdfcolor import PDFColorSpace 

12from pdfminer.pdfexceptions import PDFTypeError, PDFValueError 

13from pdfminer.pdffont import PDFFont 

14from pdfminer.pdfinterp import Color, PDFGraphicState 

15from pdfminer.pdftypes import PDFStream 

16from pdfminer.utils import ( 

17 INF, 

18 LTComponentT, 

19 Matrix, 

20 PathSegment, 

21 Plane, 

22 Point, 

23 Rect, 

24 apply_matrix_rect, 

25 bbox2str, 

26 fsplit, 

27 get_bound, 

28 matrix2str, 

29 uniq, 

30) 

31 

32logger = logging.getLogger(__name__) 

33 

34 

35class IndexAssigner: 

36 def __init__(self, index: int = 0) -> None: 

37 self.index = index 

38 

39 def run(self, obj: "LTItem") -> None: 

40 if isinstance(obj, LTTextBox): 

41 obj.index = self.index 

42 self.index += 1 

43 elif isinstance(obj, LTTextGroup): 

44 for x in obj: 

45 self.run(x) 

46 

47 

48class LAParams: 

49 """Parameters for layout analysis 

50 

51 :param line_overlap: If two characters have more overlap than this they 

52 are considered to be on the same line. The overlap is specified 

53 relative to the minimum height of both characters. 

54 :param char_margin: If two characters are closer together than this 

55 margin they are considered part of the same line. The margin is 

56 specified relative to the width of the character. 

57 :param word_margin: If two characters on the same line are further apart 

58 than this margin then they are considered to be two separate words, and 

59 an intermediate space will be added for readability. The margin is 

60 specified relative to the width of the character. 

61 :param line_margin: If two lines are are close together they are 

62 considered to be part of the same paragraph. The margin is 

63 specified relative to the height of a line. 

64 :param boxes_flow: Specifies how much a horizontal and vertical position 

65 of a text matters when determining the order of text boxes. The value 

66 should be within the range of -1.0 (only horizontal position 

67 matters) to +1.0 (only vertical position matters). You can also pass 

68 `None` to disable advanced layout analysis, and instead return text 

69 based on the position of the bottom left corner of the text box. 

70 :param detect_vertical: If vertical text should be considered during 

71 layout analysis 

72 :param all_texts: If layout analysis should be performed on text in 

73 figures. 

74 """ 

75 

76 def __init__( 

77 self, 

78 line_overlap: float = 0.5, 

79 char_margin: float = 2.0, 

80 line_margin: float = 0.5, 

81 word_margin: float = 0.1, 

82 boxes_flow: float | None = 0.5, 

83 detect_vertical: bool = False, 

84 all_texts: bool = False, 

85 ) -> None: 

86 self.line_overlap = line_overlap 

87 self.char_margin = char_margin 

88 self.line_margin = line_margin 

89 self.word_margin = word_margin 

90 self.boxes_flow = boxes_flow 

91 self.detect_vertical = detect_vertical 

92 self.all_texts = all_texts 

93 

94 self._validate() 

95 

96 def _validate(self) -> None: 

97 if self.boxes_flow is not None: 

98 boxes_flow_err_msg = ( 

99 "LAParam boxes_flow should be None, or a number between -1 and +1" 

100 ) 

101 if not (isinstance(self.boxes_flow, (int, float))): 

102 raise PDFTypeError(boxes_flow_err_msg) 

103 if not -1 <= self.boxes_flow <= 1: 

104 raise PDFValueError(boxes_flow_err_msg) 

105 

106 def __repr__(self) -> str: 

107 return ( 

108 f"<LAParams: char_margin={self.char_margin:.1f}, " 

109 f"line_margin={self.line_margin:.1f}, " 

110 f"word_margin={self.word_margin:.1f} " 

111 f"all_texts={self.all_texts!r}>" 

112 ) 

113 

114 

115class LTItem: 

116 """Interface for things that can be analyzed""" 

117 

118 def analyze(self, laparams: LAParams) -> None: 

119 """Perform the layout analysis.""" 

120 

121 

122class LTText: 

123 """Interface for things that have text""" 

124 

125 def __repr__(self) -> str: 

126 return f"<{self.__class__.__name__} {self.get_text()!r}>" 

127 

128 def get_text(self) -> str: 

129 """Text contained in this object""" 

130 raise NotImplementedError 

131 

132 

133class LTComponent(LTItem): 

134 """Object with a bounding box""" 

135 

136 def __init__(self, bbox: Rect) -> None: 

137 LTItem.__init__(self) 

138 self.set_bbox(bbox) 

139 

140 def __repr__(self) -> str: 

141 return f"<{self.__class__.__name__} {bbox2str(self.bbox)}>" 

142 

143 # Disable comparison. 

144 def __lt__(self, _: object) -> bool: 

145 raise PDFValueError 

146 

147 def __le__(self, _: object) -> bool: 

148 raise PDFValueError 

149 

150 def __gt__(self, _: object) -> bool: 

151 raise PDFValueError 

152 

153 def __ge__(self, _: object) -> bool: 

154 raise PDFValueError 

155 

156 def set_bbox(self, bbox: Rect) -> None: 

157 (x0, y0, x1, y1) = bbox 

158 self.x0 = x0 

159 self.y0 = y0 

160 self.x1 = x1 

161 self.y1 = y1 

162 self.width = x1 - x0 

163 self.height = y1 - y0 

164 self.bbox = bbox 

165 

166 def is_empty(self) -> bool: 

167 return self.width <= 0 or self.height <= 0 

168 

169 def is_hoverlap(self, obj: "LTComponent") -> bool: 

170 assert isinstance(obj, LTComponent), str(type(obj)) 

171 return obj.x0 <= self.x1 and self.x0 <= obj.x1 

172 

173 def hdistance(self, obj: "LTComponent") -> float: 

174 assert isinstance(obj, LTComponent), str(type(obj)) 

175 if self.is_hoverlap(obj): 

176 return 0 

177 else: 

178 return min(abs(self.x0 - obj.x1), abs(self.x1 - obj.x0)) 

179 

180 def hoverlap(self, obj: "LTComponent") -> float: 

181 assert isinstance(obj, LTComponent), str(type(obj)) 

182 if self.is_hoverlap(obj): 

183 return min(abs(self.x0 - obj.x1), abs(self.x1 - obj.x0)) 

184 else: 

185 return 0 

186 

187 def is_voverlap(self, obj: "LTComponent") -> bool: 

188 assert isinstance(obj, LTComponent), str(type(obj)) 

189 return obj.y0 <= self.y1 and self.y0 <= obj.y1 

190 

191 def vdistance(self, obj: "LTComponent") -> float: 

192 assert isinstance(obj, LTComponent), str(type(obj)) 

193 if self.is_voverlap(obj): 

194 return 0 

195 else: 

196 return min(abs(self.y0 - obj.y1), abs(self.y1 - obj.y0)) 

197 

198 def voverlap(self, obj: "LTComponent") -> float: 

199 assert isinstance(obj, LTComponent), str(type(obj)) 

200 if self.is_voverlap(obj): 

201 return min(abs(self.y0 - obj.y1), abs(self.y1 - obj.y0)) 

202 else: 

203 return 0 

204 

205 

206class LTCurve(LTComponent): 

207 """A generic Bezier curve 

208 

209 The parameter `original_path` contains the original 

210 pathing information from the pdf (e.g. for reconstructing Bezier Curves). 

211 

212 `dashing_style` contains the Dashing information if any. 

213 """ 

214 

215 def __init__( 

216 self, 

217 linewidth: float, 

218 pts: list[Point], 

219 stroke: bool = False, 

220 fill: bool = False, 

221 evenodd: bool = False, 

222 stroking_color: Color | None = None, 

223 non_stroking_color: Color | None = None, 

224 original_path: list[PathSegment] | None = None, 

225 dashing_style: tuple[object, object] | None = None, 

226 ) -> None: 

227 LTComponent.__init__(self, get_bound(pts)) 

228 self.pts = pts 

229 self.linewidth = linewidth 

230 self.stroke = stroke 

231 self.fill = fill 

232 self.evenodd = evenodd 

233 self.stroking_color = stroking_color 

234 self.non_stroking_color = non_stroking_color 

235 self.original_path = original_path 

236 self.dashing_style = dashing_style 

237 

238 def get_pts(self) -> str: 

239 return ",".join("{:.3f},{:.3f}".format(*p) for p in self.pts) 

240 

241 

242class LTLine(LTCurve): 

243 """A single straight line. 

244 

245 Could be used for separating text or figures. 

246 """ 

247 

248 def __init__( 

249 self, 

250 linewidth: float, 

251 p0: Point, 

252 p1: Point, 

253 stroke: bool = False, 

254 fill: bool = False, 

255 evenodd: bool = False, 

256 stroking_color: Color | None = None, 

257 non_stroking_color: Color | None = None, 

258 original_path: list[PathSegment] | None = None, 

259 dashing_style: tuple[object, object] | None = None, 

260 ) -> None: 

261 LTCurve.__init__( 

262 self, 

263 linewidth, 

264 [p0, p1], 

265 stroke, 

266 fill, 

267 evenodd, 

268 stroking_color, 

269 non_stroking_color, 

270 original_path, 

271 dashing_style, 

272 ) 

273 

274 

275class LTRect(LTCurve): 

276 """A rectangle. 

277 

278 Could be used for framing another pictures or figures. 

279 """ 

280 

281 def __init__( 

282 self, 

283 linewidth: float, 

284 bbox: Rect, 

285 stroke: bool = False, 

286 fill: bool = False, 

287 evenodd: bool = False, 

288 stroking_color: Color | None = None, 

289 non_stroking_color: Color | None = None, 

290 original_path: list[PathSegment] | None = None, 

291 dashing_style: tuple[object, object] | None = None, 

292 ) -> None: 

293 (x0, y0, x1, y1) = bbox 

294 LTCurve.__init__( 

295 self, 

296 linewidth, 

297 [(x0, y0), (x1, y0), (x1, y1), (x0, y1)], 

298 stroke, 

299 fill, 

300 evenodd, 

301 stroking_color, 

302 non_stroking_color, 

303 original_path, 

304 dashing_style, 

305 ) 

306 

307 

308class LTImage(LTComponent): 

309 """An image object. 

310 

311 Embedded images can be in JPEG, Bitmap or JBIG2. 

312 """ 

313 

314 def __init__(self, name: str, stream: PDFStream, bbox: Rect) -> None: 

315 LTComponent.__init__(self, bbox) 

316 self.name = name 

317 self.stream = stream 

318 self.srcsize = (stream.get_any(("W", "Width")), stream.get_any(("H", "Height"))) 

319 self.imagemask = stream.get_any(("IM", "ImageMask")) 

320 self.bits = stream.get_any(("BPC", "BitsPerComponent"), 1) 

321 self.colorspace = stream.get_any(("CS", "ColorSpace")) 

322 if not isinstance(self.colorspace, list): 

323 self.colorspace = [self.colorspace] 

324 

325 def __repr__(self) -> str: 

326 return ( 

327 f"<{self.__class__.__name__}({self.name}) " 

328 f"{bbox2str(self.bbox)} {self.srcsize!r}>" 

329 ) 

330 

331 

332class LTAnno(LTItem, LTText): 

333 """Actual letter in the text as a Unicode string. 

334 

335 Note that, while a LTChar object has actual boundaries, LTAnno objects does 

336 not, as these are "virtual" characters, inserted by a layout analyzer 

337 according to the relationship between two characters (e.g. a space). 

338 """ 

339 

340 def __init__(self, text: str) -> None: 

341 self._text = text 

342 

343 def get_text(self) -> str: 

344 return self._text 

345 

346 

347class LTChar(LTComponent, LTText): 

348 """Actual letter in the text as a Unicode string.""" 

349 

350 def __init__( 

351 self, 

352 matrix: Matrix, 

353 font: PDFFont, 

354 fontsize: float, 

355 scaling: float, 

356 rise: float, 

357 text: str, 

358 textwidth: float, 

359 textdisp: float | tuple[float | None, float], 

360 ncs: PDFColorSpace, 

361 graphicstate: PDFGraphicState, 

362 ) -> None: 

363 LTText.__init__(self) 

364 self._text = text 

365 self.matrix = matrix 

366 self.fontname = font.fontname 

367 self.ncs = ncs 

368 self.graphicstate = graphicstate 

369 self.adv = textwidth * fontsize * scaling 

370 # compute the boundary rectangle. 

371 if font.is_vertical(): 

372 # vertical 

373 assert isinstance(textdisp, tuple) 

374 (vx, vy) = textdisp 

375 vx = fontsize * 0.5 if vx is None else vx * fontsize * 0.001 

376 vy = (1000 - vy) * fontsize * 0.001 

377 bbox = (-vx, vy + rise + self.adv, -vx + fontsize, vy + rise) 

378 else: 

379 # horizontal 

380 descent = font.get_descent() * fontsize 

381 bbox = (0, descent + rise, self.adv, descent + rise + fontsize) 

382 (a, b, c, d, _e, _f) = self.matrix 

383 self.upright = a * d * scaling > 0 and b * c <= 0 

384 (x0, y0, x1, y1) = apply_matrix_rect(self.matrix, bbox) 

385 if x1 < x0: 

386 (x0, x1) = (x1, x0) 

387 if y1 < y0: 

388 (y0, y1) = (y1, y0) 

389 LTComponent.__init__(self, (x0, y0, x1, y1)) 

390 if font.is_vertical(): 

391 self.size = self.width 

392 else: 

393 self.size = self.height 

394 

395 def __repr__(self) -> str: 

396 return ( 

397 f"<{self.__class__.__name__} {bbox2str(self.bbox)} " 

398 f"matrix={matrix2str(self.matrix)} " 

399 f"font={self.fontname!r} " 

400 f"adv={self.adv} " 

401 f"text={self.get_text()!r}>" 

402 ) 

403 

404 def get_text(self) -> str: 

405 return self._text 

406 

407 

408LTItemT = TypeVar("LTItemT", bound=LTItem) 

409 

410 

411class LTContainer(LTComponent, Generic[LTItemT]): 

412 """Object that can be extended and analyzed""" 

413 

414 def __init__(self, bbox: Rect) -> None: 

415 LTComponent.__init__(self, bbox) 

416 self._objs: list[LTItemT] = [] 

417 

418 def __iter__(self) -> Iterator[LTItemT]: 

419 return iter(self._objs) 

420 

421 def __len__(self) -> int: 

422 return len(self._objs) 

423 

424 def add(self, obj: LTItemT) -> None: 

425 self._objs.append(obj) 

426 

427 def extend(self, objs: Iterable[LTItemT]) -> None: 

428 for obj in objs: 

429 self.add(obj) 

430 

431 def analyze(self, laparams: LAParams) -> None: 

432 for obj in self._objs: 

433 obj.analyze(laparams) 

434 

435 

436class LTExpandableContainer(LTContainer[LTItemT]): 

437 def __init__(self) -> None: 

438 LTContainer.__init__(self, (+INF, +INF, -INF, -INF)) 

439 

440 # Incompatible override: we take an LTComponent (with bounding box), but 

441 # super() LTContainer only considers LTItem (no bounding box). 

442 def add(self, obj: LTComponent) -> None: # type: ignore[override] 

443 LTContainer.add(self, cast(LTItemT, obj)) 

444 self.set_bbox( 

445 ( 

446 min(self.x0, obj.x0), 

447 min(self.y0, obj.y0), 

448 max(self.x1, obj.x1), 

449 max(self.y1, obj.y1), 

450 ), 

451 ) 

452 

453 

454class LTTextContainer(LTExpandableContainer[LTItemT], LTText): 

455 def __init__(self) -> None: 

456 LTText.__init__(self) 

457 LTExpandableContainer.__init__(self) 

458 

459 def get_text(self) -> str: 

460 return "".join( 

461 cast(LTText, obj).get_text() for obj in self if isinstance(obj, LTText) 

462 ) 

463 

464 

465TextLineElement = Union[LTChar, LTAnno] 

466 

467 

468class LTTextLine(LTTextContainer[TextLineElement]): 

469 """Contains a list of LTChar objects that represent a single text line. 

470 

471 The characters are aligned either horizontally or vertically, depending on 

472 the text's writing mode. 

473 """ 

474 

475 def __init__(self, word_margin: float) -> None: 

476 super().__init__() 

477 self.word_margin = word_margin 

478 

479 def __repr__(self) -> str: 

480 return f"<{self.__class__.__name__} {bbox2str(self.bbox)} {self.get_text()!r}>" 

481 

482 def analyze(self, laparams: LAParams) -> None: 

483 for obj in self._objs: 

484 obj.analyze(laparams) 

485 LTContainer.add(self, LTAnno("\n")) 

486 

487 def find_neighbors( 

488 self, 

489 plane: Plane[LTComponentT], 

490 ratio: float, 

491 ) -> list["LTTextLine"]: 

492 raise NotImplementedError 

493 

494 def is_empty(self) -> bool: 

495 return super().is_empty() or self.get_text().isspace() 

496 

497 

498class LTTextLineHorizontal(LTTextLine): 

499 def __init__(self, word_margin: float) -> None: 

500 LTTextLine.__init__(self, word_margin) 

501 self._x1: float = +INF 

502 

503 # Incompatible override: we take an LTComponent (with bounding box), but 

504 # LTContainer only considers LTItem (no bounding box). 

505 def add(self, obj: LTComponent) -> None: # type: ignore[override] 

506 if isinstance(obj, LTChar) and self.word_margin: 

507 margin = self.word_margin * max(obj.width, obj.height) 

508 if self._x1 < obj.x0 - margin: 

509 LTContainer.add(self, LTAnno(" ")) 

510 self._x1 = obj.x1 

511 super().add(obj) 

512 

513 def find_neighbors( 

514 self, 

515 plane: Plane[LTComponentT], 

516 ratio: float, 

517 ) -> list[LTTextLine]: 

518 """Finds neighboring LTTextLineHorizontals in the plane. 

519 

520 Returns a list of other LTTestLineHorizontals in the plane which are 

521 close to self. "Close" can be controlled by ratio. The returned objects 

522 will be the same height as self, and also either left-, right-, or 

523 centrally-aligned. 

524 """ 

525 d = ratio * self.height 

526 objs = plane.find((self.x0, self.y0 - d, self.x1, self.y1 + d)) 

527 return [ 

528 obj 

529 for obj in objs 

530 if ( 

531 isinstance(obj, LTTextLineHorizontal) 

532 and self._is_same_height_as(obj, tolerance=d) 

533 and ( 

534 self._is_left_aligned_with(obj, tolerance=d) 

535 or self._is_right_aligned_with(obj, tolerance=d) 

536 or self._is_centrally_aligned_with(obj, tolerance=d) 

537 ) 

538 ) 

539 ] 

540 

541 def _is_left_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool: 

542 """Whether the left-hand edge of `other` is within `tolerance`.""" 

543 return abs(other.x0 - self.x0) <= tolerance 

544 

545 def _is_right_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool: 

546 """Whether the right-hand edge of `other` is within `tolerance`.""" 

547 return abs(other.x1 - self.x1) <= tolerance 

548 

549 def _is_centrally_aligned_with( 

550 self, 

551 other: LTComponent, 

552 tolerance: float = 0, 

553 ) -> bool: 

554 """Whether the horizontal center of `other` is within `tolerance`.""" 

555 return abs((other.x0 + other.x1) / 2 - (self.x0 + self.x1) / 2) <= tolerance 

556 

557 def _is_same_height_as(self, other: LTComponent, tolerance: float = 0) -> bool: 

558 return abs(other.height - self.height) <= tolerance 

559 

560 

561class LTTextLineVertical(LTTextLine): 

562 def __init__(self, word_margin: float) -> None: 

563 LTTextLine.__init__(self, word_margin) 

564 self._y0: float = -INF 

565 

566 # Incompatible override: we take an LTComponent (with bounding box), but 

567 # LTContainer only considers LTItem (no bounding box). 

568 def add(self, obj: LTComponent) -> None: # type: ignore[override] 

569 if isinstance(obj, LTChar) and self.word_margin: 

570 margin = self.word_margin * max(obj.width, obj.height) 

571 if obj.y1 + margin < self._y0: 

572 LTContainer.add(self, LTAnno(" ")) 

573 self._y0 = obj.y0 

574 super().add(obj) 

575 

576 def find_neighbors( 

577 self, 

578 plane: Plane[LTComponentT], 

579 ratio: float, 

580 ) -> list[LTTextLine]: 

581 """Finds neighboring LTTextLineVerticals in the plane. 

582 

583 Returns a list of other LTTextLineVerticals in the plane which are 

584 close to self. "Close" can be controlled by ratio. The returned objects 

585 will be the same width as self, and also either upper-, lower-, or 

586 centrally-aligned. 

587 """ 

588 d = ratio * self.width 

589 objs = plane.find((self.x0 - d, self.y0, self.x1 + d, self.y1)) 

590 return [ 

591 obj 

592 for obj in objs 

593 if ( 

594 isinstance(obj, LTTextLineVertical) 

595 and self._is_same_width_as(obj, tolerance=d) 

596 and ( 

597 self._is_lower_aligned_with(obj, tolerance=d) 

598 or self._is_upper_aligned_with(obj, tolerance=d) 

599 or self._is_centrally_aligned_with(obj, tolerance=d) 

600 ) 

601 ) 

602 ] 

603 

604 def _is_lower_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool: 

605 """Whether the lower edge of `other` is within `tolerance`.""" 

606 return abs(other.y0 - self.y0) <= tolerance 

607 

608 def _is_upper_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool: 

609 """Whether the upper edge of `other` is within `tolerance`.""" 

610 return abs(other.y1 - self.y1) <= tolerance 

611 

612 def _is_centrally_aligned_with( 

613 self, 

614 other: LTComponent, 

615 tolerance: float = 0, 

616 ) -> bool: 

617 """Whether the vertical center of `other` is within `tolerance`.""" 

618 return abs((other.y0 + other.y1) / 2 - (self.y0 + self.y1) / 2) <= tolerance 

619 

620 def _is_same_width_as(self, other: LTComponent, tolerance: float) -> bool: 

621 return abs(other.width - self.width) <= tolerance 

622 

623 

624class LTTextBox(LTTextContainer[LTTextLine]): 

625 """Represents a group of text chunks in a rectangular area. 

626 

627 Note that this box is created by geometric analysis and does not 

628 necessarily represents a logical boundary of the text. It contains a list 

629 of LTTextLine objects. 

630 """ 

631 

632 def __init__(self) -> None: 

633 LTTextContainer.__init__(self) 

634 self.index: int = -1 

635 

636 def __repr__(self) -> str: 

637 return ( 

638 f"<{self.__class__.__name__}({self.index}) " 

639 f"{bbox2str(self.bbox)} {self.get_text()!r}>" 

640 ) 

641 

642 def get_writing_mode(self) -> str: 

643 raise NotImplementedError 

644 

645 

646class LTTextBoxHorizontal(LTTextBox): 

647 def analyze(self, laparams: LAParams) -> None: 

648 super().analyze(laparams) 

649 self._objs.sort(key=lambda obj: -obj.y1) 

650 

651 def get_writing_mode(self) -> str: 

652 return "lr-tb" 

653 

654 

655class LTTextBoxVertical(LTTextBox): 

656 def analyze(self, laparams: LAParams) -> None: 

657 super().analyze(laparams) 

658 self._objs.sort(key=lambda obj: -obj.x1) 

659 

660 def get_writing_mode(self) -> str: 

661 return "tb-rl" 

662 

663 

664TextGroupElement = Union[LTTextBox, "LTTextGroup"] 

665 

666 

667class LTTextGroup(LTTextContainer[TextGroupElement]): 

668 def __init__(self, objs: Iterable[TextGroupElement]) -> None: 

669 super().__init__() 

670 self.extend(objs) 

671 

672 

673class LTTextGroupLRTB(LTTextGroup): 

674 def analyze(self, laparams: LAParams) -> None: 

675 super().analyze(laparams) 

676 assert laparams.boxes_flow is not None 

677 boxes_flow = laparams.boxes_flow 

678 # reorder the objects from top-left to bottom-right. 

679 self._objs.sort( 

680 key=lambda obj: (1 - boxes_flow) * obj.x0 

681 - (1 + boxes_flow) * (obj.y0 + obj.y1), 

682 ) 

683 

684 

685class LTTextGroupTBRL(LTTextGroup): 

686 def analyze(self, laparams: LAParams) -> None: 

687 super().analyze(laparams) 

688 assert laparams.boxes_flow is not None 

689 boxes_flow = laparams.boxes_flow 

690 # reorder the objects from top-right to bottom-left. 

691 self._objs.sort( 

692 key=lambda obj: -(1 + boxes_flow) * (obj.x0 + obj.x1) 

693 - (1 - boxes_flow) * obj.y1, 

694 ) 

695 

696 

697class LTLayoutContainer(LTContainer[LTComponent]): 

698 def __init__(self, bbox: Rect) -> None: 

699 LTContainer.__init__(self, bbox) 

700 self.groups: list[LTTextGroup] | None = None 

701 

702 # group_objects: group text object to textlines. 

703 def group_objects( 

704 self, 

705 laparams: LAParams, 

706 objs: Iterable[LTComponent], 

707 ) -> Iterator[LTTextLine]: 

708 obj0 = None 

709 line: LTTextLine | None = None 

710 for obj1 in objs: 

711 if obj0 is not None: 

712 # halign: obj0 and obj1 is horizontally aligned. 

713 # 

714 # +------+ - - - 

715 # | obj0 | - - +------+ - 

716 # | | | obj1 | | (line_overlap) 

717 # +------+ - - | | - 

718 # - - - +------+ 

719 # 

720 # |<--->| 

721 # (char_margin) 

722 halign = ( 

723 obj0.is_voverlap(obj1) 

724 and min(obj0.height, obj1.height) * laparams.line_overlap 

725 < obj0.voverlap(obj1) 

726 and obj0.hdistance(obj1) 

727 < max(obj0.width, obj1.width) * laparams.char_margin 

728 ) 

729 

730 # valign: obj0 and obj1 is vertically aligned. 

731 # 

732 # +------+ 

733 # | obj0 | 

734 # | | 

735 # +------+ - - - 

736 # | | | (char_margin) 

737 # +------+ - - 

738 # | obj1 | 

739 # | | 

740 # +------+ 

741 # 

742 # |<-->| 

743 # (line_overlap) 

744 valign = ( 

745 laparams.detect_vertical 

746 and obj0.is_hoverlap(obj1) 

747 and min(obj0.width, obj1.width) * laparams.line_overlap 

748 < obj0.hoverlap(obj1) 

749 and obj0.vdistance(obj1) 

750 < max(obj0.height, obj1.height) * laparams.char_margin 

751 ) 

752 

753 if (halign and isinstance(line, LTTextLineHorizontal)) or ( 

754 valign and isinstance(line, LTTextLineVertical) 

755 ): 

756 line.add(obj1) 

757 elif line is not None: 

758 yield line 

759 line = None 

760 elif valign and not halign: 

761 line = LTTextLineVertical(laparams.word_margin) 

762 line.add(obj0) 

763 line.add(obj1) 

764 elif halign and not valign: 

765 line = LTTextLineHorizontal(laparams.word_margin) 

766 line.add(obj0) 

767 line.add(obj1) 

768 else: 

769 line = LTTextLineHorizontal(laparams.word_margin) 

770 line.add(obj0) 

771 yield line 

772 line = None 

773 obj0 = obj1 

774 if line is None: 

775 line = LTTextLineHorizontal(laparams.word_margin) 

776 assert obj0 is not None 

777 line.add(obj0) 

778 yield line 

779 

780 def group_textlines( 

781 self, 

782 laparams: LAParams, 

783 lines: Iterable[LTTextLine], 

784 ) -> Iterator[LTTextBox]: 

785 """Group neighboring lines to textboxes""" 

786 plane: Plane[LTTextLine] = Plane(self.bbox) 

787 plane.extend(lines) 

788 boxes: dict[LTTextLine, LTTextBox] = {} 

789 for line in lines: 

790 neighbors = line.find_neighbors(plane, laparams.line_margin) 

791 members = [line] 

792 for obj1 in neighbors: 

793 members.append(obj1) 

794 if obj1 in boxes: 

795 members.extend(boxes.pop(obj1)) 

796 if isinstance(line, LTTextLineHorizontal): 

797 box: LTTextBox = LTTextBoxHorizontal() 

798 else: 

799 box = LTTextBoxVertical() 

800 for obj in uniq(members): 

801 box.add(obj) 

802 boxes[obj] = box 

803 done = set() 

804 for line in lines: 

805 if line not in boxes: 

806 continue 

807 box = boxes[line] 

808 if box in done: 

809 continue 

810 done.add(box) 

811 if not box.is_empty(): 

812 yield box 

813 

814 def group_textboxes( 

815 self, 

816 laparams: LAParams, 

817 boxes: Sequence[LTTextBox], 

818 ) -> list[LTTextGroup]: 

819 """Group textboxes hierarchically. 

820 

821 Get pair-wise distances, via dist func defined below, and then merge 

822 from the closest textbox pair. Once obj1 and obj2 are merged / 

823 grouped, the resulting group is considered as a new object, and its 

824 distances to other objects & groups are added to the process queue. 

825 

826 For performance reason, pair-wise distances and object pair info are 

827 maintained in a heap of (idx, dist, id(obj1), id(obj2), obj1, obj2) 

828 tuples. It ensures quick access to the smallest element. Note that 

829 since comparison operators, e.g., __lt__, are disabled for 

830 LTComponent, id(obj) has to appear before obj in element tuples. 

831 

832 :param laparams: LAParams object. 

833 :param boxes: All textbox objects to be grouped. 

834 :return: a list that has only one element, the final top level group. 

835 """ 

836 ElementT = Union[LTTextBox, LTTextGroup] 

837 plane: Plane[ElementT] = Plane(self.bbox) 

838 

839 def dist(obj1: LTComponent, obj2: LTComponent) -> float: 

840 """A distance function between two TextBoxes. 

841 

842 Consider the bounding rectangle for obj1 and obj2. 

843 Return its area less the areas of obj1 and obj2, 

844 shown as 'www' below. This value may be negative. 

845 +------+..........+ (x1, y1) 

846 | obj1 |wwwwwwwwww: 

847 +------+www+------+ 

848 :wwwwwwwwww| obj2 | 

849 (x0, y0) +..........+------+ 

850 """ 

851 x0 = min(obj1.x0, obj2.x0) 

852 y0 = min(obj1.y0, obj2.y0) 

853 x1 = max(obj1.x1, obj2.x1) 

854 y1 = max(obj1.y1, obj2.y1) 

855 return ( 

856 (x1 - x0) * (y1 - y0) 

857 - obj1.width * obj1.height 

858 - obj2.width * obj2.height 

859 ) 

860 

861 def isany(obj1: ElementT, obj2: ElementT) -> set[ElementT]: 

862 """Check if there's any other object between obj1 and obj2.""" 

863 x0 = min(obj1.x0, obj2.x0) 

864 y0 = min(obj1.y0, obj2.y0) 

865 x1 = max(obj1.x1, obj2.x1) 

866 y1 = max(obj1.y1, obj2.y1) 

867 objs = set(plane.find((x0, y0, x1, y1))) 

868 return objs.difference((obj1, obj2)) 

869 

870 dists: list[tuple[bool, float, int, int, ElementT, ElementT]] = [] 

871 for i in range(len(boxes)): 

872 box1 = boxes[i] 

873 for j in range(i + 1, len(boxes)): 

874 box2 = boxes[j] 

875 dists.append((False, dist(box1, box2), id(box1), id(box2), box1, box2)) 

876 heapq.heapify(dists) 

877 

878 plane.extend(boxes) 

879 done = set() 

880 while len(dists) > 0: 

881 (skip_isany, d, id1, id2, obj1, obj2) = heapq.heappop(dists) 

882 # Skip objects that are already merged 

883 if (id1 not in done) and (id2 not in done): 

884 if not skip_isany and isany(obj1, obj2): 

885 heapq.heappush(dists, (True, d, id1, id2, obj1, obj2)) 

886 continue 

887 if isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or isinstance( 

888 obj2, 

889 (LTTextBoxVertical, LTTextGroupTBRL), 

890 ): 

891 group: LTTextGroup = LTTextGroupTBRL([obj1, obj2]) 

892 else: 

893 group = LTTextGroupLRTB([obj1, obj2]) 

894 plane.remove(obj1) 

895 plane.remove(obj2) 

896 done.update([id1, id2]) 

897 

898 for other in plane: 

899 heapq.heappush( 

900 dists, 

901 (False, dist(group, other), id(group), id(other), group, other), 

902 ) 

903 plane.add(group) 

904 # By now only groups are in the plane 

905 return [cast(LTTextGroup, g) for g in plane] 

906 

907 def analyze(self, laparams: LAParams) -> None: 

908 # textobjs is a list of LTChar objects, i.e. 

909 # it has all the individual characters in the page. 

910 (textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self) 

911 for obj in otherobjs: 

912 obj.analyze(laparams) 

913 if not textobjs: 

914 return 

915 textlines = list(self.group_objects(laparams, textobjs)) 

916 (empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines) 

917 for obj in empties: 

918 obj.analyze(laparams) 

919 textboxes = list(self.group_textlines(laparams, textlines)) 

920 if laparams.boxes_flow is None: 

921 for textbox in textboxes: 

922 textbox.analyze(laparams) 

923 

924 def getkey(box: LTTextBox) -> tuple[int, float, float]: 

925 if isinstance(box, LTTextBoxVertical): 

926 return (0, -box.x1, -box.y0) 

927 else: 

928 return (1, -box.y0, box.x0) 

929 

930 textboxes.sort(key=getkey) 

931 else: 

932 self.groups = self.group_textboxes(laparams, textboxes) 

933 assigner = IndexAssigner() 

934 for group in self.groups: 

935 group.analyze(laparams) 

936 assigner.run(group) 

937 textboxes.sort(key=lambda box: box.index) 

938 self._objs = ( 

939 cast(list[LTComponent], textboxes) 

940 + otherobjs 

941 + cast(list[LTComponent], empties) 

942 ) 

943 

944 

945class LTFigure(LTLayoutContainer): 

946 """Represents an area used by PDF Form objects. 

947 

948 PDF Forms can be used to present figures or pictures by embedding yet 

949 another PDF document within a page. Note that LTFigure objects can appear 

950 recursively. 

951 """ 

952 

953 def __init__(self, name: str, bbox: Rect, matrix: Matrix) -> None: 

954 self.name = name 

955 self.matrix = matrix 

956 (x, y, w, h) = bbox 

957 rect = (x, y, x + w, y + h) 

958 bbox = apply_matrix_rect(matrix, rect) 

959 LTLayoutContainer.__init__(self, bbox) 

960 

961 def __repr__(self) -> str: 

962 return ( 

963 f"<{self.__class__.__name__}({self.name}) " 

964 f"{bbox2str(self.bbox)} " 

965 f"matrix={matrix2str(self.matrix)}>" 

966 ) 

967 

968 def analyze(self, laparams: LAParams) -> None: 

969 if not laparams.all_texts: 

970 return 

971 LTLayoutContainer.analyze(self, laparams) 

972 

973 

974class LTPage(LTLayoutContainer): 

975 """Represents an entire page. 

976 

977 Like any other LTLayoutContainer, an LTPage can be iterated to obtain child 

978 objects like LTTextBox, LTFigure, LTImage, LTRect, LTCurve and LTLine. 

979 """ 

980 

981 def __init__(self, pageid: int, bbox: Rect, rotate: float = 0) -> None: 

982 LTLayoutContainer.__init__(self, bbox) 

983 self.pageid = pageid 

984 self.rotate = rotate 

985 

986 def __repr__(self) -> str: 

987 return ( 

988 f"<{self.__class__.__name__}({self.pageid!r}) " 

989 f"{bbox2str(self.bbox)} " 

990 f"rotate={self.rotate!r}>" 

991 )