Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/layout.py: 89%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

446 statements  

1import heapq 

2import logging 

3from typing import ( 

4 Dict, 

5 Generic, 

6 Iterable, 

7 Iterator, 

8 List, 

9 Optional, 

10 Sequence, 

11 Set, 

12 Tuple, 

13 TypeVar, 

14 Union, 

15 cast, 

16) 

17 

18from pdfminer.pdfcolor import PDFColorSpace 

19from pdfminer.pdfexceptions import PDFTypeError, PDFValueError 

20from pdfminer.pdffont import PDFFont 

21from pdfminer.pdfinterp import Color, PDFGraphicState 

22from pdfminer.pdftypes import PDFStream 

23from pdfminer.utils import ( 

24 INF, 

25 LTComponentT, 

26 Matrix, 

27 PathSegment, 

28 Plane, 

29 Point, 

30 Rect, 

31 apply_matrix_rect, 

32 bbox2str, 

33 fsplit, 

34 get_bound, 

35 matrix2str, 

36 uniq, 

37) 

38 

39logger = logging.getLogger(__name__) 

40 

41 

42class IndexAssigner: 

43 def __init__(self, index: int = 0) -> None: 

44 self.index = index 

45 

46 def run(self, obj: "LTItem") -> None: 

47 if isinstance(obj, LTTextBox): 

48 obj.index = self.index 

49 self.index += 1 

50 elif isinstance(obj, LTTextGroup): 

51 for x in obj: 

52 self.run(x) 

53 

54 

55class LAParams: 

56 """Parameters for layout analysis 

57 

58 :param line_overlap: If two characters have more overlap than this they 

59 are considered to be on the same line. The overlap is specified 

60 relative to the minimum height of both characters. 

61 :param char_margin: If two characters are closer together than this 

62 margin they are considered part of the same line. The margin is 

63 specified relative to the width of the character. 

64 :param word_margin: If two characters on the same line are further apart 

65 than this margin then they are considered to be two separate words, and 

66 an intermediate space will be added for readability. The margin is 

67 specified relative to the width of the character. 

68 :param line_margin: If two lines are are close together they are 

69 considered to be part of the same paragraph. The margin is 

70 specified relative to the height of a line. 

71 :param boxes_flow: Specifies how much a horizontal and vertical position 

72 of a text matters when determining the order of text boxes. The value 

73 should be within the range of -1.0 (only horizontal position 

74 matters) to +1.0 (only vertical position matters). You can also pass 

75 `None` to disable advanced layout analysis, and instead return text 

76 based on the position of the bottom left corner of the text box. 

77 :param detect_vertical: If vertical text should be considered during 

78 layout analysis 

79 :param all_texts: If layout analysis should be performed on text in 

80 figures. 

81 """ 

82 

83 def __init__( 

84 self, 

85 line_overlap: float = 0.5, 

86 char_margin: float = 2.0, 

87 line_margin: float = 0.5, 

88 word_margin: float = 0.1, 

89 boxes_flow: Optional[float] = 0.5, 

90 detect_vertical: bool = False, 

91 all_texts: bool = False, 

92 ) -> None: 

93 self.line_overlap = line_overlap 

94 self.char_margin = char_margin 

95 self.line_margin = line_margin 

96 self.word_margin = word_margin 

97 self.boxes_flow = boxes_flow 

98 self.detect_vertical = detect_vertical 

99 self.all_texts = all_texts 

100 

101 self._validate() 

102 

103 def _validate(self) -> None: 

104 if self.boxes_flow is not None: 

105 boxes_flow_err_msg = ( 

106 "LAParam boxes_flow should be None, or a number between -1 and +1" 

107 ) 

108 if not ( 

109 isinstance(self.boxes_flow, int) or isinstance(self.boxes_flow, float) 

110 ): 

111 raise PDFTypeError(boxes_flow_err_msg) 

112 if not -1 <= self.boxes_flow <= 1: 

113 raise PDFValueError(boxes_flow_err_msg) 

114 

115 def __repr__(self) -> str: 

116 return ( 

117 "<LAParams: char_margin=%.1f, line_margin=%.1f, " 

118 "word_margin=%.1f all_texts=%r>" 

119 % (self.char_margin, self.line_margin, self.word_margin, self.all_texts) 

120 ) 

121 

122 

123class LTItem: 

124 """Interface for things that can be analyzed""" 

125 

126 def analyze(self, laparams: LAParams) -> None: 

127 """Perform the layout analysis.""" 

128 

129 

130class LTText: 

131 """Interface for things that have text""" 

132 

133 def __repr__(self) -> str: 

134 return f"<{self.__class__.__name__} {self.get_text()!r}>" 

135 

136 def get_text(self) -> str: 

137 """Text contained in this object""" 

138 raise NotImplementedError 

139 

140 

141class LTComponent(LTItem): 

142 """Object with a bounding box""" 

143 

144 def __init__(self, bbox: Rect) -> None: 

145 LTItem.__init__(self) 

146 self.set_bbox(bbox) 

147 

148 def __repr__(self) -> str: 

149 return f"<{self.__class__.__name__} {bbox2str(self.bbox)}>" 

150 

151 # Disable comparison. 

152 def __lt__(self, _: object) -> bool: 

153 raise PDFValueError 

154 

155 def __le__(self, _: object) -> bool: 

156 raise PDFValueError 

157 

158 def __gt__(self, _: object) -> bool: 

159 raise PDFValueError 

160 

161 def __ge__(self, _: object) -> bool: 

162 raise PDFValueError 

163 

164 def set_bbox(self, bbox: Rect) -> None: 

165 (x0, y0, x1, y1) = bbox 

166 self.x0 = x0 

167 self.y0 = y0 

168 self.x1 = x1 

169 self.y1 = y1 

170 self.width = x1 - x0 

171 self.height = y1 - y0 

172 self.bbox = bbox 

173 

174 def is_empty(self) -> bool: 

175 return self.width <= 0 or self.height <= 0 

176 

177 def is_hoverlap(self, obj: "LTComponent") -> bool: 

178 assert isinstance(obj, LTComponent), str(type(obj)) 

179 return obj.x0 <= self.x1 and self.x0 <= obj.x1 

180 

181 def hdistance(self, obj: "LTComponent") -> float: 

182 assert isinstance(obj, LTComponent), str(type(obj)) 

183 if self.is_hoverlap(obj): 

184 return 0 

185 else: 

186 return min(abs(self.x0 - obj.x1), abs(self.x1 - obj.x0)) 

187 

188 def hoverlap(self, obj: "LTComponent") -> float: 

189 assert isinstance(obj, LTComponent), str(type(obj)) 

190 if self.is_hoverlap(obj): 

191 return min(abs(self.x0 - obj.x1), abs(self.x1 - obj.x0)) 

192 else: 

193 return 0 

194 

195 def is_voverlap(self, obj: "LTComponent") -> bool: 

196 assert isinstance(obj, LTComponent), str(type(obj)) 

197 return obj.y0 <= self.y1 and self.y0 <= obj.y1 

198 

199 def vdistance(self, obj: "LTComponent") -> float: 

200 assert isinstance(obj, LTComponent), str(type(obj)) 

201 if self.is_voverlap(obj): 

202 return 0 

203 else: 

204 return min(abs(self.y0 - obj.y1), abs(self.y1 - obj.y0)) 

205 

206 def voverlap(self, obj: "LTComponent") -> float: 

207 assert isinstance(obj, LTComponent), str(type(obj)) 

208 if self.is_voverlap(obj): 

209 return min(abs(self.y0 - obj.y1), abs(self.y1 - obj.y0)) 

210 else: 

211 return 0 

212 

213 

214class LTCurve(LTComponent): 

215 """A generic Bezier curve 

216 

217 The parameter `original_path` contains the original 

218 pathing information from the pdf (e.g. for reconstructing Bezier Curves). 

219 

220 `dashing_style` contains the Dashing information if any. 

221 """ 

222 

223 def __init__( 

224 self, 

225 linewidth: float, 

226 pts: List[Point], 

227 stroke: bool = False, 

228 fill: bool = False, 

229 evenodd: bool = False, 

230 stroking_color: Optional[Color] = None, 

231 non_stroking_color: Optional[Color] = None, 

232 original_path: Optional[List[PathSegment]] = None, 

233 dashing_style: Optional[Tuple[object, object]] = None, 

234 ) -> None: 

235 LTComponent.__init__(self, get_bound(pts)) 

236 self.pts = pts 

237 self.linewidth = linewidth 

238 self.stroke = stroke 

239 self.fill = fill 

240 self.evenodd = evenodd 

241 self.stroking_color = stroking_color 

242 self.non_stroking_color = non_stroking_color 

243 self.original_path = original_path 

244 self.dashing_style = dashing_style 

245 

246 def get_pts(self) -> str: 

247 return ",".join("%.3f,%.3f" % p for p in self.pts) 

248 

249 

250class LTLine(LTCurve): 

251 """A single straight line. 

252 

253 Could be used for separating text or figures. 

254 """ 

255 

256 def __init__( 

257 self, 

258 linewidth: float, 

259 p0: Point, 

260 p1: Point, 

261 stroke: bool = False, 

262 fill: bool = False, 

263 evenodd: bool = False, 

264 stroking_color: Optional[Color] = None, 

265 non_stroking_color: Optional[Color] = None, 

266 original_path: Optional[List[PathSegment]] = None, 

267 dashing_style: Optional[Tuple[object, object]] = None, 

268 ) -> None: 

269 LTCurve.__init__( 

270 self, 

271 linewidth, 

272 [p0, p1], 

273 stroke, 

274 fill, 

275 evenodd, 

276 stroking_color, 

277 non_stroking_color, 

278 original_path, 

279 dashing_style, 

280 ) 

281 

282 

283class LTRect(LTCurve): 

284 """A rectangle. 

285 

286 Could be used for framing another pictures or figures. 

287 """ 

288 

289 def __init__( 

290 self, 

291 linewidth: float, 

292 bbox: Rect, 

293 stroke: bool = False, 

294 fill: bool = False, 

295 evenodd: bool = False, 

296 stroking_color: Optional[Color] = None, 

297 non_stroking_color: Optional[Color] = None, 

298 original_path: Optional[List[PathSegment]] = None, 

299 dashing_style: Optional[Tuple[object, object]] = None, 

300 ) -> None: 

301 (x0, y0, x1, y1) = bbox 

302 LTCurve.__init__( 

303 self, 

304 linewidth, 

305 [(x0, y0), (x1, y0), (x1, y1), (x0, y1)], 

306 stroke, 

307 fill, 

308 evenodd, 

309 stroking_color, 

310 non_stroking_color, 

311 original_path, 

312 dashing_style, 

313 ) 

314 

315 

316class LTImage(LTComponent): 

317 """An image object. 

318 

319 Embedded images can be in JPEG, Bitmap or JBIG2. 

320 """ 

321 

322 def __init__(self, name: str, stream: PDFStream, bbox: Rect) -> None: 

323 LTComponent.__init__(self, bbox) 

324 self.name = name 

325 self.stream = stream 

326 self.srcsize = (stream.get_any(("W", "Width")), stream.get_any(("H", "Height"))) 

327 self.imagemask = stream.get_any(("IM", "ImageMask")) 

328 self.bits = stream.get_any(("BPC", "BitsPerComponent"), 1) 

329 self.colorspace = stream.get_any(("CS", "ColorSpace")) 

330 if not isinstance(self.colorspace, list): 

331 self.colorspace = [self.colorspace] 

332 

333 def __repr__(self) -> str: 

334 return f"<{self.__class__.__name__}({self.name}) {bbox2str(self.bbox)} {self.srcsize!r}>" 

335 

336 

337class LTAnno(LTItem, LTText): 

338 """Actual letter in the text as a Unicode string. 

339 

340 Note that, while a LTChar object has actual boundaries, LTAnno objects does 

341 not, as these are "virtual" characters, inserted by a layout analyzer 

342 according to the relationship between two characters (e.g. a space). 

343 """ 

344 

345 def __init__(self, text: str) -> None: 

346 self._text = text 

347 

348 def get_text(self) -> str: 

349 return self._text 

350 

351 

352class LTChar(LTComponent, LTText): 

353 """Actual letter in the text as a Unicode string.""" 

354 

355 def __init__( 

356 self, 

357 matrix: Matrix, 

358 font: PDFFont, 

359 fontsize: float, 

360 scaling: float, 

361 rise: float, 

362 text: str, 

363 textwidth: float, 

364 textdisp: Union[float, Tuple[Optional[float], float]], 

365 ncs: PDFColorSpace, 

366 graphicstate: PDFGraphicState, 

367 ) -> None: 

368 LTText.__init__(self) 

369 self._text = text 

370 self.matrix = matrix 

371 self.fontname = font.fontname 

372 self.ncs = ncs 

373 self.graphicstate = graphicstate 

374 self.adv = textwidth * fontsize * scaling 

375 # compute the boundary rectangle. 

376 if font.is_vertical(): 

377 # vertical 

378 assert isinstance(textdisp, tuple) 

379 (vx, vy) = textdisp 

380 if vx is None: 

381 vx = fontsize * 0.5 

382 else: 

383 vx = vx * fontsize * 0.001 

384 vy = (1000 - vy) * fontsize * 0.001 

385 bbox = (-vx, vy + rise + self.adv, -vx + fontsize, vy + rise) 

386 else: 

387 # horizontal 

388 descent = font.get_descent() * fontsize 

389 bbox = (0, descent + rise, self.adv, descent + rise + fontsize) 

390 (a, b, c, d, e, f) = self.matrix 

391 self.upright = a * d * scaling > 0 and b * c <= 0 

392 (x0, y0, x1, y1) = apply_matrix_rect(self.matrix, bbox) 

393 if x1 < x0: 

394 (x0, x1) = (x1, x0) 

395 if y1 < y0: 

396 (y0, y1) = (y1, y0) 

397 LTComponent.__init__(self, (x0, y0, x1, y1)) 

398 if font.is_vertical(): 

399 self.size = self.width 

400 else: 

401 self.size = self.height 

402 

403 def __repr__(self) -> str: 

404 return f"<{self.__class__.__name__} {bbox2str(self.bbox)} matrix={matrix2str(self.matrix)} font={self.fontname!r} adv={self.adv} text={self.get_text()!r}>" 

405 

406 def get_text(self) -> str: 

407 return self._text 

408 

409 

410LTItemT = TypeVar("LTItemT", bound=LTItem) 

411 

412 

413class LTContainer(LTComponent, Generic[LTItemT]): 

414 """Object that can be extended and analyzed""" 

415 

416 def __init__(self, bbox: Rect) -> None: 

417 LTComponent.__init__(self, bbox) 

418 self._objs: List[LTItemT] = [] 

419 

420 def __iter__(self) -> Iterator[LTItemT]: 

421 return iter(self._objs) 

422 

423 def __len__(self) -> int: 

424 return len(self._objs) 

425 

426 def add(self, obj: LTItemT) -> None: 

427 self._objs.append(obj) 

428 

429 def extend(self, objs: Iterable[LTItemT]) -> None: 

430 for obj in objs: 

431 self.add(obj) 

432 

433 def analyze(self, laparams: LAParams) -> None: 

434 for obj in self._objs: 

435 obj.analyze(laparams) 

436 

437 

438class LTExpandableContainer(LTContainer[LTItemT]): 

439 def __init__(self) -> None: 

440 LTContainer.__init__(self, (+INF, +INF, -INF, -INF)) 

441 

442 # Incompatible override: we take an LTComponent (with bounding box), but 

443 # super() LTContainer only considers LTItem (no bounding box). 

444 def add(self, obj: LTComponent) -> None: # type: ignore[override] 

445 LTContainer.add(self, cast(LTItemT, obj)) 

446 self.set_bbox( 

447 ( 

448 min(self.x0, obj.x0), 

449 min(self.y0, obj.y0), 

450 max(self.x1, obj.x1), 

451 max(self.y1, obj.y1), 

452 ), 

453 ) 

454 

455 

456class LTTextContainer(LTExpandableContainer[LTItemT], LTText): 

457 def __init__(self) -> None: 

458 LTText.__init__(self) 

459 LTExpandableContainer.__init__(self) 

460 

461 def get_text(self) -> str: 

462 return "".join( 

463 cast(LTText, obj).get_text() for obj in self if isinstance(obj, LTText) 

464 ) 

465 

466 

467TextLineElement = Union[LTChar, LTAnno] 

468 

469 

470class LTTextLine(LTTextContainer[TextLineElement]): 

471 """Contains a list of LTChar objects that represent a single text line. 

472 

473 The characters are aligned either horizontally or vertically, depending on 

474 the text's writing mode. 

475 """ 

476 

477 def __init__(self, word_margin: float) -> None: 

478 super().__init__() 

479 self.word_margin = word_margin 

480 

481 def __repr__(self) -> str: 

482 return f"<{self.__class__.__name__} {bbox2str(self.bbox)} {self.get_text()!r}>" 

483 

484 def analyze(self, laparams: LAParams) -> None: 

485 for obj in self._objs: 

486 obj.analyze(laparams) 

487 LTContainer.add(self, LTAnno("\n")) 

488 

489 def find_neighbors( 

490 self, 

491 plane: Plane[LTComponentT], 

492 ratio: float, 

493 ) -> List["LTTextLine"]: 

494 raise NotImplementedError 

495 

496 def is_empty(self) -> bool: 

497 return super().is_empty() or self.get_text().isspace() 

498 

499 

500class LTTextLineHorizontal(LTTextLine): 

501 def __init__(self, word_margin: float) -> None: 

502 LTTextLine.__init__(self, word_margin) 

503 self._x1: float = +INF 

504 

505 # Incompatible override: we take an LTComponent (with bounding box), but 

506 # LTContainer only considers LTItem (no bounding box). 

507 def add(self, obj: LTComponent) -> None: # type: ignore[override] 

508 if isinstance(obj, LTChar) and self.word_margin: 

509 margin = self.word_margin * max(obj.width, obj.height) 

510 if self._x1 < obj.x0 - margin: 

511 LTContainer.add(self, LTAnno(" ")) 

512 self._x1 = obj.x1 

513 super().add(obj) 

514 

515 def find_neighbors( 

516 self, 

517 plane: Plane[LTComponentT], 

518 ratio: float, 

519 ) -> List[LTTextLine]: 

520 """Finds neighboring LTTextLineHorizontals in the plane. 

521 

522 Returns a list of other LTTestLineHorizontals in the plane which are 

523 close to self. "Close" can be controlled by ratio. The returned objects 

524 will be the same height as self, and also either left-, right-, or 

525 centrally-aligned. 

526 """ 

527 d = ratio * self.height 

528 objs = plane.find((self.x0, self.y0 - d, self.x1, self.y1 + d)) 

529 return [ 

530 obj 

531 for obj in objs 

532 if ( 

533 isinstance(obj, LTTextLineHorizontal) 

534 and self._is_same_height_as(obj, tolerance=d) 

535 and ( 

536 self._is_left_aligned_with(obj, tolerance=d) 

537 or self._is_right_aligned_with(obj, tolerance=d) 

538 or self._is_centrally_aligned_with(obj, tolerance=d) 

539 ) 

540 ) 

541 ] 

542 

543 def _is_left_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool: 

544 """Whether the left-hand edge of `other` is within `tolerance`.""" 

545 return abs(other.x0 - self.x0) <= tolerance 

546 

547 def _is_right_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool: 

548 """Whether the right-hand edge of `other` is within `tolerance`.""" 

549 return abs(other.x1 - self.x1) <= tolerance 

550 

551 def _is_centrally_aligned_with( 

552 self, 

553 other: LTComponent, 

554 tolerance: float = 0, 

555 ) -> bool: 

556 """Whether the horizontal center of `other` is within `tolerance`.""" 

557 return abs((other.x0 + other.x1) / 2 - (self.x0 + self.x1) / 2) <= tolerance 

558 

559 def _is_same_height_as(self, other: LTComponent, tolerance: float = 0) -> bool: 

560 return abs(other.height - self.height) <= tolerance 

561 

562 

563class LTTextLineVertical(LTTextLine): 

564 def __init__(self, word_margin: float) -> None: 

565 LTTextLine.__init__(self, word_margin) 

566 self._y0: float = -INF 

567 

568 # Incompatible override: we take an LTComponent (with bounding box), but 

569 # LTContainer only considers LTItem (no bounding box). 

570 def add(self, obj: LTComponent) -> None: # type: ignore[override] 

571 if isinstance(obj, LTChar) and self.word_margin: 

572 margin = self.word_margin * max(obj.width, obj.height) 

573 if obj.y1 + margin < self._y0: 

574 LTContainer.add(self, LTAnno(" ")) 

575 self._y0 = obj.y0 

576 super().add(obj) 

577 

578 def find_neighbors( 

579 self, 

580 plane: Plane[LTComponentT], 

581 ratio: float, 

582 ) -> List[LTTextLine]: 

583 """Finds neighboring LTTextLineVerticals in the plane. 

584 

585 Returns a list of other LTTextLineVerticals in the plane which are 

586 close to self. "Close" can be controlled by ratio. The returned objects 

587 will be the same width as self, and also either upper-, lower-, or 

588 centrally-aligned. 

589 """ 

590 d = ratio * self.width 

591 objs = plane.find((self.x0 - d, self.y0, self.x1 + d, self.y1)) 

592 return [ 

593 obj 

594 for obj in objs 

595 if ( 

596 isinstance(obj, LTTextLineVertical) 

597 and self._is_same_width_as(obj, tolerance=d) 

598 and ( 

599 self._is_lower_aligned_with(obj, tolerance=d) 

600 or self._is_upper_aligned_with(obj, tolerance=d) 

601 or self._is_centrally_aligned_with(obj, tolerance=d) 

602 ) 

603 ) 

604 ] 

605 

606 def _is_lower_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool: 

607 """Whether the lower edge of `other` is within `tolerance`.""" 

608 return abs(other.y0 - self.y0) <= tolerance 

609 

610 def _is_upper_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool: 

611 """Whether the upper edge of `other` is within `tolerance`.""" 

612 return abs(other.y1 - self.y1) <= tolerance 

613 

614 def _is_centrally_aligned_with( 

615 self, 

616 other: LTComponent, 

617 tolerance: float = 0, 

618 ) -> bool: 

619 """Whether the vertical center of `other` is within `tolerance`.""" 

620 return abs((other.y0 + other.y1) / 2 - (self.y0 + self.y1) / 2) <= tolerance 

621 

622 def _is_same_width_as(self, other: LTComponent, tolerance: float) -> bool: 

623 return abs(other.width - self.width) <= tolerance 

624 

625 

626class LTTextBox(LTTextContainer[LTTextLine]): 

627 """Represents a group of text chunks in a rectangular area. 

628 

629 Note that this box is created by geometric analysis and does not 

630 necessarily represents a logical boundary of the text. It contains a list 

631 of LTTextLine objects. 

632 """ 

633 

634 def __init__(self) -> None: 

635 LTTextContainer.__init__(self) 

636 self.index: int = -1 

637 

638 def __repr__(self) -> str: 

639 return f"<{self.__class__.__name__}({self.index}) {bbox2str(self.bbox)} {self.get_text()!r}>" 

640 

641 def get_writing_mode(self) -> str: 

642 raise NotImplementedError 

643 

644 

645class LTTextBoxHorizontal(LTTextBox): 

646 def analyze(self, laparams: LAParams) -> None: 

647 super().analyze(laparams) 

648 self._objs.sort(key=lambda obj: -obj.y1) 

649 

650 def get_writing_mode(self) -> str: 

651 return "lr-tb" 

652 

653 

654class LTTextBoxVertical(LTTextBox): 

655 def analyze(self, laparams: LAParams) -> None: 

656 super().analyze(laparams) 

657 self._objs.sort(key=lambda obj: -obj.x1) 

658 

659 def get_writing_mode(self) -> str: 

660 return "tb-rl" 

661 

662 

663TextGroupElement = Union[LTTextBox, "LTTextGroup"] 

664 

665 

666class LTTextGroup(LTTextContainer[TextGroupElement]): 

667 def __init__(self, objs: Iterable[TextGroupElement]) -> None: 

668 super().__init__() 

669 self.extend(objs) 

670 

671 

672class LTTextGroupLRTB(LTTextGroup): 

673 def analyze(self, laparams: LAParams) -> None: 

674 super().analyze(laparams) 

675 assert laparams.boxes_flow is not None 

676 boxes_flow = laparams.boxes_flow 

677 # reorder the objects from top-left to bottom-right. 

678 self._objs.sort( 

679 key=lambda obj: (1 - boxes_flow) * obj.x0 

680 - (1 + boxes_flow) * (obj.y0 + obj.y1), 

681 ) 

682 

683 

684class LTTextGroupTBRL(LTTextGroup): 

685 def analyze(self, laparams: LAParams) -> None: 

686 super().analyze(laparams) 

687 assert laparams.boxes_flow is not None 

688 boxes_flow = laparams.boxes_flow 

689 # reorder the objects from top-right to bottom-left. 

690 self._objs.sort( 

691 key=lambda obj: -(1 + boxes_flow) * (obj.x0 + obj.x1) 

692 - (1 - boxes_flow) * obj.y1, 

693 ) 

694 

695 

696class LTLayoutContainer(LTContainer[LTComponent]): 

697 def __init__(self, bbox: Rect) -> None: 

698 LTContainer.__init__(self, bbox) 

699 self.groups: Optional[List[LTTextGroup]] = None 

700 

701 # group_objects: group text object to textlines. 

702 def group_objects( 

703 self, 

704 laparams: LAParams, 

705 objs: Iterable[LTComponent], 

706 ) -> Iterator[LTTextLine]: 

707 obj0 = None 

708 line = None 

709 for obj1 in objs: 

710 if obj0 is not None: 

711 # halign: obj0 and obj1 is horizontally aligned. 

712 # 

713 # +------+ - - - 

714 # | obj0 | - - +------+ - 

715 # | | | obj1 | | (line_overlap) 

716 # +------+ - - | | - 

717 # - - - +------+ 

718 # 

719 # |<--->| 

720 # (char_margin) 

721 halign = ( 

722 obj0.is_voverlap(obj1) 

723 and min(obj0.height, obj1.height) * laparams.line_overlap 

724 < obj0.voverlap(obj1) 

725 and obj0.hdistance(obj1) 

726 < max(obj0.width, obj1.width) * laparams.char_margin 

727 ) 

728 

729 # valign: obj0 and obj1 is vertically aligned. 

730 # 

731 # +------+ 

732 # | obj0 | 

733 # | | 

734 # +------+ - - - 

735 # | | | (char_margin) 

736 # +------+ - - 

737 # | obj1 | 

738 # | | 

739 # +------+ 

740 # 

741 # |<-->| 

742 # (line_overlap) 

743 valign = ( 

744 laparams.detect_vertical 

745 and obj0.is_hoverlap(obj1) 

746 and min(obj0.width, obj1.width) * laparams.line_overlap 

747 < obj0.hoverlap(obj1) 

748 and obj0.vdistance(obj1) 

749 < max(obj0.height, obj1.height) * laparams.char_margin 

750 ) 

751 

752 if (halign and isinstance(line, LTTextLineHorizontal)) or ( 

753 valign and isinstance(line, LTTextLineVertical) 

754 ): 

755 line.add(obj1) 

756 elif line is not None: 

757 yield line 

758 line = None 

759 elif valign and not halign: 

760 line = LTTextLineVertical(laparams.word_margin) 

761 line.add(obj0) 

762 line.add(obj1) 

763 elif halign and not valign: 

764 line = LTTextLineHorizontal(laparams.word_margin) 

765 line.add(obj0) 

766 line.add(obj1) 

767 else: 

768 line = LTTextLineHorizontal(laparams.word_margin) 

769 line.add(obj0) 

770 yield line 

771 line = None 

772 obj0 = obj1 

773 if line is None: 

774 line = LTTextLineHorizontal(laparams.word_margin) 

775 assert obj0 is not None 

776 line.add(obj0) 

777 yield line 

778 

779 def group_textlines( 

780 self, 

781 laparams: LAParams, 

782 lines: Iterable[LTTextLine], 

783 ) -> Iterator[LTTextBox]: 

784 """Group neighboring lines to textboxes""" 

785 plane: Plane[LTTextLine] = Plane(self.bbox) 

786 plane.extend(lines) 

787 boxes: Dict[LTTextLine, LTTextBox] = {} 

788 for line in lines: 

789 neighbors = line.find_neighbors(plane, laparams.line_margin) 

790 members = [line] 

791 for obj1 in neighbors: 

792 members.append(obj1) 

793 if obj1 in boxes: 

794 members.extend(boxes.pop(obj1)) 

795 if isinstance(line, LTTextLineHorizontal): 

796 box: LTTextBox = LTTextBoxHorizontal() 

797 else: 

798 box = LTTextBoxVertical() 

799 for obj in uniq(members): 

800 box.add(obj) 

801 boxes[obj] = box 

802 done = set() 

803 for line in lines: 

804 if line not in boxes: 

805 continue 

806 box = boxes[line] 

807 if box in done: 

808 continue 

809 done.add(box) 

810 if not box.is_empty(): 

811 yield box 

812 

813 def group_textboxes( 

814 self, 

815 laparams: LAParams, 

816 boxes: Sequence[LTTextBox], 

817 ) -> List[LTTextGroup]: 

818 """Group textboxes hierarchically. 

819 

820 Get pair-wise distances, via dist func defined below, and then merge 

821 from the closest textbox pair. Once obj1 and obj2 are merged / 

822 grouped, the resulting group is considered as a new object, and its 

823 distances to other objects & groups are added to the process queue. 

824 

825 For performance reason, pair-wise distances and object pair info are 

826 maintained in a heap of (idx, dist, id(obj1), id(obj2), obj1, obj2) 

827 tuples. It ensures quick access to the smallest element. Note that 

828 since comparison operators, e.g., __lt__, are disabled for 

829 LTComponent, id(obj) has to appear before obj in element tuples. 

830 

831 :param laparams: LAParams object. 

832 :param boxes: All textbox objects to be grouped. 

833 :return: a list that has only one element, the final top level group. 

834 """ 

835 ElementT = Union[LTTextBox, LTTextGroup] 

836 plane: Plane[ElementT] = Plane(self.bbox) 

837 

838 def dist(obj1: LTComponent, obj2: LTComponent) -> float: 

839 """A distance function between two TextBoxes. 

840 

841 Consider the bounding rectangle for obj1 and obj2. 

842 Return its area less the areas of obj1 and obj2, 

843 shown as 'www' below. This value may be negative. 

844 +------+..........+ (x1, y1) 

845 | obj1 |wwwwwwwwww: 

846 +------+www+------+ 

847 :wwwwwwwwww| obj2 | 

848 (x0, y0) +..........+------+ 

849 """ 

850 x0 = min(obj1.x0, obj2.x0) 

851 y0 = min(obj1.y0, obj2.y0) 

852 x1 = max(obj1.x1, obj2.x1) 

853 y1 = max(obj1.y1, obj2.y1) 

854 return ( 

855 (x1 - x0) * (y1 - y0) 

856 - obj1.width * obj1.height 

857 - obj2.width * obj2.height 

858 ) 

859 

860 def isany(obj1: ElementT, obj2: ElementT) -> Set[ElementT]: 

861 """Check if there's any other object between obj1 and obj2.""" 

862 x0 = min(obj1.x0, obj2.x0) 

863 y0 = min(obj1.y0, obj2.y0) 

864 x1 = max(obj1.x1, obj2.x1) 

865 y1 = max(obj1.y1, obj2.y1) 

866 objs = set(plane.find((x0, y0, x1, y1))) 

867 return objs.difference((obj1, obj2)) 

868 

869 dists: List[Tuple[bool, float, int, int, ElementT, ElementT]] = [] 

870 for i in range(len(boxes)): 

871 box1 = boxes[i] 

872 for j in range(i + 1, len(boxes)): 

873 box2 = boxes[j] 

874 dists.append((False, dist(box1, box2), id(box1), id(box2), box1, box2)) 

875 heapq.heapify(dists) 

876 

877 plane.extend(boxes) 

878 done = set() 

879 while len(dists) > 0: 

880 (skip_isany, d, id1, id2, obj1, obj2) = heapq.heappop(dists) 

881 # Skip objects that are already merged 

882 if (id1 not in done) and (id2 not in done): 

883 if not skip_isany and isany(obj1, obj2): 

884 heapq.heappush(dists, (True, d, id1, id2, obj1, obj2)) 

885 continue 

886 if isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or isinstance( 

887 obj2, 

888 (LTTextBoxVertical, LTTextGroupTBRL), 

889 ): 

890 group: LTTextGroup = LTTextGroupTBRL([obj1, obj2]) 

891 else: 

892 group = LTTextGroupLRTB([obj1, obj2]) 

893 plane.remove(obj1) 

894 plane.remove(obj2) 

895 done.update([id1, id2]) 

896 

897 for other in plane: 

898 heapq.heappush( 

899 dists, 

900 (False, dist(group, other), id(group), id(other), group, other), 

901 ) 

902 plane.add(group) 

903 # By now only groups are in the plane 

904 return list(cast(LTTextGroup, g) for g in plane) 

905 

906 def analyze(self, laparams: LAParams) -> None: 

907 # textobjs is a list of LTChar objects, i.e. 

908 # it has all the individual characters in the page. 

909 (textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self) 

910 for obj in otherobjs: 

911 obj.analyze(laparams) 

912 if not textobjs: 

913 return 

914 textlines = list(self.group_objects(laparams, textobjs)) 

915 (empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines) 

916 for obj in empties: 

917 obj.analyze(laparams) 

918 textboxes = list(self.group_textlines(laparams, textlines)) 

919 if laparams.boxes_flow is None: 

920 for textbox in textboxes: 

921 textbox.analyze(laparams) 

922 

923 def getkey(box: LTTextBox) -> Tuple[int, float, float]: 

924 if isinstance(box, LTTextBoxVertical): 

925 return (0, -box.x1, -box.y0) 

926 else: 

927 return (1, -box.y0, box.x0) 

928 

929 textboxes.sort(key=getkey) 

930 else: 

931 self.groups = self.group_textboxes(laparams, textboxes) 

932 assigner = IndexAssigner() 

933 for group in self.groups: 

934 group.analyze(laparams) 

935 assigner.run(group) 

936 textboxes.sort(key=lambda box: box.index) 

937 self._objs = ( 

938 cast(List[LTComponent], textboxes) 

939 + otherobjs 

940 + cast(List[LTComponent], empties) 

941 ) 

942 

943 

944class LTFigure(LTLayoutContainer): 

945 """Represents an area used by PDF Form objects. 

946 

947 PDF Forms can be used to present figures or pictures by embedding yet 

948 another PDF document within a page. Note that LTFigure objects can appear 

949 recursively. 

950 """ 

951 

952 def __init__(self, name: str, bbox: Rect, matrix: Matrix) -> None: 

953 self.name = name 

954 self.matrix = matrix 

955 (x, y, w, h) = bbox 

956 rect = (x, y, x + w, y + h) 

957 bbox = apply_matrix_rect(matrix, rect) 

958 LTLayoutContainer.__init__(self, bbox) 

959 

960 def __repr__(self) -> str: 

961 return f"<{self.__class__.__name__}({self.name}) {bbox2str(self.bbox)} matrix={matrix2str(self.matrix)}>" 

962 

963 def analyze(self, laparams: LAParams) -> None: 

964 if not laparams.all_texts: 

965 return 

966 LTLayoutContainer.analyze(self, laparams) 

967 

968 

969class LTPage(LTLayoutContainer): 

970 """Represents an entire page. 

971 

972 Like any other LTLayoutContainer, an LTPage can be iterated to obtain child 

973 objects like LTTextBox, LTFigure, LTImage, LTRect, LTCurve and LTLine. 

974 """ 

975 

976 def __init__(self, pageid: int, bbox: Rect, rotate: float = 0) -> None: 

977 LTLayoutContainer.__init__(self, bbox) 

978 self.pageid = pageid 

979 self.rotate = rotate 

980 

981 def __repr__(self) -> str: 

982 return f"<{self.__class__.__name__}({self.pageid!r}) {bbox2str(self.bbox)} rotate={self.rotate!r}>"