Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/layout.py: 94%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

449 statements  

1import heapq 

2import logging 

3from typing import ( 

4 Dict, 

5 Generic, 

6 Iterable, 

7 Iterator, 

8 List, 

9 Optional, 

10 Sequence, 

11 Set, 

12 Tuple, 

13 TypeVar, 

14 Union, 

15 cast, 

16) 

17 

18from pdfminer.pdfcolor import PDFColorSpace 

19from pdfminer.pdfexceptions import PDFTypeError, PDFValueError 

20from pdfminer.pdffont import PDFFont 

21from pdfminer.pdfinterp import Color, PDFGraphicState 

22from pdfminer.pdftypes import PDFStream 

23from pdfminer.utils import ( 

24 INF, 

25 LTComponentT, 

26 Matrix, 

27 PathSegment, 

28 Plane, 

29 Point, 

30 Rect, 

31 apply_matrix_pt, 

32 bbox2str, 

33 fsplit, 

34 get_bound, 

35 matrix2str, 

36 uniq, 

37) 

38 

39logger = logging.getLogger(__name__) 

40 

41 

42class IndexAssigner: 

43 def __init__(self, index: int = 0) -> None: 

44 self.index = index 

45 

46 def run(self, obj: "LTItem") -> None: 

47 if isinstance(obj, LTTextBox): 

48 obj.index = self.index 

49 self.index += 1 

50 elif isinstance(obj, LTTextGroup): 

51 for x in obj: 

52 self.run(x) 

53 

54 

55class LAParams: 

56 """Parameters for layout analysis 

57 

58 :param line_overlap: If two characters have more overlap than this they 

59 are considered to be on the same line. The overlap is specified 

60 relative to the minimum height of both characters. 

61 :param char_margin: If two characters are closer together than this 

62 margin they are considered part of the same line. The margin is 

63 specified relative to the width of the character. 

64 :param word_margin: If two characters on the same line are further apart 

65 than this margin then they are considered to be two separate words, and 

66 an intermediate space will be added for readability. The margin is 

67 specified relative to the width of the character. 

68 :param line_margin: If two lines are are close together they are 

69 considered to be part of the same paragraph. The margin is 

70 specified relative to the height of a line. 

71 :param boxes_flow: Specifies how much a horizontal and vertical position 

72 of a text matters when determining the order of text boxes. The value 

73 should be within the range of -1.0 (only horizontal position 

74 matters) to +1.0 (only vertical position matters). You can also pass 

75 `None` to disable advanced layout analysis, and instead return text 

76 based on the position of the bottom left corner of the text box. 

77 :param detect_vertical: If vertical text should be considered during 

78 layout analysis 

79 :param all_texts: If layout analysis should be performed on text in 

80 figures. 

81 """ 

82 

83 def __init__( 

84 self, 

85 line_overlap: float = 0.5, 

86 char_margin: float = 2.0, 

87 line_margin: float = 0.5, 

88 word_margin: float = 0.1, 

89 boxes_flow: Optional[float] = 0.5, 

90 detect_vertical: bool = False, 

91 all_texts: bool = False, 

92 ) -> None: 

93 self.line_overlap = line_overlap 

94 self.char_margin = char_margin 

95 self.line_margin = line_margin 

96 self.word_margin = word_margin 

97 self.boxes_flow = boxes_flow 

98 self.detect_vertical = detect_vertical 

99 self.all_texts = all_texts 

100 

101 self._validate() 

102 

103 def _validate(self) -> None: 

104 if self.boxes_flow is not None: 

105 boxes_flow_err_msg = ( 

106 "LAParam boxes_flow should be None, or a number between -1 and +1" 

107 ) 

108 if not ( 

109 isinstance(self.boxes_flow, int) or isinstance(self.boxes_flow, float) 

110 ): 

111 raise PDFTypeError(boxes_flow_err_msg) 

112 if not -1 <= self.boxes_flow <= 1: 

113 raise PDFValueError(boxes_flow_err_msg) 

114 

115 def __repr__(self) -> str: 

116 return ( 

117 "<LAParams: char_margin=%.1f, line_margin=%.1f, " 

118 "word_margin=%.1f all_texts=%r>" 

119 % (self.char_margin, self.line_margin, self.word_margin, self.all_texts) 

120 ) 

121 

122 

123class LTItem: 

124 """Interface for things that can be analyzed""" 

125 

126 def analyze(self, laparams: LAParams) -> None: 

127 """Perform the layout analysis.""" 

128 

129 

130class LTText: 

131 """Interface for things that have text""" 

132 

133 def __repr__(self) -> str: 

134 return f"<{self.__class__.__name__} {self.get_text()!r}>" 

135 

136 def get_text(self) -> str: 

137 """Text contained in this object""" 

138 raise NotImplementedError 

139 

140 

141class LTComponent(LTItem): 

142 """Object with a bounding box""" 

143 

144 def __init__(self, bbox: Rect) -> None: 

145 LTItem.__init__(self) 

146 self.set_bbox(bbox) 

147 

148 def __repr__(self) -> str: 

149 return f"<{self.__class__.__name__} {bbox2str(self.bbox)}>" 

150 

151 # Disable comparison. 

152 def __lt__(self, _: object) -> bool: 

153 raise PDFValueError 

154 

155 def __le__(self, _: object) -> bool: 

156 raise PDFValueError 

157 

158 def __gt__(self, _: object) -> bool: 

159 raise PDFValueError 

160 

161 def __ge__(self, _: object) -> bool: 

162 raise PDFValueError 

163 

164 def set_bbox(self, bbox: Rect) -> None: 

165 (x0, y0, x1, y1) = bbox 

166 self.x0 = x0 

167 self.y0 = y0 

168 self.x1 = x1 

169 self.y1 = y1 

170 self.width = x1 - x0 

171 self.height = y1 - y0 

172 self.bbox = bbox 

173 

174 def is_empty(self) -> bool: 

175 return self.width <= 0 or self.height <= 0 

176 

177 def is_hoverlap(self, obj: "LTComponent") -> bool: 

178 assert isinstance(obj, LTComponent), str(type(obj)) 

179 return obj.x0 <= self.x1 and self.x0 <= obj.x1 

180 

181 def hdistance(self, obj: "LTComponent") -> float: 

182 assert isinstance(obj, LTComponent), str(type(obj)) 

183 if self.is_hoverlap(obj): 

184 return 0 

185 else: 

186 return min(abs(self.x0 - obj.x1), abs(self.x1 - obj.x0)) 

187 

188 def hoverlap(self, obj: "LTComponent") -> float: 

189 assert isinstance(obj, LTComponent), str(type(obj)) 

190 if self.is_hoverlap(obj): 

191 return min(abs(self.x0 - obj.x1), abs(self.x1 - obj.x0)) 

192 else: 

193 return 0 

194 

195 def is_voverlap(self, obj: "LTComponent") -> bool: 

196 assert isinstance(obj, LTComponent), str(type(obj)) 

197 return obj.y0 <= self.y1 and self.y0 <= obj.y1 

198 

199 def vdistance(self, obj: "LTComponent") -> float: 

200 assert isinstance(obj, LTComponent), str(type(obj)) 

201 if self.is_voverlap(obj): 

202 return 0 

203 else: 

204 return min(abs(self.y0 - obj.y1), abs(self.y1 - obj.y0)) 

205 

206 def voverlap(self, obj: "LTComponent") -> float: 

207 assert isinstance(obj, LTComponent), str(type(obj)) 

208 if self.is_voverlap(obj): 

209 return min(abs(self.y0 - obj.y1), abs(self.y1 - obj.y0)) 

210 else: 

211 return 0 

212 

213 

214class LTCurve(LTComponent): 

215 """A generic Bezier curve 

216 

217 The parameter `original_path` contains the original 

218 pathing information from the pdf (e.g. for reconstructing Bezier Curves). 

219 

220 `dashing_style` contains the Dashing information if any. 

221 """ 

222 

223 def __init__( 

224 self, 

225 linewidth: float, 

226 pts: List[Point], 

227 stroke: bool = False, 

228 fill: bool = False, 

229 evenodd: bool = False, 

230 stroking_color: Optional[Color] = None, 

231 non_stroking_color: Optional[Color] = None, 

232 original_path: Optional[List[PathSegment]] = None, 

233 dashing_style: Optional[Tuple[object, object]] = None, 

234 ) -> None: 

235 LTComponent.__init__(self, get_bound(pts)) 

236 self.pts = pts 

237 self.linewidth = linewidth 

238 self.stroke = stroke 

239 self.fill = fill 

240 self.evenodd = evenodd 

241 self.stroking_color = stroking_color 

242 self.non_stroking_color = non_stroking_color 

243 self.original_path = original_path 

244 self.dashing_style = dashing_style 

245 

246 def get_pts(self) -> str: 

247 return ",".join("%.3f,%.3f" % p for p in self.pts) 

248 

249 

250class LTLine(LTCurve): 

251 """A single straight line. 

252 

253 Could be used for separating text or figures. 

254 """ 

255 

256 def __init__( 

257 self, 

258 linewidth: float, 

259 p0: Point, 

260 p1: Point, 

261 stroke: bool = False, 

262 fill: bool = False, 

263 evenodd: bool = False, 

264 stroking_color: Optional[Color] = None, 

265 non_stroking_color: Optional[Color] = None, 

266 original_path: Optional[List[PathSegment]] = None, 

267 dashing_style: Optional[Tuple[object, object]] = None, 

268 ) -> None: 

269 LTCurve.__init__( 

270 self, 

271 linewidth, 

272 [p0, p1], 

273 stroke, 

274 fill, 

275 evenodd, 

276 stroking_color, 

277 non_stroking_color, 

278 original_path, 

279 dashing_style, 

280 ) 

281 

282 

283class LTRect(LTCurve): 

284 """A rectangle. 

285 

286 Could be used for framing another pictures or figures. 

287 """ 

288 

289 def __init__( 

290 self, 

291 linewidth: float, 

292 bbox: Rect, 

293 stroke: bool = False, 

294 fill: bool = False, 

295 evenodd: bool = False, 

296 stroking_color: Optional[Color] = None, 

297 non_stroking_color: Optional[Color] = None, 

298 original_path: Optional[List[PathSegment]] = None, 

299 dashing_style: Optional[Tuple[object, object]] = None, 

300 ) -> None: 

301 (x0, y0, x1, y1) = bbox 

302 LTCurve.__init__( 

303 self, 

304 linewidth, 

305 [(x0, y0), (x1, y0), (x1, y1), (x0, y1)], 

306 stroke, 

307 fill, 

308 evenodd, 

309 stroking_color, 

310 non_stroking_color, 

311 original_path, 

312 dashing_style, 

313 ) 

314 

315 

316class LTImage(LTComponent): 

317 """An image object. 

318 

319 Embedded images can be in JPEG, Bitmap or JBIG2. 

320 """ 

321 

322 def __init__(self, name: str, stream: PDFStream, bbox: Rect) -> None: 

323 LTComponent.__init__(self, bbox) 

324 self.name = name 

325 self.stream = stream 

326 self.srcsize = (stream.get_any(("W", "Width")), stream.get_any(("H", "Height"))) 

327 self.imagemask = stream.get_any(("IM", "ImageMask")) 

328 self.bits = stream.get_any(("BPC", "BitsPerComponent"), 1) 

329 self.colorspace = stream.get_any(("CS", "ColorSpace")) 

330 if not isinstance(self.colorspace, list): 

331 self.colorspace = [self.colorspace] 

332 

333 def __repr__(self) -> str: 

334 return f"<{self.__class__.__name__}({self.name}) {bbox2str(self.bbox)} {self.srcsize!r}>" 

335 

336 

337class LTAnno(LTItem, LTText): 

338 """Actual letter in the text as a Unicode string. 

339 

340 Note that, while a LTChar object has actual boundaries, LTAnno objects does 

341 not, as these are "virtual" characters, inserted by a layout analyzer 

342 according to the relationship between two characters (e.g. a space). 

343 """ 

344 

345 def __init__(self, text: str) -> None: 

346 self._text = text 

347 

348 def get_text(self) -> str: 

349 return self._text 

350 

351 

352class LTChar(LTComponent, LTText): 

353 """Actual letter in the text as a Unicode string.""" 

354 

355 def __init__( 

356 self, 

357 matrix: Matrix, 

358 font: PDFFont, 

359 fontsize: float, 

360 scaling: float, 

361 rise: float, 

362 text: str, 

363 textwidth: float, 

364 textdisp: Union[float, Tuple[Optional[float], float]], 

365 ncs: PDFColorSpace, 

366 graphicstate: PDFGraphicState, 

367 ) -> None: 

368 LTText.__init__(self) 

369 self._text = text 

370 self.matrix = matrix 

371 self.fontname = font.fontname 

372 self.ncs = ncs 

373 self.graphicstate = graphicstate 

374 self.adv = textwidth * fontsize * scaling 

375 # compute the boundary rectangle. 

376 if font.is_vertical(): 

377 # vertical 

378 assert isinstance(textdisp, tuple) 

379 (vx, vy) = textdisp 

380 if vx is None: 

381 vx = fontsize * 0.5 

382 else: 

383 vx = vx * fontsize * 0.001 

384 vy = (1000 - vy) * fontsize * 0.001 

385 bbox_lower_left = (-vx, vy + rise + self.adv) 

386 bbox_upper_right = (-vx + fontsize, vy + rise) 

387 else: 

388 # horizontal 

389 descent = font.get_descent() * fontsize 

390 bbox_lower_left = (0, descent + rise) 

391 bbox_upper_right = (self.adv, descent + rise + fontsize) 

392 (a, b, c, d, e, f) = self.matrix 

393 self.upright = a * d * scaling > 0 and b * c <= 0 

394 (x0, y0) = apply_matrix_pt(self.matrix, bbox_lower_left) 

395 (x1, y1) = apply_matrix_pt(self.matrix, bbox_upper_right) 

396 if x1 < x0: 

397 (x0, x1) = (x1, x0) 

398 if y1 < y0: 

399 (y0, y1) = (y1, y0) 

400 LTComponent.__init__(self, (x0, y0, x1, y1)) 

401 if font.is_vertical(): 

402 self.size = self.width 

403 else: 

404 self.size = self.height 

405 

406 def __repr__(self) -> str: 

407 return f"<{self.__class__.__name__} {bbox2str(self.bbox)} matrix={matrix2str(self.matrix)} font={self.fontname!r} adv={self.adv} text={self.get_text()!r}>" 

408 

409 def get_text(self) -> str: 

410 return self._text 

411 

412 

413LTItemT = TypeVar("LTItemT", bound=LTItem) 

414 

415 

416class LTContainer(LTComponent, Generic[LTItemT]): 

417 """Object that can be extended and analyzed""" 

418 

419 def __init__(self, bbox: Rect) -> None: 

420 LTComponent.__init__(self, bbox) 

421 self._objs: List[LTItemT] = [] 

422 

423 def __iter__(self) -> Iterator[LTItemT]: 

424 return iter(self._objs) 

425 

426 def __len__(self) -> int: 

427 return len(self._objs) 

428 

429 def add(self, obj: LTItemT) -> None: 

430 self._objs.append(obj) 

431 

432 def extend(self, objs: Iterable[LTItemT]) -> None: 

433 for obj in objs: 

434 self.add(obj) 

435 

436 def analyze(self, laparams: LAParams) -> None: 

437 for obj in self._objs: 

438 obj.analyze(laparams) 

439 

440 

441class LTExpandableContainer(LTContainer[LTItemT]): 

442 def __init__(self) -> None: 

443 LTContainer.__init__(self, (+INF, +INF, -INF, -INF)) 

444 

445 # Incompatible override: we take an LTComponent (with bounding box), but 

446 # super() LTContainer only considers LTItem (no bounding box). 

447 def add(self, obj: LTComponent) -> None: # type: ignore[override] 

448 LTContainer.add(self, cast(LTItemT, obj)) 

449 self.set_bbox( 

450 ( 

451 min(self.x0, obj.x0), 

452 min(self.y0, obj.y0), 

453 max(self.x1, obj.x1), 

454 max(self.y1, obj.y1), 

455 ), 

456 ) 

457 

458 

459class LTTextContainer(LTExpandableContainer[LTItemT], LTText): 

460 def __init__(self) -> None: 

461 LTText.__init__(self) 

462 LTExpandableContainer.__init__(self) 

463 

464 def get_text(self) -> str: 

465 return "".join( 

466 cast(LTText, obj).get_text() for obj in self if isinstance(obj, LTText) 

467 ) 

468 

469 

470TextLineElement = Union[LTChar, LTAnno] 

471 

472 

473class LTTextLine(LTTextContainer[TextLineElement]): 

474 """Contains a list of LTChar objects that represent a single text line. 

475 

476 The characters are aligned either horizontally or vertically, depending on 

477 the text's writing mode. 

478 """ 

479 

480 def __init__(self, word_margin: float) -> None: 

481 super().__init__() 

482 self.word_margin = word_margin 

483 

484 def __repr__(self) -> str: 

485 return f"<{self.__class__.__name__} {bbox2str(self.bbox)} {self.get_text()!r}>" 

486 

487 def analyze(self, laparams: LAParams) -> None: 

488 for obj in self._objs: 

489 obj.analyze(laparams) 

490 LTContainer.add(self, LTAnno("\n")) 

491 

492 def find_neighbors( 

493 self, 

494 plane: Plane[LTComponentT], 

495 ratio: float, 

496 ) -> List["LTTextLine"]: 

497 raise NotImplementedError 

498 

499 def is_empty(self) -> bool: 

500 return super().is_empty() or self.get_text().isspace() 

501 

502 

503class LTTextLineHorizontal(LTTextLine): 

504 def __init__(self, word_margin: float) -> None: 

505 LTTextLine.__init__(self, word_margin) 

506 self._x1: float = +INF 

507 

508 # Incompatible override: we take an LTComponent (with bounding box), but 

509 # LTContainer only considers LTItem (no bounding box). 

510 def add(self, obj: LTComponent) -> None: # type: ignore[override] 

511 if isinstance(obj, LTChar) and self.word_margin: 

512 margin = self.word_margin * max(obj.width, obj.height) 

513 if self._x1 < obj.x0 - margin: 

514 LTContainer.add(self, LTAnno(" ")) 

515 self._x1 = obj.x1 

516 super().add(obj) 

517 

518 def find_neighbors( 

519 self, 

520 plane: Plane[LTComponentT], 

521 ratio: float, 

522 ) -> List[LTTextLine]: 

523 """Finds neighboring LTTextLineHorizontals in the plane. 

524 

525 Returns a list of other LTTestLineHorizontals in the plane which are 

526 close to self. "Close" can be controlled by ratio. The returned objects 

527 will be the same height as self, and also either left-, right-, or 

528 centrally-aligned. 

529 """ 

530 d = ratio * self.height 

531 objs = plane.find((self.x0, self.y0 - d, self.x1, self.y1 + d)) 

532 return [ 

533 obj 

534 for obj in objs 

535 if ( 

536 isinstance(obj, LTTextLineHorizontal) 

537 and self._is_same_height_as(obj, tolerance=d) 

538 and ( 

539 self._is_left_aligned_with(obj, tolerance=d) 

540 or self._is_right_aligned_with(obj, tolerance=d) 

541 or self._is_centrally_aligned_with(obj, tolerance=d) 

542 ) 

543 ) 

544 ] 

545 

546 def _is_left_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool: 

547 """Whether the left-hand edge of `other` is within `tolerance`.""" 

548 return abs(other.x0 - self.x0) <= tolerance 

549 

550 def _is_right_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool: 

551 """Whether the right-hand edge of `other` is within `tolerance`.""" 

552 return abs(other.x1 - self.x1) <= tolerance 

553 

554 def _is_centrally_aligned_with( 

555 self, 

556 other: LTComponent, 

557 tolerance: float = 0, 

558 ) -> bool: 

559 """Whether the horizontal center of `other` is within `tolerance`.""" 

560 return abs((other.x0 + other.x1) / 2 - (self.x0 + self.x1) / 2) <= tolerance 

561 

562 def _is_same_height_as(self, other: LTComponent, tolerance: float = 0) -> bool: 

563 return abs(other.height - self.height) <= tolerance 

564 

565 

566class LTTextLineVertical(LTTextLine): 

567 def __init__(self, word_margin: float) -> None: 

568 LTTextLine.__init__(self, word_margin) 

569 self._y0: float = -INF 

570 

571 # Incompatible override: we take an LTComponent (with bounding box), but 

572 # LTContainer only considers LTItem (no bounding box). 

573 def add(self, obj: LTComponent) -> None: # type: ignore[override] 

574 if isinstance(obj, LTChar) and self.word_margin: 

575 margin = self.word_margin * max(obj.width, obj.height) 

576 if obj.y1 + margin < self._y0: 

577 LTContainer.add(self, LTAnno(" ")) 

578 self._y0 = obj.y0 

579 super().add(obj) 

580 

581 def find_neighbors( 

582 self, 

583 plane: Plane[LTComponentT], 

584 ratio: float, 

585 ) -> List[LTTextLine]: 

586 """Finds neighboring LTTextLineVerticals in the plane. 

587 

588 Returns a list of other LTTextLineVerticals in the plane which are 

589 close to self. "Close" can be controlled by ratio. The returned objects 

590 will be the same width as self, and also either upper-, lower-, or 

591 centrally-aligned. 

592 """ 

593 d = ratio * self.width 

594 objs = plane.find((self.x0 - d, self.y0, self.x1 + d, self.y1)) 

595 return [ 

596 obj 

597 for obj in objs 

598 if ( 

599 isinstance(obj, LTTextLineVertical) 

600 and self._is_same_width_as(obj, tolerance=d) 

601 and ( 

602 self._is_lower_aligned_with(obj, tolerance=d) 

603 or self._is_upper_aligned_with(obj, tolerance=d) 

604 or self._is_centrally_aligned_with(obj, tolerance=d) 

605 ) 

606 ) 

607 ] 

608 

609 def _is_lower_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool: 

610 """Whether the lower edge of `other` is within `tolerance`.""" 

611 return abs(other.y0 - self.y0) <= tolerance 

612 

613 def _is_upper_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool: 

614 """Whether the upper edge of `other` is within `tolerance`.""" 

615 return abs(other.y1 - self.y1) <= tolerance 

616 

617 def _is_centrally_aligned_with( 

618 self, 

619 other: LTComponent, 

620 tolerance: float = 0, 

621 ) -> bool: 

622 """Whether the vertical center of `other` is within `tolerance`.""" 

623 return abs((other.y0 + other.y1) / 2 - (self.y0 + self.y1) / 2) <= tolerance 

624 

625 def _is_same_width_as(self, other: LTComponent, tolerance: float) -> bool: 

626 return abs(other.width - self.width) <= tolerance 

627 

628 

629class LTTextBox(LTTextContainer[LTTextLine]): 

630 """Represents a group of text chunks in a rectangular area. 

631 

632 Note that this box is created by geometric analysis and does not 

633 necessarily represents a logical boundary of the text. It contains a list 

634 of LTTextLine objects. 

635 """ 

636 

637 def __init__(self) -> None: 

638 LTTextContainer.__init__(self) 

639 self.index: int = -1 

640 

641 def __repr__(self) -> str: 

642 return f"<{self.__class__.__name__}({self.index}) {bbox2str(self.bbox)} {self.get_text()!r}>" 

643 

644 def get_writing_mode(self) -> str: 

645 raise NotImplementedError 

646 

647 

648class LTTextBoxHorizontal(LTTextBox): 

649 def analyze(self, laparams: LAParams) -> None: 

650 super().analyze(laparams) 

651 self._objs.sort(key=lambda obj: -obj.y1) 

652 

653 def get_writing_mode(self) -> str: 

654 return "lr-tb" 

655 

656 

657class LTTextBoxVertical(LTTextBox): 

658 def analyze(self, laparams: LAParams) -> None: 

659 super().analyze(laparams) 

660 self._objs.sort(key=lambda obj: -obj.x1) 

661 

662 def get_writing_mode(self) -> str: 

663 return "tb-rl" 

664 

665 

666TextGroupElement = Union[LTTextBox, "LTTextGroup"] 

667 

668 

669class LTTextGroup(LTTextContainer[TextGroupElement]): 

670 def __init__(self, objs: Iterable[TextGroupElement]) -> None: 

671 super().__init__() 

672 self.extend(objs) 

673 

674 

675class LTTextGroupLRTB(LTTextGroup): 

676 def analyze(self, laparams: LAParams) -> None: 

677 super().analyze(laparams) 

678 assert laparams.boxes_flow is not None 

679 boxes_flow = laparams.boxes_flow 

680 # reorder the objects from top-left to bottom-right. 

681 self._objs.sort( 

682 key=lambda obj: (1 - boxes_flow) * obj.x0 

683 - (1 + boxes_flow) * (obj.y0 + obj.y1), 

684 ) 

685 

686 

687class LTTextGroupTBRL(LTTextGroup): 

688 def analyze(self, laparams: LAParams) -> None: 

689 super().analyze(laparams) 

690 assert laparams.boxes_flow is not None 

691 boxes_flow = laparams.boxes_flow 

692 # reorder the objects from top-right to bottom-left. 

693 self._objs.sort( 

694 key=lambda obj: -(1 + boxes_flow) * (obj.x0 + obj.x1) 

695 - (1 - boxes_flow) * obj.y1, 

696 ) 

697 

698 

699class LTLayoutContainer(LTContainer[LTComponent]): 

700 def __init__(self, bbox: Rect) -> None: 

701 LTContainer.__init__(self, bbox) 

702 self.groups: Optional[List[LTTextGroup]] = None 

703 

704 # group_objects: group text object to textlines. 

705 def group_objects( 

706 self, 

707 laparams: LAParams, 

708 objs: Iterable[LTComponent], 

709 ) -> Iterator[LTTextLine]: 

710 obj0 = None 

711 line = None 

712 for obj1 in objs: 

713 if obj0 is not None: 

714 # halign: obj0 and obj1 is horizontally aligned. 

715 # 

716 # +------+ - - - 

717 # | obj0 | - - +------+ - 

718 # | | | obj1 | | (line_overlap) 

719 # +------+ - - | | - 

720 # - - - +------+ 

721 # 

722 # |<--->| 

723 # (char_margin) 

724 halign = ( 

725 obj0.is_voverlap(obj1) 

726 and min(obj0.height, obj1.height) * laparams.line_overlap 

727 < obj0.voverlap(obj1) 

728 and obj0.hdistance(obj1) 

729 < max(obj0.width, obj1.width) * laparams.char_margin 

730 ) 

731 

732 # valign: obj0 and obj1 is vertically aligned. 

733 # 

734 # +------+ 

735 # | obj0 | 

736 # | | 

737 # +------+ - - - 

738 # | | | (char_margin) 

739 # +------+ - - 

740 # | obj1 | 

741 # | | 

742 # +------+ 

743 # 

744 # |<-->| 

745 # (line_overlap) 

746 valign = ( 

747 laparams.detect_vertical 

748 and obj0.is_hoverlap(obj1) 

749 and min(obj0.width, obj1.width) * laparams.line_overlap 

750 < obj0.hoverlap(obj1) 

751 and obj0.vdistance(obj1) 

752 < max(obj0.height, obj1.height) * laparams.char_margin 

753 ) 

754 

755 if (halign and isinstance(line, LTTextLineHorizontal)) or ( 

756 valign and isinstance(line, LTTextLineVertical) 

757 ): 

758 line.add(obj1) 

759 elif line is not None: 

760 yield line 

761 line = None 

762 elif valign and not halign: 

763 line = LTTextLineVertical(laparams.word_margin) 

764 line.add(obj0) 

765 line.add(obj1) 

766 elif halign and not valign: 

767 line = LTTextLineHorizontal(laparams.word_margin) 

768 line.add(obj0) 

769 line.add(obj1) 

770 else: 

771 line = LTTextLineHorizontal(laparams.word_margin) 

772 line.add(obj0) 

773 yield line 

774 line = None 

775 obj0 = obj1 

776 if line is None: 

777 line = LTTextLineHorizontal(laparams.word_margin) 

778 assert obj0 is not None 

779 line.add(obj0) 

780 yield line 

781 

782 def group_textlines( 

783 self, 

784 laparams: LAParams, 

785 lines: Iterable[LTTextLine], 

786 ) -> Iterator[LTTextBox]: 

787 """Group neighboring lines to textboxes""" 

788 plane: Plane[LTTextLine] = Plane(self.bbox) 

789 plane.extend(lines) 

790 boxes: Dict[LTTextLine, LTTextBox] = {} 

791 for line in lines: 

792 neighbors = line.find_neighbors(plane, laparams.line_margin) 

793 members = [line] 

794 for obj1 in neighbors: 

795 members.append(obj1) 

796 if obj1 in boxes: 

797 members.extend(boxes.pop(obj1)) 

798 if isinstance(line, LTTextLineHorizontal): 

799 box: LTTextBox = LTTextBoxHorizontal() 

800 else: 

801 box = LTTextBoxVertical() 

802 for obj in uniq(members): 

803 box.add(obj) 

804 boxes[obj] = box 

805 done = set() 

806 for line in lines: 

807 if line not in boxes: 

808 continue 

809 box = boxes[line] 

810 if box in done: 

811 continue 

812 done.add(box) 

813 if not box.is_empty(): 

814 yield box 

815 

816 def group_textboxes( 

817 self, 

818 laparams: LAParams, 

819 boxes: Sequence[LTTextBox], 

820 ) -> List[LTTextGroup]: 

821 """Group textboxes hierarchically. 

822 

823 Get pair-wise distances, via dist func defined below, and then merge 

824 from the closest textbox pair. Once obj1 and obj2 are merged / 

825 grouped, the resulting group is considered as a new object, and its 

826 distances to other objects & groups are added to the process queue. 

827 

828 For performance reason, pair-wise distances and object pair info are 

829 maintained in a heap of (idx, dist, id(obj1), id(obj2), obj1, obj2) 

830 tuples. It ensures quick access to the smallest element. Note that 

831 since comparison operators, e.g., __lt__, are disabled for 

832 LTComponent, id(obj) has to appear before obj in element tuples. 

833 

834 :param laparams: LAParams object. 

835 :param boxes: All textbox objects to be grouped. 

836 :return: a list that has only one element, the final top level group. 

837 """ 

838 ElementT = Union[LTTextBox, LTTextGroup] 

839 plane: Plane[ElementT] = Plane(self.bbox) 

840 

841 def dist(obj1: LTComponent, obj2: LTComponent) -> float: 

842 """A distance function between two TextBoxes. 

843 

844 Consider the bounding rectangle for obj1 and obj2. 

845 Return its area less the areas of obj1 and obj2, 

846 shown as 'www' below. This value may be negative. 

847 +------+..........+ (x1, y1) 

848 | obj1 |wwwwwwwwww: 

849 +------+www+------+ 

850 :wwwwwwwwww| obj2 | 

851 (x0, y0) +..........+------+ 

852 """ 

853 x0 = min(obj1.x0, obj2.x0) 

854 y0 = min(obj1.y0, obj2.y0) 

855 x1 = max(obj1.x1, obj2.x1) 

856 y1 = max(obj1.y1, obj2.y1) 

857 return ( 

858 (x1 - x0) * (y1 - y0) 

859 - obj1.width * obj1.height 

860 - obj2.width * obj2.height 

861 ) 

862 

863 def isany(obj1: ElementT, obj2: ElementT) -> Set[ElementT]: 

864 """Check if there's any other object between obj1 and obj2.""" 

865 x0 = min(obj1.x0, obj2.x0) 

866 y0 = min(obj1.y0, obj2.y0) 

867 x1 = max(obj1.x1, obj2.x1) 

868 y1 = max(obj1.y1, obj2.y1) 

869 objs = set(plane.find((x0, y0, x1, y1))) 

870 return objs.difference((obj1, obj2)) 

871 

872 dists: List[Tuple[bool, float, int, int, ElementT, ElementT]] = [] 

873 for i in range(len(boxes)): 

874 box1 = boxes[i] 

875 for j in range(i + 1, len(boxes)): 

876 box2 = boxes[j] 

877 dists.append((False, dist(box1, box2), id(box1), id(box2), box1, box2)) 

878 heapq.heapify(dists) 

879 

880 plane.extend(boxes) 

881 done = set() 

882 while len(dists) > 0: 

883 (skip_isany, d, id1, id2, obj1, obj2) = heapq.heappop(dists) 

884 # Skip objects that are already merged 

885 if (id1 not in done) and (id2 not in done): 

886 if not skip_isany and isany(obj1, obj2): 

887 heapq.heappush(dists, (True, d, id1, id2, obj1, obj2)) 

888 continue 

889 if isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or isinstance( 

890 obj2, 

891 (LTTextBoxVertical, LTTextGroupTBRL), 

892 ): 

893 group: LTTextGroup = LTTextGroupTBRL([obj1, obj2]) 

894 else: 

895 group = LTTextGroupLRTB([obj1, obj2]) 

896 plane.remove(obj1) 

897 plane.remove(obj2) 

898 done.update([id1, id2]) 

899 

900 for other in plane: 

901 heapq.heappush( 

902 dists, 

903 (False, dist(group, other), id(group), id(other), group, other), 

904 ) 

905 plane.add(group) 

906 # By now only groups are in the plane 

907 return list(cast(LTTextGroup, g) for g in plane) 

908 

909 def analyze(self, laparams: LAParams) -> None: 

910 # textobjs is a list of LTChar objects, i.e. 

911 # it has all the individual characters in the page. 

912 (textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self) 

913 for obj in otherobjs: 

914 obj.analyze(laparams) 

915 if not textobjs: 

916 return 

917 textlines = list(self.group_objects(laparams, textobjs)) 

918 (empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines) 

919 for obj in empties: 

920 obj.analyze(laparams) 

921 textboxes = list(self.group_textlines(laparams, textlines)) 

922 if laparams.boxes_flow is None: 

923 for textbox in textboxes: 

924 textbox.analyze(laparams) 

925 

926 def getkey(box: LTTextBox) -> Tuple[int, float, float]: 

927 if isinstance(box, LTTextBoxVertical): 

928 return (0, -box.x1, -box.y0) 

929 else: 

930 return (1, -box.y0, box.x0) 

931 

932 textboxes.sort(key=getkey) 

933 else: 

934 self.groups = self.group_textboxes(laparams, textboxes) 

935 assigner = IndexAssigner() 

936 for group in self.groups: 

937 group.analyze(laparams) 

938 assigner.run(group) 

939 textboxes.sort(key=lambda box: box.index) 

940 self._objs = ( 

941 cast(List[LTComponent], textboxes) 

942 + otherobjs 

943 + cast(List[LTComponent], empties) 

944 ) 

945 

946 

947class LTFigure(LTLayoutContainer): 

948 """Represents an area used by PDF Form objects. 

949 

950 PDF Forms can be used to present figures or pictures by embedding yet 

951 another PDF document within a page. Note that LTFigure objects can appear 

952 recursively. 

953 """ 

954 

955 def __init__(self, name: str, bbox: Rect, matrix: Matrix) -> None: 

956 self.name = name 

957 self.matrix = matrix 

958 (x, y, w, h) = bbox 

959 bounds = ((x, y), (x + w, y), (x, y + h), (x + w, y + h)) 

960 bbox = get_bound(apply_matrix_pt(matrix, (p, q)) for (p, q) in bounds) 

961 LTLayoutContainer.__init__(self, bbox) 

962 

963 def __repr__(self) -> str: 

964 return f"<{self.__class__.__name__}({self.name}) {bbox2str(self.bbox)} matrix={matrix2str(self.matrix)}>" 

965 

966 def analyze(self, laparams: LAParams) -> None: 

967 if not laparams.all_texts: 

968 return 

969 LTLayoutContainer.analyze(self, laparams) 

970 

971 

972class LTPage(LTLayoutContainer): 

973 """Represents an entire page. 

974 

975 Like any other LTLayoutContainer, an LTPage can be iterated to obtain child 

976 objects like LTTextBox, LTFigure, LTImage, LTRect, LTCurve and LTLine. 

977 """ 

978 

979 def __init__(self, pageid: int, bbox: Rect, rotate: float = 0) -> None: 

980 LTLayoutContainer.__init__(self, bbox) 

981 self.pageid = pageid 

982 self.rotate = rotate 

983 

984 def __repr__(self) -> str: 

985 return f"<{self.__class__.__name__}({self.pageid!r}) {bbox2str(self.bbox)} rotate={self.rotate!r}>"