Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/converter.py: 30%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

472 statements  

1import io 

2import logging 

3import re 

4from collections.abc import Sequence 

5from typing import ( 

6 BinaryIO, 

7 ClassVar, 

8 Generic, 

9 TextIO, 

10 TypeVar, 

11 cast, 

12) 

13 

14from pdfminer import utils 

15from pdfminer.image import ImageWriter 

16from pdfminer.layout import ( 

17 LAParams, 

18 LTAnno, 

19 LTChar, 

20 LTComponent, 

21 LTContainer, 

22 LTCurve, 

23 LTFigure, 

24 LTImage, 

25 LTItem, 

26 LTLayoutContainer, 

27 LTLine, 

28 LTPage, 

29 LTRect, 

30 LTText, 

31 LTTextBox, 

32 LTTextBoxVertical, 

33 LTTextGroup, 

34 LTTextLine, 

35 TextGroupElement, 

36) 

37from pdfminer.pdfcolor import PDFColorSpace 

38from pdfminer.pdfdevice import PDFTextDevice 

39from pdfminer.pdfexceptions import PDFValueError 

40from pdfminer.pdffont import PDFFont, PDFUnicodeNotDefined 

41from pdfminer.pdfinterp import PDFGraphicState, PDFResourceManager 

42from pdfminer.pdfpage import PDFPage 

43from pdfminer.pdftypes import PDFStream 

44from pdfminer.utils import ( 

45 AnyIO, 

46 Matrix, 

47 PathSegment, 

48 Point, 

49 Rect, 

50 apply_matrix_pt, 

51 apply_matrix_rect, 

52 bbox2str, 

53 enc, 

54 make_compat_str, 

55 mult_matrix, 

56) 

57 

58log = logging.getLogger(__name__) 

59 

60 

61class PDFLayoutAnalyzer(PDFTextDevice): 

62 cur_item: LTLayoutContainer 

63 ctm: Matrix 

64 

65 def __init__( 

66 self, 

67 rsrcmgr: PDFResourceManager, 

68 pageno: int = 1, 

69 laparams: LAParams | None = None, 

70 ) -> None: 

71 PDFTextDevice.__init__(self, rsrcmgr) 

72 self.pageno = pageno 

73 self.laparams = laparams 

74 self._stack: list[LTLayoutContainer] = [] 

75 

76 def begin_page(self, page: PDFPage, ctm: Matrix) -> None: 

77 (x0, y0, x1, y1) = apply_matrix_rect(ctm, page.mediabox) 

78 mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1)) 

79 self.cur_item = LTPage(self.pageno, mediabox) 

80 

81 def end_page(self, page: PDFPage) -> None: 

82 assert not self._stack, str(len(self._stack)) 

83 assert isinstance(self.cur_item, LTPage), str(type(self.cur_item)) 

84 if self.laparams is not None: 

85 self.cur_item.analyze(self.laparams) 

86 self.pageno += 1 

87 self.receive_layout(self.cur_item) 

88 

89 def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None: 

90 self._stack.append(self.cur_item) 

91 self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm)) 

92 

93 def end_figure(self, _: str) -> None: 

94 fig = self.cur_item 

95 assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item)) 

96 self.cur_item = self._stack.pop() 

97 self.cur_item.add(fig) 

98 

99 def render_image(self, name: str, stream: PDFStream) -> None: 

100 assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item)) 

101 item = LTImage( 

102 name, 

103 stream, 

104 (self.cur_item.x0, self.cur_item.y0, self.cur_item.x1, self.cur_item.y1), 

105 ) 

106 self.cur_item.add(item) 

107 

108 def paint_path( 

109 self, 

110 gstate: PDFGraphicState, 

111 stroke: bool, 

112 fill: bool, 

113 evenodd: bool, 

114 path: Sequence[PathSegment], 

115 ) -> None: 

116 """Paint paths described in section 4.4 of the PDF reference manual""" 

117 shape = "".join(x[0] for x in path) 

118 

119 if shape[:1] != "m": 

120 # Per PDF Reference Section 4.4.1, "path construction operators may 

121 # be invoked in any sequence, but the first one invoked must be m 

122 # or re to begin a new subpath." Since pdfminer.six already 

123 # converts all `re` (rectangle) operators to their equivalent 

124 # `mlllh` representation, paths ingested by `.paint_path(...)` that 

125 # do not begin with the `m` operator are invalid. 

126 pass 

127 

128 elif shape.count("m") > 1: 

129 # recurse if there are multiple m's in this shape 

130 for m in re.finditer(r"m[^m]+", shape): 

131 subpath = path[m.start(0) : m.end(0)] 

132 self.paint_path(gstate, stroke, fill, evenodd, subpath) 

133 

134 else: 

135 # Although the 'h' command does not not literally provide a 

136 # point-position, its position is (by definition) equal to the 

137 # subpath's starting point. 

138 # 

139 # And, per Section 4.4's Table 4.9, all other path commands place 

140 # their point-position in their final two arguments. (Any preceding 

141 # arguments represent control points on Bézier curves.) 

142 raw_pts = [ 

143 cast(Point, p[-2:] if p[0] != "h" else path[0][-2:]) for p in path 

144 ] 

145 pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts] 

146 

147 operators = [str(operation[0]) for operation in path] 

148 transformed_points = [ 

149 [ 

150 apply_matrix_pt(self.ctm, (float(operand1), float(operand2))) 

151 for operand1, operand2 in zip( 

152 operation[1::2], operation[2::2], strict=False 

153 ) 

154 ] 

155 for operation in path 

156 ] 

157 transformed_path = [ 

158 cast(PathSegment, (o, *p)) 

159 for o, p in zip(operators, transformed_points, strict=False) 

160 ] 

161 

162 # Drop a redundant "l" on a path closed with "h" 

163 if len(shape) > 3 and shape[-2:] == "lh" and pts[-2] == pts[0]: 

164 shape = shape[:-2] + "h" 

165 pts.pop() 

166 

167 if shape in {"mlh", "ml"}: 

168 # single line segment 

169 # 

170 # Note: 'ml', in conditional above, is a frequent anomaly 

171 # that we want to support. 

172 line = LTLine( 

173 gstate.linewidth, 

174 pts[0], 

175 pts[1], 

176 stroke, 

177 fill, 

178 evenodd, 

179 gstate.scolor, 

180 gstate.ncolor, 

181 original_path=transformed_path, 

182 dashing_style=gstate.dash, 

183 ) 

184 self.cur_item.add(line) 

185 

186 elif shape in {"mlllh", "mllll"}: 

187 (x0, y0), (x1, y1), (x2, y2), (x3, y3), _ = pts 

188 

189 is_closed_loop = pts[0] == pts[4] 

190 has_square_coordinates = ( 

191 x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0 

192 ) or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0) 

193 if is_closed_loop and has_square_coordinates: 

194 rect = LTRect( 

195 gstate.linewidth, 

196 (*pts[0], *pts[2]), 

197 stroke, 

198 fill, 

199 evenodd, 

200 gstate.scolor, 

201 gstate.ncolor, 

202 transformed_path, 

203 gstate.dash, 

204 ) 

205 self.cur_item.add(rect) 

206 else: 

207 curve = LTCurve( 

208 gstate.linewidth, 

209 pts, 

210 stroke, 

211 fill, 

212 evenodd, 

213 gstate.scolor, 

214 gstate.ncolor, 

215 transformed_path, 

216 gstate.dash, 

217 ) 

218 self.cur_item.add(curve) 

219 else: 

220 curve = LTCurve( 

221 gstate.linewidth, 

222 pts, 

223 stroke, 

224 fill, 

225 evenodd, 

226 gstate.scolor, 

227 gstate.ncolor, 

228 transformed_path, 

229 gstate.dash, 

230 ) 

231 self.cur_item.add(curve) 

232 

233 def render_char( 

234 self, 

235 matrix: Matrix, 

236 font: PDFFont, 

237 fontsize: float, 

238 scaling: float, 

239 rise: float, 

240 cid: int, 

241 ncs: PDFColorSpace, 

242 graphicstate: PDFGraphicState, 

243 ) -> float: 

244 try: 

245 text = font.to_unichr(cid) 

246 assert isinstance(text, str), str(type(text)) 

247 except PDFUnicodeNotDefined: 

248 text = self.handle_undefined_char(font, cid) 

249 textwidth = font.char_width(cid) 

250 textdisp = font.char_disp(cid) 

251 item = LTChar( 

252 matrix, 

253 font, 

254 fontsize, 

255 scaling, 

256 rise, 

257 text, 

258 textwidth, 

259 textdisp, 

260 ncs, 

261 graphicstate, 

262 ) 

263 self.cur_item.add(item) 

264 return item.adv 

265 

266 def handle_undefined_char(self, font: PDFFont, cid: int) -> str: 

267 log.debug(f"undefined: {font!r}, {cid!r}") 

268 return f"(cid:{cid})" 

269 

270 def receive_layout(self, ltpage: LTPage) -> None: 

271 pass 

272 

273 

274class PDFPageAggregator(PDFLayoutAnalyzer): 

275 def __init__( 

276 self, 

277 rsrcmgr: PDFResourceManager, 

278 pageno: int = 1, 

279 laparams: LAParams | None = None, 

280 ) -> None: 

281 PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams) 

282 self.result: LTPage | None = None 

283 

284 def receive_layout(self, ltpage: LTPage) -> None: 

285 self.result = ltpage 

286 

287 def get_result(self) -> LTPage: 

288 assert self.result is not None 

289 return self.result 

290 

291 

292# Some PDFConverter children support only binary I/O 

293IOType = TypeVar("IOType", TextIO, BinaryIO, AnyIO) 

294 

295 

296class PDFConverter(PDFLayoutAnalyzer, Generic[IOType]): 

297 def __init__( 

298 self, 

299 rsrcmgr: PDFResourceManager, 

300 outfp: IOType, 

301 codec: str = "utf-8", 

302 pageno: int = 1, 

303 laparams: LAParams | None = None, 

304 ) -> None: 

305 PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams) 

306 self.outfp: IOType = outfp 

307 self.codec = codec 

308 self.outfp_binary = self._is_binary_stream(self.outfp) 

309 

310 @staticmethod 

311 def _is_binary_stream(outfp: AnyIO) -> bool: 

312 """Test if an stream is binary or not""" 

313 if "b" in getattr(outfp, "mode", ""): 

314 return True 

315 elif hasattr(outfp, "mode"): 

316 # output stream has a mode, but it does not contain 'b' 

317 return False 

318 elif isinstance(outfp, io.BytesIO): 

319 return True 

320 elif isinstance(outfp, (io.StringIO, io.TextIOBase)): 

321 return False 

322 

323 return True 

324 

325 

326class TextConverter(PDFConverter[AnyIO]): 

327 def __init__( 

328 self, 

329 rsrcmgr: PDFResourceManager, 

330 outfp: AnyIO, 

331 codec: str = "utf-8", 

332 pageno: int = 1, 

333 laparams: LAParams | None = None, 

334 showpageno: bool = False, 

335 imagewriter: ImageWriter | None = None, 

336 ) -> None: 

337 super().__init__(rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) 

338 self.showpageno = showpageno 

339 self.imagewriter = imagewriter 

340 

341 def write_text(self, text: str) -> None: 

342 text = utils.compatible_encode_method(text, self.codec, "ignore") 

343 if self.outfp_binary: 

344 cast(BinaryIO, self.outfp).write(text.encode()) 

345 else: 

346 cast(TextIO, self.outfp).write(text) 

347 

348 def receive_layout(self, ltpage: LTPage) -> None: 

349 def render(item: LTItem) -> None: 

350 if isinstance(item, LTContainer): 

351 for child in item: 

352 render(child) 

353 elif isinstance(item, LTText): 

354 self.write_text(item.get_text()) 

355 if isinstance(item, LTTextBox): 

356 self.write_text("\n") 

357 elif isinstance(item, LTImage) and self.imagewriter is not None: 

358 self.imagewriter.export_image(item) 

359 

360 if self.showpageno: 

361 self.write_text(f"Page {ltpage.pageid}\n") 

362 render(ltpage) 

363 self.write_text("\f") 

364 

365 # Some dummy functions to save memory/CPU when all that is wanted 

366 # is text. This stops all the image and drawing output from being 

367 # recorded and taking up RAM. 

368 def render_image(self, name: str, stream: PDFStream) -> None: 

369 if self.imagewriter is not None: 

370 PDFConverter.render_image(self, name, stream) 

371 

372 def paint_path( 

373 self, 

374 gstate: PDFGraphicState, 

375 stroke: bool, 

376 fill: bool, 

377 evenodd: bool, 

378 path: Sequence[PathSegment], 

379 ) -> None: 

380 pass 

381 

382 

383class HTMLConverter(PDFConverter[AnyIO]): 

384 RECT_COLORS: ClassVar[dict[str, str]] = { 

385 "figure": "yellow", 

386 "textline": "magenta", 

387 "textbox": "cyan", 

388 "textgroup": "red", 

389 "curve": "black", 

390 "page": "gray", 

391 } 

392 

393 TEXT_COLORS: ClassVar[dict[str, str]] = { 

394 "textbox": "blue", 

395 "char": "black", 

396 } 

397 

398 def __init__( 

399 self, 

400 rsrcmgr: PDFResourceManager, 

401 outfp: AnyIO, 

402 codec: str = "utf-8", 

403 pageno: int = 1, 

404 laparams: LAParams | None = None, 

405 scale: float = 1, 

406 fontscale: float = 1.0, 

407 layoutmode: str = "normal", 

408 showpageno: bool = True, 

409 pagemargin: int = 50, 

410 imagewriter: ImageWriter | None = None, 

411 debug: int = 0, 

412 rect_colors: dict[str, str] | None = None, 

413 text_colors: dict[str, str] | None = None, 

414 ) -> None: 

415 PDFConverter.__init__( 

416 self, 

417 rsrcmgr, 

418 outfp, 

419 codec=codec, 

420 pageno=pageno, 

421 laparams=laparams, 

422 ) 

423 

424 # write() assumes a codec for binary I/O, or no codec for text I/O. 

425 if self.outfp_binary and not self.codec: 

426 raise PDFValueError("Codec is required for a binary I/O output") 

427 if not self.outfp_binary and self.codec: 

428 raise PDFValueError("Codec must not be specified for a text I/O output") 

429 

430 if text_colors is None: 

431 text_colors = {"char": "black"} 

432 if rect_colors is None: 

433 rect_colors = {"curve": "black", "page": "gray"} 

434 

435 self.scale = scale 

436 self.fontscale = fontscale 

437 self.layoutmode = layoutmode 

438 self.showpageno = showpageno 

439 self.pagemargin = pagemargin 

440 self.imagewriter = imagewriter 

441 self.rect_colors = rect_colors 

442 self.text_colors = text_colors 

443 if debug: 

444 self.rect_colors.update(self.RECT_COLORS) 

445 self.text_colors.update(self.TEXT_COLORS) 

446 self._yoffset: float = self.pagemargin 

447 self._font: tuple[str, float] | None = None 

448 self._fontstack: list[tuple[str, float] | None] = [] 

449 self.write_header() 

450 

451 def write(self, text: str) -> None: 

452 if self.codec: 

453 cast(BinaryIO, self.outfp).write(text.encode(self.codec)) 

454 else: 

455 cast(TextIO, self.outfp).write(text) 

456 

457 def write_header(self) -> None: 

458 self.write("<html><head>\n") 

459 if self.codec: 

460 s = ( 

461 '<meta http-equiv="Content-Type" content="text/html; ' 

462 f'charset={self.codec}">\n' 

463 ) 

464 else: 

465 s = '<meta http-equiv="Content-Type" content="text/html">\n' 

466 self.write(s) 

467 self.write("</head><body>\n") 

468 

469 def write_footer(self) -> None: 

470 page_links = [f'<a href="#{i}">{i}</a>' for i in range(1, self.pageno)] 

471 s = ( 

472 '<div style="position:absolute; top:0px;">' 

473 f"Page: {', '.join(page_links)}</div>\n" 

474 ) 

475 self.write(s) 

476 self.write("</body></html>\n") 

477 

478 def write_text(self, text: str) -> None: 

479 self.write(enc(text)) 

480 

481 def place_rect( 

482 self, 

483 color: str, 

484 borderwidth: int, 

485 x: float, 

486 y: float, 

487 w: float, 

488 h: float, 

489 ) -> None: 

490 color2 = self.rect_colors.get(color) 

491 if color2 is not None: 

492 s = ( 

493 '<span style="position:absolute; ' 

494 f"border: {color2} {borderwidth}px solid; " 

495 f"left:{x * self.scale}px; " 

496 f"top:{(self._yoffset - y) * self.scale}px; " 

497 f"width:{w * self.scale}px; " 

498 f'height:{h * self.scale}px;"></span>\n' 

499 ) 

500 self.write(s) 

501 

502 def place_border(self, color: str, borderwidth: int, item: LTComponent) -> None: 

503 self.place_rect(color, borderwidth, item.x0, item.y1, item.width, item.height) 

504 

505 def place_image( 

506 self, 

507 item: LTImage, 

508 borderwidth: int, 

509 x: float, 

510 y: float, 

511 w: float, 

512 h: float, 

513 ) -> None: 

514 if self.imagewriter is not None: 

515 name = self.imagewriter.export_image(item) 

516 s = ( 

517 f'<img src="{enc(name)}" border="{borderwidth}" ' 

518 'style="position:absolute; ' 

519 f"left:{x * self.scale}px; " 

520 f'top:{(self._yoffset - y) * self.scale}px;" ' 

521 f'width="{w * self.scale}" ' 

522 f'height="{h * self.scale}" />\n' 

523 ) 

524 self.write(s) 

525 

526 def place_text( 

527 self, 

528 color: str, 

529 text: str, 

530 x: float, 

531 y: float, 

532 size: float, 

533 ) -> None: 

534 color2 = self.text_colors.get(color) 

535 if color2 is not None: 

536 s = ( 

537 '<span style="position:absolute; ' 

538 f"color:{color2}; " 

539 f"left:{x * self.scale}px; " 

540 f"top:{(self._yoffset - y) * self.scale}px; " 

541 f'font-size:{size * self.scale * self.fontscale}px;">' 

542 ) 

543 self.write(s) 

544 self.write_text(text) 

545 self.write("</span>\n") 

546 

547 def begin_div( 

548 self, 

549 color: str, 

550 borderwidth: int, 

551 x: float, 

552 y: float, 

553 w: float, 

554 h: float, 

555 writing_mode: str = "False", 

556 ) -> None: 

557 self._fontstack.append(self._font) 

558 self._font = None 

559 s = ( 

560 '<div style="position:absolute; ' 

561 f"border: {color} {borderwidth}px solid; " 

562 f"writing-mode:{writing_mode}; " 

563 f"left:{x * self.scale}px; " 

564 f"top:{(self._yoffset - y) * self.scale}px; " 

565 f"width:{w * self.scale}px; " 

566 f'height:{h * self.scale}px;">' 

567 ) 

568 self.write(s) 

569 

570 def end_div(self, color: str) -> None: 

571 if self._font is not None: 

572 self.write("</span>") 

573 self._font = self._fontstack.pop() 

574 self.write("</div>") 

575 

576 def put_text(self, text: str, fontname: str, fontsize: float) -> None: 

577 font = (fontname, fontsize) 

578 if font != self._font: 

579 if self._font is not None: 

580 self.write("</span>") 

581 # Remove subset tag from fontname, see PDF Reference 5.5.3 

582 fontname_without_subset_tag = fontname.split("+")[-1] 

583 self.write( 

584 '<span style="' 

585 f"font-family: {fontname_without_subset_tag}; " 

586 f'font-size:{fontsize * self.scale * self.fontscale}px">' 

587 ) 

588 self._font = font 

589 self.write_text(text) 

590 

591 def put_newline(self) -> None: 

592 self.write("<br>") 

593 

594 def receive_layout(self, ltpage: LTPage) -> None: 

595 def show_group(item: LTTextGroup | TextGroupElement) -> None: 

596 if isinstance(item, LTTextGroup): 

597 self.place_border("textgroup", 1, item) 

598 for child in item: 

599 show_group(child) 

600 

601 def render(item: LTItem) -> None: 

602 child: LTItem 

603 if isinstance(item, LTPage): 

604 self._yoffset += item.y1 

605 self.place_border("page", 1, item) 

606 if self.showpageno: 

607 self.write( 

608 '<div style="position:absolute; top:%dpx;">' 

609 f"{(self._yoffset - item.y1) * self.scale}", 

610 ) 

611 self.write( 

612 f'<a name="{item.pageid}">Page {item.pageid}</a></div>\n', 

613 ) 

614 for child in item: 

615 render(child) 

616 if item.groups is not None: 

617 for group in item.groups: 

618 show_group(group) 

619 elif isinstance(item, LTCurve): 

620 self.place_border("curve", 1, item) 

621 elif isinstance(item, LTFigure): 

622 self.begin_div("figure", 1, item.x0, item.y1, item.width, item.height) 

623 for child in item: 

624 render(child) 

625 self.end_div("figure") 

626 elif isinstance(item, LTImage): 

627 self.place_image(item, 1, item.x0, item.y1, item.width, item.height) 

628 elif self.layoutmode == "exact": 

629 if isinstance(item, LTTextLine): 

630 self.place_border("textline", 1, item) 

631 for child in item: 

632 render(child) 

633 elif isinstance(item, LTTextBox): 

634 self.place_border("textbox", 1, item) 

635 self.place_text( 

636 "textbox", 

637 str(item.index + 1), 

638 item.x0, 

639 item.y1, 

640 20, 

641 ) 

642 for child in item: 

643 render(child) 

644 elif isinstance(item, LTChar): 

645 self.place_border("char", 1, item) 

646 self.place_text( 

647 "char", 

648 item.get_text(), 

649 item.x0, 

650 item.y1, 

651 item.size, 

652 ) 

653 elif isinstance(item, LTTextLine): 

654 for child in item: 

655 render(child) 

656 if self.layoutmode != "loose": 

657 self.put_newline() 

658 elif isinstance(item, LTTextBox): 

659 self.begin_div( 

660 "textbox", 

661 1, 

662 item.x0, 

663 item.y1, 

664 item.width, 

665 item.height, 

666 item.get_writing_mode(), 

667 ) 

668 for child in item: 

669 render(child) 

670 self.end_div("textbox") 

671 elif isinstance(item, LTChar): 

672 fontname = make_compat_str(item.fontname) 

673 self.put_text(item.get_text(), fontname, item.size) 

674 elif isinstance(item, LTText): 

675 self.write_text(item.get_text()) 

676 

677 render(ltpage) 

678 self._yoffset += self.pagemargin 

679 

680 def close(self) -> None: 

681 self.write_footer() 

682 

683 

684class XMLConverter(PDFConverter[AnyIO]): 

685 CONTROL = re.compile("[\x00-\x08\x0b-\x0c\x0e-\x1f]") 

686 

687 def __init__( 

688 self, 

689 rsrcmgr: PDFResourceManager, 

690 outfp: AnyIO, 

691 codec: str = "utf-8", 

692 pageno: int = 1, 

693 laparams: LAParams | None = None, 

694 imagewriter: ImageWriter | None = None, 

695 stripcontrol: bool = False, 

696 ) -> None: 

697 PDFConverter.__init__( 

698 self, 

699 rsrcmgr, 

700 outfp, 

701 codec=codec, 

702 pageno=pageno, 

703 laparams=laparams, 

704 ) 

705 

706 # write() assumes a codec for binary I/O, or no codec for text I/O. 

707 if self.outfp_binary == (not self.codec): 

708 raise PDFValueError("Codec is required for a binary I/O output") 

709 

710 self.imagewriter = imagewriter 

711 self.stripcontrol = stripcontrol 

712 self.write_header() 

713 

714 def write(self, text: str) -> None: 

715 if self.codec: 

716 cast(BinaryIO, self.outfp).write(text.encode(self.codec)) 

717 else: 

718 cast(TextIO, self.outfp).write(text) 

719 

720 def write_header(self) -> None: 

721 if self.codec: 

722 self.write(f'<?xml version="1.0" encoding="{self.codec}" ?>\n') 

723 else: 

724 self.write('<?xml version="1.0" ?>\n') 

725 self.write("<pages>\n") 

726 

727 def write_footer(self) -> None: 

728 self.write("</pages>\n") 

729 

730 def write_text(self, text: str) -> None: 

731 if self.stripcontrol: 

732 text = self.CONTROL.sub("", text) 

733 self.write(enc(text)) 

734 

735 def receive_layout(self, ltpage: LTPage) -> None: 

736 def show_group(item: LTItem) -> None: 

737 if isinstance(item, LTTextBox): 

738 self.write( 

739 f'<textbox id="{item.index}" bbox="{bbox2str(item.bbox)}" />\n' 

740 ) 

741 elif isinstance(item, LTTextGroup): 

742 self.write(f'<textgroup bbox="{bbox2str(item.bbox)}">\n') 

743 for child in item: 

744 show_group(child) 

745 self.write("</textgroup>\n") 

746 

747 def render(item: LTItem) -> None: 

748 child: LTItem 

749 if isinstance(item, LTPage): 

750 s = ( 

751 f'<page id="{item.pageid}" ' 

752 f'bbox="{bbox2str(item.bbox)}" ' 

753 f'rotate="{item.rotate}">\n' 

754 ) 

755 self.write(s) 

756 for child in item: 

757 render(child) 

758 if item.groups is not None: 

759 self.write("<layout>\n") 

760 for group in item.groups: 

761 show_group(group) 

762 self.write("</layout>\n") 

763 self.write("</page>\n") 

764 elif isinstance(item, LTLine): 

765 s = ( 

766 f"<line " 

767 f'linewidth="{item.linewidth}" ' 

768 f'bbox="{bbox2str(item.bbox)}" />\n' 

769 ) 

770 self.write(s) 

771 elif isinstance(item, LTRect): 

772 s = ( 

773 f"<rect " 

774 f'linewidth="{item.linewidth}" ' 

775 f'bbox="{bbox2str(item.bbox)}" />\n' 

776 ) 

777 self.write(s) 

778 elif isinstance(item, LTCurve): 

779 s = ( 

780 f"<curve " 

781 f'linewidth="{item.linewidth}" ' 

782 f'bbox="{bbox2str(item.bbox)}" ' 

783 f'pts="{item.get_pts()}"/>\n' 

784 ) 

785 self.write(s) 

786 elif isinstance(item, LTFigure): 

787 s = f'<figure name="{item.name}" bbox="{bbox2str(item.bbox)}">\n' 

788 self.write(s) 

789 for child in item: 

790 render(child) 

791 self.write("</figure>\n") 

792 elif isinstance(item, LTTextLine): 

793 self.write(f'<textline bbox="{bbox2str(item.bbox)}">\n') 

794 for child in item: 

795 render(child) 

796 self.write("</textline>\n") 

797 elif isinstance(item, LTTextBox): 

798 wmode = "" 

799 if isinstance(item, LTTextBoxVertical): 

800 wmode = ' wmode="vertical"' 

801 s = f'<textbox id="{item.index}" bbox="{bbox2str(item.bbox)}"{wmode}>\n' 

802 self.write(s) 

803 for child in item: 

804 render(child) 

805 self.write("</textbox>\n") 

806 elif isinstance(item, LTChar): 

807 s = ( 

808 f"<text " 

809 f'font="{enc(item.fontname)}" ' 

810 f'bbox="{bbox2str(item.bbox)}" ' 

811 f'colourspace="{item.ncs.name}" ' 

812 f'ncolour="{item.graphicstate.ncolor}" ' 

813 f'size="{item.size:.3f}">' 

814 ) 

815 self.write(s) 

816 self.write_text(item.get_text()) 

817 self.write("</text>\n") 

818 elif isinstance(item, LTText): 

819 self.write(f"<text>{item.get_text()}</text>\n") 

820 elif isinstance(item, LTImage): 

821 if self.imagewriter is not None: 

822 name = self.imagewriter.export_image(item) 

823 self.write( 

824 f"<image " 

825 f'src="{enc(name)}" ' 

826 f'width="{item.width}" ' 

827 f'height="{item.height}" />\n' 

828 ) 

829 else: 

830 self.write( 

831 f'<image width="{item.width}" height="{item.height}" />\n' 

832 ) 

833 else: 

834 raise AssertionError(str(("Unhandled", item))) 

835 

836 render(ltpage) 

837 

838 def close(self) -> None: 

839 self.write_footer() 

840 

841 

842class HOCRConverter(PDFConverter[AnyIO]): 

843 """Extract an hOCR representation from explicit text information within a PDF.""" 

844 

845 # Where text is being extracted from a variety of types of PDF within a 

846 # business process, those PDFs where the text is only present in image 

847 # form will need to be analysed using an OCR tool which will typically 

848 # output hOCR. This converter extracts the explicit text information from 

849 # those PDFs that do have it and uses it to genxerate a basic hOCR 

850 # representation that is designed to be used in conjunction with the image 

851 # of the PDF in the same way as genuine OCR output would be, but without the 

852 # inevitable OCR errors. 

853 

854 # The converter does not handle images, diagrams or text colors. 

855 

856 # In the examples processed by the contributor it was necessary to set 

857 # LAParams.all_texts to True. 

858 

859 CONTROL = re.compile(r"[\x00-\x08\x0b-\x0c\x0e-\x1f]") 

860 

861 def __init__( 

862 self, 

863 rsrcmgr: PDFResourceManager, 

864 outfp: AnyIO, 

865 codec: str = "utf8", 

866 pageno: int = 1, 

867 laparams: LAParams | None = None, 

868 stripcontrol: bool = False, 

869 ): 

870 PDFConverter.__init__( 

871 self, 

872 rsrcmgr, 

873 outfp, 

874 codec=codec, 

875 pageno=pageno, 

876 laparams=laparams, 

877 ) 

878 self.stripcontrol = stripcontrol 

879 self.within_chars = False 

880 self.write_header() 

881 

882 def bbox_repr(self, bbox: Rect) -> str: 

883 (in_x0, in_y0, in_x1, in_y1) = bbox 

884 # PDF y-coordinates are the other way round from hOCR coordinates 

885 out_x0 = int(in_x0) 

886 out_y0 = int(self.page_bbox[3] - in_y1) 

887 out_x1 = int(in_x1) 

888 out_y1 = int(self.page_bbox[3] - in_y0) 

889 return f"bbox {out_x0} {out_y0} {out_x1} {out_y1}" 

890 

891 def write(self, text: str) -> None: 

892 if self.codec: 

893 encoded_text = text.encode(self.codec) 

894 cast(BinaryIO, self.outfp).write(encoded_text) 

895 else: 

896 cast(TextIO, self.outfp).write(text) 

897 

898 def write_header(self) -> None: 

899 if self.codec: 

900 self.write( 

901 "<html xmlns='http://www.w3.org/1999/xhtml' " 

902 f"xml:lang='en' lang='en' charset='{self.codec}'>\n", 

903 ) 

904 else: 

905 self.write( 

906 "<html xmlns='http://www.w3.org/1999/xhtml' xml:lang='en' lang='en'>\n", 

907 ) 

908 self.write("<head>\n") 

909 self.write("<title></title>\n") 

910 self.write( 

911 "<meta http-equiv='Content-Type' content='text/html;charset=utf-8' />\n", 

912 ) 

913 self.write( 

914 "<meta name='ocr-system' content='pdfminer.six HOCR Converter' />\n", 

915 ) 

916 self.write( 

917 " <meta name='ocr-capabilities'" 

918 " content='ocr_page ocr_block ocr_line ocrx_word'/>\n", 

919 ) 

920 self.write("</head>\n") 

921 self.write("<body>\n") 

922 

923 def write_footer(self) -> None: 

924 self.write("<!-- comment in the following line to debug -->\n") 

925 self.write( 

926 "<!--script src='https://unpkg.com/hocrjs'></script--></body></html>\n", 

927 ) 

928 

929 def write_text(self, text: str) -> None: 

930 if self.stripcontrol: 

931 text = self.CONTROL.sub("", text) 

932 self.write(text) 

933 

934 def write_word(self) -> None: 

935 if len(self.working_text) > 0: 

936 bold_and_italic_styles = "" 

937 if "Italic" in self.working_font: 

938 bold_and_italic_styles = "font-style: italic; " 

939 if "Bold" in self.working_font: 

940 bold_and_italic_styles += "font-weight: bold; " 

941 self.write( 

942 f'<span style=\'font:"{self.working_font}"; ' 

943 f"font-size:{self.working_size}; " 

944 f"{bold_and_italic_styles}' " 

945 f"class='ocrx_word' " 

946 f"title='{self.bbox_repr(self.working_bbox)}; " 

947 f"x_font {self.working_font}; " 

948 f"x_fsize {self.working_size}'>" 

949 f"{self.working_text.strip()}</span>" 

950 ) 

951 self.within_chars = False 

952 

953 def receive_layout(self, ltpage: LTPage) -> None: 

954 def render(item: LTItem) -> None: 

955 if self.within_chars and isinstance(item, LTAnno): 

956 self.write_word() 

957 if isinstance(item, LTPage): 

958 self.page_bbox = item.bbox 

959 self.write( 

960 f"<div " 

961 f"class='ocr_page' " 

962 f"id='{item.pageid}' " 

963 f"title='{self.bbox_repr(item.bbox)}'>\n", 

964 ) 

965 for child in item: 

966 render(child) 

967 self.write("</div>\n") 

968 elif isinstance(item, LTTextLine): 

969 self.write( 

970 f"<span class='ocr_line' title='{self.bbox_repr(item.bbox)}'>", 

971 ) 

972 for child_line in item: 

973 render(child_line) 

974 self.write("</span>\n") 

975 elif isinstance(item, LTTextBox): 

976 self.write( 

977 f"<div " 

978 f"class='ocr_block' " 

979 f"id='{item.index}' " 

980 f"title='{self.bbox_repr(item.bbox)}'>\n" 

981 ) 

982 for child in item: 

983 render(child) 

984 self.write("</div>\n") 

985 elif isinstance(item, LTChar): 

986 if not self.within_chars: 

987 self.within_chars = True 

988 self.working_text = item.get_text() 

989 self.working_bbox = item.bbox 

990 self.working_font = item.fontname 

991 self.working_size = item.size 

992 elif len(item.get_text().strip()) == 0: 

993 self.write_word() 

994 self.write(item.get_text()) 

995 else: 

996 if ( 

997 self.working_bbox[1] != item.bbox[1] 

998 or self.working_font != item.fontname 

999 or self.working_size != item.size 

1000 ): 

1001 self.write_word() 

1002 self.working_bbox = item.bbox 

1003 self.working_font = item.fontname 

1004 self.working_size = item.size 

1005 self.working_text += item.get_text() 

1006 self.working_bbox = ( 

1007 self.working_bbox[0], 

1008 self.working_bbox[1], 

1009 item.bbox[2], 

1010 self.working_bbox[3], 

1011 ) 

1012 

1013 render(ltpage) 

1014 

1015 def close(self) -> None: 

1016 self.write_footer()