Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/converter.py: 38%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

471 statements  

1import io 

2import logging 

3import re 

4from typing import ( 

5 BinaryIO, 

6 Dict, 

7 Generic, 

8 List, 

9 Optional, 

10 Sequence, 

11 TextIO, 

12 Tuple, 

13 TypeVar, 

14 Union, 

15 cast, 

16) 

17 

18from pdfminer import utils 

19from pdfminer.image import ImageWriter 

20from pdfminer.layout import ( 

21 LAParams, 

22 LTAnno, 

23 LTChar, 

24 LTComponent, 

25 LTContainer, 

26 LTCurve, 

27 LTFigure, 

28 LTImage, 

29 LTItem, 

30 LTLayoutContainer, 

31 LTLine, 

32 LTPage, 

33 LTRect, 

34 LTText, 

35 LTTextBox, 

36 LTTextBoxVertical, 

37 LTTextGroup, 

38 LTTextLine, 

39 TextGroupElement, 

40) 

41from pdfminer.pdfcolor import PDFColorSpace 

42from pdfminer.pdfdevice import PDFTextDevice 

43from pdfminer.pdfexceptions import PDFValueError 

44from pdfminer.pdffont import PDFFont, PDFUnicodeNotDefined 

45from pdfminer.pdfinterp import PDFGraphicState, PDFResourceManager 

46from pdfminer.pdfpage import PDFPage 

47from pdfminer.pdftypes import PDFStream 

48from pdfminer.utils import ( 

49 AnyIO, 

50 Matrix, 

51 PathSegment, 

52 Point, 

53 Rect, 

54 apply_matrix_pt, 

55 bbox2str, 

56 enc, 

57 make_compat_str, 

58 mult_matrix, 

59) 

60 

61log = logging.getLogger(__name__) 

62 

63 

64class PDFLayoutAnalyzer(PDFTextDevice): 

65 cur_item: LTLayoutContainer 

66 ctm: Matrix 

67 

68 def __init__( 

69 self, 

70 rsrcmgr: PDFResourceManager, 

71 pageno: int = 1, 

72 laparams: Optional[LAParams] = None, 

73 ) -> None: 

74 PDFTextDevice.__init__(self, rsrcmgr) 

75 self.pageno = pageno 

76 self.laparams = laparams 

77 self._stack: List[LTLayoutContainer] = [] 

78 

79 def begin_page(self, page: PDFPage, ctm: Matrix) -> None: 

80 (x0, y0, x1, y1) = page.mediabox 

81 (x0, y0) = apply_matrix_pt(ctm, (x0, y0)) 

82 (x1, y1) = apply_matrix_pt(ctm, (x1, y1)) 

83 mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1)) 

84 self.cur_item = LTPage(self.pageno, mediabox) 

85 

86 def end_page(self, page: PDFPage) -> None: 

87 assert not self._stack, str(len(self._stack)) 

88 assert isinstance(self.cur_item, LTPage), str(type(self.cur_item)) 

89 if self.laparams is not None: 

90 self.cur_item.analyze(self.laparams) 

91 self.pageno += 1 

92 self.receive_layout(self.cur_item) 

93 

94 def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None: 

95 self._stack.append(self.cur_item) 

96 self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm)) 

97 

98 def end_figure(self, _: str) -> None: 

99 fig = self.cur_item 

100 assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item)) 

101 self.cur_item = self._stack.pop() 

102 self.cur_item.add(fig) 

103 

104 def render_image(self, name: str, stream: PDFStream) -> None: 

105 assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item)) 

106 item = LTImage( 

107 name, 

108 stream, 

109 (self.cur_item.x0, self.cur_item.y0, self.cur_item.x1, self.cur_item.y1), 

110 ) 

111 self.cur_item.add(item) 

112 

113 def paint_path( 

114 self, 

115 gstate: PDFGraphicState, 

116 stroke: bool, 

117 fill: bool, 

118 evenodd: bool, 

119 path: Sequence[PathSegment], 

120 ) -> None: 

121 """Paint paths described in section 4.4 of the PDF reference manual""" 

122 shape = "".join(x[0] for x in path) 

123 

124 if shape[:1] != "m": 

125 # Per PDF Reference Section 4.4.1, "path construction operators may 

126 # be invoked in any sequence, but the first one invoked must be m 

127 # or re to begin a new subpath." Since pdfminer.six already 

128 # converts all `re` (rectangle) operators to their equivelent 

129 # `mlllh` representation, paths ingested by `.paint_path(...)` that 

130 # do not begin with the `m` operator are invalid. 

131 pass 

132 

133 elif shape.count("m") > 1: 

134 # recurse if there are multiple m's in this shape 

135 for m in re.finditer(r"m[^m]+", shape): 

136 subpath = path[m.start(0) : m.end(0)] 

137 self.paint_path(gstate, stroke, fill, evenodd, subpath) 

138 

139 else: 

140 # Although the 'h' command does not not literally provide a 

141 # point-position, its position is (by definition) equal to the 

142 # subpath's starting point. 

143 # 

144 # And, per Section 4.4's Table 4.9, all other path commands place 

145 # their point-position in their final two arguments. (Any preceding 

146 # arguments represent control points on Bézier curves.) 

147 raw_pts = [ 

148 cast(Point, p[-2:] if p[0] != "h" else path[0][-2:]) for p in path 

149 ] 

150 pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts] 

151 

152 operators = [str(operation[0]) for operation in path] 

153 transformed_points = [ 

154 [ 

155 apply_matrix_pt(self.ctm, (float(operand1), float(operand2))) 

156 for operand1, operand2 in zip(operation[1::2], operation[2::2]) 

157 ] 

158 for operation in path 

159 ] 

160 transformed_path = [ 

161 cast(PathSegment, (o, *p)) 

162 for o, p in zip(operators, transformed_points) 

163 ] 

164 

165 if shape in {"mlh", "ml"}: 

166 # single line segment 

167 # 

168 # Note: 'ml', in conditional above, is a frequent anomaly 

169 # that we want to support. 

170 line = LTLine( 

171 gstate.linewidth, 

172 pts[0], 

173 pts[1], 

174 stroke, 

175 fill, 

176 evenodd, 

177 gstate.scolor, 

178 gstate.ncolor, 

179 original_path=transformed_path, 

180 dashing_style=gstate.dash, 

181 ) 

182 self.cur_item.add(line) 

183 

184 elif shape in {"mlllh", "mllll"}: 

185 (x0, y0), (x1, y1), (x2, y2), (x3, y3), _ = pts 

186 

187 is_closed_loop = pts[0] == pts[4] 

188 has_square_coordinates = ( 

189 x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0 

190 ) or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0) 

191 if is_closed_loop and has_square_coordinates: 

192 rect = LTRect( 

193 gstate.linewidth, 

194 (*pts[0], *pts[2]), 

195 stroke, 

196 fill, 

197 evenodd, 

198 gstate.scolor, 

199 gstate.ncolor, 

200 transformed_path, 

201 gstate.dash, 

202 ) 

203 self.cur_item.add(rect) 

204 else: 

205 curve = LTCurve( 

206 gstate.linewidth, 

207 pts, 

208 stroke, 

209 fill, 

210 evenodd, 

211 gstate.scolor, 

212 gstate.ncolor, 

213 transformed_path, 

214 gstate.dash, 

215 ) 

216 self.cur_item.add(curve) 

217 else: 

218 curve = LTCurve( 

219 gstate.linewidth, 

220 pts, 

221 stroke, 

222 fill, 

223 evenodd, 

224 gstate.scolor, 

225 gstate.ncolor, 

226 transformed_path, 

227 gstate.dash, 

228 ) 

229 self.cur_item.add(curve) 

230 

231 def render_char( 

232 self, 

233 matrix: Matrix, 

234 font: PDFFont, 

235 fontsize: float, 

236 scaling: float, 

237 rise: float, 

238 cid: int, 

239 ncs: PDFColorSpace, 

240 graphicstate: PDFGraphicState, 

241 ) -> float: 

242 try: 

243 text = font.to_unichr(cid) 

244 assert isinstance(text, str), str(type(text)) 

245 except PDFUnicodeNotDefined: 

246 text = self.handle_undefined_char(font, cid) 

247 textwidth = font.char_width(cid) 

248 textdisp = font.char_disp(cid) 

249 item = LTChar( 

250 matrix, 

251 font, 

252 fontsize, 

253 scaling, 

254 rise, 

255 text, 

256 textwidth, 

257 textdisp, 

258 ncs, 

259 graphicstate, 

260 ) 

261 self.cur_item.add(item) 

262 return item.adv 

263 

264 def handle_undefined_char(self, font: PDFFont, cid: int) -> str: 

265 log.debug("undefined: %r, %r", font, cid) 

266 return "(cid:%d)" % cid 

267 

268 def receive_layout(self, ltpage: LTPage) -> None: 

269 pass 

270 

271 

272class PDFPageAggregator(PDFLayoutAnalyzer): 

273 def __init__( 

274 self, 

275 rsrcmgr: PDFResourceManager, 

276 pageno: int = 1, 

277 laparams: Optional[LAParams] = None, 

278 ) -> None: 

279 PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams) 

280 self.result: Optional[LTPage] = None 

281 

282 def receive_layout(self, ltpage: LTPage) -> None: 

283 self.result = ltpage 

284 

285 def get_result(self) -> LTPage: 

286 assert self.result is not None 

287 return self.result 

288 

289 

290# Some PDFConverter children support only binary I/O 

291IOType = TypeVar("IOType", TextIO, BinaryIO, AnyIO) 

292 

293 

294class PDFConverter(PDFLayoutAnalyzer, Generic[IOType]): 

295 def __init__( 

296 self, 

297 rsrcmgr: PDFResourceManager, 

298 outfp: IOType, 

299 codec: str = "utf-8", 

300 pageno: int = 1, 

301 laparams: Optional[LAParams] = None, 

302 ) -> None: 

303 PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams) 

304 self.outfp: IOType = outfp 

305 self.codec = codec 

306 self.outfp_binary = self._is_binary_stream(self.outfp) 

307 

308 @staticmethod 

309 def _is_binary_stream(outfp: AnyIO) -> bool: 

310 """Test if an stream is binary or not""" 

311 if "b" in getattr(outfp, "mode", ""): 

312 return True 

313 elif hasattr(outfp, "mode"): 

314 # output stream has a mode, but it does not contain 'b' 

315 return False 

316 elif isinstance(outfp, io.BytesIO): 

317 return True 

318 elif isinstance(outfp, io.StringIO) or isinstance(outfp, io.TextIOBase): 

319 return False 

320 

321 return True 

322 

323 

324class TextConverter(PDFConverter[AnyIO]): 

325 def __init__( 

326 self, 

327 rsrcmgr: PDFResourceManager, 

328 outfp: AnyIO, 

329 codec: str = "utf-8", 

330 pageno: int = 1, 

331 laparams: Optional[LAParams] = None, 

332 showpageno: bool = False, 

333 imagewriter: Optional[ImageWriter] = None, 

334 ) -> None: 

335 super().__init__(rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) 

336 self.showpageno = showpageno 

337 self.imagewriter = imagewriter 

338 

339 def write_text(self, text: str) -> None: 

340 text = utils.compatible_encode_method(text, self.codec, "ignore") 

341 if self.outfp_binary: 

342 cast(BinaryIO, self.outfp).write(text.encode()) 

343 else: 

344 cast(TextIO, self.outfp).write(text) 

345 

346 def receive_layout(self, ltpage: LTPage) -> None: 

347 def render(item: LTItem) -> None: 

348 if isinstance(item, LTContainer): 

349 for child in item: 

350 render(child) 

351 elif isinstance(item, LTText): 

352 self.write_text(item.get_text()) 

353 if isinstance(item, LTTextBox): 

354 self.write_text("\n") 

355 elif isinstance(item, LTImage): 

356 if self.imagewriter is not None: 

357 self.imagewriter.export_image(item) 

358 

359 if self.showpageno: 

360 self.write_text("Page %s\n" % ltpage.pageid) 

361 render(ltpage) 

362 self.write_text("\f") 

363 

364 # Some dummy functions to save memory/CPU when all that is wanted 

365 # is text. This stops all the image and drawing output from being 

366 # recorded and taking up RAM. 

367 def render_image(self, name: str, stream: PDFStream) -> None: 

368 if self.imagewriter is not None: 

369 PDFConverter.render_image(self, name, stream) 

370 

371 def paint_path( 

372 self, 

373 gstate: PDFGraphicState, 

374 stroke: bool, 

375 fill: bool, 

376 evenodd: bool, 

377 path: Sequence[PathSegment], 

378 ) -> None: 

379 pass 

380 

381 

382class HTMLConverter(PDFConverter[AnyIO]): 

383 RECT_COLORS = { 

384 "figure": "yellow", 

385 "textline": "magenta", 

386 "textbox": "cyan", 

387 "textgroup": "red", 

388 "curve": "black", 

389 "page": "gray", 

390 } 

391 

392 TEXT_COLORS = { 

393 "textbox": "blue", 

394 "char": "black", 

395 } 

396 

397 def __init__( 

398 self, 

399 rsrcmgr: PDFResourceManager, 

400 outfp: AnyIO, 

401 codec: str = "utf-8", 

402 pageno: int = 1, 

403 laparams: Optional[LAParams] = None, 

404 scale: float = 1, 

405 fontscale: float = 1.0, 

406 layoutmode: str = "normal", 

407 showpageno: bool = True, 

408 pagemargin: int = 50, 

409 imagewriter: Optional[ImageWriter] = None, 

410 debug: int = 0, 

411 rect_colors: Optional[Dict[str, str]] = None, 

412 text_colors: Optional[Dict[str, str]] = None, 

413 ) -> None: 

414 PDFConverter.__init__( 

415 self, 

416 rsrcmgr, 

417 outfp, 

418 codec=codec, 

419 pageno=pageno, 

420 laparams=laparams, 

421 ) 

422 

423 # write() assumes a codec for binary I/O, or no codec for text I/O. 

424 if self.outfp_binary and not self.codec: 

425 raise PDFValueError("Codec is required for a binary I/O output") 

426 if not self.outfp_binary and self.codec: 

427 raise PDFValueError("Codec must not be specified for a text I/O output") 

428 

429 if text_colors is None: 

430 text_colors = {"char": "black"} 

431 if rect_colors is None: 

432 rect_colors = {"curve": "black", "page": "gray"} 

433 

434 self.scale = scale 

435 self.fontscale = fontscale 

436 self.layoutmode = layoutmode 

437 self.showpageno = showpageno 

438 self.pagemargin = pagemargin 

439 self.imagewriter = imagewriter 

440 self.rect_colors = rect_colors 

441 self.text_colors = text_colors 

442 if debug: 

443 self.rect_colors.update(self.RECT_COLORS) 

444 self.text_colors.update(self.TEXT_COLORS) 

445 self._yoffset: float = self.pagemargin 

446 self._font: Optional[Tuple[str, float]] = None 

447 self._fontstack: List[Optional[Tuple[str, float]]] = [] 

448 self.write_header() 

449 

450 def write(self, text: str) -> None: 

451 if self.codec: 

452 cast(BinaryIO, self.outfp).write(text.encode(self.codec)) 

453 else: 

454 cast(TextIO, self.outfp).write(text) 

455 

456 def write_header(self) -> None: 

457 self.write("<html><head>\n") 

458 if self.codec: 

459 s = ( 

460 '<meta http-equiv="Content-Type" content="text/html; ' 

461 'charset=%s">\n' % self.codec 

462 ) 

463 else: 

464 s = '<meta http-equiv="Content-Type" content="text/html">\n' 

465 self.write(s) 

466 self.write("</head><body>\n") 

467 

468 def write_footer(self) -> None: 

469 page_links = [f'<a href="#{i}">{i}</a>' for i in range(1, self.pageno)] 

470 s = '<div style="position:absolute; top:0px;">Page: %s</div>\n' % ", ".join( 

471 page_links, 

472 ) 

473 self.write(s) 

474 self.write("</body></html>\n") 

475 

476 def write_text(self, text: str) -> None: 

477 self.write(enc(text)) 

478 

479 def place_rect( 

480 self, 

481 color: str, 

482 borderwidth: int, 

483 x: float, 

484 y: float, 

485 w: float, 

486 h: float, 

487 ) -> None: 

488 color2 = self.rect_colors.get(color) 

489 if color2 is not None: 

490 s = ( 

491 '<span style="position:absolute; border: %s %dpx solid; ' 

492 'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' 

493 % ( 

494 color2, 

495 borderwidth, 

496 x * self.scale, 

497 (self._yoffset - y) * self.scale, 

498 w * self.scale, 

499 h * self.scale, 

500 ) 

501 ) 

502 self.write(s) 

503 

504 def place_border(self, color: str, borderwidth: int, item: LTComponent) -> None: 

505 self.place_rect(color, borderwidth, item.x0, item.y1, item.width, item.height) 

506 

507 def place_image( 

508 self, 

509 item: LTImage, 

510 borderwidth: int, 

511 x: float, 

512 y: float, 

513 w: float, 

514 h: float, 

515 ) -> None: 

516 if self.imagewriter is not None: 

517 name = self.imagewriter.export_image(item) 

518 s = ( 

519 '<img src="%s" border="%d" style="position:absolute; ' 

520 'left:%dpx; top:%dpx;" width="%d" height="%d" />\n' 

521 % ( 

522 enc(name), 

523 borderwidth, 

524 x * self.scale, 

525 (self._yoffset - y) * self.scale, 

526 w * self.scale, 

527 h * self.scale, 

528 ) 

529 ) 

530 self.write(s) 

531 

532 def place_text( 

533 self, 

534 color: str, 

535 text: str, 

536 x: float, 

537 y: float, 

538 size: float, 

539 ) -> None: 

540 color2 = self.text_colors.get(color) 

541 if color2 is not None: 

542 s = ( 

543 '<span style="position:absolute; color:%s; left:%dpx; ' 

544 'top:%dpx; font-size:%dpx;">' 

545 % ( 

546 color2, 

547 x * self.scale, 

548 (self._yoffset - y) * self.scale, 

549 size * self.scale * self.fontscale, 

550 ) 

551 ) 

552 self.write(s) 

553 self.write_text(text) 

554 self.write("</span>\n") 

555 

556 def begin_div( 

557 self, 

558 color: str, 

559 borderwidth: int, 

560 x: float, 

561 y: float, 

562 w: float, 

563 h: float, 

564 writing_mode: str = "False", 

565 ) -> None: 

566 self._fontstack.append(self._font) 

567 self._font = None 

568 s = ( 

569 '<div style="position:absolute; border: %s %dpx solid; ' 

570 "writing-mode:%s; left:%dpx; top:%dpx; width:%dpx; " 

571 'height:%dpx;">' 

572 % ( 

573 color, 

574 borderwidth, 

575 writing_mode, 

576 x * self.scale, 

577 (self._yoffset - y) * self.scale, 

578 w * self.scale, 

579 h * self.scale, 

580 ) 

581 ) 

582 self.write(s) 

583 

584 def end_div(self, color: str) -> None: 

585 if self._font is not None: 

586 self.write("</span>") 

587 self._font = self._fontstack.pop() 

588 self.write("</div>") 

589 

590 def put_text(self, text: str, fontname: str, fontsize: float) -> None: 

591 font = (fontname, fontsize) 

592 if font != self._font: 

593 if self._font is not None: 

594 self.write("</span>") 

595 # Remove subset tag from fontname, see PDF Reference 5.5.3 

596 fontname_without_subset_tag = fontname.split("+")[-1] 

597 self.write( 

598 '<span style="font-family: %s; font-size:%dpx">' 

599 % (fontname_without_subset_tag, fontsize * self.scale * self.fontscale), 

600 ) 

601 self._font = font 

602 self.write_text(text) 

603 

604 def put_newline(self) -> None: 

605 self.write("<br>") 

606 

607 def receive_layout(self, ltpage: LTPage) -> None: 

608 def show_group(item: Union[LTTextGroup, TextGroupElement]) -> None: 

609 if isinstance(item, LTTextGroup): 

610 self.place_border("textgroup", 1, item) 

611 for child in item: 

612 show_group(child) 

613 

614 def render(item: LTItem) -> None: 

615 child: LTItem 

616 if isinstance(item, LTPage): 

617 self._yoffset += item.y1 

618 self.place_border("page", 1, item) 

619 if self.showpageno: 

620 self.write( 

621 '<div style="position:absolute; top:%dpx;">' 

622 % ((self._yoffset - item.y1) * self.scale), 

623 ) 

624 self.write( 

625 f'<a name="{item.pageid}">Page {item.pageid}</a></div>\n', 

626 ) 

627 for child in item: 

628 render(child) 

629 if item.groups is not None: 

630 for group in item.groups: 

631 show_group(group) 

632 elif isinstance(item, LTCurve): 

633 self.place_border("curve", 1, item) 

634 elif isinstance(item, LTFigure): 

635 self.begin_div("figure", 1, item.x0, item.y1, item.width, item.height) 

636 for child in item: 

637 render(child) 

638 self.end_div("figure") 

639 elif isinstance(item, LTImage): 

640 self.place_image(item, 1, item.x0, item.y1, item.width, item.height) 

641 elif self.layoutmode == "exact": 

642 if isinstance(item, LTTextLine): 

643 self.place_border("textline", 1, item) 

644 for child in item: 

645 render(child) 

646 elif isinstance(item, LTTextBox): 

647 self.place_border("textbox", 1, item) 

648 self.place_text( 

649 "textbox", 

650 str(item.index + 1), 

651 item.x0, 

652 item.y1, 

653 20, 

654 ) 

655 for child in item: 

656 render(child) 

657 elif isinstance(item, LTChar): 

658 self.place_border("char", 1, item) 

659 self.place_text( 

660 "char", 

661 item.get_text(), 

662 item.x0, 

663 item.y1, 

664 item.size, 

665 ) 

666 elif isinstance(item, LTTextLine): 

667 for child in item: 

668 render(child) 

669 if self.layoutmode != "loose": 

670 self.put_newline() 

671 elif isinstance(item, LTTextBox): 

672 self.begin_div( 

673 "textbox", 

674 1, 

675 item.x0, 

676 item.y1, 

677 item.width, 

678 item.height, 

679 item.get_writing_mode(), 

680 ) 

681 for child in item: 

682 render(child) 

683 self.end_div("textbox") 

684 elif isinstance(item, LTChar): 

685 fontname = make_compat_str(item.fontname) 

686 self.put_text(item.get_text(), fontname, item.size) 

687 elif isinstance(item, LTText): 

688 self.write_text(item.get_text()) 

689 

690 render(ltpage) 

691 self._yoffset += self.pagemargin 

692 

693 def close(self) -> None: 

694 self.write_footer() 

695 

696 

697class XMLConverter(PDFConverter[AnyIO]): 

698 CONTROL = re.compile("[\x00-\x08\x0b-\x0c\x0e-\x1f]") 

699 

700 def __init__( 

701 self, 

702 rsrcmgr: PDFResourceManager, 

703 outfp: AnyIO, 

704 codec: str = "utf-8", 

705 pageno: int = 1, 

706 laparams: Optional[LAParams] = None, 

707 imagewriter: Optional[ImageWriter] = None, 

708 stripcontrol: bool = False, 

709 ) -> None: 

710 PDFConverter.__init__( 

711 self, 

712 rsrcmgr, 

713 outfp, 

714 codec=codec, 

715 pageno=pageno, 

716 laparams=laparams, 

717 ) 

718 

719 # write() assumes a codec for binary I/O, or no codec for text I/O. 

720 if self.outfp_binary == (not self.codec): 

721 raise PDFValueError("Codec is required for a binary I/O output") 

722 

723 self.imagewriter = imagewriter 

724 self.stripcontrol = stripcontrol 

725 self.write_header() 

726 

727 def write(self, text: str) -> None: 

728 if self.codec: 

729 cast(BinaryIO, self.outfp).write(text.encode(self.codec)) 

730 else: 

731 cast(TextIO, self.outfp).write(text) 

732 

733 def write_header(self) -> None: 

734 if self.codec: 

735 self.write('<?xml version="1.0" encoding="%s" ?>\n' % self.codec) 

736 else: 

737 self.write('<?xml version="1.0" ?>\n') 

738 self.write("<pages>\n") 

739 

740 def write_footer(self) -> None: 

741 self.write("</pages>\n") 

742 

743 def write_text(self, text: str) -> None: 

744 if self.stripcontrol: 

745 text = self.CONTROL.sub("", text) 

746 self.write(enc(text)) 

747 

748 def receive_layout(self, ltpage: LTPage) -> None: 

749 def show_group(item: LTItem) -> None: 

750 if isinstance(item, LTTextBox): 

751 self.write( 

752 '<textbox id="%d" bbox="%s" />\n' 

753 % (item.index, bbox2str(item.bbox)), 

754 ) 

755 elif isinstance(item, LTTextGroup): 

756 self.write('<textgroup bbox="%s">\n' % bbox2str(item.bbox)) 

757 for child in item: 

758 show_group(child) 

759 self.write("</textgroup>\n") 

760 

761 def render(item: LTItem) -> None: 

762 child: LTItem 

763 if isinstance(item, LTPage): 

764 s = '<page id="%s" bbox="%s" rotate="%d">\n' % ( 

765 item.pageid, 

766 bbox2str(item.bbox), 

767 item.rotate, 

768 ) 

769 self.write(s) 

770 for child in item: 

771 render(child) 

772 if item.groups is not None: 

773 self.write("<layout>\n") 

774 for group in item.groups: 

775 show_group(group) 

776 self.write("</layout>\n") 

777 self.write("</page>\n") 

778 elif isinstance(item, LTLine): 

779 s = '<line linewidth="%d" bbox="%s" />\n' % ( 

780 item.linewidth, 

781 bbox2str(item.bbox), 

782 ) 

783 self.write(s) 

784 elif isinstance(item, LTRect): 

785 s = '<rect linewidth="%d" bbox="%s" />\n' % ( 

786 item.linewidth, 

787 bbox2str(item.bbox), 

788 ) 

789 self.write(s) 

790 elif isinstance(item, LTCurve): 

791 s = '<curve linewidth="%d" bbox="%s" pts="%s"/>\n' % ( 

792 item.linewidth, 

793 bbox2str(item.bbox), 

794 item.get_pts(), 

795 ) 

796 self.write(s) 

797 elif isinstance(item, LTFigure): 

798 s = f'<figure name="{item.name}" bbox="{bbox2str(item.bbox)}">\n' 

799 self.write(s) 

800 for child in item: 

801 render(child) 

802 self.write("</figure>\n") 

803 elif isinstance(item, LTTextLine): 

804 self.write('<textline bbox="%s">\n' % bbox2str(item.bbox)) 

805 for child in item: 

806 render(child) 

807 self.write("</textline>\n") 

808 elif isinstance(item, LTTextBox): 

809 wmode = "" 

810 if isinstance(item, LTTextBoxVertical): 

811 wmode = ' wmode="vertical"' 

812 s = '<textbox id="%d" bbox="%s"%s>\n' % ( 

813 item.index, 

814 bbox2str(item.bbox), 

815 wmode, 

816 ) 

817 self.write(s) 

818 for child in item: 

819 render(child) 

820 self.write("</textbox>\n") 

821 elif isinstance(item, LTChar): 

822 s = ( 

823 '<text font="%s" bbox="%s" colourspace="%s" ' 

824 'ncolour="%s" size="%.3f">' 

825 % ( 

826 enc(item.fontname), 

827 bbox2str(item.bbox), 

828 item.ncs.name, 

829 item.graphicstate.ncolor, 

830 item.size, 

831 ) 

832 ) 

833 self.write(s) 

834 self.write_text(item.get_text()) 

835 self.write("</text>\n") 

836 elif isinstance(item, LTText): 

837 self.write("<text>%s</text>\n" % item.get_text()) 

838 elif isinstance(item, LTImage): 

839 if self.imagewriter is not None: 

840 name = self.imagewriter.export_image(item) 

841 self.write( 

842 '<image src="%s" width="%d" height="%d" />\n' 

843 % (enc(name), item.width, item.height), 

844 ) 

845 else: 

846 self.write( 

847 '<image width="%d" height="%d" />\n' 

848 % (item.width, item.height), 

849 ) 

850 else: 

851 assert False, str(("Unhandled", item)) 

852 

853 render(ltpage) 

854 

855 def close(self) -> None: 

856 self.write_footer() 

857 

858 

859class HOCRConverter(PDFConverter[AnyIO]): 

860 """Extract an hOCR representation from explicit text information within a PDF.""" 

861 

862 # Where text is being extracted from a variety of types of PDF within a 

863 # business process, those PDFs where the text is only present in image 

864 # form will need to be analysed using an OCR tool which will typically 

865 # output hOCR. This converter extracts the explicit text information from 

866 # those PDFs that do have it and uses it to genxerate a basic hOCR 

867 # representation that is designed to be used in conjunction with the image 

868 # of the PDF in the same way as genuine OCR output would be, but without the 

869 # inevitable OCR errors. 

870 

871 # The converter does not handle images, diagrams or text colors. 

872 

873 # In the examples processed by the contributor it was necessary to set 

874 # LAParams.all_texts to True. 

875 

876 CONTROL = re.compile(r"[\x00-\x08\x0b-\x0c\x0e-\x1f]") 

877 

878 def __init__( 

879 self, 

880 rsrcmgr: PDFResourceManager, 

881 outfp: AnyIO, 

882 codec: str = "utf8", 

883 pageno: int = 1, 

884 laparams: Optional[LAParams] = None, 

885 stripcontrol: bool = False, 

886 ): 

887 PDFConverter.__init__( 

888 self, 

889 rsrcmgr, 

890 outfp, 

891 codec=codec, 

892 pageno=pageno, 

893 laparams=laparams, 

894 ) 

895 self.stripcontrol = stripcontrol 

896 self.within_chars = False 

897 self.write_header() 

898 

899 def bbox_repr(self, bbox: Rect) -> str: 

900 (in_x0, in_y0, in_x1, in_y1) = bbox 

901 # PDF y-coordinates are the other way round from hOCR coordinates 

902 out_x0 = int(in_x0) 

903 out_y0 = int(self.page_bbox[3] - in_y1) 

904 out_x1 = int(in_x1) 

905 out_y1 = int(self.page_bbox[3] - in_y0) 

906 return f"bbox {out_x0} {out_y0} {out_x1} {out_y1}" 

907 

908 def write(self, text: str) -> None: 

909 if self.codec: 

910 encoded_text = text.encode(self.codec) 

911 cast(BinaryIO, self.outfp).write(encoded_text) 

912 else: 

913 cast(TextIO, self.outfp).write(text) 

914 

915 def write_header(self) -> None: 

916 if self.codec: 

917 self.write( 

918 "<html xmlns='http://www.w3.org/1999/xhtml' " 

919 "xml:lang='en' lang='en' charset='%s'>\n" % self.codec, 

920 ) 

921 else: 

922 self.write( 

923 "<html xmlns='http://www.w3.org/1999/xhtml' " 

924 "xml:lang='en' lang='en'>\n", 

925 ) 

926 self.write("<head>\n") 

927 self.write("<title></title>\n") 

928 self.write( 

929 "<meta http-equiv='Content-Type' content='text/html;charset=utf-8' />\n", 

930 ) 

931 self.write( 

932 "<meta name='ocr-system' content='pdfminer.six HOCR Converter' />\n", 

933 ) 

934 self.write( 

935 " <meta name='ocr-capabilities'" 

936 " content='ocr_page ocr_block ocr_line ocrx_word'/>\n", 

937 ) 

938 self.write("</head>\n") 

939 self.write("<body>\n") 

940 

941 def write_footer(self) -> None: 

942 self.write("<!-- comment in the following line to debug -->\n") 

943 self.write( 

944 "<!--script src='https://unpkg.com/hocrjs'></script--></body></html>\n", 

945 ) 

946 

947 def write_text(self, text: str) -> None: 

948 if self.stripcontrol: 

949 text = self.CONTROL.sub("", text) 

950 self.write(text) 

951 

952 def write_word(self) -> None: 

953 if len(self.working_text) > 0: 

954 bold_and_italic_styles = "" 

955 if "Italic" in self.working_font: 

956 bold_and_italic_styles = "font-style: italic; " 

957 if "Bold" in self.working_font: 

958 bold_and_italic_styles += "font-weight: bold; " 

959 self.write( 

960 "<span style='font:\"%s\"; font-size:%d; %s' " 

961 "class='ocrx_word' title='%s; x_font %s; " 

962 "x_fsize %d'>%s</span>" 

963 % ( 

964 ( 

965 self.working_font, 

966 self.working_size, 

967 bold_and_italic_styles, 

968 self.bbox_repr(self.working_bbox), 

969 self.working_font, 

970 self.working_size, 

971 self.working_text.strip(), 

972 ) 

973 ), 

974 ) 

975 self.within_chars = False 

976 

977 def receive_layout(self, ltpage: LTPage) -> None: 

978 def render(item: LTItem) -> None: 

979 if self.within_chars and isinstance(item, LTAnno): 

980 self.write_word() 

981 if isinstance(item, LTPage): 

982 self.page_bbox = item.bbox 

983 self.write( 

984 "<div class='ocr_page' id='%s' title='%s'>\n" 

985 % (item.pageid, self.bbox_repr(item.bbox)), 

986 ) 

987 for child in item: 

988 render(child) 

989 self.write("</div>\n") 

990 elif isinstance(item, LTTextLine): 

991 self.write( 

992 "<span class='ocr_line' title='%s'>" % (self.bbox_repr(item.bbox)), 

993 ) 

994 for child_line in item: 

995 render(child_line) 

996 self.write("</span>\n") 

997 elif isinstance(item, LTTextBox): 

998 self.write( 

999 "<div class='ocr_block' id='%d' title='%s'>\n" 

1000 % (item.index, self.bbox_repr(item.bbox)), 

1001 ) 

1002 for child in item: 

1003 render(child) 

1004 self.write("</div>\n") 

1005 elif isinstance(item, LTChar): 

1006 if not self.within_chars: 

1007 self.within_chars = True 

1008 self.working_text = item.get_text() 

1009 self.working_bbox = item.bbox 

1010 self.working_font = item.fontname 

1011 self.working_size = item.size 

1012 elif len(item.get_text().strip()) == 0: 

1013 self.write_word() 

1014 self.write(item.get_text()) 

1015 else: 

1016 if ( 

1017 self.working_bbox[1] != item.bbox[1] 

1018 or self.working_font != item.fontname 

1019 or self.working_size != item.size 

1020 ): 

1021 self.write_word() 

1022 self.working_bbox = item.bbox 

1023 self.working_font = item.fontname 

1024 self.working_size = item.size 

1025 self.working_text += item.get_text() 

1026 self.working_bbox = ( 

1027 self.working_bbox[0], 

1028 self.working_bbox[1], 

1029 item.bbox[2], 

1030 self.working_bbox[3], 

1031 ) 

1032 

1033 render(ltpage) 

1034 

1035 def close(self) -> None: 

1036 self.write_footer()