Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/converter.py: 30%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

472 statements  

1import io 

2import logging 

3import re 

4from typing import ( 

5 BinaryIO, 

6 Dict, 

7 Generic, 

8 List, 

9 Optional, 

10 Sequence, 

11 TextIO, 

12 Tuple, 

13 TypeVar, 

14 Union, 

15 cast, 

16) 

17 

18from pdfminer import utils 

19from pdfminer.image import ImageWriter 

20from pdfminer.layout import ( 

21 LAParams, 

22 LTAnno, 

23 LTChar, 

24 LTComponent, 

25 LTContainer, 

26 LTCurve, 

27 LTFigure, 

28 LTImage, 

29 LTItem, 

30 LTLayoutContainer, 

31 LTLine, 

32 LTPage, 

33 LTRect, 

34 LTText, 

35 LTTextBox, 

36 LTTextBoxVertical, 

37 LTTextGroup, 

38 LTTextLine, 

39 TextGroupElement, 

40) 

41from pdfminer.pdfcolor import PDFColorSpace 

42from pdfminer.pdfdevice import PDFTextDevice 

43from pdfminer.pdfexceptions import PDFValueError 

44from pdfminer.pdffont import PDFFont, PDFUnicodeNotDefined 

45from pdfminer.pdfinterp import PDFGraphicState, PDFResourceManager 

46from pdfminer.pdfpage import PDFPage 

47from pdfminer.pdftypes import PDFStream 

48from pdfminer.utils import ( 

49 AnyIO, 

50 Matrix, 

51 PathSegment, 

52 Point, 

53 Rect, 

54 apply_matrix_pt, 

55 apply_matrix_rect, 

56 bbox2str, 

57 enc, 

58 make_compat_str, 

59 mult_matrix, 

60) 

61 

62log = logging.getLogger(__name__) 

63 

64 

65class PDFLayoutAnalyzer(PDFTextDevice): 

66 cur_item: LTLayoutContainer 

67 ctm: Matrix 

68 

69 def __init__( 

70 self, 

71 rsrcmgr: PDFResourceManager, 

72 pageno: int = 1, 

73 laparams: Optional[LAParams] = None, 

74 ) -> None: 

75 PDFTextDevice.__init__(self, rsrcmgr) 

76 self.pageno = pageno 

77 self.laparams = laparams 

78 self._stack: List[LTLayoutContainer] = [] 

79 

80 def begin_page(self, page: PDFPage, ctm: Matrix) -> None: 

81 (x0, y0, x1, y1) = apply_matrix_rect(ctm, page.mediabox) 

82 mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1)) 

83 self.cur_item = LTPage(self.pageno, mediabox) 

84 

85 def end_page(self, page: PDFPage) -> None: 

86 assert not self._stack, str(len(self._stack)) 

87 assert isinstance(self.cur_item, LTPage), str(type(self.cur_item)) 

88 if self.laparams is not None: 

89 self.cur_item.analyze(self.laparams) 

90 self.pageno += 1 

91 self.receive_layout(self.cur_item) 

92 

93 def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None: 

94 self._stack.append(self.cur_item) 

95 self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm)) 

96 

97 def end_figure(self, _: str) -> None: 

98 fig = self.cur_item 

99 assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item)) 

100 self.cur_item = self._stack.pop() 

101 self.cur_item.add(fig) 

102 

103 def render_image(self, name: str, stream: PDFStream) -> None: 

104 assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item)) 

105 item = LTImage( 

106 name, 

107 stream, 

108 (self.cur_item.x0, self.cur_item.y0, self.cur_item.x1, self.cur_item.y1), 

109 ) 

110 self.cur_item.add(item) 

111 

112 def paint_path( 

113 self, 

114 gstate: PDFGraphicState, 

115 stroke: bool, 

116 fill: bool, 

117 evenodd: bool, 

118 path: Sequence[PathSegment], 

119 ) -> None: 

120 """Paint paths described in section 4.4 of the PDF reference manual""" 

121 shape = "".join(x[0] for x in path) 

122 

123 if shape[:1] != "m": 

124 # Per PDF Reference Section 4.4.1, "path construction operators may 

125 # be invoked in any sequence, but the first one invoked must be m 

126 # or re to begin a new subpath." Since pdfminer.six already 

127 # converts all `re` (rectangle) operators to their equivelent 

128 # `mlllh` representation, paths ingested by `.paint_path(...)` that 

129 # do not begin with the `m` operator are invalid. 

130 pass 

131 

132 elif shape.count("m") > 1: 

133 # recurse if there are multiple m's in this shape 

134 for m in re.finditer(r"m[^m]+", shape): 

135 subpath = path[m.start(0) : m.end(0)] 

136 self.paint_path(gstate, stroke, fill, evenodd, subpath) 

137 

138 else: 

139 # Although the 'h' command does not not literally provide a 

140 # point-position, its position is (by definition) equal to the 

141 # subpath's starting point. 

142 # 

143 # And, per Section 4.4's Table 4.9, all other path commands place 

144 # their point-position in their final two arguments. (Any preceding 

145 # arguments represent control points on Bézier curves.) 

146 raw_pts = [ 

147 cast(Point, p[-2:] if p[0] != "h" else path[0][-2:]) for p in path 

148 ] 

149 pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts] 

150 

151 operators = [str(operation[0]) for operation in path] 

152 transformed_points = [ 

153 [ 

154 apply_matrix_pt(self.ctm, (float(operand1), float(operand2))) 

155 for operand1, operand2 in zip(operation[1::2], operation[2::2]) 

156 ] 

157 for operation in path 

158 ] 

159 transformed_path = [ 

160 cast(PathSegment, (o, *p)) 

161 for o, p in zip(operators, transformed_points) 

162 ] 

163 

164 # Drop a redundant "l" on a path closed with "h" 

165 if len(shape) > 3 and shape[-2:] == "lh" and pts[-2] == pts[0]: 

166 shape = shape[:-2] + "h" 

167 pts.pop() 

168 

169 if shape in {"mlh", "ml"}: 

170 # single line segment 

171 # 

172 # Note: 'ml', in conditional above, is a frequent anomaly 

173 # that we want to support. 

174 line = LTLine( 

175 gstate.linewidth, 

176 pts[0], 

177 pts[1], 

178 stroke, 

179 fill, 

180 evenodd, 

181 gstate.scolor, 

182 gstate.ncolor, 

183 original_path=transformed_path, 

184 dashing_style=gstate.dash, 

185 ) 

186 self.cur_item.add(line) 

187 

188 elif shape in {"mlllh", "mllll"}: 

189 (x0, y0), (x1, y1), (x2, y2), (x3, y3), _ = pts 

190 

191 is_closed_loop = pts[0] == pts[4] 

192 has_square_coordinates = ( 

193 x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0 

194 ) or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0) 

195 if is_closed_loop and has_square_coordinates: 

196 rect = LTRect( 

197 gstate.linewidth, 

198 (*pts[0], *pts[2]), 

199 stroke, 

200 fill, 

201 evenodd, 

202 gstate.scolor, 

203 gstate.ncolor, 

204 transformed_path, 

205 gstate.dash, 

206 ) 

207 self.cur_item.add(rect) 

208 else: 

209 curve = LTCurve( 

210 gstate.linewidth, 

211 pts, 

212 stroke, 

213 fill, 

214 evenodd, 

215 gstate.scolor, 

216 gstate.ncolor, 

217 transformed_path, 

218 gstate.dash, 

219 ) 

220 self.cur_item.add(curve) 

221 else: 

222 curve = LTCurve( 

223 gstate.linewidth, 

224 pts, 

225 stroke, 

226 fill, 

227 evenodd, 

228 gstate.scolor, 

229 gstate.ncolor, 

230 transformed_path, 

231 gstate.dash, 

232 ) 

233 self.cur_item.add(curve) 

234 

235 def render_char( 

236 self, 

237 matrix: Matrix, 

238 font: PDFFont, 

239 fontsize: float, 

240 scaling: float, 

241 rise: float, 

242 cid: int, 

243 ncs: PDFColorSpace, 

244 graphicstate: PDFGraphicState, 

245 ) -> float: 

246 try: 

247 text = font.to_unichr(cid) 

248 assert isinstance(text, str), str(type(text)) 

249 except PDFUnicodeNotDefined: 

250 text = self.handle_undefined_char(font, cid) 

251 textwidth = font.char_width(cid) 

252 textdisp = font.char_disp(cid) 

253 item = LTChar( 

254 matrix, 

255 font, 

256 fontsize, 

257 scaling, 

258 rise, 

259 text, 

260 textwidth, 

261 textdisp, 

262 ncs, 

263 graphicstate, 

264 ) 

265 self.cur_item.add(item) 

266 return item.adv 

267 

268 def handle_undefined_char(self, font: PDFFont, cid: int) -> str: 

269 log.debug("undefined: %r, %r", font, cid) 

270 return "(cid:%d)" % cid 

271 

272 def receive_layout(self, ltpage: LTPage) -> None: 

273 pass 

274 

275 

276class PDFPageAggregator(PDFLayoutAnalyzer): 

277 def __init__( 

278 self, 

279 rsrcmgr: PDFResourceManager, 

280 pageno: int = 1, 

281 laparams: Optional[LAParams] = None, 

282 ) -> None: 

283 PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams) 

284 self.result: Optional[LTPage] = None 

285 

286 def receive_layout(self, ltpage: LTPage) -> None: 

287 self.result = ltpage 

288 

289 def get_result(self) -> LTPage: 

290 assert self.result is not None 

291 return self.result 

292 

293 

294# Some PDFConverter children support only binary I/O 

295IOType = TypeVar("IOType", TextIO, BinaryIO, AnyIO) 

296 

297 

298class PDFConverter(PDFLayoutAnalyzer, Generic[IOType]): 

299 def __init__( 

300 self, 

301 rsrcmgr: PDFResourceManager, 

302 outfp: IOType, 

303 codec: str = "utf-8", 

304 pageno: int = 1, 

305 laparams: Optional[LAParams] = None, 

306 ) -> None: 

307 PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams) 

308 self.outfp: IOType = outfp 

309 self.codec = codec 

310 self.outfp_binary = self._is_binary_stream(self.outfp) 

311 

312 @staticmethod 

313 def _is_binary_stream(outfp: AnyIO) -> bool: 

314 """Test if an stream is binary or not""" 

315 if "b" in getattr(outfp, "mode", ""): 

316 return True 

317 elif hasattr(outfp, "mode"): 

318 # output stream has a mode, but it does not contain 'b' 

319 return False 

320 elif isinstance(outfp, io.BytesIO): 

321 return True 

322 elif isinstance(outfp, io.StringIO) or isinstance(outfp, io.TextIOBase): 

323 return False 

324 

325 return True 

326 

327 

328class TextConverter(PDFConverter[AnyIO]): 

329 def __init__( 

330 self, 

331 rsrcmgr: PDFResourceManager, 

332 outfp: AnyIO, 

333 codec: str = "utf-8", 

334 pageno: int = 1, 

335 laparams: Optional[LAParams] = None, 

336 showpageno: bool = False, 

337 imagewriter: Optional[ImageWriter] = None, 

338 ) -> None: 

339 super().__init__(rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) 

340 self.showpageno = showpageno 

341 self.imagewriter = imagewriter 

342 

343 def write_text(self, text: str) -> None: 

344 text = utils.compatible_encode_method(text, self.codec, "ignore") 

345 if self.outfp_binary: 

346 cast(BinaryIO, self.outfp).write(text.encode()) 

347 else: 

348 cast(TextIO, self.outfp).write(text) 

349 

350 def receive_layout(self, ltpage: LTPage) -> None: 

351 def render(item: LTItem) -> None: 

352 if isinstance(item, LTContainer): 

353 for child in item: 

354 render(child) 

355 elif isinstance(item, LTText): 

356 self.write_text(item.get_text()) 

357 if isinstance(item, LTTextBox): 

358 self.write_text("\n") 

359 elif isinstance(item, LTImage): 

360 if self.imagewriter is not None: 

361 self.imagewriter.export_image(item) 

362 

363 if self.showpageno: 

364 self.write_text("Page %s\n" % ltpage.pageid) 

365 render(ltpage) 

366 self.write_text("\f") 

367 

368 # Some dummy functions to save memory/CPU when all that is wanted 

369 # is text. This stops all the image and drawing output from being 

370 # recorded and taking up RAM. 

371 def render_image(self, name: str, stream: PDFStream) -> None: 

372 if self.imagewriter is not None: 

373 PDFConverter.render_image(self, name, stream) 

374 

375 def paint_path( 

376 self, 

377 gstate: PDFGraphicState, 

378 stroke: bool, 

379 fill: bool, 

380 evenodd: bool, 

381 path: Sequence[PathSegment], 

382 ) -> None: 

383 pass 

384 

385 

386class HTMLConverter(PDFConverter[AnyIO]): 

387 RECT_COLORS = { 

388 "figure": "yellow", 

389 "textline": "magenta", 

390 "textbox": "cyan", 

391 "textgroup": "red", 

392 "curve": "black", 

393 "page": "gray", 

394 } 

395 

396 TEXT_COLORS = { 

397 "textbox": "blue", 

398 "char": "black", 

399 } 

400 

401 def __init__( 

402 self, 

403 rsrcmgr: PDFResourceManager, 

404 outfp: AnyIO, 

405 codec: str = "utf-8", 

406 pageno: int = 1, 

407 laparams: Optional[LAParams] = None, 

408 scale: float = 1, 

409 fontscale: float = 1.0, 

410 layoutmode: str = "normal", 

411 showpageno: bool = True, 

412 pagemargin: int = 50, 

413 imagewriter: Optional[ImageWriter] = None, 

414 debug: int = 0, 

415 rect_colors: Optional[Dict[str, str]] = None, 

416 text_colors: Optional[Dict[str, str]] = None, 

417 ) -> None: 

418 PDFConverter.__init__( 

419 self, 

420 rsrcmgr, 

421 outfp, 

422 codec=codec, 

423 pageno=pageno, 

424 laparams=laparams, 

425 ) 

426 

427 # write() assumes a codec for binary I/O, or no codec for text I/O. 

428 if self.outfp_binary and not self.codec: 

429 raise PDFValueError("Codec is required for a binary I/O output") 

430 if not self.outfp_binary and self.codec: 

431 raise PDFValueError("Codec must not be specified for a text I/O output") 

432 

433 if text_colors is None: 

434 text_colors = {"char": "black"} 

435 if rect_colors is None: 

436 rect_colors = {"curve": "black", "page": "gray"} 

437 

438 self.scale = scale 

439 self.fontscale = fontscale 

440 self.layoutmode = layoutmode 

441 self.showpageno = showpageno 

442 self.pagemargin = pagemargin 

443 self.imagewriter = imagewriter 

444 self.rect_colors = rect_colors 

445 self.text_colors = text_colors 

446 if debug: 

447 self.rect_colors.update(self.RECT_COLORS) 

448 self.text_colors.update(self.TEXT_COLORS) 

449 self._yoffset: float = self.pagemargin 

450 self._font: Optional[Tuple[str, float]] = None 

451 self._fontstack: List[Optional[Tuple[str, float]]] = [] 

452 self.write_header() 

453 

454 def write(self, text: str) -> None: 

455 if self.codec: 

456 cast(BinaryIO, self.outfp).write(text.encode(self.codec)) 

457 else: 

458 cast(TextIO, self.outfp).write(text) 

459 

460 def write_header(self) -> None: 

461 self.write("<html><head>\n") 

462 if self.codec: 

463 s = ( 

464 '<meta http-equiv="Content-Type" content="text/html; ' 

465 'charset=%s">\n' % self.codec 

466 ) 

467 else: 

468 s = '<meta http-equiv="Content-Type" content="text/html">\n' 

469 self.write(s) 

470 self.write("</head><body>\n") 

471 

472 def write_footer(self) -> None: 

473 page_links = [f'<a href="#{i}">{i}</a>' for i in range(1, self.pageno)] 

474 s = '<div style="position:absolute; top:0px;">Page: %s</div>\n' % ", ".join( 

475 page_links, 

476 ) 

477 self.write(s) 

478 self.write("</body></html>\n") 

479 

480 def write_text(self, text: str) -> None: 

481 self.write(enc(text)) 

482 

483 def place_rect( 

484 self, 

485 color: str, 

486 borderwidth: int, 

487 x: float, 

488 y: float, 

489 w: float, 

490 h: float, 

491 ) -> None: 

492 color2 = self.rect_colors.get(color) 

493 if color2 is not None: 

494 s = ( 

495 '<span style="position:absolute; border: %s %dpx solid; ' 

496 'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' 

497 % ( 

498 color2, 

499 borderwidth, 

500 x * self.scale, 

501 (self._yoffset - y) * self.scale, 

502 w * self.scale, 

503 h * self.scale, 

504 ) 

505 ) 

506 self.write(s) 

507 

508 def place_border(self, color: str, borderwidth: int, item: LTComponent) -> None: 

509 self.place_rect(color, borderwidth, item.x0, item.y1, item.width, item.height) 

510 

511 def place_image( 

512 self, 

513 item: LTImage, 

514 borderwidth: int, 

515 x: float, 

516 y: float, 

517 w: float, 

518 h: float, 

519 ) -> None: 

520 if self.imagewriter is not None: 

521 name = self.imagewriter.export_image(item) 

522 s = ( 

523 '<img src="%s" border="%d" style="position:absolute; ' 

524 'left:%dpx; top:%dpx;" width="%d" height="%d" />\n' 

525 % ( 

526 enc(name), 

527 borderwidth, 

528 x * self.scale, 

529 (self._yoffset - y) * self.scale, 

530 w * self.scale, 

531 h * self.scale, 

532 ) 

533 ) 

534 self.write(s) 

535 

536 def place_text( 

537 self, 

538 color: str, 

539 text: str, 

540 x: float, 

541 y: float, 

542 size: float, 

543 ) -> None: 

544 color2 = self.text_colors.get(color) 

545 if color2 is not None: 

546 s = ( 

547 '<span style="position:absolute; color:%s; left:%dpx; ' 

548 'top:%dpx; font-size:%dpx;">' 

549 % ( 

550 color2, 

551 x * self.scale, 

552 (self._yoffset - y) * self.scale, 

553 size * self.scale * self.fontscale, 

554 ) 

555 ) 

556 self.write(s) 

557 self.write_text(text) 

558 self.write("</span>\n") 

559 

560 def begin_div( 

561 self, 

562 color: str, 

563 borderwidth: int, 

564 x: float, 

565 y: float, 

566 w: float, 

567 h: float, 

568 writing_mode: str = "False", 

569 ) -> None: 

570 self._fontstack.append(self._font) 

571 self._font = None 

572 s = ( 

573 '<div style="position:absolute; border: %s %dpx solid; ' 

574 "writing-mode:%s; left:%dpx; top:%dpx; width:%dpx; " 

575 'height:%dpx;">' 

576 % ( 

577 color, 

578 borderwidth, 

579 writing_mode, 

580 x * self.scale, 

581 (self._yoffset - y) * self.scale, 

582 w * self.scale, 

583 h * self.scale, 

584 ) 

585 ) 

586 self.write(s) 

587 

588 def end_div(self, color: str) -> None: 

589 if self._font is not None: 

590 self.write("</span>") 

591 self._font = self._fontstack.pop() 

592 self.write("</div>") 

593 

594 def put_text(self, text: str, fontname: str, fontsize: float) -> None: 

595 font = (fontname, fontsize) 

596 if font != self._font: 

597 if self._font is not None: 

598 self.write("</span>") 

599 # Remove subset tag from fontname, see PDF Reference 5.5.3 

600 fontname_without_subset_tag = fontname.split("+")[-1] 

601 self.write( 

602 '<span style="font-family: %s; font-size:%dpx">' 

603 % (fontname_without_subset_tag, fontsize * self.scale * self.fontscale), 

604 ) 

605 self._font = font 

606 self.write_text(text) 

607 

608 def put_newline(self) -> None: 

609 self.write("<br>") 

610 

611 def receive_layout(self, ltpage: LTPage) -> None: 

612 def show_group(item: Union[LTTextGroup, TextGroupElement]) -> None: 

613 if isinstance(item, LTTextGroup): 

614 self.place_border("textgroup", 1, item) 

615 for child in item: 

616 show_group(child) 

617 

618 def render(item: LTItem) -> None: 

619 child: LTItem 

620 if isinstance(item, LTPage): 

621 self._yoffset += item.y1 

622 self.place_border("page", 1, item) 

623 if self.showpageno: 

624 self.write( 

625 '<div style="position:absolute; top:%dpx;">' 

626 % ((self._yoffset - item.y1) * self.scale), 

627 ) 

628 self.write( 

629 f'<a name="{item.pageid}">Page {item.pageid}</a></div>\n', 

630 ) 

631 for child in item: 

632 render(child) 

633 if item.groups is not None: 

634 for group in item.groups: 

635 show_group(group) 

636 elif isinstance(item, LTCurve): 

637 self.place_border("curve", 1, item) 

638 elif isinstance(item, LTFigure): 

639 self.begin_div("figure", 1, item.x0, item.y1, item.width, item.height) 

640 for child in item: 

641 render(child) 

642 self.end_div("figure") 

643 elif isinstance(item, LTImage): 

644 self.place_image(item, 1, item.x0, item.y1, item.width, item.height) 

645 elif self.layoutmode == "exact": 

646 if isinstance(item, LTTextLine): 

647 self.place_border("textline", 1, item) 

648 for child in item: 

649 render(child) 

650 elif isinstance(item, LTTextBox): 

651 self.place_border("textbox", 1, item) 

652 self.place_text( 

653 "textbox", 

654 str(item.index + 1), 

655 item.x0, 

656 item.y1, 

657 20, 

658 ) 

659 for child in item: 

660 render(child) 

661 elif isinstance(item, LTChar): 

662 self.place_border("char", 1, item) 

663 self.place_text( 

664 "char", 

665 item.get_text(), 

666 item.x0, 

667 item.y1, 

668 item.size, 

669 ) 

670 elif isinstance(item, LTTextLine): 

671 for child in item: 

672 render(child) 

673 if self.layoutmode != "loose": 

674 self.put_newline() 

675 elif isinstance(item, LTTextBox): 

676 self.begin_div( 

677 "textbox", 

678 1, 

679 item.x0, 

680 item.y1, 

681 item.width, 

682 item.height, 

683 item.get_writing_mode(), 

684 ) 

685 for child in item: 

686 render(child) 

687 self.end_div("textbox") 

688 elif isinstance(item, LTChar): 

689 fontname = make_compat_str(item.fontname) 

690 self.put_text(item.get_text(), fontname, item.size) 

691 elif isinstance(item, LTText): 

692 self.write_text(item.get_text()) 

693 

694 render(ltpage) 

695 self._yoffset += self.pagemargin 

696 

697 def close(self) -> None: 

698 self.write_footer() 

699 

700 

701class XMLConverter(PDFConverter[AnyIO]): 

702 CONTROL = re.compile("[\x00-\x08\x0b-\x0c\x0e-\x1f]") 

703 

704 def __init__( 

705 self, 

706 rsrcmgr: PDFResourceManager, 

707 outfp: AnyIO, 

708 codec: str = "utf-8", 

709 pageno: int = 1, 

710 laparams: Optional[LAParams] = None, 

711 imagewriter: Optional[ImageWriter] = None, 

712 stripcontrol: bool = False, 

713 ) -> None: 

714 PDFConverter.__init__( 

715 self, 

716 rsrcmgr, 

717 outfp, 

718 codec=codec, 

719 pageno=pageno, 

720 laparams=laparams, 

721 ) 

722 

723 # write() assumes a codec for binary I/O, or no codec for text I/O. 

724 if self.outfp_binary == (not self.codec): 

725 raise PDFValueError("Codec is required for a binary I/O output") 

726 

727 self.imagewriter = imagewriter 

728 self.stripcontrol = stripcontrol 

729 self.write_header() 

730 

731 def write(self, text: str) -> None: 

732 if self.codec: 

733 cast(BinaryIO, self.outfp).write(text.encode(self.codec)) 

734 else: 

735 cast(TextIO, self.outfp).write(text) 

736 

737 def write_header(self) -> None: 

738 if self.codec: 

739 self.write('<?xml version="1.0" encoding="%s" ?>\n' % self.codec) 

740 else: 

741 self.write('<?xml version="1.0" ?>\n') 

742 self.write("<pages>\n") 

743 

744 def write_footer(self) -> None: 

745 self.write("</pages>\n") 

746 

747 def write_text(self, text: str) -> None: 

748 if self.stripcontrol: 

749 text = self.CONTROL.sub("", text) 

750 self.write(enc(text)) 

751 

752 def receive_layout(self, ltpage: LTPage) -> None: 

753 def show_group(item: LTItem) -> None: 

754 if isinstance(item, LTTextBox): 

755 self.write( 

756 '<textbox id="%d" bbox="%s" />\n' 

757 % (item.index, bbox2str(item.bbox)), 

758 ) 

759 elif isinstance(item, LTTextGroup): 

760 self.write('<textgroup bbox="%s">\n' % bbox2str(item.bbox)) 

761 for child in item: 

762 show_group(child) 

763 self.write("</textgroup>\n") 

764 

765 def render(item: LTItem) -> None: 

766 child: LTItem 

767 if isinstance(item, LTPage): 

768 s = '<page id="%s" bbox="%s" rotate="%d">\n' % ( 

769 item.pageid, 

770 bbox2str(item.bbox), 

771 item.rotate, 

772 ) 

773 self.write(s) 

774 for child in item: 

775 render(child) 

776 if item.groups is not None: 

777 self.write("<layout>\n") 

778 for group in item.groups: 

779 show_group(group) 

780 self.write("</layout>\n") 

781 self.write("</page>\n") 

782 elif isinstance(item, LTLine): 

783 s = '<line linewidth="%d" bbox="%s" />\n' % ( 

784 item.linewidth, 

785 bbox2str(item.bbox), 

786 ) 

787 self.write(s) 

788 elif isinstance(item, LTRect): 

789 s = '<rect linewidth="%d" bbox="%s" />\n' % ( 

790 item.linewidth, 

791 bbox2str(item.bbox), 

792 ) 

793 self.write(s) 

794 elif isinstance(item, LTCurve): 

795 s = '<curve linewidth="%d" bbox="%s" pts="%s"/>\n' % ( 

796 item.linewidth, 

797 bbox2str(item.bbox), 

798 item.get_pts(), 

799 ) 

800 self.write(s) 

801 elif isinstance(item, LTFigure): 

802 s = f'<figure name="{item.name}" bbox="{bbox2str(item.bbox)}">\n' 

803 self.write(s) 

804 for child in item: 

805 render(child) 

806 self.write("</figure>\n") 

807 elif isinstance(item, LTTextLine): 

808 self.write('<textline bbox="%s">\n' % bbox2str(item.bbox)) 

809 for child in item: 

810 render(child) 

811 self.write("</textline>\n") 

812 elif isinstance(item, LTTextBox): 

813 wmode = "" 

814 if isinstance(item, LTTextBoxVertical): 

815 wmode = ' wmode="vertical"' 

816 s = '<textbox id="%d" bbox="%s"%s>\n' % ( 

817 item.index, 

818 bbox2str(item.bbox), 

819 wmode, 

820 ) 

821 self.write(s) 

822 for child in item: 

823 render(child) 

824 self.write("</textbox>\n") 

825 elif isinstance(item, LTChar): 

826 s = ( 

827 '<text font="%s" bbox="%s" colourspace="%s" ' 

828 'ncolour="%s" size="%.3f">' 

829 % ( 

830 enc(item.fontname), 

831 bbox2str(item.bbox), 

832 item.ncs.name, 

833 item.graphicstate.ncolor, 

834 item.size, 

835 ) 

836 ) 

837 self.write(s) 

838 self.write_text(item.get_text()) 

839 self.write("</text>\n") 

840 elif isinstance(item, LTText): 

841 self.write("<text>%s</text>\n" % item.get_text()) 

842 elif isinstance(item, LTImage): 

843 if self.imagewriter is not None: 

844 name = self.imagewriter.export_image(item) 

845 self.write( 

846 '<image src="%s" width="%d" height="%d" />\n' 

847 % (enc(name), item.width, item.height), 

848 ) 

849 else: 

850 self.write( 

851 '<image width="%d" height="%d" />\n' 

852 % (item.width, item.height), 

853 ) 

854 else: 

855 assert False, str(("Unhandled", item)) 

856 

857 render(ltpage) 

858 

859 def close(self) -> None: 

860 self.write_footer() 

861 

862 

863class HOCRConverter(PDFConverter[AnyIO]): 

864 """Extract an hOCR representation from explicit text information within a PDF.""" 

865 

866 # Where text is being extracted from a variety of types of PDF within a 

867 # business process, those PDFs where the text is only present in image 

868 # form will need to be analysed using an OCR tool which will typically 

869 # output hOCR. This converter extracts the explicit text information from 

870 # those PDFs that do have it and uses it to genxerate a basic hOCR 

871 # representation that is designed to be used in conjunction with the image 

872 # of the PDF in the same way as genuine OCR output would be, but without the 

873 # inevitable OCR errors. 

874 

875 # The converter does not handle images, diagrams or text colors. 

876 

877 # In the examples processed by the contributor it was necessary to set 

878 # LAParams.all_texts to True. 

879 

880 CONTROL = re.compile(r"[\x00-\x08\x0b-\x0c\x0e-\x1f]") 

881 

882 def __init__( 

883 self, 

884 rsrcmgr: PDFResourceManager, 

885 outfp: AnyIO, 

886 codec: str = "utf8", 

887 pageno: int = 1, 

888 laparams: Optional[LAParams] = None, 

889 stripcontrol: bool = False, 

890 ): 

891 PDFConverter.__init__( 

892 self, 

893 rsrcmgr, 

894 outfp, 

895 codec=codec, 

896 pageno=pageno, 

897 laparams=laparams, 

898 ) 

899 self.stripcontrol = stripcontrol 

900 self.within_chars = False 

901 self.write_header() 

902 

903 def bbox_repr(self, bbox: Rect) -> str: 

904 (in_x0, in_y0, in_x1, in_y1) = bbox 

905 # PDF y-coordinates are the other way round from hOCR coordinates 

906 out_x0 = int(in_x0) 

907 out_y0 = int(self.page_bbox[3] - in_y1) 

908 out_x1 = int(in_x1) 

909 out_y1 = int(self.page_bbox[3] - in_y0) 

910 return f"bbox {out_x0} {out_y0} {out_x1} {out_y1}" 

911 

912 def write(self, text: str) -> None: 

913 if self.codec: 

914 encoded_text = text.encode(self.codec) 

915 cast(BinaryIO, self.outfp).write(encoded_text) 

916 else: 

917 cast(TextIO, self.outfp).write(text) 

918 

919 def write_header(self) -> None: 

920 if self.codec: 

921 self.write( 

922 "<html xmlns='http://www.w3.org/1999/xhtml' " 

923 "xml:lang='en' lang='en' charset='%s'>\n" % self.codec, 

924 ) 

925 else: 

926 self.write( 

927 "<html xmlns='http://www.w3.org/1999/xhtml' " 

928 "xml:lang='en' lang='en'>\n", 

929 ) 

930 self.write("<head>\n") 

931 self.write("<title></title>\n") 

932 self.write( 

933 "<meta http-equiv='Content-Type' content='text/html;charset=utf-8' />\n", 

934 ) 

935 self.write( 

936 "<meta name='ocr-system' content='pdfminer.six HOCR Converter' />\n", 

937 ) 

938 self.write( 

939 " <meta name='ocr-capabilities'" 

940 " content='ocr_page ocr_block ocr_line ocrx_word'/>\n", 

941 ) 

942 self.write("</head>\n") 

943 self.write("<body>\n") 

944 

945 def write_footer(self) -> None: 

946 self.write("<!-- comment in the following line to debug -->\n") 

947 self.write( 

948 "<!--script src='https://unpkg.com/hocrjs'></script--></body></html>\n", 

949 ) 

950 

951 def write_text(self, text: str) -> None: 

952 if self.stripcontrol: 

953 text = self.CONTROL.sub("", text) 

954 self.write(text) 

955 

956 def write_word(self) -> None: 

957 if len(self.working_text) > 0: 

958 bold_and_italic_styles = "" 

959 if "Italic" in self.working_font: 

960 bold_and_italic_styles = "font-style: italic; " 

961 if "Bold" in self.working_font: 

962 bold_and_italic_styles += "font-weight: bold; " 

963 self.write( 

964 "<span style='font:\"%s\"; font-size:%d; %s' " 

965 "class='ocrx_word' title='%s; x_font %s; " 

966 "x_fsize %d'>%s</span>" 

967 % ( 

968 ( 

969 self.working_font, 

970 self.working_size, 

971 bold_and_italic_styles, 

972 self.bbox_repr(self.working_bbox), 

973 self.working_font, 

974 self.working_size, 

975 self.working_text.strip(), 

976 ) 

977 ), 

978 ) 

979 self.within_chars = False 

980 

981 def receive_layout(self, ltpage: LTPage) -> None: 

982 def render(item: LTItem) -> None: 

983 if self.within_chars and isinstance(item, LTAnno): 

984 self.write_word() 

985 if isinstance(item, LTPage): 

986 self.page_bbox = item.bbox 

987 self.write( 

988 "<div class='ocr_page' id='%s' title='%s'>\n" 

989 % (item.pageid, self.bbox_repr(item.bbox)), 

990 ) 

991 for child in item: 

992 render(child) 

993 self.write("</div>\n") 

994 elif isinstance(item, LTTextLine): 

995 self.write( 

996 "<span class='ocr_line' title='%s'>" % (self.bbox_repr(item.bbox)), 

997 ) 

998 for child_line in item: 

999 render(child_line) 

1000 self.write("</span>\n") 

1001 elif isinstance(item, LTTextBox): 

1002 self.write( 

1003 "<div class='ocr_block' id='%d' title='%s'>\n" 

1004 % (item.index, self.bbox_repr(item.bbox)), 

1005 ) 

1006 for child in item: 

1007 render(child) 

1008 self.write("</div>\n") 

1009 elif isinstance(item, LTChar): 

1010 if not self.within_chars: 

1011 self.within_chars = True 

1012 self.working_text = item.get_text() 

1013 self.working_bbox = item.bbox 

1014 self.working_font = item.fontname 

1015 self.working_size = item.size 

1016 elif len(item.get_text().strip()) == 0: 

1017 self.write_word() 

1018 self.write(item.get_text()) 

1019 else: 

1020 if ( 

1021 self.working_bbox[1] != item.bbox[1] 

1022 or self.working_font != item.fontname 

1023 or self.working_size != item.size 

1024 ): 

1025 self.write_word() 

1026 self.working_bbox = item.bbox 

1027 self.working_font = item.fontname 

1028 self.working_size = item.size 

1029 self.working_text += item.get_text() 

1030 self.working_bbox = ( 

1031 self.working_bbox[0], 

1032 self.working_bbox[1], 

1033 item.bbox[2], 

1034 self.working_bbox[3], 

1035 ) 

1036 

1037 render(ltpage) 

1038 

1039 def close(self) -> None: 

1040 self.write_footer()