Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/converter.py: 38%

1import io

2import logging

3import re

4from typing import (

5 BinaryIO,

6 Dict,

7 Generic,

8 List,

9 Optional,

10 Sequence,

11 TextIO,

12 Tuple,

13 TypeVar,

14 Union,

15 cast,

16)

18from pdfminer import utils

19from pdfminer.image import ImageWriter

20from pdfminer.layout import (

21 LAParams,

22 LTAnno,

23 LTChar,

24 LTComponent,

25 LTContainer,

26 LTCurve,

27 LTFigure,

28 LTImage,

29 LTItem,

30 LTLayoutContainer,

31 LTLine,

32 LTPage,

33 LTRect,

34 LTText,

35 LTTextBox,

36 LTTextBoxVertical,

37 LTTextGroup,

38 LTTextLine,

39 TextGroupElement,

40)

41from pdfminer.pdfcolor import PDFColorSpace

42from pdfminer.pdfdevice import PDFTextDevice

43from pdfminer.pdfexceptions import PDFValueError

44from pdfminer.pdffont import PDFFont, PDFUnicodeNotDefined

45from pdfminer.pdfinterp import PDFGraphicState, PDFResourceManager

46from pdfminer.pdfpage import PDFPage

47from pdfminer.pdftypes import PDFStream

48from pdfminer.utils import (

49 AnyIO,

50 Matrix,

51 PathSegment,

52 Point,

53 Rect,

54 apply_matrix_pt,

55 bbox2str,

56 enc,

57 make_compat_str,

58 mult_matrix,

59)

61log = logging.getLogger(__name__)

64class PDFLayoutAnalyzer(PDFTextDevice):

65 cur_item: LTLayoutContainer

66 ctm: Matrix

68 def __init__(

69 self,

70 rsrcmgr: PDFResourceManager,

71 pageno: int = 1,

72 laparams: Optional[LAParams] = None,

73 ) -> None:

74 PDFTextDevice.__init__(self, rsrcmgr)

75 self.pageno = pageno

76 self.laparams = laparams

77 self._stack: List[LTLayoutContainer] = []

79 def begin_page(self, page: PDFPage, ctm: Matrix) -> None:

80 (x0, y0, x1, y1) = page.mediabox

81 (x0, y0) = apply_matrix_pt(ctm, (x0, y0))

82 (x1, y1) = apply_matrix_pt(ctm, (x1, y1))

83 mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1))

84 self.cur_item = LTPage(self.pageno, mediabox)

86 def end_page(self, page: PDFPage) -> None:

87 assert not self._stack, str(len(self._stack))

88 assert isinstance(self.cur_item, LTPage), str(type(self.cur_item))

89 if self.laparams is not None:

90 self.cur_item.analyze(self.laparams)

91 self.pageno += 1

92 self.receive_layout(self.cur_item)

94 def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None:

95 self._stack.append(self.cur_item)

96 self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))

98 def end_figure(self, _: str) -> None:

99 fig = self.cur_item

100 assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))

101 self.cur_item = self._stack.pop()

102 self.cur_item.add(fig)

103

104 def render_image(self, name: str, stream: PDFStream) -> None:

105 assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))

106 item = LTImage(

107 name,

108 stream,

109 (self.cur_item.x0, self.cur_item.y0, self.cur_item.x1, self.cur_item.y1),

110 )

111 self.cur_item.add(item)

112

113 def paint_path(

114 self,

115 gstate: PDFGraphicState,

116 stroke: bool,

117 fill: bool,

118 evenodd: bool,

119 path: Sequence[PathSegment],

120 ) -> None:

121 """Paint paths described in section 4.4 of the PDF reference manual"""

122 shape = "".join(x[0] for x in path)

123

124 if shape[:1] != "m":

125 # Per PDF Reference Section 4.4.1, "path construction operators may

126 # be invoked in any sequence, but the first one invoked must be m

127 # or re to begin a new subpath." Since pdfminer.six already

128 # converts all `re` (rectangle) operators to their equivelent

129 # `mlllh` representation, paths ingested by `.paint_path(...)` that

130 # do not begin with the `m` operator are invalid.

131 pass

132

133 elif shape.count("m") > 1:

134 # recurse if there are multiple m's in this shape

135 for m in re.finditer(r"m[^m]+", shape):

136 subpath = path[m.start(0) : m.end(0)]

137 self.paint_path(gstate, stroke, fill, evenodd, subpath)

138

139 else:

140 # Although the 'h' command does not not literally provide a

141 # point-position, its position is (by definition) equal to the

142 # subpath's starting point.

143 #

144 # And, per Section 4.4's Table 4.9, all other path commands place

145 # their point-position in their final two arguments. (Any preceding

146 # arguments represent control points on Bézier curves.)

147 raw_pts = [

148 cast(Point, p[-2:] if p[0] != "h" else path[0][-2:]) for p in path

149 ]

150 pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts]

151

152 operators = [str(operation[0]) for operation in path]

153 transformed_points = [

154 [

155 apply_matrix_pt(self.ctm, (float(operand1), float(operand2)))

156 for operand1, operand2 in zip(operation[1::2], operation[2::2])

157 ]

158 for operation in path

159 ]

160 transformed_path = [

161 cast(PathSegment, (o, *p))

162 for o, p in zip(operators, transformed_points)

163 ]

164

165 if shape in {"mlh", "ml"}:

166 # single line segment

167 #

168 # Note: 'ml', in conditional above, is a frequent anomaly

169 # that we want to support.

170 line = LTLine(

171 gstate.linewidth,

172 pts[0],

173 pts[1],

174 stroke,

175 fill,

176 evenodd,

177 gstate.scolor,

178 gstate.ncolor,

179 original_path=transformed_path,

180 dashing_style=gstate.dash,

181 )

182 self.cur_item.add(line)

183

184 elif shape in {"mlllh", "mllll"}:

185 (x0, y0), (x1, y1), (x2, y2), (x3, y3), _ = pts

186

187 is_closed_loop = pts[0] == pts[4]

188 has_square_coordinates = (

189 x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0

190 ) or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)

191 if is_closed_loop and has_square_coordinates:

192 rect = LTRect(

193 gstate.linewidth,

194 (*pts[0], *pts[2]),

195 stroke,

196 fill,

197 evenodd,

198 gstate.scolor,

199 gstate.ncolor,

200 transformed_path,

201 gstate.dash,

202 )

203 self.cur_item.add(rect)

204 else:

205 curve = LTCurve(

206 gstate.linewidth,

207 pts,

208 stroke,

209 fill,

210 evenodd,

211 gstate.scolor,

212 gstate.ncolor,

213 transformed_path,

214 gstate.dash,

215 )

216 self.cur_item.add(curve)

217 else:

218 curve = LTCurve(

219 gstate.linewidth,

220 pts,

221 stroke,

222 fill,

223 evenodd,

224 gstate.scolor,

225 gstate.ncolor,

226 transformed_path,

227 gstate.dash,

228 )

229 self.cur_item.add(curve)

230

231 def render_char(

232 self,

233 matrix: Matrix,

234 font: PDFFont,

235 fontsize: float,

236 scaling: float,

237 rise: float,

238 cid: int,

239 ncs: PDFColorSpace,

240 graphicstate: PDFGraphicState,

241 ) -> float:

242 try:

243 text = font.to_unichr(cid)

244 assert isinstance(text, str), str(type(text))

245 except PDFUnicodeNotDefined:

246 text = self.handle_undefined_char(font, cid)

247 textwidth = font.char_width(cid)

248 textdisp = font.char_disp(cid)

249 item = LTChar(

250 matrix,

251 font,

252 fontsize,

253 scaling,

254 rise,

255 text,

256 textwidth,

257 textdisp,

258 ncs,

259 graphicstate,

260 )

261 self.cur_item.add(item)

262 return item.adv

263

264 def handle_undefined_char(self, font: PDFFont, cid: int) -> str:

265 log.debug("undefined: %r, %r", font, cid)

266 return "(cid:%d)" % cid

267

268 def receive_layout(self, ltpage: LTPage) -> None:

269 pass

270

271

272class PDFPageAggregator(PDFLayoutAnalyzer):

273 def __init__(

274 self,

275 rsrcmgr: PDFResourceManager,

276 pageno: int = 1,

277 laparams: Optional[LAParams] = None,

278 ) -> None:

279 PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)

280 self.result: Optional[LTPage] = None

281

282 def receive_layout(self, ltpage: LTPage) -> None:

283 self.result = ltpage

284

285 def get_result(self) -> LTPage:

286 assert self.result is not None

287 return self.result

288

289

290# Some PDFConverter children support only binary I/O

291IOType = TypeVar("IOType", TextIO, BinaryIO, AnyIO)

292

293

294class PDFConverter(PDFLayoutAnalyzer, Generic[IOType]):

295 def __init__(

296 self,

297 rsrcmgr: PDFResourceManager,

298 outfp: IOType,

299 codec: str = "utf-8",

300 pageno: int = 1,

301 laparams: Optional[LAParams] = None,

302 ) -> None:

303 PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)

304 self.outfp: IOType = outfp

305 self.codec = codec

306 self.outfp_binary = self._is_binary_stream(self.outfp)

307

308 @staticmethod

309 def _is_binary_stream(outfp: AnyIO) -> bool:

310 """Test if an stream is binary or not"""

311 if "b" in getattr(outfp, "mode", ""):

312 return True

313 elif hasattr(outfp, "mode"):

314 # output stream has a mode, but it does not contain 'b'

315 return False

316 elif isinstance(outfp, io.BytesIO):

317 return True

318 elif isinstance(outfp, io.StringIO) or isinstance(outfp, io.TextIOBase):

319 return False

320

321 return True

322

323

324class TextConverter(PDFConverter[AnyIO]):

325 def __init__(

326 self,

327 rsrcmgr: PDFResourceManager,

328 outfp: AnyIO,

329 codec: str = "utf-8",

330 pageno: int = 1,

331 laparams: Optional[LAParams] = None,

332 showpageno: bool = False,

333 imagewriter: Optional[ImageWriter] = None,

334 ) -> None:

335 super().__init__(rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)

336 self.showpageno = showpageno

337 self.imagewriter = imagewriter

338

339 def write_text(self, text: str) -> None:

340 text = utils.compatible_encode_method(text, self.codec, "ignore")

341 if self.outfp_binary:

342 cast(BinaryIO, self.outfp).write(text.encode())

343 else:

344 cast(TextIO, self.outfp).write(text)

345

346 def receive_layout(self, ltpage: LTPage) -> None:

347 def render(item: LTItem) -> None:

348 if isinstance(item, LTContainer):

349 for child in item:

350 render(child)

351 elif isinstance(item, LTText):

352 self.write_text(item.get_text())

353 if isinstance(item, LTTextBox):

354 self.write_text("\n")

355 elif isinstance(item, LTImage):

356 if self.imagewriter is not None:

357 self.imagewriter.export_image(item)

358

359 if self.showpageno:

360 self.write_text("Page %s\n" % ltpage.pageid)

361 render(ltpage)

362 self.write_text("\f")

363

364 # Some dummy functions to save memory/CPU when all that is wanted

365 # is text. This stops all the image and drawing output from being

366 # recorded and taking up RAM.

367 def render_image(self, name: str, stream: PDFStream) -> None:

368 if self.imagewriter is not None:

369 PDFConverter.render_image(self, name, stream)

370

371 def paint_path(

372 self,

373 gstate: PDFGraphicState,

374 stroke: bool,

375 fill: bool,

376 evenodd: bool,

377 path: Sequence[PathSegment],

378 ) -> None:

379 pass

380

381

382class HTMLConverter(PDFConverter[AnyIO]):

383 RECT_COLORS = {

384 "figure": "yellow",

385 "textline": "magenta",

386 "textbox": "cyan",

387 "textgroup": "red",

388 "curve": "black",

389 "page": "gray",

390 }

391

392 TEXT_COLORS = {

393 "textbox": "blue",

394 "char": "black",

395 }

396

397 def __init__(

398 self,

399 rsrcmgr: PDFResourceManager,

400 outfp: AnyIO,

401 codec: str = "utf-8",

402 pageno: int = 1,

403 laparams: Optional[LAParams] = None,

404 scale: float = 1,

405 fontscale: float = 1.0,

406 layoutmode: str = "normal",

407 showpageno: bool = True,

408 pagemargin: int = 50,

409 imagewriter: Optional[ImageWriter] = None,

410 debug: int = 0,

411 rect_colors: Optional[Dict[str, str]] = None,

412 text_colors: Optional[Dict[str, str]] = None,

413 ) -> None:

414 PDFConverter.__init__(

415 self,

416 rsrcmgr,

417 outfp,

418 codec=codec,

419 pageno=pageno,

420 laparams=laparams,

421 )

422

423 # write() assumes a codec for binary I/O, or no codec for text I/O.

424 if self.outfp_binary and not self.codec:

425 raise PDFValueError("Codec is required for a binary I/O output")

426 if not self.outfp_binary and self.codec:

427 raise PDFValueError("Codec must not be specified for a text I/O output")

428

429 if text_colors is None:

430 text_colors = {"char": "black"}

431 if rect_colors is None:

432 rect_colors = {"curve": "black", "page": "gray"}

433

434 self.scale = scale

435 self.fontscale = fontscale

436 self.layoutmode = layoutmode

437 self.showpageno = showpageno

438 self.pagemargin = pagemargin

439 self.imagewriter = imagewriter

440 self.rect_colors = rect_colors

441 self.text_colors = text_colors

442 if debug:

443 self.rect_colors.update(self.RECT_COLORS)

444 self.text_colors.update(self.TEXT_COLORS)

445 self._yoffset: float = self.pagemargin

446 self._font: Optional[Tuple[str, float]] = None

447 self._fontstack: List[Optional[Tuple[str, float]]] = []

448 self.write_header()

449

450 def write(self, text: str) -> None:

451 if self.codec:

452 cast(BinaryIO, self.outfp).write(text.encode(self.codec))

453 else:

454 cast(TextIO, self.outfp).write(text)

455

456 def write_header(self) -> None:

457 self.write("<html><head>\n")

458 if self.codec:

459 s = (

460 '<meta http-equiv="Content-Type" content="text/html; '

461 'charset=%s">\n' % self.codec

462 )

463 else:

464 s = '<meta http-equiv="Content-Type" content="text/html">\n'

465 self.write(s)

466 self.write("</head><body>\n")

467

468 def write_footer(self) -> None:

469 page_links = [f'<a href="#{i}">{i}</a>' for i in range(1, self.pageno)]

470 s = '<div style="position:absolute; top:0px;">Page: %s</div>\n' % ", ".join(

471 page_links,

472 )

473 self.write(s)

474 self.write("</body></html>\n")

475

476 def write_text(self, text: str) -> None:

477 self.write(enc(text))

478

479 def place_rect(

480 self,

481 color: str,

482 borderwidth: int,

483 x: float,

484 y: float,

485 w: float,

486 h: float,

487 ) -> None:

488 color2 = self.rect_colors.get(color)

489 if color2 is not None:

490 s = (

491 '<span style="position:absolute; border: %s %dpx solid; '

492 'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n'

493 % (

494 color2,

495 borderwidth,

496 x * self.scale,

497 (self._yoffset - y) * self.scale,

498 w * self.scale,

499 h * self.scale,

500 )

501 )

502 self.write(s)

503

504 def place_border(self, color: str, borderwidth: int, item: LTComponent) -> None:

505 self.place_rect(color, borderwidth, item.x0, item.y1, item.width, item.height)

506

507 def place_image(

508 self,

509 item: LTImage,

510 borderwidth: int,

511 x: float,

512 y: float,

513 w: float,

514 h: float,

515 ) -> None:

516 if self.imagewriter is not None:

517 name = self.imagewriter.export_image(item)

518 s = (

519 '<img src="%s" border="%d" style="position:absolute; '

520 'left:%dpx; top:%dpx;" width="%d" height="%d" />\n'

521 % (

522 enc(name),

523 borderwidth,

524 x * self.scale,

525 (self._yoffset - y) * self.scale,

526 w * self.scale,

527 h * self.scale,

528 )

529 )

530 self.write(s)

531

532 def place_text(

533 self,

534 color: str,

535 text: str,

536 x: float,

537 y: float,

538 size: float,

539 ) -> None:

540 color2 = self.text_colors.get(color)

541 if color2 is not None:

542 s = (

543 '<span style="position:absolute; color:%s; left:%dpx; '

544 'top:%dpx; font-size:%dpx;">'

545 % (

546 color2,

547 x * self.scale,

548 (self._yoffset - y) * self.scale,

549 size * self.scale * self.fontscale,

550 )

551 )

552 self.write(s)

553 self.write_text(text)

554 self.write("</span>\n")

555

556 def begin_div(

557 self,

558 color: str,

559 borderwidth: int,

560 x: float,

561 y: float,

562 w: float,

563 h: float,

564 writing_mode: str = "False",

565 ) -> None:

566 self._fontstack.append(self._font)

567 self._font = None

568 s = (

569 '<div style="position:absolute; border: %s %dpx solid; '

570 "writing-mode:%s; left:%dpx; top:%dpx; width:%dpx; "

571 'height:%dpx;">'

572 % (

573 color,

574 borderwidth,

575 writing_mode,

576 x * self.scale,

577 (self._yoffset - y) * self.scale,

578 w * self.scale,

579 h * self.scale,

580 )

581 )

582 self.write(s)

583

584 def end_div(self, color: str) -> None:

585 if self._font is not None:

586 self.write("</span>")

587 self._font = self._fontstack.pop()

588 self.write("</div>")

589

590 def put_text(self, text: str, fontname: str, fontsize: float) -> None:

591 font = (fontname, fontsize)

592 if font != self._font:

593 if self._font is not None:

594 self.write("</span>")

595 # Remove subset tag from fontname, see PDF Reference 5.5.3

596 fontname_without_subset_tag = fontname.split("+")[-1]

597 self.write(

598 '<span style="font-family: %s; font-size:%dpx">'

599 % (fontname_without_subset_tag, fontsize * self.scale * self.fontscale),

600 )

601 self._font = font

602 self.write_text(text)

603

604 def put_newline(self) -> None:

605 self.write("<br>")

606

607 def receive_layout(self, ltpage: LTPage) -> None:

608 def show_group(item: Union[LTTextGroup, TextGroupElement]) -> None:

609 if isinstance(item, LTTextGroup):

610 self.place_border("textgroup", 1, item)

611 for child in item:

612 show_group(child)

613

614 def render(item: LTItem) -> None:

615 child: LTItem

616 if isinstance(item, LTPage):

617 self._yoffset += item.y1

618 self.place_border("page", 1, item)

619 if self.showpageno:

620 self.write(

621 '<div style="position:absolute; top:%dpx;">'

622 % ((self._yoffset - item.y1) * self.scale),

623 )

624 self.write(

625 f'<a name="{item.pageid}">Page {item.pageid}</a></div>\n',

626 )

627 for child in item:

628 render(child)

629 if item.groups is not None:

630 for group in item.groups:

631 show_group(group)

632 elif isinstance(item, LTCurve):

633 self.place_border("curve", 1, item)

634 elif isinstance(item, LTFigure):

635 self.begin_div("figure", 1, item.x0, item.y1, item.width, item.height)

636 for child in item:

637 render(child)

638 self.end_div("figure")

639 elif isinstance(item, LTImage):

640 self.place_image(item, 1, item.x0, item.y1, item.width, item.height)

641 elif self.layoutmode == "exact":

642 if isinstance(item, LTTextLine):

643 self.place_border("textline", 1, item)

644 for child in item:

645 render(child)

646 elif isinstance(item, LTTextBox):

647 self.place_border("textbox", 1, item)

648 self.place_text(

649 "textbox",

650 str(item.index + 1),

651 item.x0,

652 item.y1,

653 20,

654 )

655 for child in item:

656 render(child)

657 elif isinstance(item, LTChar):

658 self.place_border("char", 1, item)

659 self.place_text(

660 "char",

661 item.get_text(),

662 item.x0,

663 item.y1,

664 item.size,

665 )

666 elif isinstance(item, LTTextLine):

667 for child in item:

668 render(child)

669 if self.layoutmode != "loose":

670 self.put_newline()

671 elif isinstance(item, LTTextBox):

672 self.begin_div(

673 "textbox",

674 1,

675 item.x0,

676 item.y1,

677 item.width,

678 item.height,

679 item.get_writing_mode(),

680 )

681 for child in item:

682 render(child)

683 self.end_div("textbox")

684 elif isinstance(item, LTChar):

685 fontname = make_compat_str(item.fontname)

686 self.put_text(item.get_text(), fontname, item.size)

687 elif isinstance(item, LTText):

688 self.write_text(item.get_text())

689

690 render(ltpage)

691 self._yoffset += self.pagemargin

692

693 def close(self) -> None:

694 self.write_footer()

695

696

697class XMLConverter(PDFConverter[AnyIO]):

698 CONTROL = re.compile("[\x00-\x08\x0b-\x0c\x0e-\x1f]")

699

700 def __init__(

701 self,

702 rsrcmgr: PDFResourceManager,

703 outfp: AnyIO,

704 codec: str = "utf-8",

705 pageno: int = 1,

706 laparams: Optional[LAParams] = None,

707 imagewriter: Optional[ImageWriter] = None,

708 stripcontrol: bool = False,

709 ) -> None:

710 PDFConverter.__init__(

711 self,

712 rsrcmgr,

713 outfp,

714 codec=codec,

715 pageno=pageno,

716 laparams=laparams,

717 )

718

719 # write() assumes a codec for binary I/O, or no codec for text I/O.

720 if self.outfp_binary == (not self.codec):

721 raise PDFValueError("Codec is required for a binary I/O output")

722

723 self.imagewriter = imagewriter

724 self.stripcontrol = stripcontrol

725 self.write_header()

726

727 def write(self, text: str) -> None:

728 if self.codec:

729 cast(BinaryIO, self.outfp).write(text.encode(self.codec))

730 else:

731 cast(TextIO, self.outfp).write(text)

732

733 def write_header(self) -> None:

734 if self.codec:

735 self.write('<?xml version="1.0" encoding="%s" ?>\n' % self.codec)

736 else:

737 self.write('<?xml version="1.0" ?>\n')

738 self.write("<pages>\n")

739

740 def write_footer(self) -> None:

741 self.write("</pages>\n")

742

743 def write_text(self, text: str) -> None:

744 if self.stripcontrol:

745 text = self.CONTROL.sub("", text)

746 self.write(enc(text))

747

748 def receive_layout(self, ltpage: LTPage) -> None:

749 def show_group(item: LTItem) -> None:

750 if isinstance(item, LTTextBox):

751 self.write(

752 '<textbox id="%d" bbox="%s" />\n'

753 % (item.index, bbox2str(item.bbox)),

754 )

755 elif isinstance(item, LTTextGroup):

756 self.write('<textgroup bbox="%s">\n' % bbox2str(item.bbox))

757 for child in item:

758 show_group(child)

759 self.write("</textgroup>\n")

760

761 def render(item: LTItem) -> None:

762 child: LTItem

763 if isinstance(item, LTPage):

764 s = '<page id="%s" bbox="%s" rotate="%d">\n' % (

765 item.pageid,

766 bbox2str(item.bbox),

767 item.rotate,

768 )

769 self.write(s)

770 for child in item:

771 render(child)

772 if item.groups is not None:

773 self.write("<layout>\n")

774 for group in item.groups:

775 show_group(group)

776 self.write("</layout>\n")

777 self.write("</page>\n")

778 elif isinstance(item, LTLine):

779 s = '<line linewidth="%d" bbox="%s" />\n' % (

780 item.linewidth,

781 bbox2str(item.bbox),

782 )

783 self.write(s)

784 elif isinstance(item, LTRect):

785 s = '<rect linewidth="%d" bbox="%s" />\n' % (

786 item.linewidth,

787 bbox2str(item.bbox),

788 )

789 self.write(s)

790 elif isinstance(item, LTCurve):

791 s = '<curve linewidth="%d" bbox="%s" pts="%s"/>\n' % (

792 item.linewidth,

793 bbox2str(item.bbox),

794 item.get_pts(),

795 )

796 self.write(s)

797 elif isinstance(item, LTFigure):

798 s = f'<figure name="{item.name}" bbox="{bbox2str(item.bbox)}">\n'

799 self.write(s)

800 for child in item:

801 render(child)

802 self.write("</figure>\n")

803 elif isinstance(item, LTTextLine):

804 self.write('<textline bbox="%s">\n' % bbox2str(item.bbox))

805 for child in item:

806 render(child)

807 self.write("</textline>\n")

808 elif isinstance(item, LTTextBox):

809 wmode = ""

810 if isinstance(item, LTTextBoxVertical):

811 wmode = ' wmode="vertical"'

812 s = '<textbox id="%d" bbox="%s"%s>\n' % (

813 item.index,

814 bbox2str(item.bbox),

815 wmode,

816 )

817 self.write(s)

818 for child in item:

819 render(child)

820 self.write("</textbox>\n")

821 elif isinstance(item, LTChar):

822 s = (

823 '<text font="%s" bbox="%s" colourspace="%s" '

824 'ncolour="%s" size="%.3f">'

825 % (

826 enc(item.fontname),

827 bbox2str(item.bbox),

828 item.ncs.name,

829 item.graphicstate.ncolor,

830 item.size,

831 )

832 )

833 self.write(s)

834 self.write_text(item.get_text())

835 self.write("</text>\n")

836 elif isinstance(item, LTText):

837 self.write("<text>%s</text>\n" % item.get_text())

838 elif isinstance(item, LTImage):

839 if self.imagewriter is not None:

840 name = self.imagewriter.export_image(item)

841 self.write(

842 '<image src="%s" width="%d" height="%d" />\n'

843 % (enc(name), item.width, item.height),

844 )

845 else:

846 self.write(

847 '<image width="%d" height="%d" />\n'

848 % (item.width, item.height),

849 )

850 else:

851 assert False, str(("Unhandled", item))

852

853 render(ltpage)

854

855 def close(self) -> None:

856 self.write_footer()

857

858

859class HOCRConverter(PDFConverter[AnyIO]):

860 """Extract an hOCR representation from explicit text information within a PDF."""

861

862 # Where text is being extracted from a variety of types of PDF within a

863 # business process, those PDFs where the text is only present in image

864 # form will need to be analysed using an OCR tool which will typically

865 # output hOCR. This converter extracts the explicit text information from

866 # those PDFs that do have it and uses it to genxerate a basic hOCR

867 # representation that is designed to be used in conjunction with the image

868 # of the PDF in the same way as genuine OCR output would be, but without the

869 # inevitable OCR errors.

870

871 # The converter does not handle images, diagrams or text colors.

872

873 # In the examples processed by the contributor it was necessary to set

874 # LAParams.all_texts to True.

875

876 CONTROL = re.compile(r"[\x00-\x08\x0b-\x0c\x0e-\x1f]")

877

878 def __init__(

879 self,

880 rsrcmgr: PDFResourceManager,

881 outfp: AnyIO,

882 codec: str = "utf8",

883 pageno: int = 1,

884 laparams: Optional[LAParams] = None,

885 stripcontrol: bool = False,

886 ):

887 PDFConverter.__init__(

888 self,

889 rsrcmgr,

890 outfp,

891 codec=codec,

892 pageno=pageno,

893 laparams=laparams,

894 )

895 self.stripcontrol = stripcontrol

896 self.within_chars = False

897 self.write_header()

898

899 def bbox_repr(self, bbox: Rect) -> str:

900 (in_x0, in_y0, in_x1, in_y1) = bbox

901 # PDF y-coordinates are the other way round from hOCR coordinates

902 out_x0 = int(in_x0)

903 out_y0 = int(self.page_bbox[3] - in_y1)

904 out_x1 = int(in_x1)

905 out_y1 = int(self.page_bbox[3] - in_y0)

906 return f"bbox {out_x0} {out_y0} {out_x1} {out_y1}"

907

908 def write(self, text: str) -> None:

909 if self.codec:

910 encoded_text = text.encode(self.codec)

911 cast(BinaryIO, self.outfp).write(encoded_text)

912 else:

913 cast(TextIO, self.outfp).write(text)

914

915 def write_header(self) -> None:

916 if self.codec:

917 self.write(

918 "<html xmlns='http://www.w3.org/1999/xhtml' "

919 "xml:lang='en' lang='en' charset='%s'>\n" % self.codec,

920 )

921 else:

922 self.write(

923 "<html xmlns='http://www.w3.org/1999/xhtml' "

924 "xml:lang='en' lang='en'>\n",

925 )

926 self.write("<head>\n")

927 self.write("<title></title>\n")

928 self.write(

929 "<meta http-equiv='Content-Type' content='text/html;charset=utf-8' />\n",

930 )

931 self.write(

932 "<meta name='ocr-system' content='pdfminer.six HOCR Converter' />\n",

933 )

934 self.write(

935 " <meta name='ocr-capabilities'"

936 " content='ocr_page ocr_block ocr_line ocrx_word'/>\n",

937 )

938 self.write("</head>\n")

939 self.write("<body>\n")

940

941 def write_footer(self) -> None:

942 self.write("\n")

943 self.write(

944 "</body></html>\n",

945 )

946

947 def write_text(self, text: str) -> None:

948 if self.stripcontrol:

949 text = self.CONTROL.sub("", text)

950 self.write(text)

951

952 def write_word(self) -> None:

953 if len(self.working_text) > 0:

954 bold_and_italic_styles = ""

955 if "Italic" in self.working_font:

956 bold_and_italic_styles = "font-style: italic; "

957 if "Bold" in self.working_font:

958 bold_and_italic_styles += "font-weight: bold; "

959 self.write(

960 "<span style='font:\"%s\"; font-size:%d; %s' "

961 "class='ocrx_word' title='%s; x_font %s; "

962 "x_fsize %d'>%s</span>"

963 % (

964 (

965 self.working_font,

966 self.working_size,

967 bold_and_italic_styles,

968 self.bbox_repr(self.working_bbox),

969 self.working_font,

970 self.working_size,

971 self.working_text.strip(),

972 )

973 ),

974 )

975 self.within_chars = False

976

977 def receive_layout(self, ltpage: LTPage) -> None:

978 def render(item: LTItem) -> None:

979 if self.within_chars and isinstance(item, LTAnno):

980 self.write_word()

981 if isinstance(item, LTPage):

982 self.page_bbox = item.bbox

983 self.write(

984 "<div class='ocr_page' id='%s' title='%s'>\n"

985 % (item.pageid, self.bbox_repr(item.bbox)),

986 )

987 for child in item:

988 render(child)

989 self.write("</div>\n")

990 elif isinstance(item, LTTextLine):

991 self.write(

992 "<span class='ocr_line' title='%s'>" % (self.bbox_repr(item.bbox)),

993 )

994 for child_line in item:

995 render(child_line)

996 self.write("</span>\n")

997 elif isinstance(item, LTTextBox):

998 self.write(

999 "<div class='ocr_block' id='%d' title='%s'>\n"

1000 % (item.index, self.bbox_repr(item.bbox)),

1001 )

1002 for child in item:

1003 render(child)

1004 self.write("</div>\n")

1005 elif isinstance(item, LTChar):

1006 if not self.within_chars:

1007 self.within_chars = True

1008 self.working_text = item.get_text()

1009 self.working_bbox = item.bbox

1010 self.working_font = item.fontname

1011 self.working_size = item.size

1012 elif len(item.get_text().strip()) == 0:

1013 self.write_word()

1014 self.write(item.get_text())

1015 else:

1016 if (

1017 self.working_bbox[1] != item.bbox[1]

1018 or self.working_font != item.fontname

1019 or self.working_size != item.size

1020 ):

1021 self.write_word()

1022 self.working_bbox = item.bbox

1023 self.working_font = item.fontname

1024 self.working_size = item.size

1025 self.working_text += item.get_text()

1026 self.working_bbox = (

1027 self.working_bbox[0],

1028 self.working_bbox[1],

1029 item.bbox[2],

1030 self.working_bbox[3],

1031 )

1032

1033 render(ltpage)

1034

1035 def close(self) -> None:

1036 self.write_footer()