Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/converter.py: 30%

1import io

2import logging

3import re

4from typing import (

5 BinaryIO,

6 Dict,

7 Generic,

8 List,

9 Optional,

10 Sequence,

11 TextIO,

12 Tuple,

13 TypeVar,

14 Union,

15 cast,

16)

18from pdfminer import utils

19from pdfminer.image import ImageWriter

20from pdfminer.layout import (

21 LAParams,

22 LTAnno,

23 LTChar,

24 LTComponent,

25 LTContainer,

26 LTCurve,

27 LTFigure,

28 LTImage,

29 LTItem,

30 LTLayoutContainer,

31 LTLine,

32 LTPage,

33 LTRect,

34 LTText,

35 LTTextBox,

36 LTTextBoxVertical,

37 LTTextGroup,

38 LTTextLine,

39 TextGroupElement,

40)

41from pdfminer.pdfcolor import PDFColorSpace

42from pdfminer.pdfdevice import PDFTextDevice

43from pdfminer.pdfexceptions import PDFValueError

44from pdfminer.pdffont import PDFFont, PDFUnicodeNotDefined

45from pdfminer.pdfinterp import PDFGraphicState, PDFResourceManager

46from pdfminer.pdfpage import PDFPage

47from pdfminer.pdftypes import PDFStream

48from pdfminer.utils import (

49 AnyIO,

50 Matrix,

51 PathSegment,

52 Point,

53 Rect,

54 apply_matrix_pt,

55 apply_matrix_rect,

56 bbox2str,

57 enc,

58 make_compat_str,

59 mult_matrix,

60)

62log = logging.getLogger(__name__)

65class PDFLayoutAnalyzer(PDFTextDevice):

66 cur_item: LTLayoutContainer

67 ctm: Matrix

69 def __init__(

70 self,

71 rsrcmgr: PDFResourceManager,

72 pageno: int = 1,

73 laparams: Optional[LAParams] = None,

74 ) -> None:

75 PDFTextDevice.__init__(self, rsrcmgr)

76 self.pageno = pageno

77 self.laparams = laparams

78 self._stack: List[LTLayoutContainer] = []

80 def begin_page(self, page: PDFPage, ctm: Matrix) -> None:

81 (x0, y0, x1, y1) = apply_matrix_rect(ctm, page.mediabox)

82 mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1))

83 self.cur_item = LTPage(self.pageno, mediabox)

85 def end_page(self, page: PDFPage) -> None:

86 assert not self._stack, str(len(self._stack))

87 assert isinstance(self.cur_item, LTPage), str(type(self.cur_item))

88 if self.laparams is not None:

89 self.cur_item.analyze(self.laparams)

90 self.pageno += 1

91 self.receive_layout(self.cur_item)

93 def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None:

94 self._stack.append(self.cur_item)

95 self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))

97 def end_figure(self, _: str) -> None:

98 fig = self.cur_item

99 assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))

100 self.cur_item = self._stack.pop()

101 self.cur_item.add(fig)

102

103 def render_image(self, name: str, stream: PDFStream) -> None:

104 assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))

105 item = LTImage(

106 name,

107 stream,

108 (self.cur_item.x0, self.cur_item.y0, self.cur_item.x1, self.cur_item.y1),

109 )

110 self.cur_item.add(item)

111

112 def paint_path(

113 self,

114 gstate: PDFGraphicState,

115 stroke: bool,

116 fill: bool,

117 evenodd: bool,

118 path: Sequence[PathSegment],

119 ) -> None:

120 """Paint paths described in section 4.4 of the PDF reference manual"""

121 shape = "".join(x[0] for x in path)

122

123 if shape[:1] != "m":

124 # Per PDF Reference Section 4.4.1, "path construction operators may

125 # be invoked in any sequence, but the first one invoked must be m

126 # or re to begin a new subpath." Since pdfminer.six already

127 # converts all `re` (rectangle) operators to their equivelent

128 # `mlllh` representation, paths ingested by `.paint_path(...)` that

129 # do not begin with the `m` operator are invalid.

130 pass

131

132 elif shape.count("m") > 1:

133 # recurse if there are multiple m's in this shape

134 for m in re.finditer(r"m[^m]+", shape):

135 subpath = path[m.start(0) : m.end(0)]

136 self.paint_path(gstate, stroke, fill, evenodd, subpath)

137

138 else:

139 # Although the 'h' command does not not literally provide a

140 # point-position, its position is (by definition) equal to the

141 # subpath's starting point.

142 #

143 # And, per Section 4.4's Table 4.9, all other path commands place

144 # their point-position in their final two arguments. (Any preceding

145 # arguments represent control points on Bézier curves.)

146 raw_pts = [

147 cast(Point, p[-2:] if p[0] != "h" else path[0][-2:]) for p in path

148 ]

149 pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts]

150

151 operators = [str(operation[0]) for operation in path]

152 transformed_points = [

153 [

154 apply_matrix_pt(self.ctm, (float(operand1), float(operand2)))

155 for operand1, operand2 in zip(operation[1::2], operation[2::2])

156 ]

157 for operation in path

158 ]

159 transformed_path = [

160 cast(PathSegment, (o, *p))

161 for o, p in zip(operators, transformed_points)

162 ]

163

164 # Drop a redundant "l" on a path closed with "h"

165 if len(shape) > 3 and shape[-2:] == "lh" and pts[-2] == pts[0]:

166 shape = shape[:-2] + "h"

167 pts.pop()

168

169 if shape in {"mlh", "ml"}:

170 # single line segment

171 #

172 # Note: 'ml', in conditional above, is a frequent anomaly

173 # that we want to support.

174 line = LTLine(

175 gstate.linewidth,

176 pts[0],

177 pts[1],

178 stroke,

179 fill,

180 evenodd,

181 gstate.scolor,

182 gstate.ncolor,

183 original_path=transformed_path,

184 dashing_style=gstate.dash,

185 )

186 self.cur_item.add(line)

187

188 elif shape in {"mlllh", "mllll"}:

189 (x0, y0), (x1, y1), (x2, y2), (x3, y3), _ = pts

190

191 is_closed_loop = pts[0] == pts[4]

192 has_square_coordinates = (

193 x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0

194 ) or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)

195 if is_closed_loop and has_square_coordinates:

196 rect = LTRect(

197 gstate.linewidth,

198 (*pts[0], *pts[2]),

199 stroke,

200 fill,

201 evenodd,

202 gstate.scolor,

203 gstate.ncolor,

204 transformed_path,

205 gstate.dash,

206 )

207 self.cur_item.add(rect)

208 else:

209 curve = LTCurve(

210 gstate.linewidth,

211 pts,

212 stroke,

213 fill,

214 evenodd,

215 gstate.scolor,

216 gstate.ncolor,

217 transformed_path,

218 gstate.dash,

219 )

220 self.cur_item.add(curve)

221 else:

222 curve = LTCurve(

223 gstate.linewidth,

224 pts,

225 stroke,

226 fill,

227 evenodd,

228 gstate.scolor,

229 gstate.ncolor,

230 transformed_path,

231 gstate.dash,

232 )

233 self.cur_item.add(curve)

234

235 def render_char(

236 self,

237 matrix: Matrix,

238 font: PDFFont,

239 fontsize: float,

240 scaling: float,

241 rise: float,

242 cid: int,

243 ncs: PDFColorSpace,

244 graphicstate: PDFGraphicState,

245 ) -> float:

246 try:

247 text = font.to_unichr(cid)

248 assert isinstance(text, str), str(type(text))

249 except PDFUnicodeNotDefined:

250 text = self.handle_undefined_char(font, cid)

251 textwidth = font.char_width(cid)

252 textdisp = font.char_disp(cid)

253 item = LTChar(

254 matrix,

255 font,

256 fontsize,

257 scaling,

258 rise,

259 text,

260 textwidth,

261 textdisp,

262 ncs,

263 graphicstate,

264 )

265 self.cur_item.add(item)

266 return item.adv

267

268 def handle_undefined_char(self, font: PDFFont, cid: int) -> str:

269 log.debug("undefined: %r, %r", font, cid)

270 return "(cid:%d)" % cid

271

272 def receive_layout(self, ltpage: LTPage) -> None:

273 pass

274

275

276class PDFPageAggregator(PDFLayoutAnalyzer):

277 def __init__(

278 self,

279 rsrcmgr: PDFResourceManager,

280 pageno: int = 1,

281 laparams: Optional[LAParams] = None,

282 ) -> None:

283 PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)

284 self.result: Optional[LTPage] = None

285

286 def receive_layout(self, ltpage: LTPage) -> None:

287 self.result = ltpage

288

289 def get_result(self) -> LTPage:

290 assert self.result is not None

291 return self.result

292

293

294# Some PDFConverter children support only binary I/O

295IOType = TypeVar("IOType", TextIO, BinaryIO, AnyIO)

296

297

298class PDFConverter(PDFLayoutAnalyzer, Generic[IOType]):

299 def __init__(

300 self,

301 rsrcmgr: PDFResourceManager,

302 outfp: IOType,

303 codec: str = "utf-8",

304 pageno: int = 1,

305 laparams: Optional[LAParams] = None,

306 ) -> None:

307 PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)

308 self.outfp: IOType = outfp

309 self.codec = codec

310 self.outfp_binary = self._is_binary_stream(self.outfp)

311

312 @staticmethod

313 def _is_binary_stream(outfp: AnyIO) -> bool:

314 """Test if an stream is binary or not"""

315 if "b" in getattr(outfp, "mode", ""):

316 return True

317 elif hasattr(outfp, "mode"):

318 # output stream has a mode, but it does not contain 'b'

319 return False

320 elif isinstance(outfp, io.BytesIO):

321 return True

322 elif isinstance(outfp, io.StringIO) or isinstance(outfp, io.TextIOBase):

323 return False

324

325 return True

326

327

328class TextConverter(PDFConverter[AnyIO]):

329 def __init__(

330 self,

331 rsrcmgr: PDFResourceManager,

332 outfp: AnyIO,

333 codec: str = "utf-8",

334 pageno: int = 1,

335 laparams: Optional[LAParams] = None,

336 showpageno: bool = False,

337 imagewriter: Optional[ImageWriter] = None,

338 ) -> None:

339 super().__init__(rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)

340 self.showpageno = showpageno

341 self.imagewriter = imagewriter

342

343 def write_text(self, text: str) -> None:

344 text = utils.compatible_encode_method(text, self.codec, "ignore")

345 if self.outfp_binary:

346 cast(BinaryIO, self.outfp).write(text.encode())

347 else:

348 cast(TextIO, self.outfp).write(text)

349

350 def receive_layout(self, ltpage: LTPage) -> None:

351 def render(item: LTItem) -> None:

352 if isinstance(item, LTContainer):

353 for child in item:

354 render(child)

355 elif isinstance(item, LTText):

356 self.write_text(item.get_text())

357 if isinstance(item, LTTextBox):

358 self.write_text("\n")

359 elif isinstance(item, LTImage):

360 if self.imagewriter is not None:

361 self.imagewriter.export_image(item)

362

363 if self.showpageno:

364 self.write_text("Page %s\n" % ltpage.pageid)

365 render(ltpage)

366 self.write_text("\f")

367

368 # Some dummy functions to save memory/CPU when all that is wanted

369 # is text. This stops all the image and drawing output from being

370 # recorded and taking up RAM.

371 def render_image(self, name: str, stream: PDFStream) -> None:

372 if self.imagewriter is not None:

373 PDFConverter.render_image(self, name, stream)

374

375 def paint_path(

376 self,

377 gstate: PDFGraphicState,

378 stroke: bool,

379 fill: bool,

380 evenodd: bool,

381 path: Sequence[PathSegment],

382 ) -> None:

383 pass

384

385

386class HTMLConverter(PDFConverter[AnyIO]):

387 RECT_COLORS = {

388 "figure": "yellow",

389 "textline": "magenta",

390 "textbox": "cyan",

391 "textgroup": "red",

392 "curve": "black",

393 "page": "gray",

394 }

395

396 TEXT_COLORS = {

397 "textbox": "blue",

398 "char": "black",

399 }

400

401 def __init__(

402 self,

403 rsrcmgr: PDFResourceManager,

404 outfp: AnyIO,

405 codec: str = "utf-8",

406 pageno: int = 1,

407 laparams: Optional[LAParams] = None,

408 scale: float = 1,

409 fontscale: float = 1.0,

410 layoutmode: str = "normal",

411 showpageno: bool = True,

412 pagemargin: int = 50,

413 imagewriter: Optional[ImageWriter] = None,

414 debug: int = 0,

415 rect_colors: Optional[Dict[str, str]] = None,

416 text_colors: Optional[Dict[str, str]] = None,

417 ) -> None:

418 PDFConverter.__init__(

419 self,

420 rsrcmgr,

421 outfp,

422 codec=codec,

423 pageno=pageno,

424 laparams=laparams,

425 )

426

427 # write() assumes a codec for binary I/O, or no codec for text I/O.

428 if self.outfp_binary and not self.codec:

429 raise PDFValueError("Codec is required for a binary I/O output")

430 if not self.outfp_binary and self.codec:

431 raise PDFValueError("Codec must not be specified for a text I/O output")

432

433 if text_colors is None:

434 text_colors = {"char": "black"}

435 if rect_colors is None:

436 rect_colors = {"curve": "black", "page": "gray"}

437

438 self.scale = scale

439 self.fontscale = fontscale

440 self.layoutmode = layoutmode

441 self.showpageno = showpageno

442 self.pagemargin = pagemargin

443 self.imagewriter = imagewriter

444 self.rect_colors = rect_colors

445 self.text_colors = text_colors

446 if debug:

447 self.rect_colors.update(self.RECT_COLORS)

448 self.text_colors.update(self.TEXT_COLORS)

449 self._yoffset: float = self.pagemargin

450 self._font: Optional[Tuple[str, float]] = None

451 self._fontstack: List[Optional[Tuple[str, float]]] = []

452 self.write_header()

453

454 def write(self, text: str) -> None:

455 if self.codec:

456 cast(BinaryIO, self.outfp).write(text.encode(self.codec))

457 else:

458 cast(TextIO, self.outfp).write(text)

459

460 def write_header(self) -> None:

461 self.write("<html><head>\n")

462 if self.codec:

463 s = (

464 '<meta http-equiv="Content-Type" content="text/html; '

465 'charset=%s">\n' % self.codec

466 )

467 else:

468 s = '<meta http-equiv="Content-Type" content="text/html">\n'

469 self.write(s)

470 self.write("</head><body>\n")

471

472 def write_footer(self) -> None:

473 page_links = [f'<a href="#{i}">{i}</a>' for i in range(1, self.pageno)]

474 s = '<div style="position:absolute; top:0px;">Page: %s</div>\n' % ", ".join(

475 page_links,

476 )

477 self.write(s)

478 self.write("</body></html>\n")

479

480 def write_text(self, text: str) -> None:

481 self.write(enc(text))

482

483 def place_rect(

484 self,

485 color: str,

486 borderwidth: int,

487 x: float,

488 y: float,

489 w: float,

490 h: float,

491 ) -> None:

492 color2 = self.rect_colors.get(color)

493 if color2 is not None:

494 s = (

495 '<span style="position:absolute; border: %s %dpx solid; '

496 'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n'

497 % (

498 color2,

499 borderwidth,

500 x * self.scale,

501 (self._yoffset - y) * self.scale,

502 w * self.scale,

503 h * self.scale,

504 )

505 )

506 self.write(s)

507

508 def place_border(self, color: str, borderwidth: int, item: LTComponent) -> None:

509 self.place_rect(color, borderwidth, item.x0, item.y1, item.width, item.height)

510

511 def place_image(

512 self,

513 item: LTImage,

514 borderwidth: int,

515 x: float,

516 y: float,

517 w: float,

518 h: float,

519 ) -> None:

520 if self.imagewriter is not None:

521 name = self.imagewriter.export_image(item)

522 s = (

523 '<img src="%s" border="%d" style="position:absolute; '

524 'left:%dpx; top:%dpx;" width="%d" height="%d" />\n'

525 % (

526 enc(name),

527 borderwidth,

528 x * self.scale,

529 (self._yoffset - y) * self.scale,

530 w * self.scale,

531 h * self.scale,

532 )

533 )

534 self.write(s)

535

536 def place_text(

537 self,

538 color: str,

539 text: str,

540 x: float,

541 y: float,

542 size: float,

543 ) -> None:

544 color2 = self.text_colors.get(color)

545 if color2 is not None:

546 s = (

547 '<span style="position:absolute; color:%s; left:%dpx; '

548 'top:%dpx; font-size:%dpx;">'

549 % (

550 color2,

551 x * self.scale,

552 (self._yoffset - y) * self.scale,

553 size * self.scale * self.fontscale,

554 )

555 )

556 self.write(s)

557 self.write_text(text)

558 self.write("</span>\n")

559

560 def begin_div(

561 self,

562 color: str,

563 borderwidth: int,

564 x: float,

565 y: float,

566 w: float,

567 h: float,

568 writing_mode: str = "False",

569 ) -> None:

570 self._fontstack.append(self._font)

571 self._font = None

572 s = (

573 '<div style="position:absolute; border: %s %dpx solid; '

574 "writing-mode:%s; left:%dpx; top:%dpx; width:%dpx; "

575 'height:%dpx;">'

576 % (

577 color,

578 borderwidth,

579 writing_mode,

580 x * self.scale,

581 (self._yoffset - y) * self.scale,

582 w * self.scale,

583 h * self.scale,

584 )

585 )

586 self.write(s)

587

588 def end_div(self, color: str) -> None:

589 if self._font is not None:

590 self.write("</span>")

591 self._font = self._fontstack.pop()

592 self.write("</div>")

593

594 def put_text(self, text: str, fontname: str, fontsize: float) -> None:

595 font = (fontname, fontsize)

596 if font != self._font:

597 if self._font is not None:

598 self.write("</span>")

599 # Remove subset tag from fontname, see PDF Reference 5.5.3

600 fontname_without_subset_tag = fontname.split("+")[-1]

601 self.write(

602 '<span style="font-family: %s; font-size:%dpx">'

603 % (fontname_without_subset_tag, fontsize * self.scale * self.fontscale),

604 )

605 self._font = font

606 self.write_text(text)

607

608 def put_newline(self) -> None:

609 self.write("<br>")

610

611 def receive_layout(self, ltpage: LTPage) -> None:

612 def show_group(item: Union[LTTextGroup, TextGroupElement]) -> None:

613 if isinstance(item, LTTextGroup):

614 self.place_border("textgroup", 1, item)

615 for child in item:

616 show_group(child)

617

618 def render(item: LTItem) -> None:

619 child: LTItem

620 if isinstance(item, LTPage):

621 self._yoffset += item.y1

622 self.place_border("page", 1, item)

623 if self.showpageno:

624 self.write(

625 '<div style="position:absolute; top:%dpx;">'

626 % ((self._yoffset - item.y1) * self.scale),

627 )

628 self.write(

629 f'<a name="{item.pageid}">Page {item.pageid}</a></div>\n',

630 )

631 for child in item:

632 render(child)

633 if item.groups is not None:

634 for group in item.groups:

635 show_group(group)

636 elif isinstance(item, LTCurve):

637 self.place_border("curve", 1, item)

638 elif isinstance(item, LTFigure):

639 self.begin_div("figure", 1, item.x0, item.y1, item.width, item.height)

640 for child in item:

641 render(child)

642 self.end_div("figure")

643 elif isinstance(item, LTImage):

644 self.place_image(item, 1, item.x0, item.y1, item.width, item.height)

645 elif self.layoutmode == "exact":

646 if isinstance(item, LTTextLine):

647 self.place_border("textline", 1, item)

648 for child in item:

649 render(child)

650 elif isinstance(item, LTTextBox):

651 self.place_border("textbox", 1, item)

652 self.place_text(

653 "textbox",

654 str(item.index + 1),

655 item.x0,

656 item.y1,

657 20,

658 )

659 for child in item:

660 render(child)

661 elif isinstance(item, LTChar):

662 self.place_border("char", 1, item)

663 self.place_text(

664 "char",

665 item.get_text(),

666 item.x0,

667 item.y1,

668 item.size,

669 )

670 elif isinstance(item, LTTextLine):

671 for child in item:

672 render(child)

673 if self.layoutmode != "loose":

674 self.put_newline()

675 elif isinstance(item, LTTextBox):

676 self.begin_div(

677 "textbox",

678 1,

679 item.x0,

680 item.y1,

681 item.width,

682 item.height,

683 item.get_writing_mode(),

684 )

685 for child in item:

686 render(child)

687 self.end_div("textbox")

688 elif isinstance(item, LTChar):

689 fontname = make_compat_str(item.fontname)

690 self.put_text(item.get_text(), fontname, item.size)

691 elif isinstance(item, LTText):

692 self.write_text(item.get_text())

693

694 render(ltpage)

695 self._yoffset += self.pagemargin

696

697 def close(self) -> None:

698 self.write_footer()

699

700

701class XMLConverter(PDFConverter[AnyIO]):

702 CONTROL = re.compile("[\x00-\x08\x0b-\x0c\x0e-\x1f]")

703

704 def __init__(

705 self,

706 rsrcmgr: PDFResourceManager,

707 outfp: AnyIO,

708 codec: str = "utf-8",

709 pageno: int = 1,

710 laparams: Optional[LAParams] = None,

711 imagewriter: Optional[ImageWriter] = None,

712 stripcontrol: bool = False,

713 ) -> None:

714 PDFConverter.__init__(

715 self,

716 rsrcmgr,

717 outfp,

718 codec=codec,

719 pageno=pageno,

720 laparams=laparams,

721 )

722

723 # write() assumes a codec for binary I/O, or no codec for text I/O.

724 if self.outfp_binary == (not self.codec):

725 raise PDFValueError("Codec is required for a binary I/O output")

726

727 self.imagewriter = imagewriter

728 self.stripcontrol = stripcontrol

729 self.write_header()

730

731 def write(self, text: str) -> None:

732 if self.codec:

733 cast(BinaryIO, self.outfp).write(text.encode(self.codec))

734 else:

735 cast(TextIO, self.outfp).write(text)

736

737 def write_header(self) -> None:

738 if self.codec:

739 self.write('<?xml version="1.0" encoding="%s" ?>\n' % self.codec)

740 else:

741 self.write('<?xml version="1.0" ?>\n')

742 self.write("<pages>\n")

743

744 def write_footer(self) -> None:

745 self.write("</pages>\n")

746

747 def write_text(self, text: str) -> None:

748 if self.stripcontrol:

749 text = self.CONTROL.sub("", text)

750 self.write(enc(text))

751

752 def receive_layout(self, ltpage: LTPage) -> None:

753 def show_group(item: LTItem) -> None:

754 if isinstance(item, LTTextBox):

755 self.write(

756 '<textbox id="%d" bbox="%s" />\n'

757 % (item.index, bbox2str(item.bbox)),

758 )

759 elif isinstance(item, LTTextGroup):

760 self.write('<textgroup bbox="%s">\n' % bbox2str(item.bbox))

761 for child in item:

762 show_group(child)

763 self.write("</textgroup>\n")

764

765 def render(item: LTItem) -> None:

766 child: LTItem

767 if isinstance(item, LTPage):

768 s = '<page id="%s" bbox="%s" rotate="%d">\n' % (

769 item.pageid,

770 bbox2str(item.bbox),

771 item.rotate,

772 )

773 self.write(s)

774 for child in item:

775 render(child)

776 if item.groups is not None:

777 self.write("<layout>\n")

778 for group in item.groups:

779 show_group(group)

780 self.write("</layout>\n")

781 self.write("</page>\n")

782 elif isinstance(item, LTLine):

783 s = '<line linewidth="%d" bbox="%s" />\n' % (

784 item.linewidth,

785 bbox2str(item.bbox),

786 )

787 self.write(s)

788 elif isinstance(item, LTRect):

789 s = '<rect linewidth="%d" bbox="%s" />\n' % (

790 item.linewidth,

791 bbox2str(item.bbox),

792 )

793 self.write(s)

794 elif isinstance(item, LTCurve):

795 s = '<curve linewidth="%d" bbox="%s" pts="%s"/>\n' % (

796 item.linewidth,

797 bbox2str(item.bbox),

798 item.get_pts(),

799 )

800 self.write(s)

801 elif isinstance(item, LTFigure):

802 s = f'<figure name="{item.name}" bbox="{bbox2str(item.bbox)}">\n'

803 self.write(s)

804 for child in item:

805 render(child)

806 self.write("</figure>\n")

807 elif isinstance(item, LTTextLine):

808 self.write('<textline bbox="%s">\n' % bbox2str(item.bbox))

809 for child in item:

810 render(child)

811 self.write("</textline>\n")

812 elif isinstance(item, LTTextBox):

813 wmode = ""

814 if isinstance(item, LTTextBoxVertical):

815 wmode = ' wmode="vertical"'

816 s = '<textbox id="%d" bbox="%s"%s>\n' % (

817 item.index,

818 bbox2str(item.bbox),

819 wmode,

820 )

821 self.write(s)

822 for child in item:

823 render(child)

824 self.write("</textbox>\n")

825 elif isinstance(item, LTChar):

826 s = (

827 '<text font="%s" bbox="%s" colourspace="%s" '

828 'ncolour="%s" size="%.3f">'

829 % (

830 enc(item.fontname),

831 bbox2str(item.bbox),

832 item.ncs.name,

833 item.graphicstate.ncolor,

834 item.size,

835 )

836 )

837 self.write(s)

838 self.write_text(item.get_text())

839 self.write("</text>\n")

840 elif isinstance(item, LTText):

841 self.write("<text>%s</text>\n" % item.get_text())

842 elif isinstance(item, LTImage):

843 if self.imagewriter is not None:

844 name = self.imagewriter.export_image(item)

845 self.write(

846 '<image src="%s" width="%d" height="%d" />\n'

847 % (enc(name), item.width, item.height),

848 )

849 else:

850 self.write(

851 '<image width="%d" height="%d" />\n'

852 % (item.width, item.height),

853 )

854 else:

855 assert False, str(("Unhandled", item))

856

857 render(ltpage)

858

859 def close(self) -> None:

860 self.write_footer()

861

862

863class HOCRConverter(PDFConverter[AnyIO]):

864 """Extract an hOCR representation from explicit text information within a PDF."""

865

866 # Where text is being extracted from a variety of types of PDF within a

867 # business process, those PDFs where the text is only present in image

868 # form will need to be analysed using an OCR tool which will typically

869 # output hOCR. This converter extracts the explicit text information from

870 # those PDFs that do have it and uses it to genxerate a basic hOCR

871 # representation that is designed to be used in conjunction with the image

872 # of the PDF in the same way as genuine OCR output would be, but without the

873 # inevitable OCR errors.

874

875 # The converter does not handle images, diagrams or text colors.

876

877 # In the examples processed by the contributor it was necessary to set

878 # LAParams.all_texts to True.

879

880 CONTROL = re.compile(r"[\x00-\x08\x0b-\x0c\x0e-\x1f]")

881

882 def __init__(

883 self,

884 rsrcmgr: PDFResourceManager,

885 outfp: AnyIO,

886 codec: str = "utf8",

887 pageno: int = 1,

888 laparams: Optional[LAParams] = None,

889 stripcontrol: bool = False,

890 ):

891 PDFConverter.__init__(

892 self,

893 rsrcmgr,

894 outfp,

895 codec=codec,

896 pageno=pageno,

897 laparams=laparams,

898 )

899 self.stripcontrol = stripcontrol

900 self.within_chars = False

901 self.write_header()

902

903 def bbox_repr(self, bbox: Rect) -> str:

904 (in_x0, in_y0, in_x1, in_y1) = bbox

905 # PDF y-coordinates are the other way round from hOCR coordinates

906 out_x0 = int(in_x0)

907 out_y0 = int(self.page_bbox[3] - in_y1)

908 out_x1 = int(in_x1)

909 out_y1 = int(self.page_bbox[3] - in_y0)

910 return f"bbox {out_x0} {out_y0} {out_x1} {out_y1}"

911

912 def write(self, text: str) -> None:

913 if self.codec:

914 encoded_text = text.encode(self.codec)

915 cast(BinaryIO, self.outfp).write(encoded_text)

916 else:

917 cast(TextIO, self.outfp).write(text)

918

919 def write_header(self) -> None:

920 if self.codec:

921 self.write(

922 "<html xmlns='http://www.w3.org/1999/xhtml' "

923 "xml:lang='en' lang='en' charset='%s'>\n" % self.codec,

924 )

925 else:

926 self.write(

927 "<html xmlns='http://www.w3.org/1999/xhtml' "

928 "xml:lang='en' lang='en'>\n",

929 )

930 self.write("<head>\n")

931 self.write("<title></title>\n")

932 self.write(

933 "<meta http-equiv='Content-Type' content='text/html;charset=utf-8' />\n",

934 )

935 self.write(

936 "<meta name='ocr-system' content='pdfminer.six HOCR Converter' />\n",

937 )

938 self.write(

939 " <meta name='ocr-capabilities'"

940 " content='ocr_page ocr_block ocr_line ocrx_word'/>\n",

941 )

942 self.write("</head>\n")

943 self.write("<body>\n")

944

945 def write_footer(self) -> None:

946 self.write("\n")

947 self.write(

948 "</body></html>\n",

949 )

950

951 def write_text(self, text: str) -> None:

952 if self.stripcontrol:

953 text = self.CONTROL.sub("", text)

954 self.write(text)

955

956 def write_word(self) -> None:

957 if len(self.working_text) > 0:

958 bold_and_italic_styles = ""

959 if "Italic" in self.working_font:

960 bold_and_italic_styles = "font-style: italic; "

961 if "Bold" in self.working_font:

962 bold_and_italic_styles += "font-weight: bold; "

963 self.write(

964 "<span style='font:\"%s\"; font-size:%d; %s' "

965 "class='ocrx_word' title='%s; x_font %s; "

966 "x_fsize %d'>%s</span>"

967 % (

968 (

969 self.working_font,

970 self.working_size,

971 bold_and_italic_styles,

972 self.bbox_repr(self.working_bbox),

973 self.working_font,

974 self.working_size,

975 self.working_text.strip(),

976 )

977 ),

978 )

979 self.within_chars = False

980

981 def receive_layout(self, ltpage: LTPage) -> None:

982 def render(item: LTItem) -> None:

983 if self.within_chars and isinstance(item, LTAnno):

984 self.write_word()

985 if isinstance(item, LTPage):

986 self.page_bbox = item.bbox

987 self.write(

988 "<div class='ocr_page' id='%s' title='%s'>\n"

989 % (item.pageid, self.bbox_repr(item.bbox)),

990 )

991 for child in item:

992 render(child)

993 self.write("</div>\n")

994 elif isinstance(item, LTTextLine):

995 self.write(

996 "<span class='ocr_line' title='%s'>" % (self.bbox_repr(item.bbox)),

997 )

998 for child_line in item:

999 render(child_line)

1000 self.write("</span>\n")

1001 elif isinstance(item, LTTextBox):

1002 self.write(

1003 "<div class='ocr_block' id='%d' title='%s'>\n"

1004 % (item.index, self.bbox_repr(item.bbox)),

1005 )

1006 for child in item:

1007 render(child)

1008 self.write("</div>\n")

1009 elif isinstance(item, LTChar):

1010 if not self.within_chars:

1011 self.within_chars = True

1012 self.working_text = item.get_text()

1013 self.working_bbox = item.bbox

1014 self.working_font = item.fontname

1015 self.working_size = item.size

1016 elif len(item.get_text().strip()) == 0:

1017 self.write_word()

1018 self.write(item.get_text())

1019 else:

1020 if (

1021 self.working_bbox[1] != item.bbox[1]

1022 or self.working_font != item.fontname

1023 or self.working_size != item.size

1024 ):

1025 self.write_word()

1026 self.working_bbox = item.bbox

1027 self.working_font = item.fontname

1028 self.working_size = item.size

1029 self.working_text += item.get_text()

1030 self.working_bbox = (

1031 self.working_bbox[0],

1032 self.working_bbox[1],

1033 item.bbox[2],

1034 self.working_bbox[3],

1035 )

1036

1037 render(ltpage)

1038

1039 def close(self) -> None:

1040 self.write_footer()