Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/converter.py: 30%

1import io

2import logging

3import re

4from collections.abc import Sequence

5from typing import (

6 BinaryIO,

7 ClassVar,

8 Generic,

9 TextIO,

10 TypeVar,

11 cast,

12)

14from pdfminer import utils

15from pdfminer.image import ImageWriter

16from pdfminer.layout import (

17 LAParams,

18 LTAnno,

19 LTChar,

20 LTComponent,

21 LTContainer,

22 LTCurve,

23 LTFigure,

24 LTImage,

25 LTItem,

26 LTLayoutContainer,

27 LTLine,

28 LTPage,

29 LTRect,

30 LTText,

31 LTTextBox,

32 LTTextBoxVertical,

33 LTTextGroup,

34 LTTextLine,

35 TextGroupElement,

36)

37from pdfminer.pdfcolor import PDFColorSpace

38from pdfminer.pdfdevice import PDFTextDevice

39from pdfminer.pdfexceptions import PDFValueError

40from pdfminer.pdffont import PDFFont, PDFUnicodeNotDefined

41from pdfminer.pdfinterp import PDFGraphicState, PDFResourceManager

42from pdfminer.pdfpage import PDFPage

43from pdfminer.pdftypes import PDFStream

44from pdfminer.utils import (

45 AnyIO,

46 Matrix,

47 PathSegment,

48 Point,

49 Rect,

50 apply_matrix_pt,

51 apply_matrix_rect,

52 bbox2str,

53 enc,

54 make_compat_str,

55 mult_matrix,

56)

58log = logging.getLogger(__name__)

61class PDFLayoutAnalyzer(PDFTextDevice):

62 cur_item: LTLayoutContainer

63 ctm: Matrix

65 def __init__(

66 self,

67 rsrcmgr: PDFResourceManager,

68 pageno: int = 1,

69 laparams: LAParams | None = None,

70 ) -> None:

71 PDFTextDevice.__init__(self, rsrcmgr)

72 self.pageno = pageno

73 self.laparams = laparams

74 self._stack: list[LTLayoutContainer] = []

76 def begin_page(self, page: PDFPage, ctm: Matrix) -> None:

77 (x0, y0, x1, y1) = apply_matrix_rect(ctm, page.mediabox)

78 mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1))

79 self.cur_item = LTPage(self.pageno, mediabox)

81 def end_page(self, page: PDFPage) -> None:

82 assert not self._stack, str(len(self._stack))

83 assert isinstance(self.cur_item, LTPage), str(type(self.cur_item))

84 if self.laparams is not None:

85 self.cur_item.analyze(self.laparams)

86 self.pageno += 1

87 self.receive_layout(self.cur_item)

89 def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None:

90 self._stack.append(self.cur_item)

91 self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))

93 def end_figure(self, _: str) -> None:

94 fig = self.cur_item

95 assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))

96 self.cur_item = self._stack.pop()

97 self.cur_item.add(fig)

99 def render_image(self, name: str, stream: PDFStream) -> None:

100 assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))

101 item = LTImage(

102 name,

103 stream,

104 (self.cur_item.x0, self.cur_item.y0, self.cur_item.x1, self.cur_item.y1),

105 )

106 self.cur_item.add(item)

107

108 def paint_path(

109 self,

110 gstate: PDFGraphicState,

111 stroke: bool,

112 fill: bool,

113 evenodd: bool,

114 path: Sequence[PathSegment],

115 ) -> None:

116 """Paint paths described in section 4.4 of the PDF reference manual"""

117 shape = "".join(x[0] for x in path)

118

119 if shape[:1] != "m":

120 # Per PDF Reference Section 4.4.1, "path construction operators may

121 # be invoked in any sequence, but the first one invoked must be m

122 # or re to begin a new subpath." Since pdfminer.six already

123 # converts all `re` (rectangle) operators to their equivalent

124 # `mlllh` representation, paths ingested by `.paint_path(...)` that

125 # do not begin with the `m` operator are invalid.

126 pass

127

128 elif shape.count("m") > 1:

129 # recurse if there are multiple m's in this shape

130 for m in re.finditer(r"m[^m]+", shape):

131 subpath = path[m.start(0) : m.end(0)]

132 self.paint_path(gstate, stroke, fill, evenodd, subpath)

133

134 else:

135 # Although the 'h' command does not not literally provide a

136 # point-position, its position is (by definition) equal to the

137 # subpath's starting point.

138 #

139 # And, per Section 4.4's Table 4.9, all other path commands place

140 # their point-position in their final two arguments. (Any preceding

141 # arguments represent control points on Bézier curves.)

142 raw_pts = [

143 cast(Point, p[-2:] if p[0] != "h" else path[0][-2:]) for p in path

144 ]

145 pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts]

146

147 operators = [str(operation[0]) for operation in path]

148 transformed_points = [

149 [

150 apply_matrix_pt(self.ctm, (float(operand1), float(operand2)))

151 for operand1, operand2 in zip(

152 operation[1::2], operation[2::2], strict=False

153 )

154 ]

155 for operation in path

156 ]

157 transformed_path = [

158 cast(PathSegment, (o, *p))

159 for o, p in zip(operators, transformed_points, strict=False)

160 ]

161

162 # Drop a redundant "l" on a path closed with "h"

163 if len(shape) > 3 and shape[-2:] == "lh" and pts[-2] == pts[0]:

164 shape = shape[:-2] + "h"

165 pts.pop()

166

167 if shape in {"mlh", "ml"}:

168 # single line segment

169 #

170 # Note: 'ml', in conditional above, is a frequent anomaly

171 # that we want to support.

172 line = LTLine(

173 gstate.linewidth,

174 pts[0],

175 pts[1],

176 stroke,

177 fill,

178 evenodd,

179 gstate.scolor,

180 gstate.ncolor,

181 original_path=transformed_path,

182 dashing_style=gstate.dash,

183 )

184 self.cur_item.add(line)

185

186 elif shape in {"mlllh", "mllll"}:

187 (x0, y0), (x1, y1), (x2, y2), (x3, y3), _ = pts

188

189 is_closed_loop = pts[0] == pts[4]

190 has_square_coordinates = (

191 x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0

192 ) or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)

193 if is_closed_loop and has_square_coordinates:

194 rect = LTRect(

195 gstate.linewidth,

196 (*pts[0], *pts[2]),

197 stroke,

198 fill,

199 evenodd,

200 gstate.scolor,

201 gstate.ncolor,

202 transformed_path,

203 gstate.dash,

204 )

205 self.cur_item.add(rect)

206 else:

207 curve = LTCurve(

208 gstate.linewidth,

209 pts,

210 stroke,

211 fill,

212 evenodd,

213 gstate.scolor,

214 gstate.ncolor,

215 transformed_path,

216 gstate.dash,

217 )

218 self.cur_item.add(curve)

219 else:

220 curve = LTCurve(

221 gstate.linewidth,

222 pts,

223 stroke,

224 fill,

225 evenodd,

226 gstate.scolor,

227 gstate.ncolor,

228 transformed_path,

229 gstate.dash,

230 )

231 self.cur_item.add(curve)

232

233 def render_char(

234 self,

235 matrix: Matrix,

236 font: PDFFont,

237 fontsize: float,

238 scaling: float,

239 rise: float,

240 cid: int,

241 ncs: PDFColorSpace,

242 graphicstate: PDFGraphicState,

243 ) -> float:

244 try:

245 text = font.to_unichr(cid)

246 assert isinstance(text, str), str(type(text))

247 except PDFUnicodeNotDefined:

248 text = self.handle_undefined_char(font, cid)

249 textwidth = font.char_width(cid)

250 textdisp = font.char_disp(cid)

251 item = LTChar(

252 matrix,

253 font,

254 fontsize,

255 scaling,

256 rise,

257 text,

258 textwidth,

259 textdisp,

260 ncs,

261 graphicstate,

262 )

263 self.cur_item.add(item)

264 return item.adv

265

266 def handle_undefined_char(self, font: PDFFont, cid: int) -> str:

267 log.debug(f"undefined: {font!r}, {cid!r}")

268 return f"(cid:{cid})"

269

270 def receive_layout(self, ltpage: LTPage) -> None:

271 pass

272

273

274class PDFPageAggregator(PDFLayoutAnalyzer):

275 def __init__(

276 self,

277 rsrcmgr: PDFResourceManager,

278 pageno: int = 1,

279 laparams: LAParams | None = None,

280 ) -> None:

281 PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)

282 self.result: LTPage | None = None

283

284 def receive_layout(self, ltpage: LTPage) -> None:

285 self.result = ltpage

286

287 def get_result(self) -> LTPage:

288 assert self.result is not None

289 return self.result

290

291

292# Some PDFConverter children support only binary I/O

293IOType = TypeVar("IOType", TextIO, BinaryIO, AnyIO)

294

295

296class PDFConverter(PDFLayoutAnalyzer, Generic[IOType]):

297 def __init__(

298 self,

299 rsrcmgr: PDFResourceManager,

300 outfp: IOType,

301 codec: str = "utf-8",

302 pageno: int = 1,

303 laparams: LAParams | None = None,

304 ) -> None:

305 PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)

306 self.outfp: IOType = outfp

307 self.codec = codec

308 self.outfp_binary = self._is_binary_stream(self.outfp)

309

310 @staticmethod

311 def _is_binary_stream(outfp: AnyIO) -> bool:

312 """Test if an stream is binary or not"""

313 if "b" in getattr(outfp, "mode", ""):

314 return True

315 elif hasattr(outfp, "mode"):

316 # output stream has a mode, but it does not contain 'b'

317 return False

318 elif isinstance(outfp, io.BytesIO):

319 return True

320 elif isinstance(outfp, (io.StringIO, io.TextIOBase)):

321 return False

322

323 return True

324

325

326class TextConverter(PDFConverter[AnyIO]):

327 def __init__(

328 self,

329 rsrcmgr: PDFResourceManager,

330 outfp: AnyIO,

331 codec: str = "utf-8",

332 pageno: int = 1,

333 laparams: LAParams | None = None,

334 showpageno: bool = False,

335 imagewriter: ImageWriter | None = None,

336 ) -> None:

337 super().__init__(rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)

338 self.showpageno = showpageno

339 self.imagewriter = imagewriter

340

341 def write_text(self, text: str) -> None:

342 text = utils.compatible_encode_method(text, self.codec, "ignore")

343 if self.outfp_binary:

344 cast(BinaryIO, self.outfp).write(text.encode())

345 else:

346 cast(TextIO, self.outfp).write(text)

347

348 def receive_layout(self, ltpage: LTPage) -> None:

349 def render(item: LTItem) -> None:

350 if isinstance(item, LTContainer):

351 for child in item:

352 render(child)

353 elif isinstance(item, LTText):

354 self.write_text(item.get_text())

355 if isinstance(item, LTTextBox):

356 self.write_text("\n")

357 elif isinstance(item, LTImage) and self.imagewriter is not None:

358 self.imagewriter.export_image(item)

359

360 if self.showpageno:

361 self.write_text(f"Page {ltpage.pageid}\n")

362 render(ltpage)

363 self.write_text("\f")

364

365 # Some dummy functions to save memory/CPU when all that is wanted

366 # is text. This stops all the image and drawing output from being

367 # recorded and taking up RAM.

368 def render_image(self, name: str, stream: PDFStream) -> None:

369 if self.imagewriter is not None:

370 PDFConverter.render_image(self, name, stream)

371

372 def paint_path(

373 self,

374 gstate: PDFGraphicState,

375 stroke: bool,

376 fill: bool,

377 evenodd: bool,

378 path: Sequence[PathSegment],

379 ) -> None:

380 pass

381

382

383class HTMLConverter(PDFConverter[AnyIO]):

384 RECT_COLORS: ClassVar[dict[str, str]] = {

385 "figure": "yellow",

386 "textline": "magenta",

387 "textbox": "cyan",

388 "textgroup": "red",

389 "curve": "black",

390 "page": "gray",

391 }

392

393 TEXT_COLORS: ClassVar[dict[str, str]] = {

394 "textbox": "blue",

395 "char": "black",

396 }

397

398 def __init__(

399 self,

400 rsrcmgr: PDFResourceManager,

401 outfp: AnyIO,

402 codec: str = "utf-8",

403 pageno: int = 1,

404 laparams: LAParams | None = None,

405 scale: float = 1,

406 fontscale: float = 1.0,

407 layoutmode: str = "normal",

408 showpageno: bool = True,

409 pagemargin: int = 50,

410 imagewriter: ImageWriter | None = None,

411 debug: int = 0,

412 rect_colors: dict[str, str] | None = None,

413 text_colors: dict[str, str] | None = None,

414 ) -> None:

415 PDFConverter.__init__(

416 self,

417 rsrcmgr,

418 outfp,

419 codec=codec,

420 pageno=pageno,

421 laparams=laparams,

422 )

423

424 # write() assumes a codec for binary I/O, or no codec for text I/O.

425 if self.outfp_binary and not self.codec:

426 raise PDFValueError("Codec is required for a binary I/O output")

427 if not self.outfp_binary and self.codec:

428 raise PDFValueError("Codec must not be specified for a text I/O output")

429

430 if text_colors is None:

431 text_colors = {"char": "black"}

432 if rect_colors is None:

433 rect_colors = {"curve": "black", "page": "gray"}

434

435 self.scale = scale

436 self.fontscale = fontscale

437 self.layoutmode = layoutmode

438 self.showpageno = showpageno

439 self.pagemargin = pagemargin

440 self.imagewriter = imagewriter

441 self.rect_colors = rect_colors

442 self.text_colors = text_colors

443 if debug:

444 self.rect_colors.update(self.RECT_COLORS)

445 self.text_colors.update(self.TEXT_COLORS)

446 self._yoffset: float = self.pagemargin

447 self._font: tuple[str, float] | None = None

448 self._fontstack: list[tuple[str, float] | None] = []

449 self.write_header()

450

451 def write(self, text: str) -> None:

452 if self.codec:

453 cast(BinaryIO, self.outfp).write(text.encode(self.codec))

454 else:

455 cast(TextIO, self.outfp).write(text)

456

457 def write_header(self) -> None:

458 self.write("<html><head>\n")

459 if self.codec:

460 s = (

461 '<meta http-equiv="Content-Type" content="text/html; '

462 f'charset={self.codec}">\n'

463 )

464 else:

465 s = '<meta http-equiv="Content-Type" content="text/html">\n'

466 self.write(s)

467 self.write("</head><body>\n")

468

469 def write_footer(self) -> None:

470 page_links = [f'<a href="#{i}">{i}</a>' for i in range(1, self.pageno)]

471 s = (

472 '<div style="position:absolute; top:0px;">'

473 f"Page: {', '.join(page_links)}</div>\n"

474 )

475 self.write(s)

476 self.write("</body></html>\n")

477

478 def write_text(self, text: str) -> None:

479 self.write(enc(text))

480

481 def place_rect(

482 self,

483 color: str,

484 borderwidth: int,

485 x: float,

486 y: float,

487 w: float,

488 h: float,

489 ) -> None:

490 color2 = self.rect_colors.get(color)

491 if color2 is not None:

492 s = (

493 '<span style="position:absolute; '

494 f"border: {color2} {borderwidth}px solid; "

495 f"left:{x * self.scale}px; "

496 f"top:{(self._yoffset - y) * self.scale}px; "

497 f"width:{w * self.scale}px; "

498 f'height:{h * self.scale}px;"></span>\n'

499 )

500 self.write(s)

501

502 def place_border(self, color: str, borderwidth: int, item: LTComponent) -> None:

503 self.place_rect(color, borderwidth, item.x0, item.y1, item.width, item.height)

504

505 def place_image(

506 self,

507 item: LTImage,

508 borderwidth: int,

509 x: float,

510 y: float,

511 w: float,

512 h: float,

513 ) -> None:

514 if self.imagewriter is not None:

515 name = self.imagewriter.export_image(item)

516 s = (

517 f'<img src="{enc(name)}" border="{borderwidth}" '

518 'style="position:absolute; '

519 f"left:{x * self.scale}px; "

520 f'top:{(self._yoffset - y) * self.scale}px;" '

521 f'width="{w * self.scale}" '

522 f'height="{h * self.scale}" />\n'

523 )

524 self.write(s)

525

526 def place_text(

527 self,

528 color: str,

529 text: str,

530 x: float,

531 y: float,

532 size: float,

533 ) -> None:

534 color2 = self.text_colors.get(color)

535 if color2 is not None:

536 s = (

537 '<span style="position:absolute; '

538 f"color:{color2}; "

539 f"left:{x * self.scale}px; "

540 f"top:{(self._yoffset - y) * self.scale}px; "

541 f'font-size:{size * self.scale * self.fontscale}px;">'

542 )

543 self.write(s)

544 self.write_text(text)

545 self.write("</span>\n")

546

547 def begin_div(

548 self,

549 color: str,

550 borderwidth: int,

551 x: float,

552 y: float,

553 w: float,

554 h: float,

555 writing_mode: str = "False",

556 ) -> None:

557 self._fontstack.append(self._font)

558 self._font = None

559 s = (

560 '<div style="position:absolute; '

561 f"border: {color} {borderwidth}px solid; "

562 f"writing-mode:{writing_mode}; "

563 f"left:{x * self.scale}px; "

564 f"top:{(self._yoffset - y) * self.scale}px; "

565 f"width:{w * self.scale}px; "

566 f'height:{h * self.scale}px;">'

567 )

568 self.write(s)

569

570 def end_div(self, color: str) -> None:

571 if self._font is not None:

572 self.write("</span>")

573 self._font = self._fontstack.pop()

574 self.write("</div>")

575

576 def put_text(self, text: str, fontname: str, fontsize: float) -> None:

577 font = (fontname, fontsize)

578 if font != self._font:

579 if self._font is not None:

580 self.write("</span>")

581 # Remove subset tag from fontname, see PDF Reference 5.5.3

582 fontname_without_subset_tag = fontname.split("+")[-1]

583 self.write(

584 '<span style="'

585 f"font-family: {fontname_without_subset_tag}; "

586 f'font-size:{fontsize * self.scale * self.fontscale}px">'

587 )

588 self._font = font

589 self.write_text(text)

590

591 def put_newline(self) -> None:

592 self.write("<br>")

593

594 def receive_layout(self, ltpage: LTPage) -> None:

595 def show_group(item: LTTextGroup | TextGroupElement) -> None:

596 if isinstance(item, LTTextGroup):

597 self.place_border("textgroup", 1, item)

598 for child in item:

599 show_group(child)

600

601 def render(item: LTItem) -> None:

602 child: LTItem

603 if isinstance(item, LTPage):

604 self._yoffset += item.y1

605 self.place_border("page", 1, item)

606 if self.showpageno:

607 self.write(

608 '<div style="position:absolute; top:%dpx;">'

609 f"{(self._yoffset - item.y1) * self.scale}",

610 )

611 self.write(

612 f'<a name="{item.pageid}">Page {item.pageid}</a></div>\n',

613 )

614 for child in item:

615 render(child)

616 if item.groups is not None:

617 for group in item.groups:

618 show_group(group)

619 elif isinstance(item, LTCurve):

620 self.place_border("curve", 1, item)

621 elif isinstance(item, LTFigure):

622 self.begin_div("figure", 1, item.x0, item.y1, item.width, item.height)

623 for child in item:

624 render(child)

625 self.end_div("figure")

626 elif isinstance(item, LTImage):

627 self.place_image(item, 1, item.x0, item.y1, item.width, item.height)

628 elif self.layoutmode == "exact":

629 if isinstance(item, LTTextLine):

630 self.place_border("textline", 1, item)

631 for child in item:

632 render(child)

633 elif isinstance(item, LTTextBox):

634 self.place_border("textbox", 1, item)

635 self.place_text(

636 "textbox",

637 str(item.index + 1),

638 item.x0,

639 item.y1,

640 20,

641 )

642 for child in item:

643 render(child)

644 elif isinstance(item, LTChar):

645 self.place_border("char", 1, item)

646 self.place_text(

647 "char",

648 item.get_text(),

649 item.x0,

650 item.y1,

651 item.size,

652 )

653 elif isinstance(item, LTTextLine):

654 for child in item:

655 render(child)

656 if self.layoutmode != "loose":

657 self.put_newline()

658 elif isinstance(item, LTTextBox):

659 self.begin_div(

660 "textbox",

661 1,

662 item.x0,

663 item.y1,

664 item.width,

665 item.height,

666 item.get_writing_mode(),

667 )

668 for child in item:

669 render(child)

670 self.end_div("textbox")

671 elif isinstance(item, LTChar):

672 fontname = make_compat_str(item.fontname)

673 self.put_text(item.get_text(), fontname, item.size)

674 elif isinstance(item, LTText):

675 self.write_text(item.get_text())

676

677 render(ltpage)

678 self._yoffset += self.pagemargin

679

680 def close(self) -> None:

681 self.write_footer()

682

683

684class XMLConverter(PDFConverter[AnyIO]):

685 CONTROL = re.compile("[\x00-\x08\x0b-\x0c\x0e-\x1f]")

686

687 def __init__(

688 self,

689 rsrcmgr: PDFResourceManager,

690 outfp: AnyIO,

691 codec: str = "utf-8",

692 pageno: int = 1,

693 laparams: LAParams | None = None,

694 imagewriter: ImageWriter | None = None,

695 stripcontrol: bool = False,

696 ) -> None:

697 PDFConverter.__init__(

698 self,

699 rsrcmgr,

700 outfp,

701 codec=codec,

702 pageno=pageno,

703 laparams=laparams,

704 )

705

706 # write() assumes a codec for binary I/O, or no codec for text I/O.

707 if self.outfp_binary == (not self.codec):

708 raise PDFValueError("Codec is required for a binary I/O output")

709

710 self.imagewriter = imagewriter

711 self.stripcontrol = stripcontrol

712 self.write_header()

713

714 def write(self, text: str) -> None:

715 if self.codec:

716 cast(BinaryIO, self.outfp).write(text.encode(self.codec))

717 else:

718 cast(TextIO, self.outfp).write(text)

719

720 def write_header(self) -> None:

721 if self.codec:

722 self.write(f'<?xml version="1.0" encoding="{self.codec}" ?>\n')

723 else:

724 self.write('<?xml version="1.0" ?>\n')

725 self.write("<pages>\n")

726

727 def write_footer(self) -> None:

728 self.write("</pages>\n")

729

730 def write_text(self, text: str) -> None:

731 if self.stripcontrol:

732 text = self.CONTROL.sub("", text)

733 self.write(enc(text))

734

735 def receive_layout(self, ltpage: LTPage) -> None:

736 def show_group(item: LTItem) -> None:

737 if isinstance(item, LTTextBox):

738 self.write(

739 f'<textbox id="{item.index}" bbox="{bbox2str(item.bbox)}" />\n'

740 )

741 elif isinstance(item, LTTextGroup):

742 self.write(f'<textgroup bbox="{bbox2str(item.bbox)}">\n')

743 for child in item:

744 show_group(child)

745 self.write("</textgroup>\n")

746

747 def render(item: LTItem) -> None:

748 child: LTItem

749 if isinstance(item, LTPage):

750 s = (

751 f'<page id="{item.pageid}" '

752 f'bbox="{bbox2str(item.bbox)}" '

753 f'rotate="{item.rotate}">\n'

754 )

755 self.write(s)

756 for child in item:

757 render(child)

758 if item.groups is not None:

759 self.write("<layout>\n")

760 for group in item.groups:

761 show_group(group)

762 self.write("</layout>\n")

763 self.write("</page>\n")

764 elif isinstance(item, LTLine):

765 s = (

766 f"<line "

767 f'linewidth="{item.linewidth}" '

768 f'bbox="{bbox2str(item.bbox)}" />\n'

769 )

770 self.write(s)

771 elif isinstance(item, LTRect):

772 s = (

773 f"<rect "

774 f'linewidth="{item.linewidth}" '

775 f'bbox="{bbox2str(item.bbox)}" />\n'

776 )

777 self.write(s)

778 elif isinstance(item, LTCurve):

779 s = (

780 f"<curve "

781 f'linewidth="{item.linewidth}" '

782 f'bbox="{bbox2str(item.bbox)}" '

783 f'pts="{item.get_pts()}"/>\n'

784 )

785 self.write(s)

786 elif isinstance(item, LTFigure):

787 s = f'<figure name="{item.name}" bbox="{bbox2str(item.bbox)}">\n'

788 self.write(s)

789 for child in item:

790 render(child)

791 self.write("</figure>\n")

792 elif isinstance(item, LTTextLine):

793 self.write(f'<textline bbox="{bbox2str(item.bbox)}">\n')

794 for child in item:

795 render(child)

796 self.write("</textline>\n")

797 elif isinstance(item, LTTextBox):

798 wmode = ""

799 if isinstance(item, LTTextBoxVertical):

800 wmode = ' wmode="vertical"'

801 s = f'<textbox id="{item.index}" bbox="{bbox2str(item.bbox)}"{wmode}>\n'

802 self.write(s)

803 for child in item:

804 render(child)

805 self.write("</textbox>\n")

806 elif isinstance(item, LTChar):

807 s = (

808 f"<text "

809 f'font="{enc(item.fontname)}" '

810 f'bbox="{bbox2str(item.bbox)}" '

811 f'colourspace="{item.ncs.name}" '

812 f'ncolour="{item.graphicstate.ncolor}" '

813 f'size="{item.size:.3f}">'

814 )

815 self.write(s)

816 self.write_text(item.get_text())

817 self.write("</text>\n")

818 elif isinstance(item, LTText):

819 self.write(f"<text>{item.get_text()}</text>\n")

820 elif isinstance(item, LTImage):

821 if self.imagewriter is not None:

822 name = self.imagewriter.export_image(item)

823 self.write(

824 f"<image "

825 f'src="{enc(name)}" '

826 f'width="{item.width}" '

827 f'height="{item.height}" />\n'

828 )

829 else:

830 self.write(

831 f'<image width="{item.width}" height="{item.height}" />\n'

832 )

833 else:

834 raise AssertionError(str(("Unhandled", item)))

835

836 render(ltpage)

837

838 def close(self) -> None:

839 self.write_footer()

840

841

842class HOCRConverter(PDFConverter[AnyIO]):

843 """Extract an hOCR representation from explicit text information within a PDF."""

844

845 # Where text is being extracted from a variety of types of PDF within a

846 # business process, those PDFs where the text is only present in image

847 # form will need to be analysed using an OCR tool which will typically

848 # output hOCR. This converter extracts the explicit text information from

849 # those PDFs that do have it and uses it to genxerate a basic hOCR

850 # representation that is designed to be used in conjunction with the image

851 # of the PDF in the same way as genuine OCR output would be, but without the

852 # inevitable OCR errors.

853

854 # The converter does not handle images, diagrams or text colors.

855

856 # In the examples processed by the contributor it was necessary to set

857 # LAParams.all_texts to True.

858

859 CONTROL = re.compile(r"[\x00-\x08\x0b-\x0c\x0e-\x1f]")

860

861 def __init__(

862 self,

863 rsrcmgr: PDFResourceManager,

864 outfp: AnyIO,

865 codec: str = "utf8",

866 pageno: int = 1,

867 laparams: LAParams | None = None,

868 stripcontrol: bool = False,

869 ):

870 PDFConverter.__init__(

871 self,

872 rsrcmgr,

873 outfp,

874 codec=codec,

875 pageno=pageno,

876 laparams=laparams,

877 )

878 self.stripcontrol = stripcontrol

879 self.within_chars = False

880 self.write_header()

881

882 def bbox_repr(self, bbox: Rect) -> str:

883 (in_x0, in_y0, in_x1, in_y1) = bbox

884 # PDF y-coordinates are the other way round from hOCR coordinates

885 out_x0 = int(in_x0)

886 out_y0 = int(self.page_bbox[3] - in_y1)

887 out_x1 = int(in_x1)

888 out_y1 = int(self.page_bbox[3] - in_y0)

889 return f"bbox {out_x0} {out_y0} {out_x1} {out_y1}"

890

891 def write(self, text: str) -> None:

892 if self.codec:

893 encoded_text = text.encode(self.codec)

894 cast(BinaryIO, self.outfp).write(encoded_text)

895 else:

896 cast(TextIO, self.outfp).write(text)

897

898 def write_header(self) -> None:

899 if self.codec:

900 self.write(

901 "<html xmlns='http://www.w3.org/1999/xhtml' "

902 f"xml:lang='en' lang='en' charset='{self.codec}'>\n",

903 )

904 else:

905 self.write(

906 "<html xmlns='http://www.w3.org/1999/xhtml' xml:lang='en' lang='en'>\n",

907 )

908 self.write("<head>\n")

909 self.write("<title></title>\n")

910 self.write(

911 "<meta http-equiv='Content-Type' content='text/html;charset=utf-8' />\n",

912 )

913 self.write(

914 "<meta name='ocr-system' content='pdfminer.six HOCR Converter' />\n",

915 )

916 self.write(

917 " <meta name='ocr-capabilities'"

918 " content='ocr_page ocr_block ocr_line ocrx_word'/>\n",

919 )

920 self.write("</head>\n")

921 self.write("<body>\n")

922

923 def write_footer(self) -> None:

924 self.write("\n")

925 self.write(

926 "</body></html>\n",

927 )

928

929 def write_text(self, text: str) -> None:

930 if self.stripcontrol:

931 text = self.CONTROL.sub("", text)

932 self.write(text)

933

934 def write_word(self) -> None:

935 if len(self.working_text) > 0:

936 bold_and_italic_styles = ""

937 if "Italic" in self.working_font:

938 bold_and_italic_styles = "font-style: italic; "

939 if "Bold" in self.working_font:

940 bold_and_italic_styles += "font-weight: bold; "

941 self.write(

942 f'<span style=\'font:"{self.working_font}"; '

943 f"font-size:{self.working_size}; "

944 f"{bold_and_italic_styles}' "

945 f"class='ocrx_word' "

946 f"title='{self.bbox_repr(self.working_bbox)}; "

947 f"x_font {self.working_font}; "

948 f"x_fsize {self.working_size}'>"

949 f"{self.working_text.strip()}</span>"

950 )

951 self.within_chars = False

952

953 def receive_layout(self, ltpage: LTPage) -> None:

954 def render(item: LTItem) -> None:

955 if self.within_chars and isinstance(item, LTAnno):

956 self.write_word()

957 if isinstance(item, LTPage):

958 self.page_bbox = item.bbox

959 self.write(

960 f"<div "

961 f"class='ocr_page' "

962 f"id='{item.pageid}' "

963 f"title='{self.bbox_repr(item.bbox)}'>\n",

964 )

965 for child in item:

966 render(child)

967 self.write("</div>\n")

968 elif isinstance(item, LTTextLine):

969 self.write(

970 f"<span class='ocr_line' title='{self.bbox_repr(item.bbox)}'>",

971 )

972 for child_line in item:

973 render(child_line)

974 self.write("</span>\n")

975 elif isinstance(item, LTTextBox):

976 self.write(

977 f"<div "

978 f"class='ocr_block' "

979 f"id='{item.index}' "

980 f"title='{self.bbox_repr(item.bbox)}'>\n"

981 )

982 for child in item:

983 render(child)

984 self.write("</div>\n")

985 elif isinstance(item, LTChar):

986 if not self.within_chars:

987 self.within_chars = True

988 self.working_text = item.get_text()

989 self.working_bbox = item.bbox

990 self.working_font = item.fontname

991 self.working_size = item.size

992 elif len(item.get_text().strip()) == 0:

993 self.write_word()

994 self.write(item.get_text())

995 else:

996 if (

997 self.working_bbox[1] != item.bbox[1]

998 or self.working_font != item.fontname

999 or self.working_size != item.size

1000 ):

1001 self.write_word()

1002 self.working_bbox = item.bbox

1003 self.working_font = item.fontname

1004 self.working_size = item.size

1005 self.working_text += item.get_text()

1006 self.working_bbox = (

1007 self.working_bbox[0],

1008 self.working_bbox[1],

1009 item.bbox[2],

1010 self.working_bbox[3],

1011 )

1012

1013 render(ltpage)

1014

1015 def close(self) -> None:

1016 self.write_footer()