Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/converter.py: 30%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import io
2import logging
3import re
4from collections.abc import Sequence
5from typing import (
6 BinaryIO,
7 ClassVar,
8 Generic,
9 TextIO,
10 TypeVar,
11 cast,
12)
14from pdfminer import utils
15from pdfminer.image import ImageWriter
16from pdfminer.layout import (
17 LAParams,
18 LTAnno,
19 LTChar,
20 LTComponent,
21 LTContainer,
22 LTCurve,
23 LTFigure,
24 LTImage,
25 LTItem,
26 LTLayoutContainer,
27 LTLine,
28 LTPage,
29 LTRect,
30 LTText,
31 LTTextBox,
32 LTTextBoxVertical,
33 LTTextGroup,
34 LTTextLine,
35 TextGroupElement,
36)
37from pdfminer.pdfcolor import PDFColorSpace
38from pdfminer.pdfdevice import PDFTextDevice
39from pdfminer.pdfexceptions import PDFValueError
40from pdfminer.pdffont import PDFFont, PDFUnicodeNotDefined
41from pdfminer.pdfinterp import PDFGraphicState, PDFResourceManager
42from pdfminer.pdfpage import PDFPage
43from pdfminer.pdftypes import PDFStream
44from pdfminer.utils import (
45 AnyIO,
46 Matrix,
47 PathSegment,
48 Point,
49 Rect,
50 apply_matrix_pt,
51 apply_matrix_rect,
52 bbox2str,
53 enc,
54 make_compat_str,
55 mult_matrix,
56)
58log = logging.getLogger(__name__)
61class PDFLayoutAnalyzer(PDFTextDevice):
62 cur_item: LTLayoutContainer
63 ctm: Matrix
65 def __init__(
66 self,
67 rsrcmgr: PDFResourceManager,
68 pageno: int = 1,
69 laparams: LAParams | None = None,
70 ) -> None:
71 PDFTextDevice.__init__(self, rsrcmgr)
72 self.pageno = pageno
73 self.laparams = laparams
74 self._stack: list[LTLayoutContainer] = []
76 def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
77 (x0, y0, x1, y1) = apply_matrix_rect(ctm, page.mediabox)
78 mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1))
79 self.cur_item = LTPage(self.pageno, mediabox)
81 def end_page(self, page: PDFPage) -> None:
82 assert not self._stack, str(len(self._stack))
83 assert isinstance(self.cur_item, LTPage), str(type(self.cur_item))
84 if self.laparams is not None:
85 self.cur_item.analyze(self.laparams)
86 self.pageno += 1
87 self.receive_layout(self.cur_item)
89 def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None:
90 self._stack.append(self.cur_item)
91 self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))
93 def end_figure(self, _: str) -> None:
94 fig = self.cur_item
95 assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
96 self.cur_item = self._stack.pop()
97 self.cur_item.add(fig)
99 def render_image(self, name: str, stream: PDFStream) -> None:
100 assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
101 item = LTImage(
102 name,
103 stream,
104 (self.cur_item.x0, self.cur_item.y0, self.cur_item.x1, self.cur_item.y1),
105 )
106 self.cur_item.add(item)
108 def paint_path(
109 self,
110 gstate: PDFGraphicState,
111 stroke: bool,
112 fill: bool,
113 evenodd: bool,
114 path: Sequence[PathSegment],
115 ) -> None:
116 """Paint paths described in section 4.4 of the PDF reference manual"""
117 shape = "".join(x[0] for x in path)
119 if shape[:1] != "m":
120 # Per PDF Reference Section 4.4.1, "path construction operators may
121 # be invoked in any sequence, but the first one invoked must be m
122 # or re to begin a new subpath." Since pdfminer.six already
123 # converts all `re` (rectangle) operators to their equivalent
124 # `mlllh` representation, paths ingested by `.paint_path(...)` that
125 # do not begin with the `m` operator are invalid.
126 pass
128 elif shape.count("m") > 1:
129 # recurse if there are multiple m's in this shape
130 for m in re.finditer(r"m[^m]+", shape):
131 subpath = path[m.start(0) : m.end(0)]
132 self.paint_path(gstate, stroke, fill, evenodd, subpath)
134 else:
135 # Although the 'h' command does not not literally provide a
136 # point-position, its position is (by definition) equal to the
137 # subpath's starting point.
138 #
139 # And, per Section 4.4's Table 4.9, all other path commands place
140 # their point-position in their final two arguments. (Any preceding
141 # arguments represent control points on Bézier curves.)
142 raw_pts = [
143 cast(Point, p[-2:] if p[0] != "h" else path[0][-2:]) for p in path
144 ]
145 pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts]
147 operators = [str(operation[0]) for operation in path]
148 transformed_points = [
149 [
150 apply_matrix_pt(self.ctm, (float(operand1), float(operand2)))
151 for operand1, operand2 in zip(
152 operation[1::2], operation[2::2], strict=False
153 )
154 ]
155 for operation in path
156 ]
157 transformed_path = [
158 cast(PathSegment, (o, *p))
159 for o, p in zip(operators, transformed_points, strict=False)
160 ]
162 # Drop a redundant "l" on a path closed with "h"
163 if len(shape) > 3 and shape[-2:] == "lh" and pts[-2] == pts[0]:
164 shape = shape[:-2] + "h"
165 pts.pop()
167 if shape in {"mlh", "ml"}:
168 # single line segment
169 #
170 # Note: 'ml', in conditional above, is a frequent anomaly
171 # that we want to support.
172 line = LTLine(
173 gstate.linewidth,
174 pts[0],
175 pts[1],
176 stroke,
177 fill,
178 evenodd,
179 gstate.scolor,
180 gstate.ncolor,
181 original_path=transformed_path,
182 dashing_style=gstate.dash,
183 )
184 self.cur_item.add(line)
186 elif shape in {"mlllh", "mllll"}:
187 (x0, y0), (x1, y1), (x2, y2), (x3, y3), _ = pts
189 is_closed_loop = pts[0] == pts[4]
190 has_square_coordinates = (
191 x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0
192 ) or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)
193 if is_closed_loop and has_square_coordinates:
194 rect = LTRect(
195 gstate.linewidth,
196 (*pts[0], *pts[2]),
197 stroke,
198 fill,
199 evenodd,
200 gstate.scolor,
201 gstate.ncolor,
202 transformed_path,
203 gstate.dash,
204 )
205 self.cur_item.add(rect)
206 else:
207 curve = LTCurve(
208 gstate.linewidth,
209 pts,
210 stroke,
211 fill,
212 evenodd,
213 gstate.scolor,
214 gstate.ncolor,
215 transformed_path,
216 gstate.dash,
217 )
218 self.cur_item.add(curve)
219 else:
220 curve = LTCurve(
221 gstate.linewidth,
222 pts,
223 stroke,
224 fill,
225 evenodd,
226 gstate.scolor,
227 gstate.ncolor,
228 transformed_path,
229 gstate.dash,
230 )
231 self.cur_item.add(curve)
233 def render_char(
234 self,
235 matrix: Matrix,
236 font: PDFFont,
237 fontsize: float,
238 scaling: float,
239 rise: float,
240 cid: int,
241 ncs: PDFColorSpace,
242 graphicstate: PDFGraphicState,
243 ) -> float:
244 try:
245 text = font.to_unichr(cid)
246 assert isinstance(text, str), str(type(text))
247 except PDFUnicodeNotDefined:
248 text = self.handle_undefined_char(font, cid)
249 textwidth = font.char_width(cid)
250 textdisp = font.char_disp(cid)
251 item = LTChar(
252 matrix,
253 font,
254 fontsize,
255 scaling,
256 rise,
257 text,
258 textwidth,
259 textdisp,
260 ncs,
261 graphicstate,
262 )
263 self.cur_item.add(item)
264 return item.adv
266 def handle_undefined_char(self, font: PDFFont, cid: int) -> str:
267 log.debug(f"undefined: {font!r}, {cid!r}")
268 return f"(cid:{cid})"
270 def receive_layout(self, ltpage: LTPage) -> None:
271 pass
274class PDFPageAggregator(PDFLayoutAnalyzer):
275 def __init__(
276 self,
277 rsrcmgr: PDFResourceManager,
278 pageno: int = 1,
279 laparams: LAParams | None = None,
280 ) -> None:
281 PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
282 self.result: LTPage | None = None
284 def receive_layout(self, ltpage: LTPage) -> None:
285 self.result = ltpage
287 def get_result(self) -> LTPage:
288 assert self.result is not None
289 return self.result
292# Some PDFConverter children support only binary I/O
293IOType = TypeVar("IOType", TextIO, BinaryIO, AnyIO)
296class PDFConverter(PDFLayoutAnalyzer, Generic[IOType]):
297 def __init__(
298 self,
299 rsrcmgr: PDFResourceManager,
300 outfp: IOType,
301 codec: str = "utf-8",
302 pageno: int = 1,
303 laparams: LAParams | None = None,
304 ) -> None:
305 PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
306 self.outfp: IOType = outfp
307 self.codec = codec
308 self.outfp_binary = self._is_binary_stream(self.outfp)
310 @staticmethod
311 def _is_binary_stream(outfp: AnyIO) -> bool:
312 """Test if an stream is binary or not"""
313 if "b" in getattr(outfp, "mode", ""):
314 return True
315 elif hasattr(outfp, "mode"):
316 # output stream has a mode, but it does not contain 'b'
317 return False
318 elif isinstance(outfp, io.BytesIO):
319 return True
320 elif isinstance(outfp, (io.StringIO, io.TextIOBase)):
321 return False
323 return True
326class TextConverter(PDFConverter[AnyIO]):
327 def __init__(
328 self,
329 rsrcmgr: PDFResourceManager,
330 outfp: AnyIO,
331 codec: str = "utf-8",
332 pageno: int = 1,
333 laparams: LAParams | None = None,
334 showpageno: bool = False,
335 imagewriter: ImageWriter | None = None,
336 ) -> None:
337 super().__init__(rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
338 self.showpageno = showpageno
339 self.imagewriter = imagewriter
341 def write_text(self, text: str) -> None:
342 text = utils.compatible_encode_method(text, self.codec, "ignore")
343 if self.outfp_binary:
344 cast(BinaryIO, self.outfp).write(text.encode())
345 else:
346 cast(TextIO, self.outfp).write(text)
348 def receive_layout(self, ltpage: LTPage) -> None:
349 def render(item: LTItem) -> None:
350 if isinstance(item, LTContainer):
351 for child in item:
352 render(child)
353 elif isinstance(item, LTText):
354 self.write_text(item.get_text())
355 if isinstance(item, LTTextBox):
356 self.write_text("\n")
357 elif isinstance(item, LTImage) and self.imagewriter is not None:
358 self.imagewriter.export_image(item)
360 if self.showpageno:
361 self.write_text(f"Page {ltpage.pageid}\n")
362 render(ltpage)
363 self.write_text("\f")
365 # Some dummy functions to save memory/CPU when all that is wanted
366 # is text. This stops all the image and drawing output from being
367 # recorded and taking up RAM.
368 def render_image(self, name: str, stream: PDFStream) -> None:
369 if self.imagewriter is not None:
370 PDFConverter.render_image(self, name, stream)
372 def paint_path(
373 self,
374 gstate: PDFGraphicState,
375 stroke: bool,
376 fill: bool,
377 evenodd: bool,
378 path: Sequence[PathSegment],
379 ) -> None:
380 pass
383class HTMLConverter(PDFConverter[AnyIO]):
384 RECT_COLORS: ClassVar[dict[str, str]] = {
385 "figure": "yellow",
386 "textline": "magenta",
387 "textbox": "cyan",
388 "textgroup": "red",
389 "curve": "black",
390 "page": "gray",
391 }
393 TEXT_COLORS: ClassVar[dict[str, str]] = {
394 "textbox": "blue",
395 "char": "black",
396 }
398 def __init__(
399 self,
400 rsrcmgr: PDFResourceManager,
401 outfp: AnyIO,
402 codec: str = "utf-8",
403 pageno: int = 1,
404 laparams: LAParams | None = None,
405 scale: float = 1,
406 fontscale: float = 1.0,
407 layoutmode: str = "normal",
408 showpageno: bool = True,
409 pagemargin: int = 50,
410 imagewriter: ImageWriter | None = None,
411 debug: int = 0,
412 rect_colors: dict[str, str] | None = None,
413 text_colors: dict[str, str] | None = None,
414 ) -> None:
415 PDFConverter.__init__(
416 self,
417 rsrcmgr,
418 outfp,
419 codec=codec,
420 pageno=pageno,
421 laparams=laparams,
422 )
424 # write() assumes a codec for binary I/O, or no codec for text I/O.
425 if self.outfp_binary and not self.codec:
426 raise PDFValueError("Codec is required for a binary I/O output")
427 if not self.outfp_binary and self.codec:
428 raise PDFValueError("Codec must not be specified for a text I/O output")
430 if text_colors is None:
431 text_colors = {"char": "black"}
432 if rect_colors is None:
433 rect_colors = {"curve": "black", "page": "gray"}
435 self.scale = scale
436 self.fontscale = fontscale
437 self.layoutmode = layoutmode
438 self.showpageno = showpageno
439 self.pagemargin = pagemargin
440 self.imagewriter = imagewriter
441 self.rect_colors = rect_colors
442 self.text_colors = text_colors
443 if debug:
444 self.rect_colors.update(self.RECT_COLORS)
445 self.text_colors.update(self.TEXT_COLORS)
446 self._yoffset: float = self.pagemargin
447 self._font: tuple[str, float] | None = None
448 self._fontstack: list[tuple[str, float] | None] = []
449 self.write_header()
451 def write(self, text: str) -> None:
452 if self.codec:
453 cast(BinaryIO, self.outfp).write(text.encode(self.codec))
454 else:
455 cast(TextIO, self.outfp).write(text)
457 def write_header(self) -> None:
458 self.write("<html><head>\n")
459 if self.codec:
460 s = (
461 '<meta http-equiv="Content-Type" content="text/html; '
462 f'charset={self.codec}">\n'
463 )
464 else:
465 s = '<meta http-equiv="Content-Type" content="text/html">\n'
466 self.write(s)
467 self.write("</head><body>\n")
469 def write_footer(self) -> None:
470 page_links = [f'<a href="#{i}">{i}</a>' for i in range(1, self.pageno)]
471 s = (
472 '<div style="position:absolute; top:0px;">'
473 f"Page: {', '.join(page_links)}</div>\n"
474 )
475 self.write(s)
476 self.write("</body></html>\n")
478 def write_text(self, text: str) -> None:
479 self.write(enc(text))
481 def place_rect(
482 self,
483 color: str,
484 borderwidth: int,
485 x: float,
486 y: float,
487 w: float,
488 h: float,
489 ) -> None:
490 color2 = self.rect_colors.get(color)
491 if color2 is not None:
492 s = (
493 '<span style="position:absolute; '
494 f"border: {color2} {borderwidth}px solid; "
495 f"left:{x * self.scale}px; "
496 f"top:{(self._yoffset - y) * self.scale}px; "
497 f"width:{w * self.scale}px; "
498 f'height:{h * self.scale}px;"></span>\n'
499 )
500 self.write(s)
502 def place_border(self, color: str, borderwidth: int, item: LTComponent) -> None:
503 self.place_rect(color, borderwidth, item.x0, item.y1, item.width, item.height)
505 def place_image(
506 self,
507 item: LTImage,
508 borderwidth: int,
509 x: float,
510 y: float,
511 w: float,
512 h: float,
513 ) -> None:
514 if self.imagewriter is not None:
515 name = self.imagewriter.export_image(item)
516 s = (
517 f'<img src="{enc(name)}" border="{borderwidth}" '
518 'style="position:absolute; '
519 f"left:{x * self.scale}px; "
520 f'top:{(self._yoffset - y) * self.scale}px;" '
521 f'width="{w * self.scale}" '
522 f'height="{h * self.scale}" />\n'
523 )
524 self.write(s)
526 def place_text(
527 self,
528 color: str,
529 text: str,
530 x: float,
531 y: float,
532 size: float,
533 ) -> None:
534 color2 = self.text_colors.get(color)
535 if color2 is not None:
536 s = (
537 '<span style="position:absolute; '
538 f"color:{color2}; "
539 f"left:{x * self.scale}px; "
540 f"top:{(self._yoffset - y) * self.scale}px; "
541 f'font-size:{size * self.scale * self.fontscale}px;">'
542 )
543 self.write(s)
544 self.write_text(text)
545 self.write("</span>\n")
547 def begin_div(
548 self,
549 color: str,
550 borderwidth: int,
551 x: float,
552 y: float,
553 w: float,
554 h: float,
555 writing_mode: str = "False",
556 ) -> None:
557 self._fontstack.append(self._font)
558 self._font = None
559 s = (
560 '<div style="position:absolute; '
561 f"border: {color} {borderwidth}px solid; "
562 f"writing-mode:{writing_mode}; "
563 f"left:{x * self.scale}px; "
564 f"top:{(self._yoffset - y) * self.scale}px; "
565 f"width:{w * self.scale}px; "
566 f'height:{h * self.scale}px;">'
567 )
568 self.write(s)
570 def end_div(self, color: str) -> None:
571 if self._font is not None:
572 self.write("</span>")
573 self._font = self._fontstack.pop()
574 self.write("</div>")
576 def put_text(self, text: str, fontname: str, fontsize: float) -> None:
577 font = (fontname, fontsize)
578 if font != self._font:
579 if self._font is not None:
580 self.write("</span>")
581 # Remove subset tag from fontname, see PDF Reference 5.5.3
582 fontname_without_subset_tag = fontname.split("+")[-1]
583 self.write(
584 '<span style="'
585 f"font-family: {fontname_without_subset_tag}; "
586 f'font-size:{fontsize * self.scale * self.fontscale}px">'
587 )
588 self._font = font
589 self.write_text(text)
591 def put_newline(self) -> None:
592 self.write("<br>")
594 def receive_layout(self, ltpage: LTPage) -> None:
595 def show_group(item: LTTextGroup | TextGroupElement) -> None:
596 if isinstance(item, LTTextGroup):
597 self.place_border("textgroup", 1, item)
598 for child in item:
599 show_group(child)
601 def render(item: LTItem) -> None:
602 child: LTItem
603 if isinstance(item, LTPage):
604 self._yoffset += item.y1
605 self.place_border("page", 1, item)
606 if self.showpageno:
607 self.write(
608 '<div style="position:absolute; top:%dpx;">'
609 f"{(self._yoffset - item.y1) * self.scale}",
610 )
611 self.write(
612 f'<a name="{item.pageid}">Page {item.pageid}</a></div>\n',
613 )
614 for child in item:
615 render(child)
616 if item.groups is not None:
617 for group in item.groups:
618 show_group(group)
619 elif isinstance(item, LTCurve):
620 self.place_border("curve", 1, item)
621 elif isinstance(item, LTFigure):
622 self.begin_div("figure", 1, item.x0, item.y1, item.width, item.height)
623 for child in item:
624 render(child)
625 self.end_div("figure")
626 elif isinstance(item, LTImage):
627 self.place_image(item, 1, item.x0, item.y1, item.width, item.height)
628 elif self.layoutmode == "exact":
629 if isinstance(item, LTTextLine):
630 self.place_border("textline", 1, item)
631 for child in item:
632 render(child)
633 elif isinstance(item, LTTextBox):
634 self.place_border("textbox", 1, item)
635 self.place_text(
636 "textbox",
637 str(item.index + 1),
638 item.x0,
639 item.y1,
640 20,
641 )
642 for child in item:
643 render(child)
644 elif isinstance(item, LTChar):
645 self.place_border("char", 1, item)
646 self.place_text(
647 "char",
648 item.get_text(),
649 item.x0,
650 item.y1,
651 item.size,
652 )
653 elif isinstance(item, LTTextLine):
654 for child in item:
655 render(child)
656 if self.layoutmode != "loose":
657 self.put_newline()
658 elif isinstance(item, LTTextBox):
659 self.begin_div(
660 "textbox",
661 1,
662 item.x0,
663 item.y1,
664 item.width,
665 item.height,
666 item.get_writing_mode(),
667 )
668 for child in item:
669 render(child)
670 self.end_div("textbox")
671 elif isinstance(item, LTChar):
672 fontname = make_compat_str(item.fontname)
673 self.put_text(item.get_text(), fontname, item.size)
674 elif isinstance(item, LTText):
675 self.write_text(item.get_text())
677 render(ltpage)
678 self._yoffset += self.pagemargin
680 def close(self) -> None:
681 self.write_footer()
684class XMLConverter(PDFConverter[AnyIO]):
685 CONTROL = re.compile("[\x00-\x08\x0b-\x0c\x0e-\x1f]")
687 def __init__(
688 self,
689 rsrcmgr: PDFResourceManager,
690 outfp: AnyIO,
691 codec: str = "utf-8",
692 pageno: int = 1,
693 laparams: LAParams | None = None,
694 imagewriter: ImageWriter | None = None,
695 stripcontrol: bool = False,
696 ) -> None:
697 PDFConverter.__init__(
698 self,
699 rsrcmgr,
700 outfp,
701 codec=codec,
702 pageno=pageno,
703 laparams=laparams,
704 )
706 # write() assumes a codec for binary I/O, or no codec for text I/O.
707 if self.outfp_binary == (not self.codec):
708 raise PDFValueError("Codec is required for a binary I/O output")
710 self.imagewriter = imagewriter
711 self.stripcontrol = stripcontrol
712 self.write_header()
714 def write(self, text: str) -> None:
715 if self.codec:
716 cast(BinaryIO, self.outfp).write(text.encode(self.codec))
717 else:
718 cast(TextIO, self.outfp).write(text)
720 def write_header(self) -> None:
721 if self.codec:
722 self.write(f'<?xml version="1.0" encoding="{self.codec}" ?>\n')
723 else:
724 self.write('<?xml version="1.0" ?>\n')
725 self.write("<pages>\n")
727 def write_footer(self) -> None:
728 self.write("</pages>\n")
730 def write_text(self, text: str) -> None:
731 if self.stripcontrol:
732 text = self.CONTROL.sub("", text)
733 self.write(enc(text))
735 def receive_layout(self, ltpage: LTPage) -> None:
736 def show_group(item: LTItem) -> None:
737 if isinstance(item, LTTextBox):
738 self.write(
739 f'<textbox id="{item.index}" bbox="{bbox2str(item.bbox)}" />\n'
740 )
741 elif isinstance(item, LTTextGroup):
742 self.write(f'<textgroup bbox="{bbox2str(item.bbox)}">\n')
743 for child in item:
744 show_group(child)
745 self.write("</textgroup>\n")
747 def render(item: LTItem) -> None:
748 child: LTItem
749 if isinstance(item, LTPage):
750 s = (
751 f'<page id="{item.pageid}" '
752 f'bbox="{bbox2str(item.bbox)}" '
753 f'rotate="{item.rotate}">\n'
754 )
755 self.write(s)
756 for child in item:
757 render(child)
758 if item.groups is not None:
759 self.write("<layout>\n")
760 for group in item.groups:
761 show_group(group)
762 self.write("</layout>\n")
763 self.write("</page>\n")
764 elif isinstance(item, LTLine):
765 s = (
766 f"<line "
767 f'linewidth="{item.linewidth}" '
768 f'bbox="{bbox2str(item.bbox)}" />\n'
769 )
770 self.write(s)
771 elif isinstance(item, LTRect):
772 s = (
773 f"<rect "
774 f'linewidth="{item.linewidth}" '
775 f'bbox="{bbox2str(item.bbox)}" />\n'
776 )
777 self.write(s)
778 elif isinstance(item, LTCurve):
779 s = (
780 f"<curve "
781 f'linewidth="{item.linewidth}" '
782 f'bbox="{bbox2str(item.bbox)}" '
783 f'pts="{item.get_pts()}"/>\n'
784 )
785 self.write(s)
786 elif isinstance(item, LTFigure):
787 s = f'<figure name="{item.name}" bbox="{bbox2str(item.bbox)}">\n'
788 self.write(s)
789 for child in item:
790 render(child)
791 self.write("</figure>\n")
792 elif isinstance(item, LTTextLine):
793 self.write(f'<textline bbox="{bbox2str(item.bbox)}">\n')
794 for child in item:
795 render(child)
796 self.write("</textline>\n")
797 elif isinstance(item, LTTextBox):
798 wmode = ""
799 if isinstance(item, LTTextBoxVertical):
800 wmode = ' wmode="vertical"'
801 s = f'<textbox id="{item.index}" bbox="{bbox2str(item.bbox)}"{wmode}>\n'
802 self.write(s)
803 for child in item:
804 render(child)
805 self.write("</textbox>\n")
806 elif isinstance(item, LTChar):
807 s = (
808 f"<text "
809 f'font="{enc(item.fontname)}" '
810 f'bbox="{bbox2str(item.bbox)}" '
811 f'colourspace="{item.ncs.name}" '
812 f'ncolour="{item.graphicstate.ncolor}" '
813 f'size="{item.size:.3f}">'
814 )
815 self.write(s)
816 self.write_text(item.get_text())
817 self.write("</text>\n")
818 elif isinstance(item, LTText):
819 self.write(f"<text>{item.get_text()}</text>\n")
820 elif isinstance(item, LTImage):
821 if self.imagewriter is not None:
822 name = self.imagewriter.export_image(item)
823 self.write(
824 f"<image "
825 f'src="{enc(name)}" '
826 f'width="{item.width}" '
827 f'height="{item.height}" />\n'
828 )
829 else:
830 self.write(
831 f'<image width="{item.width}" height="{item.height}" />\n'
832 )
833 else:
834 raise AssertionError(str(("Unhandled", item)))
836 render(ltpage)
838 def close(self) -> None:
839 self.write_footer()
842class HOCRConverter(PDFConverter[AnyIO]):
843 """Extract an hOCR representation from explicit text information within a PDF."""
845 # Where text is being extracted from a variety of types of PDF within a
846 # business process, those PDFs where the text is only present in image
847 # form will need to be analysed using an OCR tool which will typically
848 # output hOCR. This converter extracts the explicit text information from
849 # those PDFs that do have it and uses it to genxerate a basic hOCR
850 # representation that is designed to be used in conjunction with the image
851 # of the PDF in the same way as genuine OCR output would be, but without the
852 # inevitable OCR errors.
854 # The converter does not handle images, diagrams or text colors.
856 # In the examples processed by the contributor it was necessary to set
857 # LAParams.all_texts to True.
859 CONTROL = re.compile(r"[\x00-\x08\x0b-\x0c\x0e-\x1f]")
861 def __init__(
862 self,
863 rsrcmgr: PDFResourceManager,
864 outfp: AnyIO,
865 codec: str = "utf8",
866 pageno: int = 1,
867 laparams: LAParams | None = None,
868 stripcontrol: bool = False,
869 ):
870 PDFConverter.__init__(
871 self,
872 rsrcmgr,
873 outfp,
874 codec=codec,
875 pageno=pageno,
876 laparams=laparams,
877 )
878 self.stripcontrol = stripcontrol
879 self.within_chars = False
880 self.write_header()
882 def bbox_repr(self, bbox: Rect) -> str:
883 (in_x0, in_y0, in_x1, in_y1) = bbox
884 # PDF y-coordinates are the other way round from hOCR coordinates
885 out_x0 = int(in_x0)
886 out_y0 = int(self.page_bbox[3] - in_y1)
887 out_x1 = int(in_x1)
888 out_y1 = int(self.page_bbox[3] - in_y0)
889 return f"bbox {out_x0} {out_y0} {out_x1} {out_y1}"
891 def write(self, text: str) -> None:
892 if self.codec:
893 encoded_text = text.encode(self.codec)
894 cast(BinaryIO, self.outfp).write(encoded_text)
895 else:
896 cast(TextIO, self.outfp).write(text)
898 def write_header(self) -> None:
899 if self.codec:
900 self.write(
901 "<html xmlns='http://www.w3.org/1999/xhtml' "
902 f"xml:lang='en' lang='en' charset='{self.codec}'>\n",
903 )
904 else:
905 self.write(
906 "<html xmlns='http://www.w3.org/1999/xhtml' xml:lang='en' lang='en'>\n",
907 )
908 self.write("<head>\n")
909 self.write("<title></title>\n")
910 self.write(
911 "<meta http-equiv='Content-Type' content='text/html;charset=utf-8' />\n",
912 )
913 self.write(
914 "<meta name='ocr-system' content='pdfminer.six HOCR Converter' />\n",
915 )
916 self.write(
917 " <meta name='ocr-capabilities'"
918 " content='ocr_page ocr_block ocr_line ocrx_word'/>\n",
919 )
920 self.write("</head>\n")
921 self.write("<body>\n")
923 def write_footer(self) -> None:
924 self.write("<!-- comment in the following line to debug -->\n")
925 self.write(
926 "<!--script src='https://unpkg.com/hocrjs'></script--></body></html>\n",
927 )
929 def write_text(self, text: str) -> None:
930 if self.stripcontrol:
931 text = self.CONTROL.sub("", text)
932 self.write(text)
934 def write_word(self) -> None:
935 if len(self.working_text) > 0:
936 bold_and_italic_styles = ""
937 if "Italic" in self.working_font:
938 bold_and_italic_styles = "font-style: italic; "
939 if "Bold" in self.working_font:
940 bold_and_italic_styles += "font-weight: bold; "
941 self.write(
942 f'<span style=\'font:"{self.working_font}"; '
943 f"font-size:{self.working_size}; "
944 f"{bold_and_italic_styles}' "
945 f"class='ocrx_word' "
946 f"title='{self.bbox_repr(self.working_bbox)}; "
947 f"x_font {self.working_font}; "
948 f"x_fsize {self.working_size}'>"
949 f"{self.working_text.strip()}</span>"
950 )
951 self.within_chars = False
953 def receive_layout(self, ltpage: LTPage) -> None:
954 def render(item: LTItem) -> None:
955 if self.within_chars and isinstance(item, LTAnno):
956 self.write_word()
957 if isinstance(item, LTPage):
958 self.page_bbox = item.bbox
959 self.write(
960 f"<div "
961 f"class='ocr_page' "
962 f"id='{item.pageid}' "
963 f"title='{self.bbox_repr(item.bbox)}'>\n",
964 )
965 for child in item:
966 render(child)
967 self.write("</div>\n")
968 elif isinstance(item, LTTextLine):
969 self.write(
970 f"<span class='ocr_line' title='{self.bbox_repr(item.bbox)}'>",
971 )
972 for child_line in item:
973 render(child_line)
974 self.write("</span>\n")
975 elif isinstance(item, LTTextBox):
976 self.write(
977 f"<div "
978 f"class='ocr_block' "
979 f"id='{item.index}' "
980 f"title='{self.bbox_repr(item.bbox)}'>\n"
981 )
982 for child in item:
983 render(child)
984 self.write("</div>\n")
985 elif isinstance(item, LTChar):
986 if not self.within_chars:
987 self.within_chars = True
988 self.working_text = item.get_text()
989 self.working_bbox = item.bbox
990 self.working_font = item.fontname
991 self.working_size = item.size
992 elif len(item.get_text().strip()) == 0:
993 self.write_word()
994 self.write(item.get_text())
995 else:
996 if (
997 self.working_bbox[1] != item.bbox[1]
998 or self.working_font != item.fontname
999 or self.working_size != item.size
1000 ):
1001 self.write_word()
1002 self.working_bbox = item.bbox
1003 self.working_font = item.fontname
1004 self.working_size = item.size
1005 self.working_text += item.get_text()
1006 self.working_bbox = (
1007 self.working_bbox[0],
1008 self.working_bbox[1],
1009 item.bbox[2],
1010 self.working_bbox[3],
1011 )
1013 render(ltpage)
1015 def close(self) -> None:
1016 self.write_footer()