Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/converter.py: 30%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import io
2import logging
3import re
4from typing import (
5 BinaryIO,
6 Dict,
7 Generic,
8 List,
9 Optional,
10 Sequence,
11 TextIO,
12 Tuple,
13 TypeVar,
14 Union,
15 cast,
16)
18from pdfminer import utils
19from pdfminer.image import ImageWriter
20from pdfminer.layout import (
21 LAParams,
22 LTAnno,
23 LTChar,
24 LTComponent,
25 LTContainer,
26 LTCurve,
27 LTFigure,
28 LTImage,
29 LTItem,
30 LTLayoutContainer,
31 LTLine,
32 LTPage,
33 LTRect,
34 LTText,
35 LTTextBox,
36 LTTextBoxVertical,
37 LTTextGroup,
38 LTTextLine,
39 TextGroupElement,
40)
41from pdfminer.pdfcolor import PDFColorSpace
42from pdfminer.pdfdevice import PDFTextDevice
43from pdfminer.pdfexceptions import PDFValueError
44from pdfminer.pdffont import PDFFont, PDFUnicodeNotDefined
45from pdfminer.pdfinterp import PDFGraphicState, PDFResourceManager
46from pdfminer.pdfpage import PDFPage
47from pdfminer.pdftypes import PDFStream
48from pdfminer.utils import (
49 AnyIO,
50 Matrix,
51 PathSegment,
52 Point,
53 Rect,
54 apply_matrix_pt,
55 apply_matrix_rect,
56 bbox2str,
57 enc,
58 make_compat_str,
59 mult_matrix,
60)
62log = logging.getLogger(__name__)
65class PDFLayoutAnalyzer(PDFTextDevice):
66 cur_item: LTLayoutContainer
67 ctm: Matrix
69 def __init__(
70 self,
71 rsrcmgr: PDFResourceManager,
72 pageno: int = 1,
73 laparams: Optional[LAParams] = None,
74 ) -> None:
75 PDFTextDevice.__init__(self, rsrcmgr)
76 self.pageno = pageno
77 self.laparams = laparams
78 self._stack: List[LTLayoutContainer] = []
80 def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
81 (x0, y0, x1, y1) = apply_matrix_rect(ctm, page.mediabox)
82 mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1))
83 self.cur_item = LTPage(self.pageno, mediabox)
85 def end_page(self, page: PDFPage) -> None:
86 assert not self._stack, str(len(self._stack))
87 assert isinstance(self.cur_item, LTPage), str(type(self.cur_item))
88 if self.laparams is not None:
89 self.cur_item.analyze(self.laparams)
90 self.pageno += 1
91 self.receive_layout(self.cur_item)
93 def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None:
94 self._stack.append(self.cur_item)
95 self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))
97 def end_figure(self, _: str) -> None:
98 fig = self.cur_item
99 assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
100 self.cur_item = self._stack.pop()
101 self.cur_item.add(fig)
103 def render_image(self, name: str, stream: PDFStream) -> None:
104 assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
105 item = LTImage(
106 name,
107 stream,
108 (self.cur_item.x0, self.cur_item.y0, self.cur_item.x1, self.cur_item.y1),
109 )
110 self.cur_item.add(item)
112 def paint_path(
113 self,
114 gstate: PDFGraphicState,
115 stroke: bool,
116 fill: bool,
117 evenodd: bool,
118 path: Sequence[PathSegment],
119 ) -> None:
120 """Paint paths described in section 4.4 of the PDF reference manual"""
121 shape = "".join(x[0] for x in path)
123 if shape[:1] != "m":
124 # Per PDF Reference Section 4.4.1, "path construction operators may
125 # be invoked in any sequence, but the first one invoked must be m
126 # or re to begin a new subpath." Since pdfminer.six already
127 # converts all `re` (rectangle) operators to their equivelent
128 # `mlllh` representation, paths ingested by `.paint_path(...)` that
129 # do not begin with the `m` operator are invalid.
130 pass
132 elif shape.count("m") > 1:
133 # recurse if there are multiple m's in this shape
134 for m in re.finditer(r"m[^m]+", shape):
135 subpath = path[m.start(0) : m.end(0)]
136 self.paint_path(gstate, stroke, fill, evenodd, subpath)
138 else:
139 # Although the 'h' command does not not literally provide a
140 # point-position, its position is (by definition) equal to the
141 # subpath's starting point.
142 #
143 # And, per Section 4.4's Table 4.9, all other path commands place
144 # their point-position in their final two arguments. (Any preceding
145 # arguments represent control points on Bézier curves.)
146 raw_pts = [
147 cast(Point, p[-2:] if p[0] != "h" else path[0][-2:]) for p in path
148 ]
149 pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts]
151 operators = [str(operation[0]) for operation in path]
152 transformed_points = [
153 [
154 apply_matrix_pt(self.ctm, (float(operand1), float(operand2)))
155 for operand1, operand2 in zip(operation[1::2], operation[2::2])
156 ]
157 for operation in path
158 ]
159 transformed_path = [
160 cast(PathSegment, (o, *p))
161 for o, p in zip(operators, transformed_points)
162 ]
164 # Drop a redundant "l" on a path closed with "h"
165 if len(shape) > 3 and shape[-2:] == "lh" and pts[-2] == pts[0]:
166 shape = shape[:-2] + "h"
167 pts.pop()
169 if shape in {"mlh", "ml"}:
170 # single line segment
171 #
172 # Note: 'ml', in conditional above, is a frequent anomaly
173 # that we want to support.
174 line = LTLine(
175 gstate.linewidth,
176 pts[0],
177 pts[1],
178 stroke,
179 fill,
180 evenodd,
181 gstate.scolor,
182 gstate.ncolor,
183 original_path=transformed_path,
184 dashing_style=gstate.dash,
185 )
186 self.cur_item.add(line)
188 elif shape in {"mlllh", "mllll"}:
189 (x0, y0), (x1, y1), (x2, y2), (x3, y3), _ = pts
191 is_closed_loop = pts[0] == pts[4]
192 has_square_coordinates = (
193 x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0
194 ) or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)
195 if is_closed_loop and has_square_coordinates:
196 rect = LTRect(
197 gstate.linewidth,
198 (*pts[0], *pts[2]),
199 stroke,
200 fill,
201 evenodd,
202 gstate.scolor,
203 gstate.ncolor,
204 transformed_path,
205 gstate.dash,
206 )
207 self.cur_item.add(rect)
208 else:
209 curve = LTCurve(
210 gstate.linewidth,
211 pts,
212 stroke,
213 fill,
214 evenodd,
215 gstate.scolor,
216 gstate.ncolor,
217 transformed_path,
218 gstate.dash,
219 )
220 self.cur_item.add(curve)
221 else:
222 curve = LTCurve(
223 gstate.linewidth,
224 pts,
225 stroke,
226 fill,
227 evenodd,
228 gstate.scolor,
229 gstate.ncolor,
230 transformed_path,
231 gstate.dash,
232 )
233 self.cur_item.add(curve)
235 def render_char(
236 self,
237 matrix: Matrix,
238 font: PDFFont,
239 fontsize: float,
240 scaling: float,
241 rise: float,
242 cid: int,
243 ncs: PDFColorSpace,
244 graphicstate: PDFGraphicState,
245 ) -> float:
246 try:
247 text = font.to_unichr(cid)
248 assert isinstance(text, str), str(type(text))
249 except PDFUnicodeNotDefined:
250 text = self.handle_undefined_char(font, cid)
251 textwidth = font.char_width(cid)
252 textdisp = font.char_disp(cid)
253 item = LTChar(
254 matrix,
255 font,
256 fontsize,
257 scaling,
258 rise,
259 text,
260 textwidth,
261 textdisp,
262 ncs,
263 graphicstate,
264 )
265 self.cur_item.add(item)
266 return item.adv
268 def handle_undefined_char(self, font: PDFFont, cid: int) -> str:
269 log.debug("undefined: %r, %r", font, cid)
270 return "(cid:%d)" % cid
272 def receive_layout(self, ltpage: LTPage) -> None:
273 pass
276class PDFPageAggregator(PDFLayoutAnalyzer):
277 def __init__(
278 self,
279 rsrcmgr: PDFResourceManager,
280 pageno: int = 1,
281 laparams: Optional[LAParams] = None,
282 ) -> None:
283 PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
284 self.result: Optional[LTPage] = None
286 def receive_layout(self, ltpage: LTPage) -> None:
287 self.result = ltpage
289 def get_result(self) -> LTPage:
290 assert self.result is not None
291 return self.result
294# Some PDFConverter children support only binary I/O
295IOType = TypeVar("IOType", TextIO, BinaryIO, AnyIO)
298class PDFConverter(PDFLayoutAnalyzer, Generic[IOType]):
299 def __init__(
300 self,
301 rsrcmgr: PDFResourceManager,
302 outfp: IOType,
303 codec: str = "utf-8",
304 pageno: int = 1,
305 laparams: Optional[LAParams] = None,
306 ) -> None:
307 PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
308 self.outfp: IOType = outfp
309 self.codec = codec
310 self.outfp_binary = self._is_binary_stream(self.outfp)
312 @staticmethod
313 def _is_binary_stream(outfp: AnyIO) -> bool:
314 """Test if an stream is binary or not"""
315 if "b" in getattr(outfp, "mode", ""):
316 return True
317 elif hasattr(outfp, "mode"):
318 # output stream has a mode, but it does not contain 'b'
319 return False
320 elif isinstance(outfp, io.BytesIO):
321 return True
322 elif isinstance(outfp, io.StringIO) or isinstance(outfp, io.TextIOBase):
323 return False
325 return True
328class TextConverter(PDFConverter[AnyIO]):
329 def __init__(
330 self,
331 rsrcmgr: PDFResourceManager,
332 outfp: AnyIO,
333 codec: str = "utf-8",
334 pageno: int = 1,
335 laparams: Optional[LAParams] = None,
336 showpageno: bool = False,
337 imagewriter: Optional[ImageWriter] = None,
338 ) -> None:
339 super().__init__(rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
340 self.showpageno = showpageno
341 self.imagewriter = imagewriter
343 def write_text(self, text: str) -> None:
344 text = utils.compatible_encode_method(text, self.codec, "ignore")
345 if self.outfp_binary:
346 cast(BinaryIO, self.outfp).write(text.encode())
347 else:
348 cast(TextIO, self.outfp).write(text)
350 def receive_layout(self, ltpage: LTPage) -> None:
351 def render(item: LTItem) -> None:
352 if isinstance(item, LTContainer):
353 for child in item:
354 render(child)
355 elif isinstance(item, LTText):
356 self.write_text(item.get_text())
357 if isinstance(item, LTTextBox):
358 self.write_text("\n")
359 elif isinstance(item, LTImage):
360 if self.imagewriter is not None:
361 self.imagewriter.export_image(item)
363 if self.showpageno:
364 self.write_text("Page %s\n" % ltpage.pageid)
365 render(ltpage)
366 self.write_text("\f")
368 # Some dummy functions to save memory/CPU when all that is wanted
369 # is text. This stops all the image and drawing output from being
370 # recorded and taking up RAM.
371 def render_image(self, name: str, stream: PDFStream) -> None:
372 if self.imagewriter is not None:
373 PDFConverter.render_image(self, name, stream)
375 def paint_path(
376 self,
377 gstate: PDFGraphicState,
378 stroke: bool,
379 fill: bool,
380 evenodd: bool,
381 path: Sequence[PathSegment],
382 ) -> None:
383 pass
386class HTMLConverter(PDFConverter[AnyIO]):
387 RECT_COLORS = {
388 "figure": "yellow",
389 "textline": "magenta",
390 "textbox": "cyan",
391 "textgroup": "red",
392 "curve": "black",
393 "page": "gray",
394 }
396 TEXT_COLORS = {
397 "textbox": "blue",
398 "char": "black",
399 }
401 def __init__(
402 self,
403 rsrcmgr: PDFResourceManager,
404 outfp: AnyIO,
405 codec: str = "utf-8",
406 pageno: int = 1,
407 laparams: Optional[LAParams] = None,
408 scale: float = 1,
409 fontscale: float = 1.0,
410 layoutmode: str = "normal",
411 showpageno: bool = True,
412 pagemargin: int = 50,
413 imagewriter: Optional[ImageWriter] = None,
414 debug: int = 0,
415 rect_colors: Optional[Dict[str, str]] = None,
416 text_colors: Optional[Dict[str, str]] = None,
417 ) -> None:
418 PDFConverter.__init__(
419 self,
420 rsrcmgr,
421 outfp,
422 codec=codec,
423 pageno=pageno,
424 laparams=laparams,
425 )
427 # write() assumes a codec for binary I/O, or no codec for text I/O.
428 if self.outfp_binary and not self.codec:
429 raise PDFValueError("Codec is required for a binary I/O output")
430 if not self.outfp_binary and self.codec:
431 raise PDFValueError("Codec must not be specified for a text I/O output")
433 if text_colors is None:
434 text_colors = {"char": "black"}
435 if rect_colors is None:
436 rect_colors = {"curve": "black", "page": "gray"}
438 self.scale = scale
439 self.fontscale = fontscale
440 self.layoutmode = layoutmode
441 self.showpageno = showpageno
442 self.pagemargin = pagemargin
443 self.imagewriter = imagewriter
444 self.rect_colors = rect_colors
445 self.text_colors = text_colors
446 if debug:
447 self.rect_colors.update(self.RECT_COLORS)
448 self.text_colors.update(self.TEXT_COLORS)
449 self._yoffset: float = self.pagemargin
450 self._font: Optional[Tuple[str, float]] = None
451 self._fontstack: List[Optional[Tuple[str, float]]] = []
452 self.write_header()
454 def write(self, text: str) -> None:
455 if self.codec:
456 cast(BinaryIO, self.outfp).write(text.encode(self.codec))
457 else:
458 cast(TextIO, self.outfp).write(text)
460 def write_header(self) -> None:
461 self.write("<html><head>\n")
462 if self.codec:
463 s = (
464 '<meta http-equiv="Content-Type" content="text/html; '
465 'charset=%s">\n' % self.codec
466 )
467 else:
468 s = '<meta http-equiv="Content-Type" content="text/html">\n'
469 self.write(s)
470 self.write("</head><body>\n")
472 def write_footer(self) -> None:
473 page_links = [f'<a href="#{i}">{i}</a>' for i in range(1, self.pageno)]
474 s = '<div style="position:absolute; top:0px;">Page: %s</div>\n' % ", ".join(
475 page_links,
476 )
477 self.write(s)
478 self.write("</body></html>\n")
480 def write_text(self, text: str) -> None:
481 self.write(enc(text))
483 def place_rect(
484 self,
485 color: str,
486 borderwidth: int,
487 x: float,
488 y: float,
489 w: float,
490 h: float,
491 ) -> None:
492 color2 = self.rect_colors.get(color)
493 if color2 is not None:
494 s = (
495 '<span style="position:absolute; border: %s %dpx solid; '
496 'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n'
497 % (
498 color2,
499 borderwidth,
500 x * self.scale,
501 (self._yoffset - y) * self.scale,
502 w * self.scale,
503 h * self.scale,
504 )
505 )
506 self.write(s)
508 def place_border(self, color: str, borderwidth: int, item: LTComponent) -> None:
509 self.place_rect(color, borderwidth, item.x0, item.y1, item.width, item.height)
511 def place_image(
512 self,
513 item: LTImage,
514 borderwidth: int,
515 x: float,
516 y: float,
517 w: float,
518 h: float,
519 ) -> None:
520 if self.imagewriter is not None:
521 name = self.imagewriter.export_image(item)
522 s = (
523 '<img src="%s" border="%d" style="position:absolute; '
524 'left:%dpx; top:%dpx;" width="%d" height="%d" />\n'
525 % (
526 enc(name),
527 borderwidth,
528 x * self.scale,
529 (self._yoffset - y) * self.scale,
530 w * self.scale,
531 h * self.scale,
532 )
533 )
534 self.write(s)
536 def place_text(
537 self,
538 color: str,
539 text: str,
540 x: float,
541 y: float,
542 size: float,
543 ) -> None:
544 color2 = self.text_colors.get(color)
545 if color2 is not None:
546 s = (
547 '<span style="position:absolute; color:%s; left:%dpx; '
548 'top:%dpx; font-size:%dpx;">'
549 % (
550 color2,
551 x * self.scale,
552 (self._yoffset - y) * self.scale,
553 size * self.scale * self.fontscale,
554 )
555 )
556 self.write(s)
557 self.write_text(text)
558 self.write("</span>\n")
560 def begin_div(
561 self,
562 color: str,
563 borderwidth: int,
564 x: float,
565 y: float,
566 w: float,
567 h: float,
568 writing_mode: str = "False",
569 ) -> None:
570 self._fontstack.append(self._font)
571 self._font = None
572 s = (
573 '<div style="position:absolute; border: %s %dpx solid; '
574 "writing-mode:%s; left:%dpx; top:%dpx; width:%dpx; "
575 'height:%dpx;">'
576 % (
577 color,
578 borderwidth,
579 writing_mode,
580 x * self.scale,
581 (self._yoffset - y) * self.scale,
582 w * self.scale,
583 h * self.scale,
584 )
585 )
586 self.write(s)
588 def end_div(self, color: str) -> None:
589 if self._font is not None:
590 self.write("</span>")
591 self._font = self._fontstack.pop()
592 self.write("</div>")
594 def put_text(self, text: str, fontname: str, fontsize: float) -> None:
595 font = (fontname, fontsize)
596 if font != self._font:
597 if self._font is not None:
598 self.write("</span>")
599 # Remove subset tag from fontname, see PDF Reference 5.5.3
600 fontname_without_subset_tag = fontname.split("+")[-1]
601 self.write(
602 '<span style="font-family: %s; font-size:%dpx">'
603 % (fontname_without_subset_tag, fontsize * self.scale * self.fontscale),
604 )
605 self._font = font
606 self.write_text(text)
608 def put_newline(self) -> None:
609 self.write("<br>")
611 def receive_layout(self, ltpage: LTPage) -> None:
612 def show_group(item: Union[LTTextGroup, TextGroupElement]) -> None:
613 if isinstance(item, LTTextGroup):
614 self.place_border("textgroup", 1, item)
615 for child in item:
616 show_group(child)
618 def render(item: LTItem) -> None:
619 child: LTItem
620 if isinstance(item, LTPage):
621 self._yoffset += item.y1
622 self.place_border("page", 1, item)
623 if self.showpageno:
624 self.write(
625 '<div style="position:absolute; top:%dpx;">'
626 % ((self._yoffset - item.y1) * self.scale),
627 )
628 self.write(
629 f'<a name="{item.pageid}">Page {item.pageid}</a></div>\n',
630 )
631 for child in item:
632 render(child)
633 if item.groups is not None:
634 for group in item.groups:
635 show_group(group)
636 elif isinstance(item, LTCurve):
637 self.place_border("curve", 1, item)
638 elif isinstance(item, LTFigure):
639 self.begin_div("figure", 1, item.x0, item.y1, item.width, item.height)
640 for child in item:
641 render(child)
642 self.end_div("figure")
643 elif isinstance(item, LTImage):
644 self.place_image(item, 1, item.x0, item.y1, item.width, item.height)
645 elif self.layoutmode == "exact":
646 if isinstance(item, LTTextLine):
647 self.place_border("textline", 1, item)
648 for child in item:
649 render(child)
650 elif isinstance(item, LTTextBox):
651 self.place_border("textbox", 1, item)
652 self.place_text(
653 "textbox",
654 str(item.index + 1),
655 item.x0,
656 item.y1,
657 20,
658 )
659 for child in item:
660 render(child)
661 elif isinstance(item, LTChar):
662 self.place_border("char", 1, item)
663 self.place_text(
664 "char",
665 item.get_text(),
666 item.x0,
667 item.y1,
668 item.size,
669 )
670 elif isinstance(item, LTTextLine):
671 for child in item:
672 render(child)
673 if self.layoutmode != "loose":
674 self.put_newline()
675 elif isinstance(item, LTTextBox):
676 self.begin_div(
677 "textbox",
678 1,
679 item.x0,
680 item.y1,
681 item.width,
682 item.height,
683 item.get_writing_mode(),
684 )
685 for child in item:
686 render(child)
687 self.end_div("textbox")
688 elif isinstance(item, LTChar):
689 fontname = make_compat_str(item.fontname)
690 self.put_text(item.get_text(), fontname, item.size)
691 elif isinstance(item, LTText):
692 self.write_text(item.get_text())
694 render(ltpage)
695 self._yoffset += self.pagemargin
697 def close(self) -> None:
698 self.write_footer()
701class XMLConverter(PDFConverter[AnyIO]):
702 CONTROL = re.compile("[\x00-\x08\x0b-\x0c\x0e-\x1f]")
704 def __init__(
705 self,
706 rsrcmgr: PDFResourceManager,
707 outfp: AnyIO,
708 codec: str = "utf-8",
709 pageno: int = 1,
710 laparams: Optional[LAParams] = None,
711 imagewriter: Optional[ImageWriter] = None,
712 stripcontrol: bool = False,
713 ) -> None:
714 PDFConverter.__init__(
715 self,
716 rsrcmgr,
717 outfp,
718 codec=codec,
719 pageno=pageno,
720 laparams=laparams,
721 )
723 # write() assumes a codec for binary I/O, or no codec for text I/O.
724 if self.outfp_binary == (not self.codec):
725 raise PDFValueError("Codec is required for a binary I/O output")
727 self.imagewriter = imagewriter
728 self.stripcontrol = stripcontrol
729 self.write_header()
731 def write(self, text: str) -> None:
732 if self.codec:
733 cast(BinaryIO, self.outfp).write(text.encode(self.codec))
734 else:
735 cast(TextIO, self.outfp).write(text)
737 def write_header(self) -> None:
738 if self.codec:
739 self.write('<?xml version="1.0" encoding="%s" ?>\n' % self.codec)
740 else:
741 self.write('<?xml version="1.0" ?>\n')
742 self.write("<pages>\n")
744 def write_footer(self) -> None:
745 self.write("</pages>\n")
747 def write_text(self, text: str) -> None:
748 if self.stripcontrol:
749 text = self.CONTROL.sub("", text)
750 self.write(enc(text))
752 def receive_layout(self, ltpage: LTPage) -> None:
753 def show_group(item: LTItem) -> None:
754 if isinstance(item, LTTextBox):
755 self.write(
756 '<textbox id="%d" bbox="%s" />\n'
757 % (item.index, bbox2str(item.bbox)),
758 )
759 elif isinstance(item, LTTextGroup):
760 self.write('<textgroup bbox="%s">\n' % bbox2str(item.bbox))
761 for child in item:
762 show_group(child)
763 self.write("</textgroup>\n")
765 def render(item: LTItem) -> None:
766 child: LTItem
767 if isinstance(item, LTPage):
768 s = '<page id="%s" bbox="%s" rotate="%d">\n' % (
769 item.pageid,
770 bbox2str(item.bbox),
771 item.rotate,
772 )
773 self.write(s)
774 for child in item:
775 render(child)
776 if item.groups is not None:
777 self.write("<layout>\n")
778 for group in item.groups:
779 show_group(group)
780 self.write("</layout>\n")
781 self.write("</page>\n")
782 elif isinstance(item, LTLine):
783 s = '<line linewidth="%d" bbox="%s" />\n' % (
784 item.linewidth,
785 bbox2str(item.bbox),
786 )
787 self.write(s)
788 elif isinstance(item, LTRect):
789 s = '<rect linewidth="%d" bbox="%s" />\n' % (
790 item.linewidth,
791 bbox2str(item.bbox),
792 )
793 self.write(s)
794 elif isinstance(item, LTCurve):
795 s = '<curve linewidth="%d" bbox="%s" pts="%s"/>\n' % (
796 item.linewidth,
797 bbox2str(item.bbox),
798 item.get_pts(),
799 )
800 self.write(s)
801 elif isinstance(item, LTFigure):
802 s = f'<figure name="{item.name}" bbox="{bbox2str(item.bbox)}">\n'
803 self.write(s)
804 for child in item:
805 render(child)
806 self.write("</figure>\n")
807 elif isinstance(item, LTTextLine):
808 self.write('<textline bbox="%s">\n' % bbox2str(item.bbox))
809 for child in item:
810 render(child)
811 self.write("</textline>\n")
812 elif isinstance(item, LTTextBox):
813 wmode = ""
814 if isinstance(item, LTTextBoxVertical):
815 wmode = ' wmode="vertical"'
816 s = '<textbox id="%d" bbox="%s"%s>\n' % (
817 item.index,
818 bbox2str(item.bbox),
819 wmode,
820 )
821 self.write(s)
822 for child in item:
823 render(child)
824 self.write("</textbox>\n")
825 elif isinstance(item, LTChar):
826 s = (
827 '<text font="%s" bbox="%s" colourspace="%s" '
828 'ncolour="%s" size="%.3f">'
829 % (
830 enc(item.fontname),
831 bbox2str(item.bbox),
832 item.ncs.name,
833 item.graphicstate.ncolor,
834 item.size,
835 )
836 )
837 self.write(s)
838 self.write_text(item.get_text())
839 self.write("</text>\n")
840 elif isinstance(item, LTText):
841 self.write("<text>%s</text>\n" % item.get_text())
842 elif isinstance(item, LTImage):
843 if self.imagewriter is not None:
844 name = self.imagewriter.export_image(item)
845 self.write(
846 '<image src="%s" width="%d" height="%d" />\n'
847 % (enc(name), item.width, item.height),
848 )
849 else:
850 self.write(
851 '<image width="%d" height="%d" />\n'
852 % (item.width, item.height),
853 )
854 else:
855 assert False, str(("Unhandled", item))
857 render(ltpage)
859 def close(self) -> None:
860 self.write_footer()
863class HOCRConverter(PDFConverter[AnyIO]):
864 """Extract an hOCR representation from explicit text information within a PDF."""
866 # Where text is being extracted from a variety of types of PDF within a
867 # business process, those PDFs where the text is only present in image
868 # form will need to be analysed using an OCR tool which will typically
869 # output hOCR. This converter extracts the explicit text information from
870 # those PDFs that do have it and uses it to genxerate a basic hOCR
871 # representation that is designed to be used in conjunction with the image
872 # of the PDF in the same way as genuine OCR output would be, but without the
873 # inevitable OCR errors.
875 # The converter does not handle images, diagrams or text colors.
877 # In the examples processed by the contributor it was necessary to set
878 # LAParams.all_texts to True.
880 CONTROL = re.compile(r"[\x00-\x08\x0b-\x0c\x0e-\x1f]")
882 def __init__(
883 self,
884 rsrcmgr: PDFResourceManager,
885 outfp: AnyIO,
886 codec: str = "utf8",
887 pageno: int = 1,
888 laparams: Optional[LAParams] = None,
889 stripcontrol: bool = False,
890 ):
891 PDFConverter.__init__(
892 self,
893 rsrcmgr,
894 outfp,
895 codec=codec,
896 pageno=pageno,
897 laparams=laparams,
898 )
899 self.stripcontrol = stripcontrol
900 self.within_chars = False
901 self.write_header()
903 def bbox_repr(self, bbox: Rect) -> str:
904 (in_x0, in_y0, in_x1, in_y1) = bbox
905 # PDF y-coordinates are the other way round from hOCR coordinates
906 out_x0 = int(in_x0)
907 out_y0 = int(self.page_bbox[3] - in_y1)
908 out_x1 = int(in_x1)
909 out_y1 = int(self.page_bbox[3] - in_y0)
910 return f"bbox {out_x0} {out_y0} {out_x1} {out_y1}"
912 def write(self, text: str) -> None:
913 if self.codec:
914 encoded_text = text.encode(self.codec)
915 cast(BinaryIO, self.outfp).write(encoded_text)
916 else:
917 cast(TextIO, self.outfp).write(text)
919 def write_header(self) -> None:
920 if self.codec:
921 self.write(
922 "<html xmlns='http://www.w3.org/1999/xhtml' "
923 "xml:lang='en' lang='en' charset='%s'>\n" % self.codec,
924 )
925 else:
926 self.write(
927 "<html xmlns='http://www.w3.org/1999/xhtml' "
928 "xml:lang='en' lang='en'>\n",
929 )
930 self.write("<head>\n")
931 self.write("<title></title>\n")
932 self.write(
933 "<meta http-equiv='Content-Type' content='text/html;charset=utf-8' />\n",
934 )
935 self.write(
936 "<meta name='ocr-system' content='pdfminer.six HOCR Converter' />\n",
937 )
938 self.write(
939 " <meta name='ocr-capabilities'"
940 " content='ocr_page ocr_block ocr_line ocrx_word'/>\n",
941 )
942 self.write("</head>\n")
943 self.write("<body>\n")
945 def write_footer(self) -> None:
946 self.write("<!-- comment in the following line to debug -->\n")
947 self.write(
948 "<!--script src='https://unpkg.com/hocrjs'></script--></body></html>\n",
949 )
951 def write_text(self, text: str) -> None:
952 if self.stripcontrol:
953 text = self.CONTROL.sub("", text)
954 self.write(text)
956 def write_word(self) -> None:
957 if len(self.working_text) > 0:
958 bold_and_italic_styles = ""
959 if "Italic" in self.working_font:
960 bold_and_italic_styles = "font-style: italic; "
961 if "Bold" in self.working_font:
962 bold_and_italic_styles += "font-weight: bold; "
963 self.write(
964 "<span style='font:\"%s\"; font-size:%d; %s' "
965 "class='ocrx_word' title='%s; x_font %s; "
966 "x_fsize %d'>%s</span>"
967 % (
968 (
969 self.working_font,
970 self.working_size,
971 bold_and_italic_styles,
972 self.bbox_repr(self.working_bbox),
973 self.working_font,
974 self.working_size,
975 self.working_text.strip(),
976 )
977 ),
978 )
979 self.within_chars = False
981 def receive_layout(self, ltpage: LTPage) -> None:
982 def render(item: LTItem) -> None:
983 if self.within_chars and isinstance(item, LTAnno):
984 self.write_word()
985 if isinstance(item, LTPage):
986 self.page_bbox = item.bbox
987 self.write(
988 "<div class='ocr_page' id='%s' title='%s'>\n"
989 % (item.pageid, self.bbox_repr(item.bbox)),
990 )
991 for child in item:
992 render(child)
993 self.write("</div>\n")
994 elif isinstance(item, LTTextLine):
995 self.write(
996 "<span class='ocr_line' title='%s'>" % (self.bbox_repr(item.bbox)),
997 )
998 for child_line in item:
999 render(child_line)
1000 self.write("</span>\n")
1001 elif isinstance(item, LTTextBox):
1002 self.write(
1003 "<div class='ocr_block' id='%d' title='%s'>\n"
1004 % (item.index, self.bbox_repr(item.bbox)),
1005 )
1006 for child in item:
1007 render(child)
1008 self.write("</div>\n")
1009 elif isinstance(item, LTChar):
1010 if not self.within_chars:
1011 self.within_chars = True
1012 self.working_text = item.get_text()
1013 self.working_bbox = item.bbox
1014 self.working_font = item.fontname
1015 self.working_size = item.size
1016 elif len(item.get_text().strip()) == 0:
1017 self.write_word()
1018 self.write(item.get_text())
1019 else:
1020 if (
1021 self.working_bbox[1] != item.bbox[1]
1022 or self.working_font != item.fontname
1023 or self.working_size != item.size
1024 ):
1025 self.write_word()
1026 self.working_bbox = item.bbox
1027 self.working_font = item.fontname
1028 self.working_size = item.size
1029 self.working_text += item.get_text()
1030 self.working_bbox = (
1031 self.working_bbox[0],
1032 self.working_bbox[1],
1033 item.bbox[2],
1034 self.working_bbox[3],
1035 )
1037 render(ltpage)
1039 def close(self) -> None:
1040 self.write_footer()