Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/converter.py: 38%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import io
2import logging
3import re
4from typing import (
5 BinaryIO,
6 Dict,
7 Generic,
8 List,
9 Optional,
10 Sequence,
11 TextIO,
12 Tuple,
13 TypeVar,
14 Union,
15 cast,
16)
18from pdfminer import utils
19from pdfminer.image import ImageWriter
20from pdfminer.layout import (
21 LAParams,
22 LTAnno,
23 LTChar,
24 LTComponent,
25 LTContainer,
26 LTCurve,
27 LTFigure,
28 LTImage,
29 LTItem,
30 LTLayoutContainer,
31 LTLine,
32 LTPage,
33 LTRect,
34 LTText,
35 LTTextBox,
36 LTTextBoxVertical,
37 LTTextGroup,
38 LTTextLine,
39 TextGroupElement,
40)
41from pdfminer.pdfcolor import PDFColorSpace
42from pdfminer.pdfdevice import PDFTextDevice
43from pdfminer.pdfexceptions import PDFValueError
44from pdfminer.pdffont import PDFFont, PDFUnicodeNotDefined
45from pdfminer.pdfinterp import PDFGraphicState, PDFResourceManager
46from pdfminer.pdfpage import PDFPage
47from pdfminer.pdftypes import PDFStream
48from pdfminer.utils import (
49 AnyIO,
50 Matrix,
51 PathSegment,
52 Point,
53 Rect,
54 apply_matrix_pt,
55 bbox2str,
56 enc,
57 make_compat_str,
58 mult_matrix,
59)
61log = logging.getLogger(__name__)
64class PDFLayoutAnalyzer(PDFTextDevice):
65 cur_item: LTLayoutContainer
66 ctm: Matrix
68 def __init__(
69 self,
70 rsrcmgr: PDFResourceManager,
71 pageno: int = 1,
72 laparams: Optional[LAParams] = None,
73 ) -> None:
74 PDFTextDevice.__init__(self, rsrcmgr)
75 self.pageno = pageno
76 self.laparams = laparams
77 self._stack: List[LTLayoutContainer] = []
79 def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
80 (x0, y0, x1, y1) = page.mediabox
81 (x0, y0) = apply_matrix_pt(ctm, (x0, y0))
82 (x1, y1) = apply_matrix_pt(ctm, (x1, y1))
83 mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1))
84 self.cur_item = LTPage(self.pageno, mediabox)
86 def end_page(self, page: PDFPage) -> None:
87 assert not self._stack, str(len(self._stack))
88 assert isinstance(self.cur_item, LTPage), str(type(self.cur_item))
89 if self.laparams is not None:
90 self.cur_item.analyze(self.laparams)
91 self.pageno += 1
92 self.receive_layout(self.cur_item)
94 def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None:
95 self._stack.append(self.cur_item)
96 self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))
98 def end_figure(self, _: str) -> None:
99 fig = self.cur_item
100 assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
101 self.cur_item = self._stack.pop()
102 self.cur_item.add(fig)
104 def render_image(self, name: str, stream: PDFStream) -> None:
105 assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
106 item = LTImage(
107 name,
108 stream,
109 (self.cur_item.x0, self.cur_item.y0, self.cur_item.x1, self.cur_item.y1),
110 )
111 self.cur_item.add(item)
113 def paint_path(
114 self,
115 gstate: PDFGraphicState,
116 stroke: bool,
117 fill: bool,
118 evenodd: bool,
119 path: Sequence[PathSegment],
120 ) -> None:
121 """Paint paths described in section 4.4 of the PDF reference manual"""
122 shape = "".join(x[0] for x in path)
124 if shape[:1] != "m":
125 # Per PDF Reference Section 4.4.1, "path construction operators may
126 # be invoked in any sequence, but the first one invoked must be m
127 # or re to begin a new subpath." Since pdfminer.six already
128 # converts all `re` (rectangle) operators to their equivelent
129 # `mlllh` representation, paths ingested by `.paint_path(...)` that
130 # do not begin with the `m` operator are invalid.
131 pass
133 elif shape.count("m") > 1:
134 # recurse if there are multiple m's in this shape
135 for m in re.finditer(r"m[^m]+", shape):
136 subpath = path[m.start(0) : m.end(0)]
137 self.paint_path(gstate, stroke, fill, evenodd, subpath)
139 else:
140 # Although the 'h' command does not not literally provide a
141 # point-position, its position is (by definition) equal to the
142 # subpath's starting point.
143 #
144 # And, per Section 4.4's Table 4.9, all other path commands place
145 # their point-position in their final two arguments. (Any preceding
146 # arguments represent control points on Bézier curves.)
147 raw_pts = [
148 cast(Point, p[-2:] if p[0] != "h" else path[0][-2:]) for p in path
149 ]
150 pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts]
152 operators = [str(operation[0]) for operation in path]
153 transformed_points = [
154 [
155 apply_matrix_pt(self.ctm, (float(operand1), float(operand2)))
156 for operand1, operand2 in zip(operation[1::2], operation[2::2])
157 ]
158 for operation in path
159 ]
160 transformed_path = [
161 cast(PathSegment, (o, *p))
162 for o, p in zip(operators, transformed_points)
163 ]
165 if shape in {"mlh", "ml"}:
166 # single line segment
167 #
168 # Note: 'ml', in conditional above, is a frequent anomaly
169 # that we want to support.
170 line = LTLine(
171 gstate.linewidth,
172 pts[0],
173 pts[1],
174 stroke,
175 fill,
176 evenodd,
177 gstate.scolor,
178 gstate.ncolor,
179 original_path=transformed_path,
180 dashing_style=gstate.dash,
181 )
182 self.cur_item.add(line)
184 elif shape in {"mlllh", "mllll"}:
185 (x0, y0), (x1, y1), (x2, y2), (x3, y3), _ = pts
187 is_closed_loop = pts[0] == pts[4]
188 has_square_coordinates = (
189 x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0
190 ) or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)
191 if is_closed_loop and has_square_coordinates:
192 rect = LTRect(
193 gstate.linewidth,
194 (*pts[0], *pts[2]),
195 stroke,
196 fill,
197 evenodd,
198 gstate.scolor,
199 gstate.ncolor,
200 transformed_path,
201 gstate.dash,
202 )
203 self.cur_item.add(rect)
204 else:
205 curve = LTCurve(
206 gstate.linewidth,
207 pts,
208 stroke,
209 fill,
210 evenodd,
211 gstate.scolor,
212 gstate.ncolor,
213 transformed_path,
214 gstate.dash,
215 )
216 self.cur_item.add(curve)
217 else:
218 curve = LTCurve(
219 gstate.linewidth,
220 pts,
221 stroke,
222 fill,
223 evenodd,
224 gstate.scolor,
225 gstate.ncolor,
226 transformed_path,
227 gstate.dash,
228 )
229 self.cur_item.add(curve)
231 def render_char(
232 self,
233 matrix: Matrix,
234 font: PDFFont,
235 fontsize: float,
236 scaling: float,
237 rise: float,
238 cid: int,
239 ncs: PDFColorSpace,
240 graphicstate: PDFGraphicState,
241 ) -> float:
242 try:
243 text = font.to_unichr(cid)
244 assert isinstance(text, str), str(type(text))
245 except PDFUnicodeNotDefined:
246 text = self.handle_undefined_char(font, cid)
247 textwidth = font.char_width(cid)
248 textdisp = font.char_disp(cid)
249 item = LTChar(
250 matrix,
251 font,
252 fontsize,
253 scaling,
254 rise,
255 text,
256 textwidth,
257 textdisp,
258 ncs,
259 graphicstate,
260 )
261 self.cur_item.add(item)
262 return item.adv
264 def handle_undefined_char(self, font: PDFFont, cid: int) -> str:
265 log.debug("undefined: %r, %r", font, cid)
266 return "(cid:%d)" % cid
268 def receive_layout(self, ltpage: LTPage) -> None:
269 pass
272class PDFPageAggregator(PDFLayoutAnalyzer):
273 def __init__(
274 self,
275 rsrcmgr: PDFResourceManager,
276 pageno: int = 1,
277 laparams: Optional[LAParams] = None,
278 ) -> None:
279 PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
280 self.result: Optional[LTPage] = None
282 def receive_layout(self, ltpage: LTPage) -> None:
283 self.result = ltpage
285 def get_result(self) -> LTPage:
286 assert self.result is not None
287 return self.result
290# Some PDFConverter children support only binary I/O
291IOType = TypeVar("IOType", TextIO, BinaryIO, AnyIO)
294class PDFConverter(PDFLayoutAnalyzer, Generic[IOType]):
295 def __init__(
296 self,
297 rsrcmgr: PDFResourceManager,
298 outfp: IOType,
299 codec: str = "utf-8",
300 pageno: int = 1,
301 laparams: Optional[LAParams] = None,
302 ) -> None:
303 PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
304 self.outfp: IOType = outfp
305 self.codec = codec
306 self.outfp_binary = self._is_binary_stream(self.outfp)
308 @staticmethod
309 def _is_binary_stream(outfp: AnyIO) -> bool:
310 """Test if an stream is binary or not"""
311 if "b" in getattr(outfp, "mode", ""):
312 return True
313 elif hasattr(outfp, "mode"):
314 # output stream has a mode, but it does not contain 'b'
315 return False
316 elif isinstance(outfp, io.BytesIO):
317 return True
318 elif isinstance(outfp, io.StringIO) or isinstance(outfp, io.TextIOBase):
319 return False
321 return True
324class TextConverter(PDFConverter[AnyIO]):
325 def __init__(
326 self,
327 rsrcmgr: PDFResourceManager,
328 outfp: AnyIO,
329 codec: str = "utf-8",
330 pageno: int = 1,
331 laparams: Optional[LAParams] = None,
332 showpageno: bool = False,
333 imagewriter: Optional[ImageWriter] = None,
334 ) -> None:
335 super().__init__(rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
336 self.showpageno = showpageno
337 self.imagewriter = imagewriter
339 def write_text(self, text: str) -> None:
340 text = utils.compatible_encode_method(text, self.codec, "ignore")
341 if self.outfp_binary:
342 cast(BinaryIO, self.outfp).write(text.encode())
343 else:
344 cast(TextIO, self.outfp).write(text)
346 def receive_layout(self, ltpage: LTPage) -> None:
347 def render(item: LTItem) -> None:
348 if isinstance(item, LTContainer):
349 for child in item:
350 render(child)
351 elif isinstance(item, LTText):
352 self.write_text(item.get_text())
353 if isinstance(item, LTTextBox):
354 self.write_text("\n")
355 elif isinstance(item, LTImage):
356 if self.imagewriter is not None:
357 self.imagewriter.export_image(item)
359 if self.showpageno:
360 self.write_text("Page %s\n" % ltpage.pageid)
361 render(ltpage)
362 self.write_text("\f")
364 # Some dummy functions to save memory/CPU when all that is wanted
365 # is text. This stops all the image and drawing output from being
366 # recorded and taking up RAM.
367 def render_image(self, name: str, stream: PDFStream) -> None:
368 if self.imagewriter is not None:
369 PDFConverter.render_image(self, name, stream)
371 def paint_path(
372 self,
373 gstate: PDFGraphicState,
374 stroke: bool,
375 fill: bool,
376 evenodd: bool,
377 path: Sequence[PathSegment],
378 ) -> None:
379 pass
382class HTMLConverter(PDFConverter[AnyIO]):
383 RECT_COLORS = {
384 "figure": "yellow",
385 "textline": "magenta",
386 "textbox": "cyan",
387 "textgroup": "red",
388 "curve": "black",
389 "page": "gray",
390 }
392 TEXT_COLORS = {
393 "textbox": "blue",
394 "char": "black",
395 }
397 def __init__(
398 self,
399 rsrcmgr: PDFResourceManager,
400 outfp: AnyIO,
401 codec: str = "utf-8",
402 pageno: int = 1,
403 laparams: Optional[LAParams] = None,
404 scale: float = 1,
405 fontscale: float = 1.0,
406 layoutmode: str = "normal",
407 showpageno: bool = True,
408 pagemargin: int = 50,
409 imagewriter: Optional[ImageWriter] = None,
410 debug: int = 0,
411 rect_colors: Optional[Dict[str, str]] = None,
412 text_colors: Optional[Dict[str, str]] = None,
413 ) -> None:
414 PDFConverter.__init__(
415 self,
416 rsrcmgr,
417 outfp,
418 codec=codec,
419 pageno=pageno,
420 laparams=laparams,
421 )
423 # write() assumes a codec for binary I/O, or no codec for text I/O.
424 if self.outfp_binary and not self.codec:
425 raise PDFValueError("Codec is required for a binary I/O output")
426 if not self.outfp_binary and self.codec:
427 raise PDFValueError("Codec must not be specified for a text I/O output")
429 if text_colors is None:
430 text_colors = {"char": "black"}
431 if rect_colors is None:
432 rect_colors = {"curve": "black", "page": "gray"}
434 self.scale = scale
435 self.fontscale = fontscale
436 self.layoutmode = layoutmode
437 self.showpageno = showpageno
438 self.pagemargin = pagemargin
439 self.imagewriter = imagewriter
440 self.rect_colors = rect_colors
441 self.text_colors = text_colors
442 if debug:
443 self.rect_colors.update(self.RECT_COLORS)
444 self.text_colors.update(self.TEXT_COLORS)
445 self._yoffset: float = self.pagemargin
446 self._font: Optional[Tuple[str, float]] = None
447 self._fontstack: List[Optional[Tuple[str, float]]] = []
448 self.write_header()
450 def write(self, text: str) -> None:
451 if self.codec:
452 cast(BinaryIO, self.outfp).write(text.encode(self.codec))
453 else:
454 cast(TextIO, self.outfp).write(text)
456 def write_header(self) -> None:
457 self.write("<html><head>\n")
458 if self.codec:
459 s = (
460 '<meta http-equiv="Content-Type" content="text/html; '
461 'charset=%s">\n' % self.codec
462 )
463 else:
464 s = '<meta http-equiv="Content-Type" content="text/html">\n'
465 self.write(s)
466 self.write("</head><body>\n")
468 def write_footer(self) -> None:
469 page_links = [f'<a href="#{i}">{i}</a>' for i in range(1, self.pageno)]
470 s = '<div style="position:absolute; top:0px;">Page: %s</div>\n' % ", ".join(
471 page_links,
472 )
473 self.write(s)
474 self.write("</body></html>\n")
476 def write_text(self, text: str) -> None:
477 self.write(enc(text))
479 def place_rect(
480 self,
481 color: str,
482 borderwidth: int,
483 x: float,
484 y: float,
485 w: float,
486 h: float,
487 ) -> None:
488 color2 = self.rect_colors.get(color)
489 if color2 is not None:
490 s = (
491 '<span style="position:absolute; border: %s %dpx solid; '
492 'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n'
493 % (
494 color2,
495 borderwidth,
496 x * self.scale,
497 (self._yoffset - y) * self.scale,
498 w * self.scale,
499 h * self.scale,
500 )
501 )
502 self.write(s)
504 def place_border(self, color: str, borderwidth: int, item: LTComponent) -> None:
505 self.place_rect(color, borderwidth, item.x0, item.y1, item.width, item.height)
507 def place_image(
508 self,
509 item: LTImage,
510 borderwidth: int,
511 x: float,
512 y: float,
513 w: float,
514 h: float,
515 ) -> None:
516 if self.imagewriter is not None:
517 name = self.imagewriter.export_image(item)
518 s = (
519 '<img src="%s" border="%d" style="position:absolute; '
520 'left:%dpx; top:%dpx;" width="%d" height="%d" />\n'
521 % (
522 enc(name),
523 borderwidth,
524 x * self.scale,
525 (self._yoffset - y) * self.scale,
526 w * self.scale,
527 h * self.scale,
528 )
529 )
530 self.write(s)
532 def place_text(
533 self,
534 color: str,
535 text: str,
536 x: float,
537 y: float,
538 size: float,
539 ) -> None:
540 color2 = self.text_colors.get(color)
541 if color2 is not None:
542 s = (
543 '<span style="position:absolute; color:%s; left:%dpx; '
544 'top:%dpx; font-size:%dpx;">'
545 % (
546 color2,
547 x * self.scale,
548 (self._yoffset - y) * self.scale,
549 size * self.scale * self.fontscale,
550 )
551 )
552 self.write(s)
553 self.write_text(text)
554 self.write("</span>\n")
556 def begin_div(
557 self,
558 color: str,
559 borderwidth: int,
560 x: float,
561 y: float,
562 w: float,
563 h: float,
564 writing_mode: str = "False",
565 ) -> None:
566 self._fontstack.append(self._font)
567 self._font = None
568 s = (
569 '<div style="position:absolute; border: %s %dpx solid; '
570 "writing-mode:%s; left:%dpx; top:%dpx; width:%dpx; "
571 'height:%dpx;">'
572 % (
573 color,
574 borderwidth,
575 writing_mode,
576 x * self.scale,
577 (self._yoffset - y) * self.scale,
578 w * self.scale,
579 h * self.scale,
580 )
581 )
582 self.write(s)
584 def end_div(self, color: str) -> None:
585 if self._font is not None:
586 self.write("</span>")
587 self._font = self._fontstack.pop()
588 self.write("</div>")
590 def put_text(self, text: str, fontname: str, fontsize: float) -> None:
591 font = (fontname, fontsize)
592 if font != self._font:
593 if self._font is not None:
594 self.write("</span>")
595 # Remove subset tag from fontname, see PDF Reference 5.5.3
596 fontname_without_subset_tag = fontname.split("+")[-1]
597 self.write(
598 '<span style="font-family: %s; font-size:%dpx">'
599 % (fontname_without_subset_tag, fontsize * self.scale * self.fontscale),
600 )
601 self._font = font
602 self.write_text(text)
604 def put_newline(self) -> None:
605 self.write("<br>")
607 def receive_layout(self, ltpage: LTPage) -> None:
608 def show_group(item: Union[LTTextGroup, TextGroupElement]) -> None:
609 if isinstance(item, LTTextGroup):
610 self.place_border("textgroup", 1, item)
611 for child in item:
612 show_group(child)
614 def render(item: LTItem) -> None:
615 child: LTItem
616 if isinstance(item, LTPage):
617 self._yoffset += item.y1
618 self.place_border("page", 1, item)
619 if self.showpageno:
620 self.write(
621 '<div style="position:absolute; top:%dpx;">'
622 % ((self._yoffset - item.y1) * self.scale),
623 )
624 self.write(
625 f'<a name="{item.pageid}">Page {item.pageid}</a></div>\n',
626 )
627 for child in item:
628 render(child)
629 if item.groups is not None:
630 for group in item.groups:
631 show_group(group)
632 elif isinstance(item, LTCurve):
633 self.place_border("curve", 1, item)
634 elif isinstance(item, LTFigure):
635 self.begin_div("figure", 1, item.x0, item.y1, item.width, item.height)
636 for child in item:
637 render(child)
638 self.end_div("figure")
639 elif isinstance(item, LTImage):
640 self.place_image(item, 1, item.x0, item.y1, item.width, item.height)
641 elif self.layoutmode == "exact":
642 if isinstance(item, LTTextLine):
643 self.place_border("textline", 1, item)
644 for child in item:
645 render(child)
646 elif isinstance(item, LTTextBox):
647 self.place_border("textbox", 1, item)
648 self.place_text(
649 "textbox",
650 str(item.index + 1),
651 item.x0,
652 item.y1,
653 20,
654 )
655 for child in item:
656 render(child)
657 elif isinstance(item, LTChar):
658 self.place_border("char", 1, item)
659 self.place_text(
660 "char",
661 item.get_text(),
662 item.x0,
663 item.y1,
664 item.size,
665 )
666 elif isinstance(item, LTTextLine):
667 for child in item:
668 render(child)
669 if self.layoutmode != "loose":
670 self.put_newline()
671 elif isinstance(item, LTTextBox):
672 self.begin_div(
673 "textbox",
674 1,
675 item.x0,
676 item.y1,
677 item.width,
678 item.height,
679 item.get_writing_mode(),
680 )
681 for child in item:
682 render(child)
683 self.end_div("textbox")
684 elif isinstance(item, LTChar):
685 fontname = make_compat_str(item.fontname)
686 self.put_text(item.get_text(), fontname, item.size)
687 elif isinstance(item, LTText):
688 self.write_text(item.get_text())
690 render(ltpage)
691 self._yoffset += self.pagemargin
693 def close(self) -> None:
694 self.write_footer()
697class XMLConverter(PDFConverter[AnyIO]):
698 CONTROL = re.compile("[\x00-\x08\x0b-\x0c\x0e-\x1f]")
700 def __init__(
701 self,
702 rsrcmgr: PDFResourceManager,
703 outfp: AnyIO,
704 codec: str = "utf-8",
705 pageno: int = 1,
706 laparams: Optional[LAParams] = None,
707 imagewriter: Optional[ImageWriter] = None,
708 stripcontrol: bool = False,
709 ) -> None:
710 PDFConverter.__init__(
711 self,
712 rsrcmgr,
713 outfp,
714 codec=codec,
715 pageno=pageno,
716 laparams=laparams,
717 )
719 # write() assumes a codec for binary I/O, or no codec for text I/O.
720 if self.outfp_binary == (not self.codec):
721 raise PDFValueError("Codec is required for a binary I/O output")
723 self.imagewriter = imagewriter
724 self.stripcontrol = stripcontrol
725 self.write_header()
727 def write(self, text: str) -> None:
728 if self.codec:
729 cast(BinaryIO, self.outfp).write(text.encode(self.codec))
730 else:
731 cast(TextIO, self.outfp).write(text)
733 def write_header(self) -> None:
734 if self.codec:
735 self.write('<?xml version="1.0" encoding="%s" ?>\n' % self.codec)
736 else:
737 self.write('<?xml version="1.0" ?>\n')
738 self.write("<pages>\n")
740 def write_footer(self) -> None:
741 self.write("</pages>\n")
743 def write_text(self, text: str) -> None:
744 if self.stripcontrol:
745 text = self.CONTROL.sub("", text)
746 self.write(enc(text))
748 def receive_layout(self, ltpage: LTPage) -> None:
749 def show_group(item: LTItem) -> None:
750 if isinstance(item, LTTextBox):
751 self.write(
752 '<textbox id="%d" bbox="%s" />\n'
753 % (item.index, bbox2str(item.bbox)),
754 )
755 elif isinstance(item, LTTextGroup):
756 self.write('<textgroup bbox="%s">\n' % bbox2str(item.bbox))
757 for child in item:
758 show_group(child)
759 self.write("</textgroup>\n")
761 def render(item: LTItem) -> None:
762 child: LTItem
763 if isinstance(item, LTPage):
764 s = '<page id="%s" bbox="%s" rotate="%d">\n' % (
765 item.pageid,
766 bbox2str(item.bbox),
767 item.rotate,
768 )
769 self.write(s)
770 for child in item:
771 render(child)
772 if item.groups is not None:
773 self.write("<layout>\n")
774 for group in item.groups:
775 show_group(group)
776 self.write("</layout>\n")
777 self.write("</page>\n")
778 elif isinstance(item, LTLine):
779 s = '<line linewidth="%d" bbox="%s" />\n' % (
780 item.linewidth,
781 bbox2str(item.bbox),
782 )
783 self.write(s)
784 elif isinstance(item, LTRect):
785 s = '<rect linewidth="%d" bbox="%s" />\n' % (
786 item.linewidth,
787 bbox2str(item.bbox),
788 )
789 self.write(s)
790 elif isinstance(item, LTCurve):
791 s = '<curve linewidth="%d" bbox="%s" pts="%s"/>\n' % (
792 item.linewidth,
793 bbox2str(item.bbox),
794 item.get_pts(),
795 )
796 self.write(s)
797 elif isinstance(item, LTFigure):
798 s = f'<figure name="{item.name}" bbox="{bbox2str(item.bbox)}">\n'
799 self.write(s)
800 for child in item:
801 render(child)
802 self.write("</figure>\n")
803 elif isinstance(item, LTTextLine):
804 self.write('<textline bbox="%s">\n' % bbox2str(item.bbox))
805 for child in item:
806 render(child)
807 self.write("</textline>\n")
808 elif isinstance(item, LTTextBox):
809 wmode = ""
810 if isinstance(item, LTTextBoxVertical):
811 wmode = ' wmode="vertical"'
812 s = '<textbox id="%d" bbox="%s"%s>\n' % (
813 item.index,
814 bbox2str(item.bbox),
815 wmode,
816 )
817 self.write(s)
818 for child in item:
819 render(child)
820 self.write("</textbox>\n")
821 elif isinstance(item, LTChar):
822 s = (
823 '<text font="%s" bbox="%s" colourspace="%s" '
824 'ncolour="%s" size="%.3f">'
825 % (
826 enc(item.fontname),
827 bbox2str(item.bbox),
828 item.ncs.name,
829 item.graphicstate.ncolor,
830 item.size,
831 )
832 )
833 self.write(s)
834 self.write_text(item.get_text())
835 self.write("</text>\n")
836 elif isinstance(item, LTText):
837 self.write("<text>%s</text>\n" % item.get_text())
838 elif isinstance(item, LTImage):
839 if self.imagewriter is not None:
840 name = self.imagewriter.export_image(item)
841 self.write(
842 '<image src="%s" width="%d" height="%d" />\n'
843 % (enc(name), item.width, item.height),
844 )
845 else:
846 self.write(
847 '<image width="%d" height="%d" />\n'
848 % (item.width, item.height),
849 )
850 else:
851 assert False, str(("Unhandled", item))
853 render(ltpage)
855 def close(self) -> None:
856 self.write_footer()
859class HOCRConverter(PDFConverter[AnyIO]):
860 """Extract an hOCR representation from explicit text information within a PDF."""
862 # Where text is being extracted from a variety of types of PDF within a
863 # business process, those PDFs where the text is only present in image
864 # form will need to be analysed using an OCR tool which will typically
865 # output hOCR. This converter extracts the explicit text information from
866 # those PDFs that do have it and uses it to genxerate a basic hOCR
867 # representation that is designed to be used in conjunction with the image
868 # of the PDF in the same way as genuine OCR output would be, but without the
869 # inevitable OCR errors.
871 # The converter does not handle images, diagrams or text colors.
873 # In the examples processed by the contributor it was necessary to set
874 # LAParams.all_texts to True.
876 CONTROL = re.compile(r"[\x00-\x08\x0b-\x0c\x0e-\x1f]")
878 def __init__(
879 self,
880 rsrcmgr: PDFResourceManager,
881 outfp: AnyIO,
882 codec: str = "utf8",
883 pageno: int = 1,
884 laparams: Optional[LAParams] = None,
885 stripcontrol: bool = False,
886 ):
887 PDFConverter.__init__(
888 self,
889 rsrcmgr,
890 outfp,
891 codec=codec,
892 pageno=pageno,
893 laparams=laparams,
894 )
895 self.stripcontrol = stripcontrol
896 self.within_chars = False
897 self.write_header()
899 def bbox_repr(self, bbox: Rect) -> str:
900 (in_x0, in_y0, in_x1, in_y1) = bbox
901 # PDF y-coordinates are the other way round from hOCR coordinates
902 out_x0 = int(in_x0)
903 out_y0 = int(self.page_bbox[3] - in_y1)
904 out_x1 = int(in_x1)
905 out_y1 = int(self.page_bbox[3] - in_y0)
906 return f"bbox {out_x0} {out_y0} {out_x1} {out_y1}"
908 def write(self, text: str) -> None:
909 if self.codec:
910 encoded_text = text.encode(self.codec)
911 cast(BinaryIO, self.outfp).write(encoded_text)
912 else:
913 cast(TextIO, self.outfp).write(text)
915 def write_header(self) -> None:
916 if self.codec:
917 self.write(
918 "<html xmlns='http://www.w3.org/1999/xhtml' "
919 "xml:lang='en' lang='en' charset='%s'>\n" % self.codec,
920 )
921 else:
922 self.write(
923 "<html xmlns='http://www.w3.org/1999/xhtml' "
924 "xml:lang='en' lang='en'>\n",
925 )
926 self.write("<head>\n")
927 self.write("<title></title>\n")
928 self.write(
929 "<meta http-equiv='Content-Type' content='text/html;charset=utf-8' />\n",
930 )
931 self.write(
932 "<meta name='ocr-system' content='pdfminer.six HOCR Converter' />\n",
933 )
934 self.write(
935 " <meta name='ocr-capabilities'"
936 " content='ocr_page ocr_block ocr_line ocrx_word'/>\n",
937 )
938 self.write("</head>\n")
939 self.write("<body>\n")
941 def write_footer(self) -> None:
942 self.write("<!-- comment in the following line to debug -->\n")
943 self.write(
944 "<!--script src='https://unpkg.com/hocrjs'></script--></body></html>\n",
945 )
947 def write_text(self, text: str) -> None:
948 if self.stripcontrol:
949 text = self.CONTROL.sub("", text)
950 self.write(text)
952 def write_word(self) -> None:
953 if len(self.working_text) > 0:
954 bold_and_italic_styles = ""
955 if "Italic" in self.working_font:
956 bold_and_italic_styles = "font-style: italic; "
957 if "Bold" in self.working_font:
958 bold_and_italic_styles += "font-weight: bold; "
959 self.write(
960 "<span style='font:\"%s\"; font-size:%d; %s' "
961 "class='ocrx_word' title='%s; x_font %s; "
962 "x_fsize %d'>%s</span>"
963 % (
964 (
965 self.working_font,
966 self.working_size,
967 bold_and_italic_styles,
968 self.bbox_repr(self.working_bbox),
969 self.working_font,
970 self.working_size,
971 self.working_text.strip(),
972 )
973 ),
974 )
975 self.within_chars = False
977 def receive_layout(self, ltpage: LTPage) -> None:
978 def render(item: LTItem) -> None:
979 if self.within_chars and isinstance(item, LTAnno):
980 self.write_word()
981 if isinstance(item, LTPage):
982 self.page_bbox = item.bbox
983 self.write(
984 "<div class='ocr_page' id='%s' title='%s'>\n"
985 % (item.pageid, self.bbox_repr(item.bbox)),
986 )
987 for child in item:
988 render(child)
989 self.write("</div>\n")
990 elif isinstance(item, LTTextLine):
991 self.write(
992 "<span class='ocr_line' title='%s'>" % (self.bbox_repr(item.bbox)),
993 )
994 for child_line in item:
995 render(child_line)
996 self.write("</span>\n")
997 elif isinstance(item, LTTextBox):
998 self.write(
999 "<div class='ocr_block' id='%d' title='%s'>\n"
1000 % (item.index, self.bbox_repr(item.bbox)),
1001 )
1002 for child in item:
1003 render(child)
1004 self.write("</div>\n")
1005 elif isinstance(item, LTChar):
1006 if not self.within_chars:
1007 self.within_chars = True
1008 self.working_text = item.get_text()
1009 self.working_bbox = item.bbox
1010 self.working_font = item.fontname
1011 self.working_size = item.size
1012 elif len(item.get_text().strip()) == 0:
1013 self.write_word()
1014 self.write(item.get_text())
1015 else:
1016 if (
1017 self.working_bbox[1] != item.bbox[1]
1018 or self.working_font != item.fontname
1019 or self.working_size != item.size
1020 ):
1021 self.write_word()
1022 self.working_bbox = item.bbox
1023 self.working_font = item.fontname
1024 self.working_size = item.size
1025 self.working_text += item.get_text()
1026 self.working_bbox = (
1027 self.working_bbox[0],
1028 self.working_bbox[1],
1029 item.bbox[2],
1030 self.working_bbox[3],
1031 )
1033 render(ltpage)
1035 def close(self) -> None:
1036 self.write_footer()