Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/high_level.py: 42%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""Functions that can be used for the most common use-cases for pdfminer.six"""
3import logging
4import sys
5from collections.abc import Container, Iterator
6from io import StringIO
7from typing import Any, BinaryIO, cast
9from pdfminer.converter import (
10 HOCRConverter,
11 HTMLConverter,
12 PDFPageAggregator,
13 TextConverter,
14 XMLConverter,
15)
16from pdfminer.image import ImageWriter
17from pdfminer.layout import LAParams, LTPage
18from pdfminer.pdfdevice import PDFDevice, TagExtractor
19from pdfminer.pdfexceptions import PDFValueError
20from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
21from pdfminer.pdfpage import PDFPage
22from pdfminer.utils import AnyIO, FileOrName, open_filename
25def extract_text_to_fp(
26 inf: BinaryIO,
27 outfp: AnyIO,
28 output_type: str = "text",
29 codec: str = "utf-8",
30 laparams: LAParams | None = None,
31 maxpages: int = 0,
32 page_numbers: Container[int] | None = None,
33 password: str = "",
34 scale: float = 1.0,
35 rotation: int = 0,
36 layoutmode: str = "normal",
37 output_dir: str | None = None,
38 strip_control: bool = False,
39 debug: bool = False,
40 disable_caching: bool = False,
41 **kwargs: Any,
42) -> None:
43 """Parses text from inf-file and writes to outfp file-like object.
45 Takes loads of optional arguments but the defaults are somewhat sane.
46 Beware laparams: Including an empty LAParams is not the same as passing
47 None!
49 :param inf: a file-like object to read PDF structure from, such as a
50 file handler (using the builtin `open()` function) or a `BytesIO`.
51 :param outfp: a file-like object to write the text to.
52 :param output_type: May be 'text', 'xml', 'html', 'hocr', 'tag'.
53 Only 'text' works properly.
54 :param codec: Text decoding codec
55 :param laparams: An LAParams object from pdfminer.layout. Default is None
56 but may not layout correctly.
57 :param maxpages: How many pages to stop parsing after
58 :param page_numbers: zero-indexed page numbers to operate on.
59 :param password: For encrypted PDFs, the password to decrypt.
60 :param scale: Scale factor
61 :param rotation: Rotation factor
62 :param layoutmode: Default is 'normal', see
63 pdfminer.converter.HTMLConverter
64 :param output_dir: If given, creates an ImageWriter for extracted images.
65 :param strip_control: Does what it says on the tin
66 :param debug: Output more logging data
67 :param disable_caching: Does what it says on the tin
68 :param other:
69 :return: nothing, acting as it does on two streams. Use StringIO to get
70 strings.
71 """
72 if debug:
73 logging.getLogger().setLevel(logging.DEBUG)
75 imagewriter = None
76 if output_dir:
77 imagewriter = ImageWriter(output_dir)
79 rsrcmgr = PDFResourceManager(caching=not disable_caching)
80 device: PDFDevice | None = None
82 if output_type != "text" and outfp == sys.stdout:
83 outfp = sys.stdout.buffer
85 if output_type == "text":
86 device = TextConverter(
87 rsrcmgr,
88 outfp,
89 codec=codec,
90 laparams=laparams,
91 imagewriter=imagewriter,
92 )
94 elif output_type == "xml":
95 device = XMLConverter(
96 rsrcmgr,
97 outfp,
98 codec=codec,
99 laparams=laparams,
100 imagewriter=imagewriter,
101 stripcontrol=strip_control,
102 )
104 elif output_type == "html":
105 device = HTMLConverter(
106 rsrcmgr,
107 outfp,
108 codec=codec,
109 scale=scale,
110 layoutmode=layoutmode,
111 laparams=laparams,
112 imagewriter=imagewriter,
113 )
115 elif output_type == "hocr":
116 device = HOCRConverter(
117 rsrcmgr,
118 outfp,
119 codec=codec,
120 laparams=laparams,
121 stripcontrol=strip_control,
122 )
124 elif output_type == "tag":
125 # Binary I/O is required, but we have no good way to test it here.
126 device = TagExtractor(rsrcmgr, cast(BinaryIO, outfp), codec=codec)
128 else:
129 msg = f"Output type can be text, html, xml or tag but is {output_type}"
130 raise PDFValueError(msg)
132 assert device is not None
133 interpreter = PDFPageInterpreter(rsrcmgr, device)
134 for page in PDFPage.get_pages(
135 inf,
136 page_numbers,
137 maxpages=maxpages,
138 password=password,
139 caching=not disable_caching,
140 ):
141 page.rotate = (page.rotate + rotation) % 360
142 interpreter.process_page(page)
144 device.close()
147def extract_text(
148 pdf_file: FileOrName,
149 password: str = "",
150 page_numbers: Container[int] | None = None,
151 maxpages: int = 0,
152 caching: bool = True,
153 codec: str = "utf-8",
154 laparams: LAParams | None = None,
155) -> str:
156 """Parse and return the text contained in a PDF file.
158 :param pdf_file: Either a file path or a file-like object for the PDF file
159 to be worked on.
160 :param password: For encrypted PDFs, the password to decrypt.
161 :param page_numbers: List of zero-indexed page numbers to extract.
162 :param maxpages: The maximum number of pages to parse
163 :param caching: If resources should be cached
164 :param codec: Text decoding codec
165 :param laparams: An LAParams object from pdfminer.layout. If None, uses
166 some default settings that often work well.
167 :return: a string containing all of the text extracted.
168 """
169 if laparams is None:
170 laparams = LAParams()
172 with open_filename(pdf_file, "rb") as fp, StringIO() as output_string:
173 fp = cast(BinaryIO, fp) # we opened in binary mode
174 rsrcmgr = PDFResourceManager(caching=caching)
175 device = TextConverter(rsrcmgr, output_string, codec=codec, laparams=laparams)
176 interpreter = PDFPageInterpreter(rsrcmgr, device)
178 for page in PDFPage.get_pages(
179 fp,
180 page_numbers,
181 maxpages=maxpages,
182 password=password,
183 caching=caching,
184 ):
185 interpreter.process_page(page)
187 return output_string.getvalue()
190def extract_pages(
191 pdf_file: FileOrName,
192 password: str = "",
193 page_numbers: Container[int] | None = None,
194 maxpages: int = 0,
195 caching: bool = True,
196 laparams: LAParams | None = None,
197) -> Iterator[LTPage]:
198 """Extract and yield LTPage objects
200 :param pdf_file: Either a file path or a file-like object for the PDF file
201 to be worked on.
202 :param password: For encrypted PDFs, the password to decrypt.
203 :param page_numbers: List of zero-indexed page numbers to extract.
204 :param maxpages: The maximum number of pages to parse
205 :param caching: If resources should be cached
206 :param laparams: An LAParams object from pdfminer.layout. If None, uses
207 some default settings that often work well.
208 :return: LTPage objects
209 """
210 if laparams is None:
211 laparams = LAParams()
213 with open_filename(pdf_file, "rb") as fp:
214 fp = cast(BinaryIO, fp) # we opened in binary mode
215 resource_manager = PDFResourceManager(caching=caching)
216 device = PDFPageAggregator(resource_manager, laparams=laparams)
217 interpreter = PDFPageInterpreter(resource_manager, device)
218 for page in PDFPage.get_pages(
219 fp,
220 page_numbers,
221 maxpages=maxpages,
222 password=password,
223 caching=caching,
224 ):
225 interpreter.process_page(page)
226 layout = device.get_result()
227 yield layout