Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/high_level.py: 57%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""Functions that can be used for the most common use-cases for pdfminer.six"""
3import logging
4import sys
5from io import StringIO
6from typing import Any, BinaryIO, Container, Iterator, Optional, cast
8from pdfminer.converter import (
9 HOCRConverter,
10 HTMLConverter,
11 PDFPageAggregator,
12 TextConverter,
13 XMLConverter,
14)
15from pdfminer.image import ImageWriter
16from pdfminer.layout import LAParams, LTPage
17from pdfminer.pdfdevice import PDFDevice, TagExtractor
18from pdfminer.pdfexceptions import PDFValueError
19from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
20from pdfminer.pdfpage import PDFPage
21from pdfminer.utils import AnyIO, FileOrName, open_filename
24def extract_text_to_fp(
25 inf: BinaryIO,
26 outfp: AnyIO,
27 output_type: str = "text",
28 codec: str = "utf-8",
29 laparams: Optional[LAParams] = None,
30 maxpages: int = 0,
31 page_numbers: Optional[Container[int]] = None,
32 password: str = "",
33 scale: float = 1.0,
34 rotation: int = 0,
35 layoutmode: str = "normal",
36 output_dir: Optional[str] = None,
37 strip_control: bool = False,
38 debug: bool = False,
39 disable_caching: bool = False,
40 **kwargs: Any,
41) -> None:
42 """Parses text from inf-file and writes to outfp file-like object.
44 Takes loads of optional arguments but the defaults are somewhat sane.
45 Beware laparams: Including an empty LAParams is not the same as passing
46 None!
48 :param inf: a file-like object to read PDF structure from, such as a
49 file handler (using the builtin `open()` function) or a `BytesIO`.
50 :param outfp: a file-like object to write the text to.
51 :param output_type: May be 'text', 'xml', 'html', 'hocr', 'tag'.
52 Only 'text' works properly.
53 :param codec: Text decoding codec
54 :param laparams: An LAParams object from pdfminer.layout. Default is None
55 but may not layout correctly.
56 :param maxpages: How many pages to stop parsing after
57 :param page_numbers: zero-indexed page numbers to operate on.
58 :param password: For encrypted PDFs, the password to decrypt.
59 :param scale: Scale factor
60 :param rotation: Rotation factor
61 :param layoutmode: Default is 'normal', see
62 pdfminer.converter.HTMLConverter
63 :param output_dir: If given, creates an ImageWriter for extracted images.
64 :param strip_control: Does what it says on the tin
65 :param debug: Output more logging data
66 :param disable_caching: Does what it says on the tin
67 :param other:
68 :return: nothing, acting as it does on two streams. Use StringIO to get
69 strings.
70 """
71 if debug:
72 logging.getLogger().setLevel(logging.DEBUG)
74 imagewriter = None
75 if output_dir:
76 imagewriter = ImageWriter(output_dir)
78 rsrcmgr = PDFResourceManager(caching=not disable_caching)
79 device: Optional[PDFDevice] = None
81 if output_type != "text" and outfp == sys.stdout:
82 outfp = sys.stdout.buffer
84 if output_type == "text":
85 device = TextConverter(
86 rsrcmgr,
87 outfp,
88 codec=codec,
89 laparams=laparams,
90 imagewriter=imagewriter,
91 )
93 elif output_type == "xml":
94 device = XMLConverter(
95 rsrcmgr,
96 outfp,
97 codec=codec,
98 laparams=laparams,
99 imagewriter=imagewriter,
100 stripcontrol=strip_control,
101 )
103 elif output_type == "html":
104 device = HTMLConverter(
105 rsrcmgr,
106 outfp,
107 codec=codec,
108 scale=scale,
109 layoutmode=layoutmode,
110 laparams=laparams,
111 imagewriter=imagewriter,
112 )
114 elif output_type == "hocr":
115 device = HOCRConverter(
116 rsrcmgr,
117 outfp,
118 codec=codec,
119 laparams=laparams,
120 stripcontrol=strip_control,
121 )
123 elif output_type == "tag":
124 # Binary I/O is required, but we have no good way to test it here.
125 device = TagExtractor(rsrcmgr, cast(BinaryIO, outfp), codec=codec)
127 else:
128 msg = f"Output type can be text, html, xml or tag but is {output_type}"
129 raise PDFValueError(msg)
131 assert device is not None
132 interpreter = PDFPageInterpreter(rsrcmgr, device)
133 for page in PDFPage.get_pages(
134 inf,
135 page_numbers,
136 maxpages=maxpages,
137 password=password,
138 caching=not disable_caching,
139 ):
140 page.rotate = (page.rotate + rotation) % 360
141 interpreter.process_page(page)
143 device.close()
146def extract_text(
147 pdf_file: FileOrName,
148 password: str = "",
149 page_numbers: Optional[Container[int]] = None,
150 maxpages: int = 0,
151 caching: bool = True,
152 codec: str = "utf-8",
153 laparams: Optional[LAParams] = None,
154) -> str:
155 """Parse and return the text contained in a PDF file.
157 :param pdf_file: Either a file path or a file-like object for the PDF file
158 to be worked on.
159 :param password: For encrypted PDFs, the password to decrypt.
160 :param page_numbers: List of zero-indexed page numbers to extract.
161 :param maxpages: The maximum number of pages to parse
162 :param caching: If resources should be cached
163 :param codec: Text decoding codec
164 :param laparams: An LAParams object from pdfminer.layout. If None, uses
165 some default settings that often work well.
166 :return: a string containing all of the text extracted.
167 """
168 if laparams is None:
169 laparams = LAParams()
171 with open_filename(pdf_file, "rb") as fp, StringIO() as output_string:
172 fp = cast(BinaryIO, fp) # we opened in binary mode
173 rsrcmgr = PDFResourceManager(caching=caching)
174 device = TextConverter(rsrcmgr, output_string, codec=codec, laparams=laparams)
175 interpreter = PDFPageInterpreter(rsrcmgr, device)
177 for page in PDFPage.get_pages(
178 fp,
179 page_numbers,
180 maxpages=maxpages,
181 password=password,
182 caching=caching,
183 ):
184 interpreter.process_page(page)
186 return output_string.getvalue()
189def extract_pages(
190 pdf_file: FileOrName,
191 password: str = "",
192 page_numbers: Optional[Container[int]] = None,
193 maxpages: int = 0,
194 caching: bool = True,
195 laparams: Optional[LAParams] = None,
196) -> Iterator[LTPage]:
197 """Extract and yield LTPage objects
199 :param pdf_file: Either a file path or a file-like object for the PDF file
200 to be worked on.
201 :param password: For encrypted PDFs, the password to decrypt.
202 :param page_numbers: List of zero-indexed page numbers to extract.
203 :param maxpages: The maximum number of pages to parse
204 :param caching: If resources should be cached
205 :param laparams: An LAParams object from pdfminer.layout. If None, uses
206 some default settings that often work well.
207 :return: LTPage objects
208 """
209 if laparams is None:
210 laparams = LAParams()
212 with open_filename(pdf_file, "rb") as fp:
213 fp = cast(BinaryIO, fp) # we opened in binary mode
214 resource_manager = PDFResourceManager(caching=caching)
215 device = PDFPageAggregator(resource_manager, laparams=laparams)
216 interpreter = PDFPageInterpreter(resource_manager, device)
217 for page in PDFPage.get_pages(
218 fp,
219 page_numbers,
220 maxpages=maxpages,
221 password=password,
222 caching=caching,
223 ):
224 interpreter.process_page(page)
225 layout = device.get_result()
226 yield layout