Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/high

1"""Functions that can be used for the most common use-cases for pdfminer.six"""

3import logging

4import sys

5from io import StringIO

6from typing import Any, BinaryIO, Container, Iterator, Optional, cast

8from pdfminer.converter import (

9 HOCRConverter,

10 HTMLConverter,

11 PDFPageAggregator,

12 TextConverter,

13 XMLConverter,

14)

15from pdfminer.image import ImageWriter

16from pdfminer.layout import LAParams, LTPage

17from pdfminer.pdfdevice import PDFDevice, TagExtractor

18from pdfminer.pdfexceptions import PDFValueError

19from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager

20from pdfminer.pdfpage import PDFPage

21from pdfminer.utils import AnyIO, FileOrName, open_filename

24def extract_text_to_fp(

25 inf: BinaryIO,

26 outfp: AnyIO,

27 output_type: str = "text",

28 codec: str = "utf-8",

29 laparams: Optional[LAParams] = None,

30 maxpages: int = 0,

31 page_numbers: Optional[Container[int]] = None,

32 password: str = "",

33 scale: float = 1.0,

34 rotation: int = 0,

35 layoutmode: str = "normal",

36 output_dir: Optional[str] = None,

37 strip_control: bool = False,

38 debug: bool = False,

39 disable_caching: bool = False,

40 **kwargs: Any,

41) -> None:

42 """Parses text from inf-file and writes to outfp file-like object.

44 Takes loads of optional arguments but the defaults are somewhat sane.

45 Beware laparams: Including an empty LAParams is not the same as passing

46 None!

48 :param inf: a file-like object to read PDF structure from, such as a

49 file handler (using the builtin `open()` function) or a `BytesIO`.

50 :param outfp: a file-like object to write the text to.

51 :param output_type: May be 'text', 'xml', 'html', 'hocr', 'tag'.

52 Only 'text' works properly.

53 :param codec: Text decoding codec

54 :param laparams: An LAParams object from pdfminer.layout. Default is None

55 but may not layout correctly.

56 :param maxpages: How many pages to stop parsing after

57 :param page_numbers: zero-indexed page numbers to operate on.

58 :param password: For encrypted PDFs, the password to decrypt.

59 :param scale: Scale factor

60 :param rotation: Rotation factor

61 :param layoutmode: Default is 'normal', see

62 pdfminer.converter.HTMLConverter

63 :param output_dir: If given, creates an ImageWriter for extracted images.

64 :param strip_control: Does what it says on the tin

65 :param debug: Output more logging data

66 :param disable_caching: Does what it says on the tin

67 :param other:

68 :return: nothing, acting as it does on two streams. Use StringIO to get

69 strings.

70 """

71 if debug:

72 logging.getLogger().setLevel(logging.DEBUG)

74 imagewriter = None

75 if output_dir:

76 imagewriter = ImageWriter(output_dir)

78 rsrcmgr = PDFResourceManager(caching=not disable_caching)

79 device: Optional[PDFDevice] = None

81 if output_type != "text" and outfp == sys.stdout:

82 outfp = sys.stdout.buffer

84 if output_type == "text":

85 device = TextConverter(

86 rsrcmgr,

87 outfp,

88 codec=codec,

89 laparams=laparams,

90 imagewriter=imagewriter,

91 )

93 elif output_type == "xml":

94 device = XMLConverter(

95 rsrcmgr,

96 outfp,

97 codec=codec,

98 laparams=laparams,

99 imagewriter=imagewriter,

100 stripcontrol=strip_control,

101 )

102

103 elif output_type == "html":

104 device = HTMLConverter(

105 rsrcmgr,

106 outfp,

107 codec=codec,

108 scale=scale,

109 layoutmode=layoutmode,

110 laparams=laparams,

111 imagewriter=imagewriter,

112 )

113

114 elif output_type == "hocr":

115 device = HOCRConverter(

116 rsrcmgr,

117 outfp,

118 codec=codec,

119 laparams=laparams,

120 stripcontrol=strip_control,

121 )

122

123 elif output_type == "tag":

124 # Binary I/O is required, but we have no good way to test it here.

125 device = TagExtractor(rsrcmgr, cast(BinaryIO, outfp), codec=codec)

126

127 else:

128 msg = f"Output type can be text, html, xml or tag but is {output_type}"

129 raise PDFValueError(msg)

130

131 assert device is not None

132 interpreter = PDFPageInterpreter(rsrcmgr, device)

133 for page in PDFPage.get_pages(

134 inf,

135 page_numbers,

136 maxpages=maxpages,

137 password=password,

138 caching=not disable_caching,

139 ):

140 page.rotate = (page.rotate + rotation) % 360

141 interpreter.process_page(page)

142

143 device.close()

144

145

146def extract_text(

147 pdf_file: FileOrName,

148 password: str = "",

149 page_numbers: Optional[Container[int]] = None,

150 maxpages: int = 0,

151 caching: bool = True,

152 codec: str = "utf-8",

153 laparams: Optional[LAParams] = None,

154) -> str:

155 """Parse and return the text contained in a PDF file.

156

157 :param pdf_file: Either a file path or a file-like object for the PDF file

158 to be worked on.

159 :param password: For encrypted PDFs, the password to decrypt.

160 :param page_numbers: List of zero-indexed page numbers to extract.

161 :param maxpages: The maximum number of pages to parse

162 :param caching: If resources should be cached

163 :param codec: Text decoding codec

164 :param laparams: An LAParams object from pdfminer.layout. If None, uses

165 some default settings that often work well.

166 :return: a string containing all of the text extracted.

167 """

168 if laparams is None:

169 laparams = LAParams()

170

171 with open_filename(pdf_file, "rb") as fp, StringIO() as output_string:

172 fp = cast(BinaryIO, fp) # we opened in binary mode

173 rsrcmgr = PDFResourceManager(caching=caching)

174 device = TextConverter(rsrcmgr, output_string, codec=codec, laparams=laparams)

175 interpreter = PDFPageInterpreter(rsrcmgr, device)

176

177 for page in PDFPage.get_pages(

178 fp,

179 page_numbers,

180 maxpages=maxpages,

181 password=password,

182 caching=caching,

183 ):

184 interpreter.process_page(page)

185

186 return output_string.getvalue()

187

188

189def extract_pages(

190 pdf_file: FileOrName,

191 password: str = "",

192 page_numbers: Optional[Container[int]] = None,

193 maxpages: int = 0,

194 caching: bool = True,

195 laparams: Optional[LAParams] = None,

196) -> Iterator[LTPage]:

197 """Extract and yield LTPage objects

198

199 :param pdf_file: Either a file path or a file-like object for the PDF file

200 to be worked on.

201 :param password: For encrypted PDFs, the password to decrypt.

202 :param page_numbers: List of zero-indexed page numbers to extract.

203 :param maxpages: The maximum number of pages to parse

204 :param caching: If resources should be cached

205 :param laparams: An LAParams object from pdfminer.layout. If None, uses

206 some default settings that often work well.

207 :return: LTPage objects

208 """

209 if laparams is None:

210 laparams = LAParams()

211

212 with open_filename(pdf_file, "rb") as fp:

213 fp = cast(BinaryIO, fp) # we opened in binary mode

214 resource_manager = PDFResourceManager(caching=caching)

215 device = PDFPageAggregator(resource_manager, laparams=laparams)

216 interpreter = PDFPageInterpreter(resource_manager, device)

217 for page in PDFPage.get_pages(

218 fp,

219 page_numbers,

220 maxpages=maxpages,

221 password=password,

222 caching=caching,

223 ):

224 interpreter.process_page(page)

225 layout = device.get_result()

226 yield layout

Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/high_level.py: 57%

63 statements