Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/high

1"""Functions that can be used for the most common use-cases for pdfminer.six"""

3import logging

4import sys

5from collections.abc import Container, Iterator

6from io import StringIO

7from typing import Any, BinaryIO, cast

9from pdfminer.converter import (

10 HOCRConverter,

11 HTMLConverter,

12 PDFPageAggregator,

13 TextConverter,

14 XMLConverter,

15)

16from pdfminer.image import ImageWriter

17from pdfminer.layout import LAParams, LTPage

18from pdfminer.pdfdevice import PDFDevice, TagExtractor

19from pdfminer.pdfexceptions import PDFValueError

20from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager

21from pdfminer.pdfpage import PDFPage

22from pdfminer.utils import AnyIO, FileOrName, open_filename

25def extract_text_to_fp(

26 inf: BinaryIO,

27 outfp: AnyIO,

28 output_type: str = "text",

29 codec: str = "utf-8",

30 laparams: LAParams | None = None,

31 maxpages: int = 0,

32 page_numbers: Container[int] | None = None,

33 password: str = "",

34 scale: float = 1.0,

35 rotation: int = 0,

36 layoutmode: str = "normal",

37 output_dir: str | None = None,

38 strip_control: bool = False,

39 debug: bool = False,

40 disable_caching: bool = False,

41 **kwargs: Any,

42) -> None:

43 """Parses text from inf-file and writes to outfp file-like object.

45 Takes loads of optional arguments but the defaults are somewhat sane.

46 Beware laparams: Including an empty LAParams is not the same as passing

47 None!

49 :param inf: a file-like object to read PDF structure from, such as a

50 file handler (using the builtin `open()` function) or a `BytesIO`.

51 :param outfp: a file-like object to write the text to.

52 :param output_type: May be 'text', 'xml', 'html', 'hocr', 'tag'.

53 Only 'text' works properly.

54 :param codec: Text decoding codec

55 :param laparams: An LAParams object from pdfminer.layout. Default is None

56 but may not layout correctly.

57 :param maxpages: How many pages to stop parsing after

58 :param page_numbers: zero-indexed page numbers to operate on.

59 :param password: For encrypted PDFs, the password to decrypt.

60 :param scale: Scale factor

61 :param rotation: Rotation factor

62 :param layoutmode: Default is 'normal', see

63 pdfminer.converter.HTMLConverter

64 :param output_dir: If given, creates an ImageWriter for extracted images.

65 :param strip_control: Does what it says on the tin

66 :param debug: Output more logging data

67 :param disable_caching: Does what it says on the tin

68 :param other:

69 :return: nothing, acting as it does on two streams. Use StringIO to get

70 strings.

71 """

72 if debug:

73 logging.getLogger().setLevel(logging.DEBUG)

75 imagewriter = None

76 if output_dir:

77 imagewriter = ImageWriter(output_dir)

79 rsrcmgr = PDFResourceManager(caching=not disable_caching)

80 device: PDFDevice | None = None

82 if output_type != "text" and outfp == sys.stdout:

83 outfp = sys.stdout.buffer

85 if output_type == "text":

86 device = TextConverter(

87 rsrcmgr,

88 outfp,

89 codec=codec,

90 laparams=laparams,

91 imagewriter=imagewriter,

92 )

94 elif output_type == "xml":

95 device = XMLConverter(

96 rsrcmgr,

97 outfp,

98 codec=codec,

99 laparams=laparams,

100 imagewriter=imagewriter,

101 stripcontrol=strip_control,

102 )

103

104 elif output_type == "html":

105 device = HTMLConverter(

106 rsrcmgr,

107 outfp,

108 codec=codec,

109 scale=scale,

110 layoutmode=layoutmode,

111 laparams=laparams,

112 imagewriter=imagewriter,

113 )

114

115 elif output_type == "hocr":

116 device = HOCRConverter(

117 rsrcmgr,

118 outfp,

119 codec=codec,

120 laparams=laparams,

121 stripcontrol=strip_control,

122 )

123

124 elif output_type == "tag":

125 # Binary I/O is required, but we have no good way to test it here.

126 device = TagExtractor(rsrcmgr, cast(BinaryIO, outfp), codec=codec)

127

128 else:

129 msg = f"Output type can be text, html, xml or tag but is {output_type}"

130 raise PDFValueError(msg)

131

132 assert device is not None

133 interpreter = PDFPageInterpreter(rsrcmgr, device)

134 for page in PDFPage.get_pages(

135 inf,

136 page_numbers,

137 maxpages=maxpages,

138 password=password,

139 caching=not disable_caching,

140 ):

141 page.rotate = (page.rotate + rotation) % 360

142 interpreter.process_page(page)

143

144 device.close()

145

146

147def extract_text(

148 pdf_file: FileOrName,

149 password: str = "",

150 page_numbers: Container[int] | None = None,

151 maxpages: int = 0,

152 caching: bool = True,

153 codec: str = "utf-8",

154 laparams: LAParams | None = None,

155) -> str:

156 """Parse and return the text contained in a PDF file.

157

158 :param pdf_file: Either a file path or a file-like object for the PDF file

159 to be worked on.

160 :param password: For encrypted PDFs, the password to decrypt.

161 :param page_numbers: List of zero-indexed page numbers to extract.

162 :param maxpages: The maximum number of pages to parse

163 :param caching: If resources should be cached

164 :param codec: Text decoding codec

165 :param laparams: An LAParams object from pdfminer.layout. If None, uses

166 some default settings that often work well.

167 :return: a string containing all of the text extracted.

168 """

169 if laparams is None:

170 laparams = LAParams()

171

172 with open_filename(pdf_file, "rb") as fp, StringIO() as output_string:

173 fp = cast(BinaryIO, fp) # we opened in binary mode

174 rsrcmgr = PDFResourceManager(caching=caching)

175 device = TextConverter(rsrcmgr, output_string, codec=codec, laparams=laparams)

176 interpreter = PDFPageInterpreter(rsrcmgr, device)

177

178 for page in PDFPage.get_pages(

179 fp,

180 page_numbers,

181 maxpages=maxpages,

182 password=password,

183 caching=caching,

184 ):

185 interpreter.process_page(page)

186

187 return output_string.getvalue()

188

189

190def extract_pages(

191 pdf_file: FileOrName,

192 password: str = "",

193 page_numbers: Container[int] | None = None,

194 maxpages: int = 0,

195 caching: bool = True,

196 laparams: LAParams | None = None,

197) -> Iterator[LTPage]:

198 """Extract and yield LTPage objects

199

200 :param pdf_file: Either a file path or a file-like object for the PDF file

201 to be worked on.

202 :param password: For encrypted PDFs, the password to decrypt.

203 :param page_numbers: List of zero-indexed page numbers to extract.

204 :param maxpages: The maximum number of pages to parse

205 :param caching: If resources should be cached

206 :param laparams: An LAParams object from pdfminer.layout. If None, uses

207 some default settings that often work well.

208 :return: LTPage objects

209 """

210 if laparams is None:

211 laparams = LAParams()

212

213 with open_filename(pdf_file, "rb") as fp:

214 fp = cast(BinaryIO, fp) # we opened in binary mode

215 resource_manager = PDFResourceManager(caching=caching)

216 device = PDFPageAggregator(resource_manager, laparams=laparams)

217 interpreter = PDFPageInterpreter(resource_manager, device)

218 for page in PDFPage.get_pages(

219 fp,

220 page_numbers,

221 maxpages=maxpages,

222 password=password,

223 caching=caching,

224 ):

225 interpreter.process_page(page)

226 layout = device.get_result()

227 yield layout

Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/high_level.py: 42%

65 statements