Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/high_level.py: 42%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

65 statements  

1"""Functions that can be used for the most common use-cases for pdfminer.six""" 

2 

3import logging 

4import sys 

5from collections.abc import Container, Iterator 

6from io import StringIO 

7from typing import Any, BinaryIO, cast 

8 

9from pdfminer.converter import ( 

10 HOCRConverter, 

11 HTMLConverter, 

12 PDFPageAggregator, 

13 TextConverter, 

14 XMLConverter, 

15) 

16from pdfminer.image import ImageWriter 

17from pdfminer.layout import LAParams, LTPage 

18from pdfminer.pdfdevice import PDFDevice, TagExtractor 

19from pdfminer.pdfexceptions import PDFValueError 

20from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager 

21from pdfminer.pdfpage import PDFPage 

22from pdfminer.utils import AnyIO, FileOrName, open_filename 

23 

24 

25def extract_text_to_fp( 

26 inf: BinaryIO, 

27 outfp: AnyIO, 

28 output_type: str = "text", 

29 codec: str = "utf-8", 

30 laparams: LAParams | None = None, 

31 maxpages: int = 0, 

32 page_numbers: Container[int] | None = None, 

33 password: str = "", 

34 scale: float = 1.0, 

35 rotation: int = 0, 

36 layoutmode: str = "normal", 

37 output_dir: str | None = None, 

38 strip_control: bool = False, 

39 debug: bool = False, 

40 disable_caching: bool = False, 

41 **kwargs: Any, 

42) -> None: 

43 """Parses text from inf-file and writes to outfp file-like object. 

44 

45 Takes loads of optional arguments but the defaults are somewhat sane. 

46 Beware laparams: Including an empty LAParams is not the same as passing 

47 None! 

48 

49 :param inf: a file-like object to read PDF structure from, such as a 

50 file handler (using the builtin `open()` function) or a `BytesIO`. 

51 :param outfp: a file-like object to write the text to. 

52 :param output_type: May be 'text', 'xml', 'html', 'hocr', 'tag'. 

53 Only 'text' works properly. 

54 :param codec: Text decoding codec 

55 :param laparams: An LAParams object from pdfminer.layout. Default is None 

56 but may not layout correctly. 

57 :param maxpages: How many pages to stop parsing after 

58 :param page_numbers: zero-indexed page numbers to operate on. 

59 :param password: For encrypted PDFs, the password to decrypt. 

60 :param scale: Scale factor 

61 :param rotation: Rotation factor 

62 :param layoutmode: Default is 'normal', see 

63 pdfminer.converter.HTMLConverter 

64 :param output_dir: If given, creates an ImageWriter for extracted images. 

65 :param strip_control: Does what it says on the tin 

66 :param debug: Output more logging data 

67 :param disable_caching: Does what it says on the tin 

68 :param other: 

69 :return: nothing, acting as it does on two streams. Use StringIO to get 

70 strings. 

71 """ 

72 if debug: 

73 logging.getLogger().setLevel(logging.DEBUG) 

74 

75 imagewriter = None 

76 if output_dir: 

77 imagewriter = ImageWriter(output_dir) 

78 

79 rsrcmgr = PDFResourceManager(caching=not disable_caching) 

80 device: PDFDevice | None = None 

81 

82 if output_type != "text" and outfp == sys.stdout: 

83 outfp = sys.stdout.buffer 

84 

85 if output_type == "text": 

86 device = TextConverter( 

87 rsrcmgr, 

88 outfp, 

89 codec=codec, 

90 laparams=laparams, 

91 imagewriter=imagewriter, 

92 ) 

93 

94 elif output_type == "xml": 

95 device = XMLConverter( 

96 rsrcmgr, 

97 outfp, 

98 codec=codec, 

99 laparams=laparams, 

100 imagewriter=imagewriter, 

101 stripcontrol=strip_control, 

102 ) 

103 

104 elif output_type == "html": 

105 device = HTMLConverter( 

106 rsrcmgr, 

107 outfp, 

108 codec=codec, 

109 scale=scale, 

110 layoutmode=layoutmode, 

111 laparams=laparams, 

112 imagewriter=imagewriter, 

113 ) 

114 

115 elif output_type == "hocr": 

116 device = HOCRConverter( 

117 rsrcmgr, 

118 outfp, 

119 codec=codec, 

120 laparams=laparams, 

121 stripcontrol=strip_control, 

122 ) 

123 

124 elif output_type == "tag": 

125 # Binary I/O is required, but we have no good way to test it here. 

126 device = TagExtractor(rsrcmgr, cast(BinaryIO, outfp), codec=codec) 

127 

128 else: 

129 msg = f"Output type can be text, html, xml or tag but is {output_type}" 

130 raise PDFValueError(msg) 

131 

132 assert device is not None 

133 interpreter = PDFPageInterpreter(rsrcmgr, device) 

134 for page in PDFPage.get_pages( 

135 inf, 

136 page_numbers, 

137 maxpages=maxpages, 

138 password=password, 

139 caching=not disable_caching, 

140 ): 

141 page.rotate = (page.rotate + rotation) % 360 

142 interpreter.process_page(page) 

143 

144 device.close() 

145 

146 

147def extract_text( 

148 pdf_file: FileOrName, 

149 password: str = "", 

150 page_numbers: Container[int] | None = None, 

151 maxpages: int = 0, 

152 caching: bool = True, 

153 codec: str = "utf-8", 

154 laparams: LAParams | None = None, 

155) -> str: 

156 """Parse and return the text contained in a PDF file. 

157 

158 :param pdf_file: Either a file path or a file-like object for the PDF file 

159 to be worked on. 

160 :param password: For encrypted PDFs, the password to decrypt. 

161 :param page_numbers: List of zero-indexed page numbers to extract. 

162 :param maxpages: The maximum number of pages to parse 

163 :param caching: If resources should be cached 

164 :param codec: Text decoding codec 

165 :param laparams: An LAParams object from pdfminer.layout. If None, uses 

166 some default settings that often work well. 

167 :return: a string containing all of the text extracted. 

168 """ 

169 if laparams is None: 

170 laparams = LAParams() 

171 

172 with open_filename(pdf_file, "rb") as fp, StringIO() as output_string: 

173 fp = cast(BinaryIO, fp) # we opened in binary mode 

174 rsrcmgr = PDFResourceManager(caching=caching) 

175 device = TextConverter(rsrcmgr, output_string, codec=codec, laparams=laparams) 

176 interpreter = PDFPageInterpreter(rsrcmgr, device) 

177 

178 for page in PDFPage.get_pages( 

179 fp, 

180 page_numbers, 

181 maxpages=maxpages, 

182 password=password, 

183 caching=caching, 

184 ): 

185 interpreter.process_page(page) 

186 

187 return output_string.getvalue() 

188 

189 

190def extract_pages( 

191 pdf_file: FileOrName, 

192 password: str = "", 

193 page_numbers: Container[int] | None = None, 

194 maxpages: int = 0, 

195 caching: bool = True, 

196 laparams: LAParams | None = None, 

197) -> Iterator[LTPage]: 

198 """Extract and yield LTPage objects 

199 

200 :param pdf_file: Either a file path or a file-like object for the PDF file 

201 to be worked on. 

202 :param password: For encrypted PDFs, the password to decrypt. 

203 :param page_numbers: List of zero-indexed page numbers to extract. 

204 :param maxpages: The maximum number of pages to parse 

205 :param caching: If resources should be cached 

206 :param laparams: An LAParams object from pdfminer.layout. If None, uses 

207 some default settings that often work well. 

208 :return: LTPage objects 

209 """ 

210 if laparams is None: 

211 laparams = LAParams() 

212 

213 with open_filename(pdf_file, "rb") as fp: 

214 fp = cast(BinaryIO, fp) # we opened in binary mode 

215 resource_manager = PDFResourceManager(caching=caching) 

216 device = PDFPageAggregator(resource_manager, laparams=laparams) 

217 interpreter = PDFPageInterpreter(resource_manager, device) 

218 for page in PDFPage.get_pages( 

219 fp, 

220 page_numbers, 

221 maxpages=maxpages, 

222 password=password, 

223 caching=caching, 

224 ): 

225 interpreter.process_page(page) 

226 layout = device.get_result() 

227 yield layout