Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/high_level.py: 57%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

63 statements  

1"""Functions that can be used for the most common use-cases for pdfminer.six""" 

2 

3import logging 

4import sys 

5from io import StringIO 

6from typing import Any, BinaryIO, Container, Iterator, Optional, cast 

7 

8from pdfminer.converter import ( 

9 HOCRConverter, 

10 HTMLConverter, 

11 PDFPageAggregator, 

12 TextConverter, 

13 XMLConverter, 

14) 

15from pdfminer.image import ImageWriter 

16from pdfminer.layout import LAParams, LTPage 

17from pdfminer.pdfdevice import PDFDevice, TagExtractor 

18from pdfminer.pdfexceptions import PDFValueError 

19from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager 

20from pdfminer.pdfpage import PDFPage 

21from pdfminer.utils import AnyIO, FileOrName, open_filename 

22 

23 

24def extract_text_to_fp( 

25 inf: BinaryIO, 

26 outfp: AnyIO, 

27 output_type: str = "text", 

28 codec: str = "utf-8", 

29 laparams: Optional[LAParams] = None, 

30 maxpages: int = 0, 

31 page_numbers: Optional[Container[int]] = None, 

32 password: str = "", 

33 scale: float = 1.0, 

34 rotation: int = 0, 

35 layoutmode: str = "normal", 

36 output_dir: Optional[str] = None, 

37 strip_control: bool = False, 

38 debug: bool = False, 

39 disable_caching: bool = False, 

40 **kwargs: Any, 

41) -> None: 

42 """Parses text from inf-file and writes to outfp file-like object. 

43 

44 Takes loads of optional arguments but the defaults are somewhat sane. 

45 Beware laparams: Including an empty LAParams is not the same as passing 

46 None! 

47 

48 :param inf: a file-like object to read PDF structure from, such as a 

49 file handler (using the builtin `open()` function) or a `BytesIO`. 

50 :param outfp: a file-like object to write the text to. 

51 :param output_type: May be 'text', 'xml', 'html', 'hocr', 'tag'. 

52 Only 'text' works properly. 

53 :param codec: Text decoding codec 

54 :param laparams: An LAParams object from pdfminer.layout. Default is None 

55 but may not layout correctly. 

56 :param maxpages: How many pages to stop parsing after 

57 :param page_numbers: zero-indexed page numbers to operate on. 

58 :param password: For encrypted PDFs, the password to decrypt. 

59 :param scale: Scale factor 

60 :param rotation: Rotation factor 

61 :param layoutmode: Default is 'normal', see 

62 pdfminer.converter.HTMLConverter 

63 :param output_dir: If given, creates an ImageWriter for extracted images. 

64 :param strip_control: Does what it says on the tin 

65 :param debug: Output more logging data 

66 :param disable_caching: Does what it says on the tin 

67 :param other: 

68 :return: nothing, acting as it does on two streams. Use StringIO to get 

69 strings. 

70 """ 

71 if debug: 

72 logging.getLogger().setLevel(logging.DEBUG) 

73 

74 imagewriter = None 

75 if output_dir: 

76 imagewriter = ImageWriter(output_dir) 

77 

78 rsrcmgr = PDFResourceManager(caching=not disable_caching) 

79 device: Optional[PDFDevice] = None 

80 

81 if output_type != "text" and outfp == sys.stdout: 

82 outfp = sys.stdout.buffer 

83 

84 if output_type == "text": 

85 device = TextConverter( 

86 rsrcmgr, 

87 outfp, 

88 codec=codec, 

89 laparams=laparams, 

90 imagewriter=imagewriter, 

91 ) 

92 

93 elif output_type == "xml": 

94 device = XMLConverter( 

95 rsrcmgr, 

96 outfp, 

97 codec=codec, 

98 laparams=laparams, 

99 imagewriter=imagewriter, 

100 stripcontrol=strip_control, 

101 ) 

102 

103 elif output_type == "html": 

104 device = HTMLConverter( 

105 rsrcmgr, 

106 outfp, 

107 codec=codec, 

108 scale=scale, 

109 layoutmode=layoutmode, 

110 laparams=laparams, 

111 imagewriter=imagewriter, 

112 ) 

113 

114 elif output_type == "hocr": 

115 device = HOCRConverter( 

116 rsrcmgr, 

117 outfp, 

118 codec=codec, 

119 laparams=laparams, 

120 stripcontrol=strip_control, 

121 ) 

122 

123 elif output_type == "tag": 

124 # Binary I/O is required, but we have no good way to test it here. 

125 device = TagExtractor(rsrcmgr, cast(BinaryIO, outfp), codec=codec) 

126 

127 else: 

128 msg = f"Output type can be text, html, xml or tag but is {output_type}" 

129 raise PDFValueError(msg) 

130 

131 assert device is not None 

132 interpreter = PDFPageInterpreter(rsrcmgr, device) 

133 for page in PDFPage.get_pages( 

134 inf, 

135 page_numbers, 

136 maxpages=maxpages, 

137 password=password, 

138 caching=not disable_caching, 

139 ): 

140 page.rotate = (page.rotate + rotation) % 360 

141 interpreter.process_page(page) 

142 

143 device.close() 

144 

145 

146def extract_text( 

147 pdf_file: FileOrName, 

148 password: str = "", 

149 page_numbers: Optional[Container[int]] = None, 

150 maxpages: int = 0, 

151 caching: bool = True, 

152 codec: str = "utf-8", 

153 laparams: Optional[LAParams] = None, 

154) -> str: 

155 """Parse and return the text contained in a PDF file. 

156 

157 :param pdf_file: Either a file path or a file-like object for the PDF file 

158 to be worked on. 

159 :param password: For encrypted PDFs, the password to decrypt. 

160 :param page_numbers: List of zero-indexed page numbers to extract. 

161 :param maxpages: The maximum number of pages to parse 

162 :param caching: If resources should be cached 

163 :param codec: Text decoding codec 

164 :param laparams: An LAParams object from pdfminer.layout. If None, uses 

165 some default settings that often work well. 

166 :return: a string containing all of the text extracted. 

167 """ 

168 if laparams is None: 

169 laparams = LAParams() 

170 

171 with open_filename(pdf_file, "rb") as fp, StringIO() as output_string: 

172 fp = cast(BinaryIO, fp) # we opened in binary mode 

173 rsrcmgr = PDFResourceManager(caching=caching) 

174 device = TextConverter(rsrcmgr, output_string, codec=codec, laparams=laparams) 

175 interpreter = PDFPageInterpreter(rsrcmgr, device) 

176 

177 for page in PDFPage.get_pages( 

178 fp, 

179 page_numbers, 

180 maxpages=maxpages, 

181 password=password, 

182 caching=caching, 

183 ): 

184 interpreter.process_page(page) 

185 

186 return output_string.getvalue() 

187 

188 

189def extract_pages( 

190 pdf_file: FileOrName, 

191 password: str = "", 

192 page_numbers: Optional[Container[int]] = None, 

193 maxpages: int = 0, 

194 caching: bool = True, 

195 laparams: Optional[LAParams] = None, 

196) -> Iterator[LTPage]: 

197 """Extract and yield LTPage objects 

198 

199 :param pdf_file: Either a file path or a file-like object for the PDF file 

200 to be worked on. 

201 :param password: For encrypted PDFs, the password to decrypt. 

202 :param page_numbers: List of zero-indexed page numbers to extract. 

203 :param maxpages: The maximum number of pages to parse 

204 :param caching: If resources should be cached 

205 :param laparams: An LAParams object from pdfminer.layout. If None, uses 

206 some default settings that often work well. 

207 :return: LTPage objects 

208 """ 

209 if laparams is None: 

210 laparams = LAParams() 

211 

212 with open_filename(pdf_file, "rb") as fp: 

213 fp = cast(BinaryIO, fp) # we opened in binary mode 

214 resource_manager = PDFResourceManager(caching=caching) 

215 device = PDFPageAggregator(resource_manager, laparams=laparams) 

216 interpreter = PDFPageInterpreter(resource_manager, device) 

217 for page in PDFPage.get_pages( 

218 fp, 

219 page_numbers, 

220 maxpages=maxpages, 

221 password=password, 

222 caching=caching, 

223 ): 

224 interpreter.process_page(page) 

225 layout = device.get_result() 

226 yield layout