Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/image.py: 16%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

173 statements  

1import os 

2import os.path 

3import struct 

4from io import BytesIO 

5from typing import BinaryIO, Literal, Tuple 

6 

7from pdfminer.jbig2 import JBIG2StreamReader, JBIG2StreamWriter 

8from pdfminer.layout import LTImage 

9from pdfminer.pdfcolor import ( 

10 LITERAL_DEVICE_CMYK, 

11 LITERAL_DEVICE_GRAY, 

12 LITERAL_DEVICE_RGB, 

13 LITERAL_INLINE_DEVICE_GRAY, 

14 LITERAL_INLINE_DEVICE_RGB, 

15) 

16from pdfminer.pdfexceptions import PDFValueError 

17from pdfminer.pdftypes import ( 

18 LITERALS_DCT_DECODE, 

19 LITERALS_FLATE_DECODE, 

20 LITERALS_JBIG2_DECODE, 

21 LITERALS_JPX_DECODE, 

22) 

23 

24PIL_ERROR_MESSAGE = ( 

25 "Could not import Pillow. This dependency of pdfminer.six is not " 

26 "installed by default. You need it to to save jpg images to a file. Install it " 

27 "with `pip install 'pdfminer.six[image]'`" 

28) 

29 

30 

31def align32(x: int) -> int: 

32 return ((x + 3) // 4) * 4 

33 

34 

35class BMPWriter: 

36 def __init__(self, fp: BinaryIO, bits: int, width: int, height: int) -> None: 

37 self.fp = fp 

38 self.bits = bits 

39 self.width = width 

40 self.height = height 

41 if bits == 1: 

42 ncols = 2 

43 elif bits == 8: 

44 ncols = 256 

45 elif bits == 24: 

46 ncols = 0 

47 else: 

48 raise PDFValueError(bits) 

49 self.linesize = align32((self.width * self.bits + 7) // 8) 

50 self.datasize = self.linesize * self.height 

51 headersize = 14 + 40 + ncols * 4 

52 info = struct.pack( 

53 "<IiiHHIIIIII", 

54 40, 

55 self.width, 

56 self.height, 

57 1, 

58 self.bits, 

59 0, 

60 self.datasize, 

61 0, 

62 0, 

63 ncols, 

64 0, 

65 ) 

66 assert len(info) == 40, str(len(info)) 

67 header = struct.pack( 

68 "<ccIHHI", 

69 b"B", 

70 b"M", 

71 headersize + self.datasize, 

72 0, 

73 0, 

74 headersize, 

75 ) 

76 assert len(header) == 14, str(len(header)) 

77 self.fp.write(header) 

78 self.fp.write(info) 

79 if ncols == 2: 

80 # B&W color table 

81 for i in (0, 255): 

82 self.fp.write(struct.pack("BBBx", i, i, i)) 

83 elif ncols == 256: 

84 # grayscale color table 

85 for i in range(256): 

86 self.fp.write(struct.pack("BBBx", i, i, i)) 

87 self.pos0 = self.fp.tell() 

88 self.pos1 = self.pos0 + self.datasize 

89 

90 def write_line(self, y: int, data: bytes) -> None: 

91 self.fp.seek(self.pos1 - (y + 1) * self.linesize) 

92 self.fp.write(data) 

93 

94 

95class ImageWriter: 

96 """Write image to a file 

97 

98 Supports various image types: JPEG, JBIG2 and bitmaps 

99 """ 

100 

101 def __init__(self, outdir: str) -> None: 

102 self.outdir = outdir 

103 if not os.path.exists(self.outdir): 

104 os.makedirs(self.outdir) 

105 

106 def export_image(self, image: LTImage) -> str: 

107 """Save an LTImage to disk""" 

108 (width, height) = image.srcsize 

109 

110 filters = image.stream.get_filters() 

111 

112 if filters[-1][0] in LITERALS_DCT_DECODE: 

113 name = self._save_jpeg(image) 

114 

115 elif filters[-1][0] in LITERALS_JPX_DECODE: 

116 name = self._save_jpeg2000(image) 

117 

118 elif self._is_jbig2_iamge(image): 

119 name = self._save_jbig2(image) 

120 

121 elif image.bits == 1: 

122 name = self._save_bmp(image, width, height, (width + 7) // 8, image.bits) 

123 

124 elif image.bits == 8 and ( 

125 LITERAL_DEVICE_RGB in image.colorspace 

126 or LITERAL_INLINE_DEVICE_RGB in image.colorspace 

127 ): 

128 name = self._save_bmp(image, width, height, width * 3, image.bits * 3) 

129 

130 elif image.bits == 8 and ( 

131 LITERAL_DEVICE_GRAY in image.colorspace 

132 or LITERAL_INLINE_DEVICE_GRAY in image.colorspace 

133 ): 

134 name = self._save_bmp(image, width, height, width, image.bits) 

135 

136 elif len(filters) == 1 and filters[0][0] in LITERALS_FLATE_DECODE: 

137 name = self._save_bytes(image) 

138 

139 else: 

140 name = self._save_raw(image) 

141 

142 return name 

143 

144 def _save_jpeg(self, image: LTImage) -> str: 

145 """Save a JPEG encoded image""" 

146 data = image.stream.get_data() 

147 

148 name, path = self._create_unique_image_name(image, ".jpg") 

149 with open(path, "wb") as fp: 

150 if LITERAL_DEVICE_CMYK in image.colorspace: 

151 try: 

152 from PIL import Image, ImageChops # type: ignore[import] 

153 except ImportError: 

154 raise ImportError(PIL_ERROR_MESSAGE) 

155 

156 ifp = BytesIO(data) 

157 i = Image.open(ifp) 

158 i = ImageChops.invert(i) 

159 i = i.convert("RGB") 

160 i.save(fp, "JPEG") 

161 else: 

162 fp.write(data) 

163 

164 return name 

165 

166 def _save_jpeg2000(self, image: LTImage) -> str: 

167 """Save a JPEG 2000 encoded image""" 

168 data = image.stream.get_data() 

169 

170 name, path = self._create_unique_image_name(image, ".jp2") 

171 with open(path, "wb") as fp: 

172 try: 

173 from PIL import Image # type: ignore[import] 

174 except ImportError: 

175 raise ImportError(PIL_ERROR_MESSAGE) 

176 

177 # if we just write the raw data, most image programs 

178 # that I have tried cannot open the file. However, 

179 # open and saving with PIL produces a file that 

180 # seems to be easily opened by other programs 

181 ifp = BytesIO(data) 

182 i = Image.open(ifp) 

183 i.save(fp, "JPEG2000") 

184 return name 

185 

186 def _save_jbig2(self, image: LTImage) -> str: 

187 """Save a JBIG2 encoded image""" 

188 name, path = self._create_unique_image_name(image, ".jb2") 

189 with open(path, "wb") as fp: 

190 input_stream = BytesIO() 

191 

192 global_streams = [] 

193 filters = image.stream.get_filters() 

194 for filter_name, params in filters: 

195 if filter_name in LITERALS_JBIG2_DECODE: 

196 global_streams.append(params["JBIG2Globals"].resolve()) 

197 

198 if len(global_streams) > 1: 

199 msg = ( 

200 "There should never be more than one JBIG2Globals " 

201 "associated with a JBIG2 embedded image" 

202 ) 

203 raise PDFValueError(msg) 

204 if len(global_streams) == 1: 

205 input_stream.write(global_streams[0].get_data().rstrip(b"\n")) 

206 input_stream.write(image.stream.get_data()) 

207 input_stream.seek(0) 

208 reader = JBIG2StreamReader(input_stream) 

209 segments = reader.get_segments() 

210 

211 writer = JBIG2StreamWriter(fp) 

212 writer.write_file(segments) 

213 return name 

214 

215 def _save_bmp( 

216 self, 

217 image: LTImage, 

218 width: int, 

219 height: int, 

220 bytes_per_line: int, 

221 bits: int, 

222 ) -> str: 

223 """Save a BMP encoded image""" 

224 name, path = self._create_unique_image_name(image, ".bmp") 

225 with open(path, "wb") as fp: 

226 bmp = BMPWriter(fp, bits, width, height) 

227 data = image.stream.get_data() 

228 i = 0 

229 for y in range(height): 

230 bmp.write_line(y, data[i : i + bytes_per_line]) 

231 i += bytes_per_line 

232 return name 

233 

234 def _save_bytes(self, image: LTImage) -> str: 

235 """Save an image without encoding, just bytes""" 

236 name, path = self._create_unique_image_name(image, ".jpg") 

237 width, height = image.srcsize 

238 channels = len(image.stream.get_data()) / width / height / (image.bits / 8) 

239 with open(path, "wb") as fp: 

240 try: 

241 from PIL import ( 

242 Image, # type: ignore[import] 

243 ImageOps, 

244 ) 

245 except ImportError: 

246 raise ImportError(PIL_ERROR_MESSAGE) 

247 

248 mode: Literal["1", "L", "RGB", "CMYK"] 

249 if image.bits == 1: 

250 mode = "1" 

251 elif image.bits == 8 and channels == 1: 

252 mode = "L" 

253 elif image.bits == 8 and channels == 3: 

254 mode = "RGB" 

255 elif image.bits == 8 and channels == 4: 

256 mode = "CMYK" 

257 

258 img = Image.frombytes(mode, image.srcsize, image.stream.get_data(), "raw") 

259 if mode == "L": 

260 img = ImageOps.invert(img) 

261 

262 img.save(fp) 

263 

264 return name 

265 

266 def _save_raw(self, image: LTImage) -> str: 

267 """Save an image with unknown encoding""" 

268 ext = ".%d.%dx%d.img" % (image.bits, image.srcsize[0], image.srcsize[1]) 

269 name, path = self._create_unique_image_name(image, ext) 

270 

271 with open(path, "wb") as fp: 

272 fp.write(image.stream.get_data()) 

273 return name 

274 

275 @staticmethod 

276 def _is_jbig2_iamge(image: LTImage) -> bool: 

277 filters = image.stream.get_filters() 

278 for filter_name, params in filters: 

279 if filter_name in LITERALS_JBIG2_DECODE: 

280 return True 

281 return False 

282 

283 def _create_unique_image_name(self, image: LTImage, ext: str) -> Tuple[str, str]: 

284 name = image.name + ext 

285 path = os.path.join(self.outdir, name) 

286 img_index = 0 

287 while os.path.exists(path): 

288 name = "%s.%d%s" % (image.name, img_index, ext) 

289 path = os.path.join(self.outdir, name) 

290 img_index += 1 

291 return name, path