Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/image.py: 15%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import os
2import os.path
3import struct
4from io import BytesIO
5from typing import BinaryIO, Literal, Tuple
7from pdfminer.jbig2 import JBIG2StreamReader, JBIG2StreamWriter
8from pdfminer.layout import LTImage
9from pdfminer.pdfcolor import (
10 LITERAL_DEVICE_CMYK,
11 LITERAL_DEVICE_GRAY,
12 LITERAL_DEVICE_RGB,
13 LITERAL_INLINE_DEVICE_GRAY,
14 LITERAL_INLINE_DEVICE_RGB,
15)
16from pdfminer.pdfexceptions import PDFValueError
17from pdfminer.pdftypes import (
18 LITERALS_DCT_DECODE,
19 LITERALS_FLATE_DECODE,
20 LITERALS_JBIG2_DECODE,
21 LITERALS_JPX_DECODE,
22)
24PIL_ERROR_MESSAGE = (
25 "Could not import Pillow. This dependency of pdfminer.six is not "
26 "installed by default. You need it to to save jpg images to a file. Install it "
27 "with `pip install 'pdfminer.six[image]'`"
28)
31def align32(x: int) -> int:
32 return ((x + 3) // 4) * 4
35class BMPWriter:
36 def __init__(self, fp: BinaryIO, bits: int, width: int, height: int) -> None:
37 self.fp = fp
38 self.bits = bits
39 self.width = width
40 self.height = height
41 if bits == 1:
42 ncols = 2
43 elif bits == 8:
44 ncols = 256
45 elif bits == 24:
46 ncols = 0
47 else:
48 raise PDFValueError(bits)
49 self.linesize = align32((self.width * self.bits + 7) // 8)
50 self.datasize = self.linesize * self.height
51 headersize = 14 + 40 + ncols * 4
52 info = struct.pack(
53 "<IiiHHIIIIII",
54 40,
55 self.width,
56 self.height,
57 1,
58 self.bits,
59 0,
60 self.datasize,
61 0,
62 0,
63 ncols,
64 0,
65 )
66 assert len(info) == 40, str(len(info))
67 header = struct.pack(
68 "<ccIHHI",
69 b"B",
70 b"M",
71 headersize + self.datasize,
72 0,
73 0,
74 headersize,
75 )
76 assert len(header) == 14, str(len(header))
77 self.fp.write(header)
78 self.fp.write(info)
79 if ncols == 2:
80 # B&W color table
81 for i in (0, 255):
82 self.fp.write(struct.pack("BBBx", i, i, i))
83 elif ncols == 256:
84 # grayscale color table
85 for i in range(256):
86 self.fp.write(struct.pack("BBBx", i, i, i))
87 self.pos0 = self.fp.tell()
88 self.pos1 = self.pos0 + self.datasize
90 def write_line(self, y: int, data: bytes) -> None:
91 self.fp.seek(self.pos1 - (y + 1) * self.linesize)
92 self.fp.write(data)
95class ImageWriter:
96 """Write image to a file
98 Supports various image types: JPEG, JBIG2 and bitmaps
99 """
101 def __init__(self, outdir: str) -> None:
102 self.outdir = outdir
103 if not os.path.exists(self.outdir):
104 os.makedirs(self.outdir)
106 def export_image(self, image: LTImage) -> str:
107 """Save an LTImage to disk"""
108 (width, height) = image.srcsize
110 filters = image.stream.get_filters()
112 if not filters:
113 name = self._save_bytes(image)
115 elif filters[-1][0] in LITERALS_DCT_DECODE:
116 name = self._save_jpeg(image)
118 elif filters[-1][0] in LITERALS_JPX_DECODE:
119 name = self._save_jpeg2000(image)
121 elif self._is_jbig2_iamge(image):
122 name = self._save_jbig2(image)
124 elif image.bits == 1:
125 name = self._save_bmp(image, width, height, (width + 7) // 8, image.bits)
127 elif image.bits == 8 and (
128 LITERAL_DEVICE_RGB in image.colorspace
129 or LITERAL_INLINE_DEVICE_RGB in image.colorspace
130 ):
131 name = self._save_bmp(image, width, height, width * 3, image.bits * 3)
133 elif image.bits == 8 and (
134 LITERAL_DEVICE_GRAY in image.colorspace
135 or LITERAL_INLINE_DEVICE_GRAY in image.colorspace
136 ):
137 name = self._save_bmp(image, width, height, width, image.bits)
139 elif len(filters) == 1 and filters[0][0] in LITERALS_FLATE_DECODE:
140 name = self._save_bytes(image)
142 else:
143 name = self._save_raw(image)
145 return name
147 def _save_jpeg(self, image: LTImage) -> str:
148 """Save a JPEG encoded image"""
149 data = image.stream.get_data()
151 name, path = self._create_unique_image_name(image, ".jpg")
152 with open(path, "wb") as fp:
153 if LITERAL_DEVICE_CMYK in image.colorspace:
154 try:
155 from PIL import Image, ImageChops # type: ignore[import]
156 except ImportError:
157 raise ImportError(PIL_ERROR_MESSAGE)
159 ifp = BytesIO(data)
160 i = Image.open(ifp)
161 i = ImageChops.invert(i)
162 i = i.convert("RGB")
163 i.save(fp, "JPEG")
164 else:
165 fp.write(data)
167 return name
169 def _save_jpeg2000(self, image: LTImage) -> str:
170 """Save a JPEG 2000 encoded image"""
171 data = image.stream.get_data()
173 name, path = self._create_unique_image_name(image, ".jp2")
174 with open(path, "wb") as fp:
175 try:
176 from PIL import Image # type: ignore[import]
177 except ImportError:
178 raise ImportError(PIL_ERROR_MESSAGE)
180 # if we just write the raw data, most image programs
181 # that I have tried cannot open the file. However,
182 # open and saving with PIL produces a file that
183 # seems to be easily opened by other programs
184 ifp = BytesIO(data)
185 i = Image.open(ifp)
186 i.save(fp, "JPEG2000")
187 return name
189 def _save_jbig2(self, image: LTImage) -> str:
190 """Save a JBIG2 encoded image"""
191 name, path = self._create_unique_image_name(image, ".jb2")
192 with open(path, "wb") as fp:
193 input_stream = BytesIO()
195 global_streams = []
196 filters = image.stream.get_filters()
197 for filter_name, params in filters:
198 if filter_name in LITERALS_JBIG2_DECODE:
199 global_streams.append(params["JBIG2Globals"].resolve())
201 if len(global_streams) > 1:
202 msg = (
203 "There should never be more than one JBIG2Globals "
204 "associated with a JBIG2 embedded image"
205 )
206 raise PDFValueError(msg)
207 if len(global_streams) == 1:
208 input_stream.write(global_streams[0].get_data().rstrip(b"\n"))
209 input_stream.write(image.stream.get_data())
210 input_stream.seek(0)
211 reader = JBIG2StreamReader(input_stream)
212 segments = reader.get_segments()
214 writer = JBIG2StreamWriter(fp)
215 writer.write_file(segments)
216 return name
218 def _save_bmp(
219 self,
220 image: LTImage,
221 width: int,
222 height: int,
223 bytes_per_line: int,
224 bits: int,
225 ) -> str:
226 """Save a BMP encoded image"""
227 name, path = self._create_unique_image_name(image, ".bmp")
228 with open(path, "wb") as fp:
229 bmp = BMPWriter(fp, bits, width, height)
230 data = image.stream.get_data()
231 i = 0
232 for y in range(height):
233 bmp.write_line(y, data[i : i + bytes_per_line])
234 i += bytes_per_line
235 return name
237 def _save_bytes(self, image: LTImage) -> str:
238 """Save an image without encoding, just bytes"""
239 name, path = self._create_unique_image_name(image, ".jpg")
240 width, height = image.srcsize
241 channels = len(image.stream.get_data()) / width / height / (image.bits / 8)
242 with open(path, "wb") as fp:
243 try:
244 from PIL import (
245 Image, # type: ignore[import]
246 ImageOps,
247 )
248 except ImportError:
249 raise ImportError(PIL_ERROR_MESSAGE)
251 mode: Literal["1", "L", "RGB", "CMYK"]
252 if image.bits == 1:
253 mode = "1"
254 elif image.bits == 8 and channels == 1:
255 mode = "L"
256 elif image.bits == 8 and channels == 3:
257 mode = "RGB"
258 elif image.bits == 8 and channels == 4:
259 mode = "CMYK"
261 img = Image.frombytes(mode, image.srcsize, image.stream.get_data(), "raw")
262 if mode == "L":
263 img = ImageOps.invert(img)
265 img.save(fp)
267 return name
269 def _save_raw(self, image: LTImage) -> str:
270 """Save an image with unknown encoding"""
271 ext = ".%d.%dx%d.img" % (image.bits, image.srcsize[0], image.srcsize[1])
272 name, path = self._create_unique_image_name(image, ext)
274 with open(path, "wb") as fp:
275 fp.write(image.stream.get_data())
276 return name
278 @staticmethod
279 def _is_jbig2_iamge(image: LTImage) -> bool:
280 filters = image.stream.get_filters()
281 for filter_name, params in filters:
282 if filter_name in LITERALS_JBIG2_DECODE:
283 return True
284 return False
286 def _create_unique_image_name(self, image: LTImage, ext: str) -> Tuple[str, str]:
287 name = image.name + ext
288 path = os.path.join(self.outdir, name)
289 img_index = 0
290 while os.path.exists(path):
291 name = "%s.%d%s" % (image.name, img_index, ext)
292 path = os.path.join(self.outdir, name)
293 img_index += 1
294 return name, path