Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/image.py: 16%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import os
2import os.path
3import struct
4from io import BytesIO
5from typing import BinaryIO, Literal, Tuple
7from pdfminer.jbig2 import JBIG2StreamReader, JBIG2StreamWriter
8from pdfminer.layout import LTImage
9from pdfminer.pdfcolor import (
10 LITERAL_DEVICE_CMYK,
11 LITERAL_DEVICE_GRAY,
12 LITERAL_DEVICE_RGB,
13 LITERAL_INLINE_DEVICE_GRAY,
14 LITERAL_INLINE_DEVICE_RGB,
15)
16from pdfminer.pdfexceptions import PDFValueError
17from pdfminer.pdftypes import (
18 LITERALS_DCT_DECODE,
19 LITERALS_FLATE_DECODE,
20 LITERALS_JBIG2_DECODE,
21 LITERALS_JPX_DECODE,
22)
24PIL_ERROR_MESSAGE = (
25 "Could not import Pillow. This dependency of pdfminer.six is not "
26 "installed by default. You need it to to save jpg images to a file. Install it "
27 "with `pip install 'pdfminer.six[image]'`"
28)
31def align32(x: int) -> int:
32 return ((x + 3) // 4) * 4
35class BMPWriter:
36 def __init__(self, fp: BinaryIO, bits: int, width: int, height: int) -> None:
37 self.fp = fp
38 self.bits = bits
39 self.width = width
40 self.height = height
41 if bits == 1:
42 ncols = 2
43 elif bits == 8:
44 ncols = 256
45 elif bits == 24:
46 ncols = 0
47 else:
48 raise PDFValueError(bits)
49 self.linesize = align32((self.width * self.bits + 7) // 8)
50 self.datasize = self.linesize * self.height
51 headersize = 14 + 40 + ncols * 4
52 info = struct.pack(
53 "<IiiHHIIIIII",
54 40,
55 self.width,
56 self.height,
57 1,
58 self.bits,
59 0,
60 self.datasize,
61 0,
62 0,
63 ncols,
64 0,
65 )
66 assert len(info) == 40, str(len(info))
67 header = struct.pack(
68 "<ccIHHI",
69 b"B",
70 b"M",
71 headersize + self.datasize,
72 0,
73 0,
74 headersize,
75 )
76 assert len(header) == 14, str(len(header))
77 self.fp.write(header)
78 self.fp.write(info)
79 if ncols == 2:
80 # B&W color table
81 for i in (0, 255):
82 self.fp.write(struct.pack("BBBx", i, i, i))
83 elif ncols == 256:
84 # grayscale color table
85 for i in range(256):
86 self.fp.write(struct.pack("BBBx", i, i, i))
87 self.pos0 = self.fp.tell()
88 self.pos1 = self.pos0 + self.datasize
90 def write_line(self, y: int, data: bytes) -> None:
91 self.fp.seek(self.pos1 - (y + 1) * self.linesize)
92 self.fp.write(data)
95class ImageWriter:
96 """Write image to a file
98 Supports various image types: JPEG, JBIG2 and bitmaps
99 """
101 def __init__(self, outdir: str) -> None:
102 self.outdir = outdir
103 if not os.path.exists(self.outdir):
104 os.makedirs(self.outdir)
106 def export_image(self, image: LTImage) -> str:
107 """Save an LTImage to disk"""
108 (width, height) = image.srcsize
110 filters = image.stream.get_filters()
112 if filters[-1][0] in LITERALS_DCT_DECODE:
113 name = self._save_jpeg(image)
115 elif filters[-1][0] in LITERALS_JPX_DECODE:
116 name = self._save_jpeg2000(image)
118 elif self._is_jbig2_iamge(image):
119 name = self._save_jbig2(image)
121 elif image.bits == 1:
122 name = self._save_bmp(image, width, height, (width + 7) // 8, image.bits)
124 elif image.bits == 8 and (
125 LITERAL_DEVICE_RGB in image.colorspace
126 or LITERAL_INLINE_DEVICE_RGB in image.colorspace
127 ):
128 name = self._save_bmp(image, width, height, width * 3, image.bits * 3)
130 elif image.bits == 8 and (
131 LITERAL_DEVICE_GRAY in image.colorspace
132 or LITERAL_INLINE_DEVICE_GRAY in image.colorspace
133 ):
134 name = self._save_bmp(image, width, height, width, image.bits)
136 elif len(filters) == 1 and filters[0][0] in LITERALS_FLATE_DECODE:
137 name = self._save_bytes(image)
139 else:
140 name = self._save_raw(image)
142 return name
144 def _save_jpeg(self, image: LTImage) -> str:
145 """Save a JPEG encoded image"""
146 data = image.stream.get_data()
148 name, path = self._create_unique_image_name(image, ".jpg")
149 with open(path, "wb") as fp:
150 if LITERAL_DEVICE_CMYK in image.colorspace:
151 try:
152 from PIL import Image, ImageChops # type: ignore[import]
153 except ImportError:
154 raise ImportError(PIL_ERROR_MESSAGE)
156 ifp = BytesIO(data)
157 i = Image.open(ifp)
158 i = ImageChops.invert(i)
159 i = i.convert("RGB")
160 i.save(fp, "JPEG")
161 else:
162 fp.write(data)
164 return name
166 def _save_jpeg2000(self, image: LTImage) -> str:
167 """Save a JPEG 2000 encoded image"""
168 data = image.stream.get_data()
170 name, path = self._create_unique_image_name(image, ".jp2")
171 with open(path, "wb") as fp:
172 try:
173 from PIL import Image # type: ignore[import]
174 except ImportError:
175 raise ImportError(PIL_ERROR_MESSAGE)
177 # if we just write the raw data, most image programs
178 # that I have tried cannot open the file. However,
179 # open and saving with PIL produces a file that
180 # seems to be easily opened by other programs
181 ifp = BytesIO(data)
182 i = Image.open(ifp)
183 i.save(fp, "JPEG2000")
184 return name
186 def _save_jbig2(self, image: LTImage) -> str:
187 """Save a JBIG2 encoded image"""
188 name, path = self._create_unique_image_name(image, ".jb2")
189 with open(path, "wb") as fp:
190 input_stream = BytesIO()
192 global_streams = []
193 filters = image.stream.get_filters()
194 for filter_name, params in filters:
195 if filter_name in LITERALS_JBIG2_DECODE:
196 global_streams.append(params["JBIG2Globals"].resolve())
198 if len(global_streams) > 1:
199 msg = (
200 "There should never be more than one JBIG2Globals "
201 "associated with a JBIG2 embedded image"
202 )
203 raise PDFValueError(msg)
204 if len(global_streams) == 1:
205 input_stream.write(global_streams[0].get_data().rstrip(b"\n"))
206 input_stream.write(image.stream.get_data())
207 input_stream.seek(0)
208 reader = JBIG2StreamReader(input_stream)
209 segments = reader.get_segments()
211 writer = JBIG2StreamWriter(fp)
212 writer.write_file(segments)
213 return name
215 def _save_bmp(
216 self,
217 image: LTImage,
218 width: int,
219 height: int,
220 bytes_per_line: int,
221 bits: int,
222 ) -> str:
223 """Save a BMP encoded image"""
224 name, path = self._create_unique_image_name(image, ".bmp")
225 with open(path, "wb") as fp:
226 bmp = BMPWriter(fp, bits, width, height)
227 data = image.stream.get_data()
228 i = 0
229 for y in range(height):
230 bmp.write_line(y, data[i : i + bytes_per_line])
231 i += bytes_per_line
232 return name
234 def _save_bytes(self, image: LTImage) -> str:
235 """Save an image without encoding, just bytes"""
236 name, path = self._create_unique_image_name(image, ".jpg")
237 width, height = image.srcsize
238 channels = len(image.stream.get_data()) / width / height / (image.bits / 8)
239 with open(path, "wb") as fp:
240 try:
241 from PIL import (
242 Image, # type: ignore[import]
243 ImageOps,
244 )
245 except ImportError:
246 raise ImportError(PIL_ERROR_MESSAGE)
248 mode: Literal["1", "L", "RGB", "CMYK"]
249 if image.bits == 1:
250 mode = "1"
251 elif image.bits == 8 and channels == 1:
252 mode = "L"
253 elif image.bits == 8 and channels == 3:
254 mode = "RGB"
255 elif image.bits == 8 and channels == 4:
256 mode = "CMYK"
258 img = Image.frombytes(mode, image.srcsize, image.stream.get_data(), "raw")
259 if mode == "L":
260 img = ImageOps.invert(img)
262 img.save(fp)
264 return name
266 def _save_raw(self, image: LTImage) -> str:
267 """Save an image with unknown encoding"""
268 ext = ".%d.%dx%d.img" % (image.bits, image.srcsize[0], image.srcsize[1])
269 name, path = self._create_unique_image_name(image, ext)
271 with open(path, "wb") as fp:
272 fp.write(image.stream.get_data())
273 return name
275 @staticmethod
276 def _is_jbig2_iamge(image: LTImage) -> bool:
277 filters = image.stream.get_filters()
278 for filter_name, params in filters:
279 if filter_name in LITERALS_JBIG2_DECODE:
280 return True
281 return False
283 def _create_unique_image_name(self, image: LTImage, ext: str) -> Tuple[str, str]:
284 name = image.name + ext
285 path = os.path.join(self.outdir, name)
286 img_index = 0
287 while os.path.exists(path):
288 name = "%s.%d%s" % (image.name, img_index, ext)
289 path = os.path.join(self.outdir, name)
290 img_index += 1
291 return name, path