Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/image.py: 15%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import os
2import os.path
3import struct
4from io import BytesIO
5from typing import BinaryIO, Literal
7from pdfminer.jbig2 import JBIG2StreamReader, JBIG2StreamWriter
8from pdfminer.layout import LTImage
9from pdfminer.pdfcolor import (
10 LITERAL_DEVICE_CMYK,
11 LITERAL_DEVICE_GRAY,
12 LITERAL_DEVICE_RGB,
13 LITERAL_INLINE_DEVICE_GRAY,
14 LITERAL_INLINE_DEVICE_RGB,
15)
16from pdfminer.pdfexceptions import PDFValueError
17from pdfminer.pdftypes import (
18 LITERALS_DCT_DECODE,
19 LITERALS_FLATE_DECODE,
20 LITERALS_JBIG2_DECODE,
21 LITERALS_JPX_DECODE,
22)
24PIL_ERROR_MESSAGE = (
25 "Could not import Pillow. This dependency of pdfminer.six is not "
26 "installed by default. You need it to to save jpg images to a file. Install it "
27 "with `pip install 'pdfminer.six[image]'`"
28)
31def align32(x: int) -> int:
32 return ((x + 3) // 4) * 4
35class BMPWriter:
36 def __init__(self, fp: BinaryIO, bits: int, width: int, height: int) -> None:
37 self.fp = fp
38 self.bits = bits
39 self.width = width
40 self.height = height
41 if bits == 1:
42 ncols = 2
43 elif bits == 8:
44 ncols = 256
45 elif bits == 24:
46 ncols = 0
47 else:
48 raise PDFValueError(bits)
49 self.linesize = align32((self.width * self.bits + 7) // 8)
50 self.datasize = self.linesize * self.height
51 headersize = 14 + 40 + ncols * 4
52 info = struct.pack(
53 "<IiiHHIIIIII",
54 40,
55 self.width,
56 self.height,
57 1,
58 self.bits,
59 0,
60 self.datasize,
61 0,
62 0,
63 ncols,
64 0,
65 )
66 assert len(info) == 40, str(len(info))
67 header = struct.pack(
68 "<ccIHHI",
69 b"B",
70 b"M",
71 headersize + self.datasize,
72 0,
73 0,
74 headersize,
75 )
76 assert len(header) == 14, str(len(header))
77 self.fp.write(header)
78 self.fp.write(info)
79 if ncols == 2:
80 # B&W color table
81 for i in (0, 255):
82 self.fp.write(struct.pack("BBBx", i, i, i))
83 elif ncols == 256:
84 # grayscale color table
85 for i in range(256):
86 self.fp.write(struct.pack("BBBx", i, i, i))
87 self.pos0 = self.fp.tell()
88 self.pos1 = self.pos0 + self.datasize
90 def write_line(self, y: int, data: bytes) -> None:
91 self.fp.seek(self.pos1 - (y + 1) * self.linesize)
92 self.fp.write(data)
95class ImageWriter:
96 """Write image to a file
98 Supports various image types: JPEG, JBIG2 and bitmaps
99 """
101 def __init__(self, outdir: str) -> None:
102 self.outdir = outdir
103 if not os.path.exists(self.outdir):
104 os.makedirs(self.outdir)
106 def export_image(self, image: LTImage) -> str:
107 """Save an LTImage to disk"""
108 (width, height) = image.srcsize
110 filters = image.stream.get_filters()
112 if not filters:
113 name = self._save_bytes(image)
115 elif filters[-1][0] in LITERALS_DCT_DECODE:
116 name = self._save_jpeg(image)
118 elif filters[-1][0] in LITERALS_JPX_DECODE:
119 name = self._save_jpeg2000(image)
121 elif self._is_jbig2_iamge(image):
122 name = self._save_jbig2(image)
124 elif image.bits == 1:
125 name = self._save_bmp(image, width, height, (width + 7) // 8, image.bits)
127 elif image.bits == 8 and (
128 LITERAL_DEVICE_RGB in image.colorspace
129 or LITERAL_INLINE_DEVICE_RGB in image.colorspace
130 ):
131 name = self._save_bmp(image, width, height, width * 3, image.bits * 3)
133 elif image.bits == 8 and (
134 LITERAL_DEVICE_GRAY in image.colorspace
135 or LITERAL_INLINE_DEVICE_GRAY in image.colorspace
136 ):
137 name = self._save_bmp(image, width, height, width, image.bits)
139 elif len(filters) == 1 and filters[0][0] in LITERALS_FLATE_DECODE:
140 name = self._save_bytes(image)
142 else:
143 name = self._save_raw(image)
145 return name
147 def _save_jpeg(self, image: LTImage) -> str:
148 """Save a JPEG encoded image"""
149 data = image.stream.get_data()
151 name, path = self._create_unique_image_name(image, ".jpg")
152 with open(path, "wb") as fp:
153 if LITERAL_DEVICE_CMYK in image.colorspace:
154 try:
155 from PIL import Image, ImageChops # type: ignore[import]
156 except ImportError as err:
157 raise ImportError(PIL_ERROR_MESSAGE) from err
159 ifp = BytesIO(data)
160 img = Image.open(ifp)
161 inverted = ImageChops.invert(img)
162 rgb_img = inverted.convert("RGB")
163 rgb_img.save(fp, "JPEG")
164 else:
165 fp.write(data)
167 return name
169 def _save_jpeg2000(self, image: LTImage) -> str:
170 """Save a JPEG 2000 encoded image"""
171 data = image.stream.get_data()
173 name, path = self._create_unique_image_name(image, ".jp2")
174 with open(path, "wb") as fp:
175 try:
176 from PIL import Image # type: ignore[import]
177 except ImportError as err:
178 raise ImportError(PIL_ERROR_MESSAGE) from err
180 # if we just write the raw data, most image programs
181 # that I have tried cannot open the file. However,
182 # open and saving with PIL produces a file that
183 # seems to be easily opened by other programs
184 ifp = BytesIO(data)
185 i = Image.open(ifp)
186 i.save(fp, "JPEG2000")
187 return name
189 def _save_jbig2(self, image: LTImage) -> str:
190 """Save a JBIG2 encoded image"""
191 name, path = self._create_unique_image_name(image, ".jb2")
192 with open(path, "wb") as fp:
193 input_stream = BytesIO()
195 global_streams = []
196 filters = image.stream.get_filters()
197 for filter_name, params in filters:
198 if filter_name in LITERALS_JBIG2_DECODE:
199 global_streams.append(params["JBIG2Globals"].resolve())
201 if len(global_streams) > 1:
202 msg = (
203 "There should never be more than one JBIG2Globals "
204 "associated with a JBIG2 embedded image"
205 )
206 raise PDFValueError(msg)
207 if len(global_streams) == 1:
208 input_stream.write(global_streams[0].get_data().rstrip(b"\n"))
209 input_stream.write(image.stream.get_data())
210 input_stream.seek(0)
211 reader = JBIG2StreamReader(input_stream)
212 segments = reader.get_segments()
214 writer = JBIG2StreamWriter(fp)
215 writer.write_file(segments)
216 return name
218 def _save_bmp(
219 self,
220 image: LTImage,
221 width: int,
222 height: int,
223 bytes_per_line: int,
224 bits: int,
225 ) -> str:
226 """Save a BMP encoded image"""
227 name, path = self._create_unique_image_name(image, ".bmp")
228 with open(path, "wb") as fp:
229 bmp = BMPWriter(fp, bits, width, height)
230 data = image.stream.get_data()
231 i = 0
232 for y in range(height):
233 bmp.write_line(y, data[i : i + bytes_per_line])
234 i += bytes_per_line
235 return name
237 def _save_bytes(self, image: LTImage) -> str:
238 """Save an image without encoding, just bytes"""
239 name, path = self._create_unique_image_name(image, ".jpg")
240 width, height = image.srcsize
241 channels = len(image.stream.get_data()) / width / height / (image.bits / 8)
242 with open(path, "wb") as fp:
243 try:
244 from PIL import (
245 Image, # type: ignore[import]
246 ImageOps,
247 )
248 except ImportError as err:
249 raise ImportError(PIL_ERROR_MESSAGE) from err
251 mode: Literal["1", "L", "RGB", "CMYK"]
252 if image.bits == 1:
253 mode = "1"
254 elif image.bits == 8 and channels == 1:
255 mode = "L"
256 elif image.bits == 8 and channels == 3:
257 mode = "RGB"
258 elif image.bits == 8 and channels == 4:
259 mode = "CMYK"
261 img = Image.frombytes(mode, image.srcsize, image.stream.get_data(), "raw")
262 if mode == "L":
263 img = ImageOps.invert(img)
265 img.save(fp)
267 return name
269 def _save_raw(self, image: LTImage) -> str:
270 """Save an image with unknown encoding"""
271 ext = f".{image.bits}.{image.srcsize[0]}x{image.srcsize[1]}.img"
272 name, path = self._create_unique_image_name(image, ext)
274 with open(path, "wb") as fp:
275 fp.write(image.stream.get_data())
276 return name
278 @staticmethod
279 def _is_jbig2_iamge(image: LTImage) -> bool:
280 filters = image.stream.get_filters()
281 for filter_name, _params in filters:
282 if filter_name in LITERALS_JBIG2_DECODE:
283 return True
284 return False
286 def _create_unique_image_name(self, image: LTImage, ext: str) -> tuple[str, str]:
287 name = image.name + ext
288 path = os.path.join(self.outdir, name)
289 img_index = 0
290 while os.path.exists(path):
291 name = f"{image.name}.{img_index}{ext}"
292 path = os.path.join(self.outdir, name)
293 img_index += 1
294 return name, path