Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/image.py: 16%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import os
2import os.path
3import struct
4from io import BytesIO
5from typing import BinaryIO, Tuple
7try:
8 from typing import Literal
9except ImportError:
10 # Literal was introduced in Python 3.8
11 from typing_extensions import Literal # type: ignore[assignment]
13from pdfminer.jbig2 import JBIG2StreamReader, JBIG2StreamWriter
14from pdfminer.layout import LTImage
15from pdfminer.pdfcolor import (
16 LITERAL_DEVICE_CMYK,
17 LITERAL_DEVICE_GRAY,
18 LITERAL_DEVICE_RGB,
19 LITERAL_INLINE_DEVICE_GRAY,
20 LITERAL_INLINE_DEVICE_RGB,
21)
22from pdfminer.pdfexceptions import PDFValueError
23from pdfminer.pdftypes import (
24 LITERALS_DCT_DECODE,
25 LITERALS_FLATE_DECODE,
26 LITERALS_JBIG2_DECODE,
27 LITERALS_JPX_DECODE,
28)
30PIL_ERROR_MESSAGE = (
31 "Could not import Pillow. This dependency of pdfminer.six is not "
32 "installed by default. You need it to to save jpg images to a file. Install it "
33 "with `pip install 'pdfminer.six[image]'`"
34)
37def align32(x: int) -> int:
38 return ((x + 3) // 4) * 4
41class BMPWriter:
42 def __init__(self, fp: BinaryIO, bits: int, width: int, height: int) -> None:
43 self.fp = fp
44 self.bits = bits
45 self.width = width
46 self.height = height
47 if bits == 1:
48 ncols = 2
49 elif bits == 8:
50 ncols = 256
51 elif bits == 24:
52 ncols = 0
53 else:
54 raise PDFValueError(bits)
55 self.linesize = align32((self.width * self.bits + 7) // 8)
56 self.datasize = self.linesize * self.height
57 headersize = 14 + 40 + ncols * 4
58 info = struct.pack(
59 "<IiiHHIIIIII",
60 40,
61 self.width,
62 self.height,
63 1,
64 self.bits,
65 0,
66 self.datasize,
67 0,
68 0,
69 ncols,
70 0,
71 )
72 assert len(info) == 40, str(len(info))
73 header = struct.pack(
74 "<ccIHHI",
75 b"B",
76 b"M",
77 headersize + self.datasize,
78 0,
79 0,
80 headersize,
81 )
82 assert len(header) == 14, str(len(header))
83 self.fp.write(header)
84 self.fp.write(info)
85 if ncols == 2:
86 # B&W color table
87 for i in (0, 255):
88 self.fp.write(struct.pack("BBBx", i, i, i))
89 elif ncols == 256:
90 # grayscale color table
91 for i in range(256):
92 self.fp.write(struct.pack("BBBx", i, i, i))
93 self.pos0 = self.fp.tell()
94 self.pos1 = self.pos0 + self.datasize
96 def write_line(self, y: int, data: bytes) -> None:
97 self.fp.seek(self.pos1 - (y + 1) * self.linesize)
98 self.fp.write(data)
101class ImageWriter:
102 """Write image to a file
104 Supports various image types: JPEG, JBIG2 and bitmaps
105 """
107 def __init__(self, outdir: str) -> None:
108 self.outdir = outdir
109 if not os.path.exists(self.outdir):
110 os.makedirs(self.outdir)
112 def export_image(self, image: LTImage) -> str:
113 """Save an LTImage to disk"""
114 (width, height) = image.srcsize
116 filters = image.stream.get_filters()
118 if filters[-1][0] in LITERALS_DCT_DECODE:
119 name = self._save_jpeg(image)
121 elif filters[-1][0] in LITERALS_JPX_DECODE:
122 name = self._save_jpeg2000(image)
124 elif self._is_jbig2_iamge(image):
125 name = self._save_jbig2(image)
127 elif image.bits == 1:
128 name = self._save_bmp(image, width, height, (width + 7) // 8, image.bits)
130 elif image.bits == 8 and (
131 LITERAL_DEVICE_RGB in image.colorspace
132 or LITERAL_INLINE_DEVICE_RGB in image.colorspace
133 ):
134 name = self._save_bmp(image, width, height, width * 3, image.bits * 3)
136 elif image.bits == 8 and (
137 LITERAL_DEVICE_GRAY in image.colorspace
138 or LITERAL_INLINE_DEVICE_GRAY in image.colorspace
139 ):
140 name = self._save_bmp(image, width, height, width, image.bits)
142 elif len(filters) == 1 and filters[0][0] in LITERALS_FLATE_DECODE:
143 name = self._save_bytes(image)
145 else:
146 name = self._save_raw(image)
148 return name
150 def _save_jpeg(self, image: LTImage) -> str:
151 """Save a JPEG encoded image"""
152 data = image.stream.get_data()
154 name, path = self._create_unique_image_name(image, ".jpg")
155 with open(path, "wb") as fp:
156 if LITERAL_DEVICE_CMYK in image.colorspace:
157 try:
158 from PIL import Image, ImageChops # type: ignore[import]
159 except ImportError:
160 raise ImportError(PIL_ERROR_MESSAGE)
162 ifp = BytesIO(data)
163 i = Image.open(ifp)
164 i = ImageChops.invert(i)
165 i = i.convert("RGB")
166 i.save(fp, "JPEG")
167 else:
168 fp.write(data)
170 return name
172 def _save_jpeg2000(self, image: LTImage) -> str:
173 """Save a JPEG 2000 encoded image"""
174 data = image.stream.get_data()
176 name, path = self._create_unique_image_name(image, ".jp2")
177 with open(path, "wb") as fp:
178 try:
179 from PIL import Image # type: ignore[import]
180 except ImportError:
181 raise ImportError(PIL_ERROR_MESSAGE)
183 # if we just write the raw data, most image programs
184 # that I have tried cannot open the file. However,
185 # open and saving with PIL produces a file that
186 # seems to be easily opened by other programs
187 ifp = BytesIO(data)
188 i = Image.open(ifp)
189 i.save(fp, "JPEG2000")
190 return name
192 def _save_jbig2(self, image: LTImage) -> str:
193 """Save a JBIG2 encoded image"""
194 name, path = self._create_unique_image_name(image, ".jb2")
195 with open(path, "wb") as fp:
196 input_stream = BytesIO()
198 global_streams = []
199 filters = image.stream.get_filters()
200 for filter_name, params in filters:
201 if filter_name in LITERALS_JBIG2_DECODE:
202 global_streams.append(params["JBIG2Globals"].resolve())
204 if len(global_streams) > 1:
205 msg = (
206 "There should never be more than one JBIG2Globals "
207 "associated with a JBIG2 embedded image"
208 )
209 raise PDFValueError(msg)
210 if len(global_streams) == 1:
211 input_stream.write(global_streams[0].get_data().rstrip(b"\n"))
212 input_stream.write(image.stream.get_data())
213 input_stream.seek(0)
214 reader = JBIG2StreamReader(input_stream)
215 segments = reader.get_segments()
217 writer = JBIG2StreamWriter(fp)
218 writer.write_file(segments)
219 return name
221 def _save_bmp(
222 self,
223 image: LTImage,
224 width: int,
225 height: int,
226 bytes_per_line: int,
227 bits: int,
228 ) -> str:
229 """Save a BMP encoded image"""
230 name, path = self._create_unique_image_name(image, ".bmp")
231 with open(path, "wb") as fp:
232 bmp = BMPWriter(fp, bits, width, height)
233 data = image.stream.get_data()
234 i = 0
235 for y in range(height):
236 bmp.write_line(y, data[i : i + bytes_per_line])
237 i += bytes_per_line
238 return name
240 def _save_bytes(self, image: LTImage) -> str:
241 """Save an image without encoding, just bytes"""
242 name, path = self._create_unique_image_name(image, ".jpg")
243 width, height = image.srcsize
244 channels = len(image.stream.get_data()) / width / height / (image.bits / 8)
245 with open(path, "wb") as fp:
246 try:
247 from PIL import (
248 Image, # type: ignore[import]
249 ImageOps,
250 )
251 except ImportError:
252 raise ImportError(PIL_ERROR_MESSAGE)
254 mode: Literal["1", "L", "RGB", "CMYK"]
255 if image.bits == 1:
256 mode = "1"
257 elif image.bits == 8 and channels == 1:
258 mode = "L"
259 elif image.bits == 8 and channels == 3:
260 mode = "RGB"
261 elif image.bits == 8 and channels == 4:
262 mode = "CMYK"
264 img = Image.frombytes(mode, image.srcsize, image.stream.get_data(), "raw")
265 if mode == "L":
266 img = ImageOps.invert(img)
268 img.save(fp)
270 return name
272 def _save_raw(self, image: LTImage) -> str:
273 """Save an image with unknown encoding"""
274 ext = ".%d.%dx%d.img" % (image.bits, image.srcsize[0], image.srcsize[1])
275 name, path = self._create_unique_image_name(image, ext)
277 with open(path, "wb") as fp:
278 fp.write(image.stream.get_data())
279 return name
281 @staticmethod
282 def _is_jbig2_iamge(image: LTImage) -> bool:
283 filters = image.stream.get_filters()
284 for filter_name, params in filters:
285 if filter_name in LITERALS_JBIG2_DECODE:
286 return True
287 return False
289 def _create_unique_image_name(self, image: LTImage, ext: str) -> Tuple[str, str]:
290 name = image.name + ext
291 path = os.path.join(self.outdir, name)
292 img_index = 0
293 while os.path.exists(path):
294 name = "%s.%d%s" % (image.name, img_index, ext)
295 path = os.path.join(self.outdir, name)
296 img_index += 1
297 return name, path