Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pikepdf/models/image.py: 26%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# SPDX-FileCopyrightText: 2022 James R. Barlow
2# SPDX-License-Identifier: MPL-2.0
4"""Extract images embedded in PDF."""
6from __future__ import annotations
8from abc import ABC, abstractmethod
9from collections.abc import Callable
10from copy import copy
11from decimal import Decimal
12from io import BytesIO
13from itertools import zip_longest
14from pathlib import Path
15from shutil import copyfileobj
16from typing import Any, BinaryIO, NamedTuple, TypeVar, cast
18from PIL import Image
19from PIL.ImageCms import ImageCmsProfile
21from pikepdf import jbig2
22from pikepdf._core import Buffer, Pdf, PdfError, StreamDecodeLevel
23from pikepdf._exceptions import DependencyError
24from pikepdf.models import _transcoding
25from pikepdf.models._transcoding import ImageDecompressionError
26from pikepdf.objects import (
27 Array,
28 Dictionary,
29 Name,
30 Object,
31 Stream,
32 String,
33)
35T = TypeVar('T')
37RGBDecodeArray = tuple[float, float, float, float, float, float]
38GrayDecodeArray = tuple[float, float]
39CMYKDecodeArray = tuple[float, float, float, float, float, float, float, float]
40DecodeArray = RGBDecodeArray | GrayDecodeArray | CMYKDecodeArray
43class UnsupportedImageTypeError(Exception):
44 """This image is formatted in a way pikepdf does not supported."""
47class NotExtractableError(Exception):
48 """Indicates that an image cannot be directly extracted."""
51class HifiPrintImageNotTranscodableError(NotExtractableError):
52 """Image contains high fidelity printing information and cannot be extracted."""
55class InvalidPdfImageError(Exception):
56 """This image is not valid according to the PDF 1.7 specification."""
59def _array_str(value: Object | str | list):
60 """Simplify pikepdf objects to array of str. Keep streams, dictionaries intact."""
62 def _convert(item):
63 if isinstance(item, list | Array):
64 return [_convert(subitem) for subitem in item]
65 if isinstance(item, Stream | Dictionary | bytes | int):
66 return item
67 if isinstance(item, Name | str):
68 return str(item)
69 if isinstance(item, (String)):
70 return bytes(item)
71 raise NotImplementedError(value)
73 result = _convert(value)
74 if not isinstance(result, list):
75 result = [result]
76 return result
79def _ensure_list(value: list[Object] | Dictionary | Array | Object) -> list[Object]:
80 """Ensure value is a list of pikepdf.Object, if it was not already.
82 To support DecodeParms which can be present as either an array of dicts or a single
83 dict. It's easier to convert to an array of one dict.
84 """
85 if isinstance(value, list):
86 return value
87 return list(value.wrap_in_array().as_list())
90def _metadata_from_obj(
91 obj: Dictionary | Stream, name: str, type_: Callable[[Any], T], default: T
92) -> T | None:
93 """Retrieve metadata from a dictionary or stream and wrangle types."""
94 val = getattr(obj, name, default)
95 try:
96 return type_(val)
97 except TypeError:
98 if val is None:
99 return None
100 raise NotImplementedError('Metadata access for ' + name)
103class PaletteData(NamedTuple):
104 """Returns the color space and binary representation of the palette.
106 ``base_colorspace`` is typically ``"RGB"`` or ``"L"`` (for grayscale).
108 ``palette`` is typically 256 or 256*3=768 bytes, for grayscale and RGB color
109 respectively, with each unit/triplet being the grayscale/RGB triplet values.
110 """
112 base_colorspace: str
113 palette: bytes
116class PdfImageBase(ABC):
117 """Abstract base class for images."""
119 SIMPLE_COLORSPACES = {'/DeviceRGB', '/DeviceGray', '/CalRGB', '/CalGray'}
120 MAIN_COLORSPACES = SIMPLE_COLORSPACES | {'/DeviceCMYK', '/CalCMYK', '/ICCBased'}
121 PRINT_COLORSPACES = {'/Separation', '/DeviceN'}
123 @abstractmethod
124 def _metadata(self, name: str, type_: Callable[[Any], T], default: T) -> T:
125 """Get metadata for this image type."""
127 @property
128 def width(self) -> int:
129 """Width of the image data in pixels."""
130 return self._metadata('Width', int, 0)
132 @property
133 def height(self) -> int:
134 """Height of the image data in pixels."""
135 return self._metadata('Height', int, 0)
137 @property
138 def image_mask(self) -> bool:
139 """Return ``True`` if this is an image mask."""
140 return self._metadata('ImageMask', bool, False)
142 @property
143 def _bpc(self) -> int | None:
144 """Bits per component for this image (low-level)."""
145 return self._metadata('BitsPerComponent', int, 0)
147 @property
148 def _colorspaces(self):
149 """Colorspace (low-level)."""
150 return self._metadata('ColorSpace', _array_str, [])
152 @property
153 def filters(self):
154 """List of names of the filters that we applied to encode this image."""
155 return self._metadata('Filter', _array_str, [])
157 @property
158 def _decode_array(self) -> DecodeArray:
159 """Extract the /Decode array."""
160 decode: list = self._metadata('Decode', _ensure_list, [])
161 if decode and len(decode) in (2, 6, 8):
162 return cast(DecodeArray, tuple(float(value) for value in decode))
164 if self.colorspace in ('/DeviceGray', '/CalGray'):
165 return (0.0, 1.0)
166 if self.colorspace in ('/DeviceRGB', '/CalRGB'):
167 return (0.0, 1.0, 0.0, 1.0, 0.0, 1.0)
168 if self.colorspace == '/DeviceCMYK':
169 return (0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0)
170 if self.colorspace == '/ICCBased':
171 if self._approx_mode_from_icc() == 'L':
172 return (0.0, 1.0)
173 if self._approx_mode_from_icc() == 'RGB':
174 return (0.0, 1.0, 0.0, 1.0, 0.0, 1.0)
175 if self.image_mask:
176 return (0.0, 1.0) # Default for image masks; per RM 8.9.6.2
178 raise NotImplementedError(
179 "Don't how to retrieve default /Decode array for image" + repr(self)
180 )
182 @property
183 def decode_parms(self):
184 """List of the /DecodeParms, arguments to filters."""
185 return self._metadata('DecodeParms', _ensure_list, [])
187 @property
188 def colorspace(self) -> str | None:
189 """PDF name of the colorspace that best describes this image."""
190 if self.image_mask:
191 return None # Undefined for image masks
192 if self._colorspaces:
193 if self._colorspaces[0] in self.MAIN_COLORSPACES:
194 return self._colorspaces[0]
195 if self._colorspaces[0] == '/Indexed':
196 subspace = self._colorspaces[1]
197 if isinstance(subspace, str) and subspace in self.MAIN_COLORSPACES:
198 return subspace
199 if isinstance(subspace, list) and subspace[0] in (
200 '/ICCBased',
201 '/DeviceN',
202 '/CalGray',
203 '/CalRGB',
204 ):
205 return subspace[0]
206 if self._colorspaces[0] == '/DeviceN':
207 return '/DeviceN'
209 raise NotImplementedError(
210 "not sure how to get colorspace: " + repr(self._colorspaces)
211 )
213 @property
214 def bits_per_component(self) -> int:
215 """Bits per component of this image."""
216 if self._bpc is None or self._bpc == 0:
217 return 1 if self.image_mask else 8
218 return self._bpc
220 @property
221 @abstractmethod
222 def icc(self) -> ImageCmsProfile | None:
223 """Return ICC profile for this image if one is defined."""
225 @property
226 def indexed(self) -> bool:
227 """Check if the image has a defined color palette."""
228 return '/Indexed' in self._colorspaces
230 def _colorspace_has_name(self, name):
231 try:
232 cs = self._colorspaces
233 if cs[0] == '/Indexed' and cs[1][0] == name:
234 return True
235 if cs[0] == name:
236 return True
237 except (IndexError, AttributeError, KeyError):
238 pass
239 return False
241 @property
242 def is_device_n(self) -> bool:
243 """Check if image has a /DeviceN (complex printing) colorspace."""
244 return self._colorspace_has_name('/DeviceN')
246 @property
247 def is_separation(self) -> bool:
248 """Check if image has a /DeviceN (complex printing) colorspace."""
249 return self._colorspace_has_name('/Separation')
251 @property
252 def size(self) -> tuple[int, int]:
253 """Size of image as (width, height)."""
254 return self.width, self.height
256 def _approx_mode_from_icc(self):
257 if self.indexed:
258 icc_profile = self._colorspaces[1][1]
259 else:
260 icc_profile = self._colorspaces[1]
261 icc_profile_nchannels = int(icc_profile['/N'])
263 if icc_profile_nchannels == 1:
264 return 'L'
266 # Multiple channels, need to open the profile and look
267 mode_from_xcolor_space = {'RGB ': 'RGB', 'CMYK': 'CMYK'}
268 xcolor_space = self.icc.profile.xcolor_space
269 return mode_from_xcolor_space.get(xcolor_space, '')
271 @property
272 def mode(self) -> str:
273 """``PIL.Image.mode`` equivalent for this image, where possible.
275 If an ICC profile is attached to the image, we still attempt to resolve a Pillow
276 mode.
277 """
278 m = ''
279 if self.is_device_n:
280 m = 'DeviceN'
281 elif self.is_separation:
282 m = 'Separation'
283 elif self.indexed:
284 m = 'P'
285 elif self.colorspace == '/DeviceGray' and self.bits_per_component == 1:
286 m = '1'
287 elif self.colorspace == '/DeviceGray' and self.bits_per_component > 1:
288 m = 'L'
289 elif self.colorspace == '/DeviceRGB':
290 m = 'RGB'
291 elif self.colorspace == '/DeviceCMYK':
292 m = 'CMYK'
293 elif self.colorspace == '/ICCBased':
294 try:
295 m = self._approx_mode_from_icc()
296 except (ValueError, TypeError) as e:
297 raise NotImplementedError(
298 "Not sure how to handle PDF image of this type"
299 ) from e
300 if m == '':
301 raise NotImplementedError(
302 "Not sure how to handle PDF image of this type"
303 ) from None
304 return m
306 @property
307 def filter_decodeparms(self):
308 """Return normalized the Filter and DecodeParms data.
310 PDF has a lot of possible data structures concerning /Filter and
311 /DecodeParms. /Filter can be absent or a name or an array, /DecodeParms
312 can be absent or a dictionary (if /Filter is a name) or an array (if
313 /Filter is an array). When both are arrays the lengths match.
315 Normalize this into:
316 [(/FilterName, {/DecodeParmName: Value, ...}), ...]
318 The order of /Filter matters as indicates the encoding/decoding sequence.
319 """
320 return list(zip_longest(self.filters, self.decode_parms, fillvalue={}))
322 @property
323 def palette(self) -> PaletteData | None:
324 """Retrieve the color palette for this image if applicable."""
325 if not self.indexed:
326 return None
327 try:
328 _idx, base, _hival, lookup = self._colorspaces
329 except ValueError as e:
330 raise ValueError('Not sure how to interpret this palette') from e
331 if self.icc or self.is_device_n or self.is_separation or isinstance(base, list):
332 base = str(base[0])
333 else:
334 base = str(base)
335 lookup = bytes(lookup)
336 if base not in self.MAIN_COLORSPACES and base not in self.PRINT_COLORSPACES:
337 raise NotImplementedError(f"not sure how to interpret this palette: {base}")
338 if base in ('/DeviceRGB', '/CalRGB'):
339 base = 'RGB'
340 elif base in ('/DeviceGray', '/CalGray'):
341 base = 'L'
342 elif base == '/DeviceCMYK':
343 base = 'CMYK'
344 elif base == '/DeviceN':
345 base = 'DeviceN'
346 elif base == '/Separation':
347 base = 'Separation'
348 elif base == '/ICCBased':
349 base = self._approx_mode_from_icc()
350 else:
351 raise NotImplementedError(f"not sure how to interpret this palette: {base}")
352 return PaletteData(base, lookup)
354 @abstractmethod
355 def as_pil_image(self) -> Image.Image:
356 """Convert this PDF image to a Python PIL (Pillow) image."""
358 def _repr_png_(self) -> bytes:
359 """Display hook for IPython/Jupyter."""
360 b = BytesIO()
361 with self.as_pil_image() as im:
362 im.save(b, 'PNG')
363 return b.getvalue()
366class PdfImage(PdfImageBase):
367 """Support class to provide a consistent API for manipulating PDF images.
369 The data structure for images inside PDFs is irregular and complex,
370 making it difficult to use without introducing errors for less
371 typical cases. This class addresses these difficulties by providing a
372 regular, Pythonic API similar in spirit (and convertible to) the Python
373 Pillow imaging library.
374 """
376 obj: Stream
377 _icc: ImageCmsProfile | None
378 _pdf_source: Pdf | None
380 def __new__(cls, obj: Stream):
381 """Construct a PdfImage... or a PdfJpxImage if that is what we really are."""
382 try:
383 # Check if JPXDecode is called for and initialize as PdfJpxImage
384 filters = _ensure_list(obj.Filter)
385 if Name.JPXDecode in filters:
386 return super().__new__(PdfJpxImage)
387 except (AttributeError, KeyError):
388 # __init__ will deal with any other errors
389 pass
390 return super().__new__(PdfImage)
392 def __init__(self, obj: Stream):
393 """Construct a PDF image from a Image XObject inside a PDF.
395 ``pim = PdfImage(page.Resources.XObject['/ImageNN'])``
397 Args:
398 obj: an Image XObject
399 """
400 if isinstance(obj, Stream) and obj.stream_dict.get("/Subtype") != "/Image":
401 raise TypeError("can't construct PdfImage from non-image")
402 self.obj = obj
403 self._icc = None
405 def __eq__(self, other):
406 if not isinstance(other, PdfImageBase):
407 return NotImplemented
408 return self.obj == other.obj
410 @classmethod
411 def _from_pil_image(cls, *, pdf, page, name, image): # pragma: no cover
412 """Insert a PIL image into a PDF (rudimentary).
414 Args:
415 pdf (pikepdf.Pdf): the PDF to attach the image to
416 page (pikepdf.Object): the page to attach the image to
417 name (str or pikepdf.Name): the name to set the image
418 image (PIL.Image.Image): the image to insert
419 """
420 data = image.tobytes()
422 imstream = Stream(pdf, data)
423 imstream.Type = Name('/XObject')
424 imstream.Subtype = Name('/Image')
425 if image.mode == 'RGB':
426 imstream.ColorSpace = Name('/DeviceRGB')
427 elif image.mode in ('1', 'L'):
428 imstream.ColorSpace = Name('/DeviceGray')
429 imstream.BitsPerComponent = 1 if image.mode == '1' else 8
430 imstream.Width = image.width
431 imstream.Height = image.height
433 page.Resources.XObject[name] = imstream
435 return cls(imstream)
437 def _metadata(self, name, type_, default):
438 return _metadata_from_obj(self.obj, name, type_, default)
440 @property
441 def _iccstream(self):
442 if self.colorspace == '/ICCBased':
443 if not self.indexed:
444 return self._colorspaces[1]
445 assert isinstance(self._colorspaces[1], list)
446 return self._colorspaces[1][1]
447 raise NotImplementedError("Don't know how to find ICC stream for image")
449 @property
450 def icc(self) -> ImageCmsProfile | None:
451 """If an ICC profile is attached, return a Pillow object that describe it.
453 Most of the information may be found in ``icc.profile``.
454 """
455 if self.colorspace not in ('/ICCBased', '/Indexed'):
456 return None
457 if not self._icc:
458 iccstream = self._iccstream
459 iccbuffer = iccstream.get_stream_buffer()
460 iccbytesio = BytesIO(iccbuffer)
461 try:
462 self._icc = ImageCmsProfile(iccbytesio)
463 except OSError as e:
464 if str(e) == 'cannot open profile from string':
465 # ICC profile is corrupt
466 raise UnsupportedImageTypeError(
467 "ICC profile corrupt or not readable"
468 ) from e
469 return self._icc
471 def _remove_simple_filters(self):
472 """Remove simple lossless compression where it appears."""
473 COMPLEX_FILTERS = {
474 '/DCTDecode',
475 '/JPXDecode',
476 '/JBIG2Decode',
477 '/CCITTFaxDecode',
478 }
479 indices = [n for n, filt in enumerate(self.filters) if filt in COMPLEX_FILTERS]
480 if len(indices) > 1:
481 raise NotImplementedError(
482 f"Object {self.obj.objgen} has compound complex filters: "
483 f"{self.filters}. We cannot decompress this."
484 )
485 if len(indices) == 0:
486 # No complex filter indices, so all filters are simple - remove them all
487 return self.obj.read_bytes(StreamDecodeLevel.specialized), []
489 n = indices[0]
490 if n == 0:
491 # The only filter is complex, so return
492 return self.obj.read_raw_bytes(), self.filters
494 obj_copy = copy(self.obj)
495 obj_copy.Filter = Array([Name(f) for f in self.filters[:n]])
496 obj_copy.DecodeParms = Array(self.decode_parms[:n])
497 return obj_copy.read_bytes(StreamDecodeLevel.specialized), self.filters[n:]
499 def _extract_direct(self, *, stream: BinaryIO) -> str | None:
500 """Attempt to extract the image directly to a usable image file.
502 If there is no way to extract the image without decompressing or
503 transcoding then raise an exception. The type and format of image
504 generated will vary.
506 Args:
507 stream: Writable file stream to write data to, e.g. an open file
508 """
510 def normal_dct_rgb() -> bool:
511 # Normal DCTDecode RGB images have the default value of
512 # /ColorTransform 1 and are actually in YUV. Such a file can be
513 # saved as a standard JPEG. RGB JPEGs without YUV conversion can't
514 # be saved as JPEGs, and are probably bugs. Some software in the
515 # wild actually produces RGB JPEGs in PDFs (probably a bug).
516 DEFAULT_CT_RGB = 1
517 ct = DEFAULT_CT_RGB
518 if self.filter_decodeparms[0][1] is not None:
519 ct = self.filter_decodeparms[0][1].get(
520 '/ColorTransform', DEFAULT_CT_RGB
521 )
522 return self.mode == 'RGB' and ct == DEFAULT_CT_RGB
524 def normal_dct_cmyk() -> bool:
525 # Normal DCTDecode CMYKs have /ColorTransform 0 and can be saved.
526 # There is a YUVK colorspace but CMYK JPEGs don't generally use it
527 DEFAULT_CT_CMYK = 0
528 ct = DEFAULT_CT_CMYK
529 if self.filter_decodeparms[0][1] is not None:
530 ct = self.filter_decodeparms[0][1].get(
531 '/ColorTransform', DEFAULT_CT_CMYK
532 )
533 return self.mode == 'CMYK' and ct == DEFAULT_CT_CMYK
535 data, filters = self._remove_simple_filters()
537 if filters == ['/CCITTFaxDecode']:
538 if self.colorspace == '/ICCBased':
539 icc = self._iccstream.read_bytes()
540 else:
541 icc = None
542 stream.write(self._generate_ccitt_header(data, icc=icc))
543 stream.write(data)
544 return '.tif'
545 if filters == ['/DCTDecode'] and (
546 self.mode == 'L' or normal_dct_rgb() or normal_dct_cmyk()
547 ):
548 stream.write(data)
549 return '.jpg'
551 return None
553 def _extract_transcoded_1248bits(self) -> Image.Image:
554 """Extract an image when there are 1/2/4/8 bits packed in byte data."""
555 stride = 0 # tell Pillow to calculate stride from line width
556 scale = 0 if self.mode == 'L' else 1
557 if self.bits_per_component in (2, 4):
558 buffer, stride = _transcoding.unpack_subbyte_pixels(
559 self.read_bytes(), self.size, self.bits_per_component, scale
560 )
561 elif self.bits_per_component == 8:
562 buffer = cast(memoryview, self.get_stream_buffer())
563 else:
564 raise InvalidPdfImageError("BitsPerComponent must be 1, 2, 4, 8, or 16")
566 if self.mode == 'P' and self.palette is not None:
567 base_mode, palette = self.palette
568 im = _transcoding.image_from_buffer_and_palette(
569 buffer,
570 self.size,
571 stride,
572 base_mode,
573 palette,
574 )
575 else:
576 im = _transcoding.image_from_byte_buffer(buffer, self.size, stride)
577 return im
579 def _extract_transcoded_1bit(self) -> Image.Image:
580 if not self.image_mask and self.mode in ('RGB', 'CMYK'):
581 raise UnsupportedImageTypeError("1-bit RGB and CMYK are not supported")
582 try:
583 data = self.read_bytes()
584 except (RuntimeError, PdfError) as e:
585 if (
586 'read_bytes called on unfilterable stream' in str(e)
587 and not jbig2.get_decoder().available()
588 ):
589 raise DependencyError(
590 "jbig2dec - not installed or installed version is too old "
591 "(older than version 0.15)"
592 ) from None
593 raise
595 im = Image.frombytes('1', self.size, data)
597 if self.palette is not None:
598 base_mode, palette = self.palette
599 im = _transcoding.fix_1bit_palette_image(im, base_mode, palette)
601 return im
603 def _extract_transcoded_mask(self) -> Image.Image:
604 return self._extract_transcoded_1bit()
606 def _extract_transcoded(self) -> Image.Image:
607 if self.image_mask:
608 return self._extract_transcoded_mask()
610 if self.mode in {'DeviceN', 'Separation'}:
611 raise HifiPrintImageNotTranscodableError()
613 if self.mode == 'RGB' and self.bits_per_component == 8:
614 # Cannot use the zero-copy .get_stream_buffer here, we have 3-byte
615 # RGB and Pillow needs RGBX.
616 im = Image.frombuffer(
617 'RGB', self.size, self.read_bytes(), 'raw', 'RGB', 0, 1
618 )
619 elif self.mode == 'CMYK' and self.bits_per_component == 8:
620 im = Image.frombuffer(
621 'CMYK', self.size, self.get_stream_buffer(), 'raw', 'CMYK', 0, 1
622 )
623 # elif self.mode == '1':
624 elif self.bits_per_component == 1:
625 im = self._extract_transcoded_1bit()
626 elif self.mode in ('L', 'P') and self.bits_per_component <= 8:
627 im = self._extract_transcoded_1248bits()
628 else:
629 raise UnsupportedImageTypeError(repr(self) + ", " + repr(self.obj))
631 if self.colorspace == '/ICCBased' and self.icc is not None:
632 im.info['icc_profile'] = self.icc.tobytes()
634 return im
636 def _extract_to_stream(self, *, stream: BinaryIO) -> str:
637 """Extract the image to a stream.
639 If possible, the compressed data is extracted and inserted into
640 a compressed image file format without transcoding the compressed
641 content. If this is not possible, the data will be decompressed
642 and extracted to an appropriate format.
644 Args:
645 stream: Writable stream to write data to
647 Returns:
648 The file format extension.
649 """
650 direct_extraction = self._extract_direct(stream=stream)
651 if direct_extraction:
652 return direct_extraction
654 im = None
655 try:
656 im = self._extract_transcoded()
657 if im.mode == 'CMYK':
658 im.save(stream, format='tiff', compression='tiff_adobe_deflate')
659 return '.tiff'
660 if im:
661 im.save(stream, format='png')
662 return '.png'
663 except PdfError as e:
664 if 'called on unfilterable stream' in str(e):
665 raise UnsupportedImageTypeError(repr(self)) from e
666 raise
667 finally:
668 if im:
669 im.close()
671 raise UnsupportedImageTypeError(repr(self))
673 def extract_to(
674 self, *, stream: BinaryIO | None = None, fileprefix: str = ''
675 ) -> str:
676 """Extract the image directly to a usable image file.
678 If possible, the compressed data is extracted and inserted into
679 a compressed image file format without transcoding the compressed
680 content. If this is not possible, the data will be decompressed
681 and extracted to an appropriate format.
683 Because it is not known until attempted what image format will be
684 extracted, users should not assume what format they are getting back.
685 When saving the image to a file, use a temporary filename, and then
686 rename the file to its final name based on the returned file extension.
688 Images might be saved as any of .png, .jpg, or .tiff.
690 Examples:
691 >>> im.extract_to(stream=bytes_io) # doctest: +SKIP
692 '.png'
694 >>> im.extract_to(fileprefix='/tmp/image00') # doctest: +SKIP
695 '/tmp/image00.jpg'
697 Args:
698 stream: Writable stream to write data to.
699 fileprefix (str or Path): The path to write the extracted image to,
700 without the file extension.
702 Returns:
703 If *fileprefix* was provided, then the fileprefix with the
704 appropriate extension. If no *fileprefix*, then an extension
705 indicating the file type.
706 """
707 if bool(stream) == bool(fileprefix):
708 raise ValueError("Cannot set both stream and fileprefix")
709 if stream:
710 return self._extract_to_stream(stream=stream)
712 bio = BytesIO()
713 extension = self._extract_to_stream(stream=bio)
714 bio.seek(0)
715 filepath = Path(str(Path(fileprefix)) + extension)
716 with filepath.open('wb') as target:
717 copyfileobj(bio, target)
718 return str(filepath)
720 def read_bytes(
721 self, decode_level: StreamDecodeLevel = StreamDecodeLevel.specialized
722 ) -> bytes:
723 """Decompress this image and return it as unencoded bytes."""
724 return self.obj.read_bytes(decode_level=decode_level)
726 def get_stream_buffer(
727 self, decode_level: StreamDecodeLevel = StreamDecodeLevel.specialized
728 ) -> Buffer:
729 """Access this image with the buffer protocol."""
730 return self.obj.get_stream_buffer(decode_level=decode_level)
732 def as_pil_image(self) -> Image.Image:
733 """Extract the image as a Pillow Image, using decompression as necessary.
735 Caller must close the image.
736 """
737 bio = BytesIO()
738 direct_extraction = self._extract_direct(stream=bio)
739 if direct_extraction:
740 bio.seek(0)
741 return Image.open(bio)
743 im = self._extract_transcoded()
744 if not im:
745 raise UnsupportedImageTypeError(repr(self))
747 return im
749 def _generate_ccitt_header(self, data: bytes, icc: bytes | None = None) -> bytes:
750 """Construct a CCITT G3 or G4 header from the PDF metadata."""
751 # https://stackoverflow.com/questions/2641770/
752 # https://www.itu.int/itudoc/itu-t/com16/tiff-fx/docs/tiff6.pdf
754 if not self.decode_parms:
755 raise ValueError("/CCITTFaxDecode without /DecodeParms")
757 expected_defaults = [
758 ("/EncodedByteAlign", False),
759 ]
760 for name, val in expected_defaults:
761 if self.decode_parms[0].get(name, val) != val:
762 raise UnsupportedImageTypeError(
763 f"/CCITTFaxDecode with decode parameter {name} not equal {val}"
764 )
766 k = self.decode_parms[0].get("/K", 0)
767 t4_options = None
768 if k < 0:
769 ccitt_group = 4 # Group 4
770 elif k > 0:
771 ccitt_group = 3 # Group 3 2-D
772 t4_options = 1
773 else:
774 ccitt_group = 3 # Group 3 1-D
775 black_is_one = self.decode_parms[0].get("/BlackIs1", False)
776 decode = self._decode_array
777 # PDF spec says:
778 # BlackIs1: A flag indicating whether 1 bits shall be interpreted as black
779 # pixels and 0 bits as white pixels, the reverse of the normal
780 # PDF convention for image data. Default value: false.
781 # TIFF spec says:
782 # use 0 for white_is_zero (=> black is 1) MINISWHITE
783 # use 1 for black_is_zero (=> white is 1) MINISBLACK
784 photometry = 1 if black_is_one else 0
786 # If Decode is [1, 0] then the photometry is inverted
787 if len(decode) == 2 and decode == (1.0, 0.0):
788 photometry = 1 - photometry
790 img_size = len(data)
791 if icc is None:
792 icc = b''
794 return _transcoding.generate_ccitt_header(
795 self.size,
796 data_length=img_size,
797 ccitt_group=ccitt_group,
798 t4_options=t4_options,
799 photometry=photometry,
800 icc=icc,
801 )
803 def show(self): # pragma: no cover
804 """Show the image however PIL wants to."""
805 self.as_pil_image().show()
807 def _set_pdf_source(self, pdf: Pdf):
808 self._pdf_source = pdf
810 def __repr__(self):
811 try:
812 mode = self.mode
813 except NotImplementedError:
814 mode = '?'
815 return (
816 f'<pikepdf.PdfImage image mode={mode} '
817 f'size={self.width}x{self.height} at {hex(id(self))}>'
818 )
821class PdfJpxImage(PdfImage):
822 """Support class for JPEG 2000 images. Implements the same API as :class:`PdfImage`.
824 If you call PdfImage(object_that_is_actually_jpeg2000_image), pikepdf will return
825 this class instead, due to the check in PdfImage.__new__.
826 """
828 def __init__(self, obj):
829 """Initialize a JPEG 2000 image."""
830 super().__init__(obj)
831 self._jpxpil = self.as_pil_image()
833 def __eq__(self, other):
834 if not isinstance(other, PdfImageBase):
835 return NotImplemented
836 return (
837 self.obj == other.obj
838 and isinstance(other, PdfJpxImage)
839 and self._jpxpil == other._jpxpil
840 )
842 def _extract_direct(self, *, stream: BinaryIO) -> str | None:
843 data, filters = self._remove_simple_filters()
844 if filters != ['/JPXDecode']:
845 return None
846 stream.write(data)
847 return '.jp2'
849 def _extract_transcoded(self) -> Image.Image:
850 return super()._extract_transcoded()
852 @property
853 def _colorspaces(self):
854 """Return the effective colorspace of a JPEG 2000 image.
856 If the ColorSpace dictionary is present, the colorspace embedded in the
857 JPEG 2000 data will be ignored, as required by the specification.
858 """
859 # (PDF 1.7 Table 89) If ColorSpace is present, any colour space
860 # specifications in the JPEG2000 data shall be ignored.
861 super_colorspaces = super()._colorspaces
862 if super_colorspaces:
863 return super_colorspaces
864 if self._jpxpil.mode == 'L':
865 return ['/DeviceGray']
866 if self._jpxpil.mode == 'RGB':
867 return ['/DeviceRGB']
868 raise NotImplementedError('Complex JP2 colorspace')
870 @property
871 def _bpc(self) -> int:
872 """Return 8, since bpc is not meaningful for JPEG 2000 encoding."""
873 # (PDF 1.7 Table 89) If the image stream uses the JPXDecode filter, this
874 # entry is optional and shall be ignored if present. The bit depth is
875 # determined by the conforming reader in the process of decoding the
876 # JPEG2000 image.
877 return 8
879 @property
880 def indexed(self) -> bool:
881 """Return False, since JPEG 2000 should not be indexed."""
882 # Nothing in the spec precludes an Indexed JPXDecode image, except for
883 # the fact that doing so is madness. Let's assume it no one is that
884 # insane.
885 return False
887 def __repr__(self):
888 return (
889 f'<pikepdf.PdfJpxImage JPEG2000 image mode={self.mode} '
890 f'size={self.width}x{self.height} at {hex(id(self))}>'
891 )
894class PdfInlineImage(PdfImageBase):
895 """Support class for PDF inline images."""
897 # Inline images can contain abbreviations that we write automatically
898 ABBREVS = {
899 b'/W': b'/Width',
900 b'/H': b'/Height',
901 b'/BPC': b'/BitsPerComponent',
902 b'/IM': b'/ImageMask',
903 b'/CS': b'/ColorSpace',
904 b'/F': b'/Filter',
905 b'/DP': b'/DecodeParms',
906 b'/G': b'/DeviceGray',
907 b'/RGB': b'/DeviceRGB',
908 b'/CMYK': b'/DeviceCMYK',
909 b'/I': b'/Indexed',
910 b'/AHx': b'/ASCIIHexDecode',
911 b'/A85': b'/ASCII85Decode',
912 b'/LZW': b'/LZWDecode',
913 b'/RL': b'/RunLengthDecode',
914 b'/CCF': b'/CCITTFaxDecode',
915 b'/DCT': b'/DCTDecode',
916 }
917 REVERSE_ABBREVS = {v: k for k, v in ABBREVS.items()}
919 _data: Object
920 _image_object: tuple[Object, ...]
922 def __init__(self, *, image_data: Object, image_object: tuple):
923 """Construct wrapper for inline image.
925 Args:
926 image_data: data stream for image, extracted from content stream
927 image_object: the metadata for image, also from content stream
928 """
929 # Convert the sequence of pikepdf.Object from the content stream into
930 # a dictionary object by unparsing it (to bytes), eliminating inline
931 # image abbreviations, and constructing a bytes string equivalent to
932 # what an image XObject would look like. Then retrieve data from there
934 self._data = image_data
935 self._image_object = image_object
937 reparse = b' '.join(
938 self._unparse_obj(obj, remap_names=self.ABBREVS) for obj in image_object
939 )
940 try:
941 reparsed_obj = Object.parse(b'<< ' + reparse + b' >>')
942 except PdfError as e:
943 raise PdfError("parsing inline " + reparse.decode('unicode_escape')) from e
944 self.obj = reparsed_obj
946 def __eq__(self, other):
947 if not isinstance(other, PdfImageBase):
948 return NotImplemented
949 return (
950 self.obj == other.obj
951 and isinstance(other, PdfInlineImage)
952 and (
953 self._data._inline_image_raw_bytes()
954 == other._data._inline_image_raw_bytes()
955 )
956 )
958 @classmethod
959 def _unparse_obj(cls, obj, remap_names):
960 if isinstance(obj, Object):
961 if isinstance(obj, Name):
962 name = obj.unparse(resolved=True)
963 assert isinstance(name, bytes)
964 return remap_names.get(name, name)
965 return obj.unparse(resolved=True)
966 if isinstance(obj, bool):
967 return b'true' if obj else b'false' # Lower case for PDF spec
968 if isinstance(obj, int | Decimal | float):
969 return str(obj).encode('ascii')
970 raise NotImplementedError(repr(obj))
972 def _metadata(self, name, type_, default):
973 return _metadata_from_obj(self.obj, name, type_, default)
975 def unparse(self) -> bytes:
976 """Create the content stream bytes that reproduce this inline image."""
978 def metadata_tokens():
979 for metadata_obj in self._image_object:
980 unparsed = self._unparse_obj(
981 metadata_obj, remap_names=self.REVERSE_ABBREVS
982 )
983 assert isinstance(unparsed, bytes)
984 yield unparsed
986 def inline_image_tokens():
987 yield b'BI\n'
988 yield b' '.join(m for m in metadata_tokens())
989 yield b'\nID\n'
990 yield self._data._inline_image_raw_bytes()
991 yield b'EI'
993 return b''.join(inline_image_tokens())
995 @property
996 def icc(self): # pragma: no cover
997 """Raise an exception since ICC profiles are not supported on inline images."""
998 raise InvalidPdfImageError(
999 "Inline images with ICC profiles are not supported in the PDF specification"
1000 )
1002 def __repr__(self):
1003 try:
1004 mode = self.mode
1005 except NotImplementedError:
1006 mode = '?'
1007 return (
1008 f'<pikepdf.PdfInlineImage image mode={mode} '
1009 f'size={self.width}x{self.height} at {hex(id(self))}>'
1010 )
1012 def _convert_to_pdfimage(self) -> PdfImage:
1013 # Construct a temporary PDF that holds this inline image, and...
1014 tmppdf = Pdf.new()
1015 tmppdf.add_blank_page(page_size=(self.width, self.height))
1016 tmppdf.pages[0].contents_add(
1017 f'{self.width} 0 0 {self.height} 0 0 cm'.encode('ascii'), prepend=True
1018 )
1019 tmppdf.pages[0].contents_add(self.unparse())
1021 # ...externalize it,
1022 tmppdf.pages[0].externalize_inline_images()
1023 raw_img = cast(Stream, next(im for im in tmppdf.pages[0].images.values()))
1025 # ...then use the regular PdfImage API to extract it.
1026 img = PdfImage(raw_img)
1027 img._set_pdf_source(tmppdf) # Hold tmppdf open while PdfImage exists
1028 return img
1030 def as_pil_image(self) -> Image.Image:
1031 """Return inline image as a Pillow Image."""
1032 return self._convert_to_pdfimage().as_pil_image()
1034 def extract_to(self, *, stream: BinaryIO | None = None, fileprefix: str = ''):
1035 """Extract the inline image directly to a usable image file.
1037 See:
1038 :meth:`PdfImage.extract_to`
1039 """
1040 return self._convert_to_pdfimage().extract_to(
1041 stream=stream, fileprefix=fileprefix
1042 )
1044 def read_bytes(self):
1045 """Return decompressed image bytes."""
1046 # qpdf does not have an API to return this directly, so convert it.
1047 return self._convert_to_pdfimage().read_bytes()
1049 def get_stream_buffer(self):
1050 """Return decompressed stream buffer."""
1051 # qpdf does not have an API to return this directly, so convert it.
1052 return self._convert_to_pdfimage().get_stream_buffer()
1055__all__ = [
1056 'CMYKDecodeArray',
1057 'DecodeArray',
1058 'HifiPrintImageNotTranscodableError',
1059 'ImageDecompressionError',
1060 'InvalidPdfImageError',
1061 'PaletteData',
1062 'PdfImage',
1063 'PdfImageBase',
1064 'PdfInlineImage',
1065 'PdfJpxImage',
1066 'RGBDecodeArray',
1067 'UnsupportedImageTypeError',
1068]