Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pikepdf/models/image.py: 25%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# SPDX-FileCopyrightText: 2022 James R. Barlow
2# SPDX-License-Identifier: MPL-2.0
4"""Extract images embedded in PDF."""
6from __future__ import annotations
8from abc import ABC, abstractmethod
9from collections.abc import Callable
10from decimal import Decimal
11from io import BytesIO
12from itertools import zip_longest
13from pathlib import Path
14from shutil import copyfileobj
15from typing import TYPE_CHECKING, Any, BinaryIO, NamedTuple, TypeVar, cast
17from pikepdf import jbig2
18from pikepdf._core import Buffer, Pdf, PdfError, StreamDecodeLevel
19from pikepdf._exceptions import DependencyError
20from pikepdf.models import _transcoding
21from pikepdf.models._transcoding import ImageDecompressionError
22from pikepdf.objects import (
23 Array,
24 Dictionary,
25 Name,
26 Object,
27 Stream,
28 String,
29)
31if TYPE_CHECKING:
32 from PIL import Image
33 from PIL.ImageCms import ImageCmsProfile
36T = TypeVar('T')
38RGBDecodeArray = tuple[float, float, float, float, float, float]
39GrayDecodeArray = tuple[float, float]
40CMYKDecodeArray = tuple[float, float, float, float, float, float, float, float]
41DecodeArray = RGBDecodeArray | GrayDecodeArray | CMYKDecodeArray
44class UnsupportedImageTypeError(Exception):
45 """This image is formatted in a way pikepdf does not supported."""
48class NotExtractableError(Exception):
49 """Indicates that an image cannot be directly extracted."""
52class HifiPrintImageNotTranscodableError(NotExtractableError):
53 """Image contains high fidelity printing information and cannot be extracted."""
56class InvalidPdfImageError(Exception):
57 """This image is not valid according to the PDF 1.7 specification."""
60def _array_str(value: Object | str | list):
61 """Simplify pikepdf objects to array of str. Keep streams, dictionaries intact."""
63 def _convert(item):
64 if isinstance(item, list | Array):
65 return [_convert(subitem) for subitem in item]
66 if isinstance(item, Stream | Dictionary | bytes | int):
67 return item
68 if isinstance(item, Name | str):
69 return str(item)
70 if isinstance(item, (String)):
71 return bytes(item)
72 raise NotImplementedError(value)
74 result = _convert(value)
75 if not isinstance(result, list):
76 result = [result]
77 return result
80def _ensure_list(value: list[Object] | Dictionary | Array | Object) -> list[Object]:
81 """Ensure value is a list of pikepdf.Object, if it was not already.
83 To support DecodeParms which can be present as either an array of dicts or a single
84 dict. It's easier to convert to an array of one dict.
85 """
86 if isinstance(value, list):
87 return value
88 return list(value.wrap_in_array().as_list())
91def _metadata_from_obj(
92 obj: Dictionary | Stream, name: str, type_: Callable[[Any], T], default: T
93) -> T | None:
94 """Retrieve metadata from a dictionary or stream and wrangle types."""
95 val = getattr(obj, name, default)
96 try:
97 return type_(val)
98 except TypeError:
99 if val is None:
100 return None
101 raise NotImplementedError('Metadata access for ' + name)
104class PaletteData(NamedTuple):
105 """Returns the color space and binary representation of the palette.
107 ``base_colorspace`` is typically ``"RGB"`` or ``"L"`` (for grayscale).
109 ``palette`` is typically 256 or 256*3=768 bytes, for grayscale and RGB color
110 respectively, with each unit/triplet being the grayscale/RGB triplet values.
111 """
113 base_colorspace: str
114 palette: bytes
117class PdfImageBase(ABC):
118 """Abstract base class for images."""
120 SIMPLE_COLORSPACES = {'/DeviceRGB', '/DeviceGray', '/CalRGB', '/CalGray'}
121 MAIN_COLORSPACES = SIMPLE_COLORSPACES | {'/DeviceCMYK', '/CalCMYK', '/ICCBased'}
122 PRINT_COLORSPACES = {'/Separation', '/DeviceN'}
124 @abstractmethod
125 def _metadata(self, name: str, type_: Callable[[Any], T], default: T) -> T:
126 """Get metadata for this image type."""
128 @property
129 def width(self) -> int:
130 """Width of the image data in pixels."""
131 return self._metadata('Width', int, 0)
133 @property
134 def height(self) -> int:
135 """Height of the image data in pixels."""
136 return self._metadata('Height', int, 0)
138 @property
139 def image_mask(self) -> bool:
140 """Return ``True`` if this is an image mask."""
141 return self._metadata('ImageMask', bool, False)
143 @property
144 def _bpc(self) -> int | None:
145 """Bits per component for this image (low-level)."""
146 return self._metadata('BitsPerComponent', int, 0)
148 @property
149 def _colorspaces(self):
150 """Colorspace (low-level)."""
151 return self._metadata('ColorSpace', _array_str, [])
153 @property
154 def filters(self):
155 """List of names of the filters that we applied to encode this image."""
156 return self._metadata('Filter', _array_str, [])
158 @property
159 def _decode_array(self) -> DecodeArray:
160 """Extract the /Decode array."""
161 decode: list = self._metadata('Decode', _ensure_list, [])
162 if decode and len(decode) in (2, 6, 8):
163 return cast(DecodeArray, tuple(float(value) for value in decode))
165 if self.colorspace in ('/DeviceGray', '/CalGray'):
166 return (0.0, 1.0)
167 if self.colorspace in ('/DeviceRGB', '/CalRGB'):
168 return (0.0, 1.0, 0.0, 1.0, 0.0, 1.0)
169 if self.colorspace == '/DeviceCMYK':
170 return (0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0)
171 if self.colorspace == '/ICCBased':
172 if self._approx_mode_from_icc() == 'L':
173 return (0.0, 1.0)
174 if self._approx_mode_from_icc() == 'RGB':
175 return (0.0, 1.0, 0.0, 1.0, 0.0, 1.0)
176 if self.image_mask:
177 return (0.0, 1.0) # Default for image masks; per RM 8.9.6.2
179 raise NotImplementedError(
180 "Don't how to retrieve default /Decode array for image" + repr(self)
181 )
183 @property
184 def decode_parms(self):
185 """List of the /DecodeParms, arguments to filters."""
186 return self._metadata('DecodeParms', _ensure_list, [])
188 @property
189 def colorspace(self) -> str | None:
190 """PDF name of the colorspace that best describes this image."""
191 if self.image_mask:
192 return None # Undefined for image masks
193 if self._colorspaces:
194 if self._colorspaces[0] in self.MAIN_COLORSPACES:
195 return self._colorspaces[0]
196 if self._colorspaces[0] == '/Indexed':
197 subspace = self._colorspaces[1]
198 if isinstance(subspace, str) and subspace in self.MAIN_COLORSPACES:
199 return subspace
200 if isinstance(subspace, list) and subspace[0] in (
201 '/ICCBased',
202 '/DeviceN',
203 '/CalGray',
204 '/CalRGB',
205 ):
206 return subspace[0]
207 if self._colorspaces[0] == '/DeviceN':
208 return '/DeviceN'
210 raise NotImplementedError(
211 "not sure how to get colorspace: " + repr(self._colorspaces)
212 )
214 @property
215 def bits_per_component(self) -> int:
216 """Bits per component of this image."""
217 if self._bpc is None or self._bpc == 0:
218 return 1 if self.image_mask else 8
219 return self._bpc
221 @property
222 @abstractmethod
223 def icc(self) -> ImageCmsProfile | None:
224 """Return ICC profile for this image if one is defined."""
226 @property
227 def indexed(self) -> bool:
228 """Check if the image has a defined color palette."""
229 return '/Indexed' in self._colorspaces
231 def _colorspace_has_name(self, name):
232 try:
233 cs = self._colorspaces
234 if cs[0] == '/Indexed' and cs[1][0] == name:
235 return True
236 if cs[0] == name:
237 return True
238 except (IndexError, AttributeError, KeyError):
239 pass
240 return False
242 @property
243 def is_device_n(self) -> bool:
244 """Check if image has a /DeviceN (complex printing) colorspace."""
245 return self._colorspace_has_name('/DeviceN')
247 @property
248 def is_separation(self) -> bool:
249 """Check if image has a /DeviceN (complex printing) colorspace."""
250 return self._colorspace_has_name('/Separation')
252 @property
253 def size(self) -> tuple[int, int]:
254 """Size of image as (width, height)."""
255 return self.width, self.height
257 def _approx_mode_from_icc(self):
258 if self.indexed:
259 icc_profile = self._colorspaces[1][1]
260 else:
261 icc_profile = self._colorspaces[1]
262 icc_profile_nchannels = int(icc_profile['/N'])
264 if icc_profile_nchannels == 1:
265 return 'L'
267 # Multiple channels, need to open the profile and look
268 mode_from_xcolor_space = {'RGB ': 'RGB', 'CMYK': 'CMYK'}
269 xcolor_space = self.icc.profile.xcolor_space
270 return mode_from_xcolor_space.get(xcolor_space, '')
272 @property
273 def mode(self) -> str:
274 """``PIL.Image.mode`` equivalent for this image, where possible.
276 If an ICC profile is attached to the image, we still attempt to resolve a Pillow
277 mode.
278 """
279 m = ''
280 if self.is_device_n:
281 m = 'DeviceN'
282 elif self.is_separation:
283 m = 'Separation'
284 elif self.indexed:
285 m = 'P'
286 elif self.colorspace == '/DeviceGray' and self.bits_per_component == 1:
287 m = '1'
288 elif self.colorspace == '/DeviceGray' and self.bits_per_component > 1:
289 m = 'L'
290 elif self.colorspace == '/DeviceRGB':
291 m = 'RGB'
292 elif self.colorspace == '/DeviceCMYK':
293 m = 'CMYK'
294 elif self.colorspace == '/ICCBased':
295 try:
296 m = self._approx_mode_from_icc()
297 except (ValueError, TypeError) as e:
298 raise NotImplementedError(
299 "Not sure how to handle PDF image of this type"
300 ) from e
301 if m == '':
302 raise NotImplementedError(
303 "Not sure how to handle PDF image of this type"
304 ) from None
305 return m
307 @property
308 def filter_decodeparms(self):
309 """Return normalized the Filter and DecodeParms data.
311 PDF has a lot of possible data structures concerning /Filter and
312 /DecodeParms. /Filter can be absent or a name or an array, /DecodeParms
313 can be absent or a dictionary (if /Filter is a name) or an array (if
314 /Filter is an array). When both are arrays the lengths match.
316 Normalize this into:
317 [(/FilterName, {/DecodeParmName: Value, ...}), ...]
319 The order of /Filter matters as indicates the encoding/decoding sequence.
320 """
321 return list(zip_longest(self.filters, self.decode_parms, fillvalue={}))
323 @property
324 def palette(self) -> PaletteData | None:
325 """Retrieve the color palette for this image if applicable."""
326 if not self.indexed:
327 return None
328 try:
329 _idx, base, _hival, lookup = self._colorspaces
330 except ValueError as e:
331 raise ValueError('Not sure how to interpret this palette') from e
332 if self.icc or self.is_device_n or self.is_separation or isinstance(base, list):
333 base = str(base[0])
334 else:
335 base = str(base)
336 lookup = bytes(lookup)
337 if base not in self.MAIN_COLORSPACES and base not in self.PRINT_COLORSPACES:
338 raise NotImplementedError(f"not sure how to interpret this palette: {base}")
339 if base in ('/DeviceRGB', '/CalRGB'):
340 base = 'RGB'
341 elif base in ('/DeviceGray', '/CalGray'):
342 base = 'L'
343 elif base == '/DeviceCMYK':
344 base = 'CMYK'
345 elif base == '/DeviceN':
346 base = 'DeviceN'
347 elif base == '/Separation':
348 base = 'Separation'
349 elif base == '/ICCBased':
350 base = self._approx_mode_from_icc()
351 else:
352 raise NotImplementedError(f"not sure how to interpret this palette: {base}")
353 return PaletteData(base, lookup)
355 @abstractmethod
356 def as_pil_image(self) -> Image.Image:
357 """Convert this PDF image to a Python PIL (Pillow) image."""
359 def _repr_png_(self) -> bytes:
360 """Display hook for IPython/Jupyter."""
361 b = BytesIO()
362 with self.as_pil_image() as im:
363 im.save(b, 'PNG')
364 return b.getvalue()
367class PdfImage(PdfImageBase):
368 """Support class to provide a consistent API for manipulating PDF images.
370 The data structure for images inside PDFs is irregular and complex,
371 making it difficult to use without introducing errors for less
372 typical cases. This class addresses these difficulties by providing a
373 regular, Pythonic API similar in spirit (and convertible to) the Python
374 Pillow imaging library.
375 """
377 obj: Stream
378 _icc: ImageCmsProfile | None
379 _pdf_source: Pdf | None
381 def __new__(cls, obj: Stream):
382 """Construct a PdfImage... or a PdfJpxImage if that is what we really are."""
383 try:
384 # Check if JPXDecode is called for and initialize as PdfJpxImage
385 filters = _ensure_list(obj.Filter)
386 if Name.JPXDecode in filters:
387 return super().__new__(PdfJpxImage)
388 except (AttributeError, KeyError):
389 # __init__ will deal with any other errors
390 pass
391 return super().__new__(PdfImage)
393 def __init__(self, obj: Stream):
394 """Construct a PDF image from a Image XObject inside a PDF.
396 ``pim = PdfImage(page.Resources.XObject['/ImageNN'])``
398 Args:
399 obj: an Image XObject
400 """
401 if isinstance(obj, Stream) and obj.stream_dict.get("/Subtype") != "/Image":
402 raise TypeError("can't construct PdfImage from non-image")
403 self.obj = obj
404 self._icc = None
406 def __eq__(self, other):
407 if not isinstance(other, PdfImageBase):
408 return NotImplemented
409 return self.obj == other.obj
411 @classmethod
412 def _from_pil_image(cls, *, pdf, page, name, image): # pragma: no cover
413 """Insert a PIL image into a PDF (rudimentary).
415 Args:
416 pdf (pikepdf.Pdf): the PDF to attach the image to
417 page (pikepdf.Object): the page to attach the image to
418 name (str or pikepdf.Name): the name to set the image
419 image (PIL.Image.Image): the image to insert
420 """
421 data = image.tobytes()
423 imstream = Stream(pdf, data)
424 imstream.Type = Name('/XObject')
425 imstream.Subtype = Name('/Image')
426 if image.mode == 'RGB':
427 imstream.ColorSpace = Name('/DeviceRGB')
428 elif image.mode in ('1', 'L'):
429 imstream.ColorSpace = Name('/DeviceGray')
430 imstream.BitsPerComponent = 1 if image.mode == '1' else 8
431 imstream.Width = image.width
432 imstream.Height = image.height
434 page.Resources.XObject[name] = imstream
436 return cls(imstream)
438 def _metadata(self, name, type_, default):
439 return _metadata_from_obj(self.obj, name, type_, default)
441 @property
442 def _iccstream(self):
443 if self.colorspace == '/ICCBased':
444 if not self.indexed:
445 return self._colorspaces[1]
446 assert isinstance(self._colorspaces[1], list)
447 return self._colorspaces[1][1]
448 raise NotImplementedError("Don't know how to find ICC stream for image")
450 @property
451 def icc(self) -> ImageCmsProfile | None:
452 """If an ICC profile is attached, return a Pillow object that describe it.
454 Most of the information may be found in ``icc.profile``.
455 """
456 if self.colorspace not in ('/ICCBased', '/Indexed'):
457 return None
458 if not self._icc:
459 iccstream = self._iccstream
460 iccbuffer = iccstream.get_stream_buffer()
461 iccbytesio = BytesIO(iccbuffer)
462 try:
463 from PIL.ImageCms import ImageCmsProfile
464 self._icc = ImageCmsProfile(iccbytesio)
465 except OSError as e:
466 if str(e) == 'cannot open profile from string':
467 # ICC profile is corrupt
468 raise UnsupportedImageTypeError(
469 "ICC profile corrupt or not readable"
470 ) from e
471 return self._icc
473 def _remove_simple_filters(self):
474 """Remove simple lossless compression where it appears."""
475 COMPLEX_FILTERS = {
476 '/DCTDecode',
477 '/JPXDecode',
478 '/JBIG2Decode',
479 '/CCITTFaxDecode',
480 }
481 indices = [n for n, filt in enumerate(self.filters) if filt in COMPLEX_FILTERS]
482 if len(indices) > 1:
483 raise NotImplementedError(
484 f"Object {self.obj.objgen} has compound complex filters: "
485 f"{self.filters}. We cannot decompress this."
486 )
487 if len(indices) == 0:
488 # No complex filter indices, so all filters are simple - remove them all
489 return self.obj.read_bytes(StreamDecodeLevel.specialized), []
491 n = indices[0]
492 if n == 0:
493 # The only filter is complex, so return
494 return self.obj.read_raw_bytes(), self.filters
496 # Put copy in a temporary PDF to ensure we don't permanently modify self
497 with Pdf.new() as tmp_pdf:
498 obj_copy = tmp_pdf.copy_foreign(self.obj)
499 obj_copy.Filter = Array([Name(f) for f in self.filters[:n]])
500 obj_copy.DecodeParms = Array(self.decode_parms[:n])
501 return obj_copy.read_bytes(StreamDecodeLevel.specialized), self.filters[n:]
503 def _extract_direct(self, *, stream: BinaryIO) -> str | None:
504 """Attempt to extract the image directly to a usable image file.
506 If there is no way to extract the image without decompressing or
507 transcoding then raise an exception. The type and format of image
508 generated will vary.
510 Args:
511 stream: Writable file stream to write data to, e.g. an open file
512 """
514 def normal_dct_rgb() -> bool:
515 # Normal DCTDecode RGB images have the default value of
516 # /ColorTransform 1 and are actually in YUV. Such a file can be
517 # saved as a standard JPEG. RGB JPEGs without YUV conversion can't
518 # be saved as JPEGs, and are probably bugs. Some software in the
519 # wild actually produces RGB JPEGs in PDFs (probably a bug).
520 DEFAULT_CT_RGB = 1
521 ct = DEFAULT_CT_RGB
522 if self.filter_decodeparms[0][1] is not None:
523 ct = self.filter_decodeparms[0][1].get(
524 '/ColorTransform', DEFAULT_CT_RGB
525 )
526 return self.mode == 'RGB' and ct == DEFAULT_CT_RGB
528 def normal_dct_cmyk() -> bool:
529 # Normal DCTDecode CMYKs have /ColorTransform 0 and can be saved.
530 # There is a YUVK colorspace but CMYK JPEGs don't generally use it
531 DEFAULT_CT_CMYK = 0
532 ct = DEFAULT_CT_CMYK
533 if self.filter_decodeparms[0][1] is not None:
534 ct = self.filter_decodeparms[0][1].get(
535 '/ColorTransform', DEFAULT_CT_CMYK
536 )
537 return self.mode == 'CMYK' and ct == DEFAULT_CT_CMYK
539 data, filters = self._remove_simple_filters()
541 if filters == ['/CCITTFaxDecode']:
542 if self.colorspace == '/ICCBased':
543 icc = self._iccstream.read_bytes()
544 else:
545 icc = None
546 stream.write(self._generate_ccitt_header(data, icc=icc))
547 stream.write(data)
548 return '.tif'
549 if filters == ['/DCTDecode'] and (
550 self.mode == 'L' or normal_dct_rgb() or normal_dct_cmyk()
551 ):
552 stream.write(data)
553 return '.jpg'
555 return None
557 def _extract_transcoded_1248bits(self) -> Image.Image:
558 """Extract an image when there are 1/2/4/8 bits packed in byte data."""
559 stride = 0 # tell Pillow to calculate stride from line width
560 scale = 0 if self.mode == 'L' else 1
561 if self.bits_per_component in (2, 4):
562 buffer, stride = _transcoding.unpack_subbyte_pixels(
563 self.read_bytes(), self.size, self.bits_per_component, scale
564 )
565 elif self.bits_per_component == 8:
566 buffer = cast(memoryview, self.get_stream_buffer())
567 else:
568 raise InvalidPdfImageError("BitsPerComponent must be 1, 2, 4, 8, or 16")
570 if self.mode == 'P' and self.palette is not None:
571 base_mode, palette = self.palette
572 im = _transcoding.image_from_buffer_and_palette(
573 buffer,
574 self.size,
575 stride,
576 base_mode,
577 palette,
578 )
579 else:
580 im = _transcoding.image_from_byte_buffer(buffer, self.size, stride)
581 return im
583 def _extract_transcoded_1bit(self) -> Image.Image:
584 from PIL import Image
586 if not self.image_mask and self.mode in ('RGB', 'CMYK'):
587 raise UnsupportedImageTypeError("1-bit RGB and CMYK are not supported")
588 try:
589 data = self.read_bytes()
590 except (RuntimeError, PdfError) as e:
591 if (
592 'read_bytes called on unfilterable stream' in str(e)
593 and not jbig2.get_decoder().available()
594 ):
595 raise DependencyError(
596 "jbig2dec - not installed or installed version is too old "
597 "(older than version 0.15)"
598 ) from None
599 raise
601 im = Image.frombytes('1', self.size, data)
603 if self.palette is not None:
604 base_mode, palette = self.palette
605 im = _transcoding.fix_1bit_palette_image(im, base_mode, palette)
607 return im
609 def _extract_transcoded_mask(self) -> Image.Image:
610 return self._extract_transcoded_1bit()
612 def _extract_transcoded(self) -> Image.Image:
613 from PIL import Image
614 if self.image_mask:
615 return self._extract_transcoded_mask()
617 if self.mode in {'DeviceN', 'Separation'}:
618 raise HifiPrintImageNotTranscodableError()
620 if self.mode == 'RGB' and self.bits_per_component == 8:
621 # Cannot use the zero-copy .get_stream_buffer here, we have 3-byte
622 # RGB and Pillow needs RGBX.
623 im = Image.frombuffer(
624 'RGB', self.size, self.read_bytes(), 'raw', 'RGB', 0, 1
625 )
626 elif self.mode == 'CMYK' and self.bits_per_component == 8:
627 im = Image.frombuffer(
628 'CMYK', self.size, self.get_stream_buffer(), 'raw', 'CMYK', 0, 1
629 )
630 # elif self.mode == '1':
631 elif self.bits_per_component == 1:
632 im = self._extract_transcoded_1bit()
633 elif self.mode in ('L', 'P') and self.bits_per_component <= 8:
634 im = self._extract_transcoded_1248bits()
635 else:
636 raise UnsupportedImageTypeError(repr(self) + ", " + repr(self.obj))
638 if self.colorspace == '/ICCBased' and self.icc is not None:
639 im.info['icc_profile'] = self.icc.tobytes()
641 return im
643 def _extract_to_stream(self, *, stream: BinaryIO) -> str:
644 """Extract the image to a stream.
646 If possible, the compressed data is extracted and inserted into
647 a compressed image file format without transcoding the compressed
648 content. If this is not possible, the data will be decompressed
649 and extracted to an appropriate format.
651 Args:
652 stream: Writable stream to write data to
654 Returns:
655 The file format extension.
656 """
657 direct_extraction = self._extract_direct(stream=stream)
658 if direct_extraction:
659 return direct_extraction
661 im = None
662 try:
663 im = self._extract_transcoded()
664 if im.mode == 'CMYK':
665 im.save(stream, format='tiff', compression='tiff_adobe_deflate')
666 return '.tiff'
667 if im:
668 im.save(stream, format='png')
669 return '.png'
670 except PdfError as e:
671 if 'called on unfilterable stream' in str(e):
672 raise UnsupportedImageTypeError(repr(self)) from e
673 raise
674 finally:
675 if im:
676 im.close()
678 raise UnsupportedImageTypeError(repr(self))
680 def extract_to(
681 self, *, stream: BinaryIO | None = None, fileprefix: str = ''
682 ) -> str:
683 """Extract the image directly to a usable image file.
685 If possible, the compressed data is extracted and inserted into
686 a compressed image file format without transcoding the compressed
687 content. If this is not possible, the data will be decompressed
688 and extracted to an appropriate format.
690 Because it is not known until attempted what image format will be
691 extracted, users should not assume what format they are getting back.
692 When saving the image to a file, use a temporary filename, and then
693 rename the file to its final name based on the returned file extension.
695 Images might be saved as any of .png, .jpg, or .tiff.
697 Examples:
698 >>> im.extract_to(stream=bytes_io) # doctest: +SKIP
699 '.png'
701 >>> im.extract_to(fileprefix='/tmp/image00') # doctest: +SKIP
702 '/tmp/image00.jpg'
704 Args:
705 stream: Writable stream to write data to.
706 fileprefix (str or Path): The path to write the extracted image to,
707 without the file extension.
709 Returns:
710 If *fileprefix* was provided, then the fileprefix with the
711 appropriate extension. If no *fileprefix*, then an extension
712 indicating the file type.
713 """
714 if bool(stream) == bool(fileprefix):
715 raise ValueError("Cannot set both stream and fileprefix")
716 if stream:
717 return self._extract_to_stream(stream=stream)
719 bio = BytesIO()
720 extension = self._extract_to_stream(stream=bio)
721 bio.seek(0)
722 filepath = Path(str(Path(fileprefix)) + extension)
723 with filepath.open('wb') as target:
724 copyfileobj(bio, target)
725 return str(filepath)
727 def read_bytes(
728 self, decode_level: StreamDecodeLevel = StreamDecodeLevel.specialized
729 ) -> bytes:
730 """Decompress this image and return it as unencoded bytes."""
731 return self.obj.read_bytes(decode_level=decode_level)
733 def get_stream_buffer(
734 self, decode_level: StreamDecodeLevel = StreamDecodeLevel.specialized
735 ) -> Buffer:
736 """Access this image with the buffer protocol."""
737 return self.obj.get_stream_buffer(decode_level=decode_level)
739 def as_pil_image(self) -> Image.Image:
740 """Extract the image as a Pillow Image, using decompression as necessary.
742 Caller must close the image.
743 """
744 from PIL import Image
746 bio = BytesIO()
747 direct_extraction = self._extract_direct(stream=bio)
748 if direct_extraction:
749 bio.seek(0)
750 return Image.open(bio)
752 im = self._extract_transcoded()
753 if not im:
754 raise UnsupportedImageTypeError(repr(self))
756 return im
758 def _generate_ccitt_header(self, data: bytes, icc: bytes | None = None) -> bytes:
759 """Construct a CCITT G3 or G4 header from the PDF metadata."""
760 # https://stackoverflow.com/questions/2641770/
761 # https://www.itu.int/itudoc/itu-t/com16/tiff-fx/docs/tiff6.pdf
763 if not self.decode_parms:
764 raise ValueError("/CCITTFaxDecode without /DecodeParms")
766 expected_defaults = [
767 ("/EncodedByteAlign", False),
768 ]
769 for name, val in expected_defaults:
770 if self.decode_parms[0].get(name, val) != val:
771 raise UnsupportedImageTypeError(
772 f"/CCITTFaxDecode with decode parameter {name} not equal {val}"
773 )
775 k = self.decode_parms[0].get("/K", 0)
776 t4_options = None
777 if k < 0:
778 ccitt_group = 4 # Group 4
779 elif k > 0:
780 ccitt_group = 3 # Group 3 2-D
781 t4_options = 1
782 else:
783 ccitt_group = 3 # Group 3 1-D
784 black_is_one = self.decode_parms[0].get("/BlackIs1", False)
785 decode = self._decode_array
786 # PDF spec says:
787 # BlackIs1: A flag indicating whether 1 bits shall be interpreted as black
788 # pixels and 0 bits as white pixels, the reverse of the normal
789 # PDF convention for image data. Default value: false.
790 # TIFF spec says:
791 # use 0 for white_is_zero (=> black is 1) MINISWHITE
792 # use 1 for black_is_zero (=> white is 1) MINISBLACK
793 photometry = 1 if black_is_one else 0
795 # If Decode is [1, 0] then the photometry is inverted
796 if len(decode) == 2 and decode == (1.0, 0.0):
797 photometry = 1 - photometry
799 img_size = len(data)
800 if icc is None:
801 icc = b''
803 return _transcoding.generate_ccitt_header(
804 self.size,
805 data_length=img_size,
806 ccitt_group=ccitt_group,
807 t4_options=t4_options,
808 photometry=photometry,
809 icc=icc,
810 )
812 def show(self): # pragma: no cover
813 """Show the image however PIL wants to."""
814 self.as_pil_image().show()
816 def _set_pdf_source(self, pdf: Pdf):
817 self._pdf_source = pdf
819 def __repr__(self):
820 try:
821 mode = self.mode
822 except NotImplementedError:
823 mode = '?'
824 return (
825 f'<pikepdf.PdfImage image mode={mode} '
826 f'size={self.width}x{self.height} at {hex(id(self))}>'
827 )
830class PdfJpxImage(PdfImage):
831 """Support class for JPEG 2000 images. Implements the same API as :class:`PdfImage`.
833 If you call PdfImage(object_that_is_actually_jpeg2000_image), pikepdf will return
834 this class instead, due to the check in PdfImage.__new__.
835 """
837 def __init__(self, obj):
838 """Initialize a JPEG 2000 image."""
839 super().__init__(obj)
840 self._jpxpil = self.as_pil_image()
842 def __eq__(self, other):
843 if not isinstance(other, PdfImageBase):
844 return NotImplemented
845 return (
846 self.obj == other.obj
847 and isinstance(other, PdfJpxImage)
848 and self._jpxpil == other._jpxpil
849 )
851 def _extract_direct(self, *, stream: BinaryIO) -> str | None:
852 data, filters = self._remove_simple_filters()
853 if filters != ['/JPXDecode']:
854 return None
855 stream.write(data)
856 return '.jp2'
858 def _extract_transcoded(self) -> Image.Image:
859 return super()._extract_transcoded()
861 @property
862 def _colorspaces(self):
863 """Return the effective colorspace of a JPEG 2000 image.
865 If the ColorSpace dictionary is present, the colorspace embedded in the
866 JPEG 2000 data will be ignored, as required by the specification.
867 """
868 # (PDF 1.7 Table 89) If ColorSpace is present, any colour space
869 # specifications in the JPEG2000 data shall be ignored.
870 super_colorspaces = super()._colorspaces
871 if super_colorspaces:
872 return super_colorspaces
873 if self._jpxpil.mode == 'L':
874 return ['/DeviceGray']
875 if self._jpxpil.mode == 'RGB':
876 return ['/DeviceRGB']
877 raise NotImplementedError('Complex JP2 colorspace')
879 @property
880 def _bpc(self) -> int:
881 """Return 8, since bpc is not meaningful for JPEG 2000 encoding."""
882 # (PDF 1.7 Table 89) If the image stream uses the JPXDecode filter, this
883 # entry is optional and shall be ignored if present. The bit depth is
884 # determined by the conforming reader in the process of decoding the
885 # JPEG2000 image.
886 return 8
888 @property
889 def indexed(self) -> bool:
890 """Return False, since JPEG 2000 should not be indexed."""
891 # Nothing in the spec precludes an Indexed JPXDecode image, except for
892 # the fact that doing so is madness. Let's assume it no one is that
893 # insane.
894 return False
896 def __repr__(self):
897 return (
898 f'<pikepdf.PdfJpxImage JPEG2000 image mode={self.mode} '
899 f'size={self.width}x{self.height} at {hex(id(self))}>'
900 )
903class PdfInlineImage(PdfImageBase):
904 """Support class for PDF inline images."""
906 # Inline images can contain abbreviations that we write automatically
907 ABBREVS = {
908 b'/W': b'/Width',
909 b'/H': b'/Height',
910 b'/BPC': b'/BitsPerComponent',
911 b'/IM': b'/ImageMask',
912 b'/CS': b'/ColorSpace',
913 b'/F': b'/Filter',
914 b'/DP': b'/DecodeParms',
915 b'/G': b'/DeviceGray',
916 b'/RGB': b'/DeviceRGB',
917 b'/CMYK': b'/DeviceCMYK',
918 b'/I': b'/Indexed',
919 b'/AHx': b'/ASCIIHexDecode',
920 b'/A85': b'/ASCII85Decode',
921 b'/LZW': b'/LZWDecode',
922 b'/RL': b'/RunLengthDecode',
923 b'/CCF': b'/CCITTFaxDecode',
924 b'/DCT': b'/DCTDecode',
925 }
926 REVERSE_ABBREVS = {v: k for k, v in ABBREVS.items()}
928 _data: Object
929 _image_object: tuple[Object, ...]
931 def __init__(self, *, image_data: Object, image_object: tuple):
932 """Construct wrapper for inline image.
934 Args:
935 image_data: data stream for image, extracted from content stream
936 image_object: the metadata for image, also from content stream
937 """
938 # Convert the sequence of pikepdf.Object from the content stream into
939 # a dictionary object by unparsing it (to bytes), eliminating inline
940 # image abbreviations, and constructing a bytes string equivalent to
941 # what an image XObject would look like. Then retrieve data from there
943 self._data = image_data
944 self._image_object = image_object
946 reparse = b' '.join(
947 self._unparse_obj(obj, remap_names=self.ABBREVS) for obj in image_object
948 )
949 try:
950 reparsed_obj = Object.parse(b'<< ' + reparse + b' >>')
951 except PdfError as e:
952 raise PdfError("parsing inline " + reparse.decode('unicode_escape')) from e
953 self.obj = reparsed_obj
955 def __eq__(self, other):
956 if not isinstance(other, PdfImageBase):
957 return NotImplemented
958 return (
959 self.obj == other.obj
960 and isinstance(other, PdfInlineImage)
961 and (
962 self._data._inline_image_raw_bytes()
963 == other._data._inline_image_raw_bytes()
964 )
965 )
967 @classmethod
968 def _unparse_obj(cls, obj, remap_names):
969 if isinstance(obj, Object):
970 if isinstance(obj, Name):
971 name = obj.unparse(resolved=True)
972 assert isinstance(name, bytes)
973 return remap_names.get(name, name)
974 return obj.unparse(resolved=True)
975 if isinstance(obj, bool):
976 return b'true' if obj else b'false' # Lower case for PDF spec
977 if isinstance(obj, int | Decimal | float):
978 return str(obj).encode('ascii')
979 raise NotImplementedError(repr(obj))
981 def _metadata(self, name, type_, default):
982 return _metadata_from_obj(self.obj, name, type_, default)
984 def unparse(self) -> bytes:
985 """Create the content stream bytes that reproduce this inline image."""
987 def metadata_tokens():
988 for metadata_obj in self._image_object:
989 unparsed = self._unparse_obj(
990 metadata_obj, remap_names=self.REVERSE_ABBREVS
991 )
992 assert isinstance(unparsed, bytes)
993 yield unparsed
995 def inline_image_tokens():
996 yield b'BI\n'
997 yield b' '.join(m for m in metadata_tokens())
998 yield b'\nID\n'
999 yield self._data._inline_image_raw_bytes()
1000 yield b'EI'
1002 return b''.join(inline_image_tokens())
1004 @property
1005 def icc(self): # pragma: no cover
1006 """Raise an exception since ICC profiles are not supported on inline images."""
1007 raise InvalidPdfImageError(
1008 "Inline images with ICC profiles are not supported in the PDF specification"
1009 )
1011 def __repr__(self):
1012 try:
1013 mode = self.mode
1014 except NotImplementedError:
1015 mode = '?'
1016 return (
1017 f'<pikepdf.PdfInlineImage image mode={mode} '
1018 f'size={self.width}x{self.height} at {hex(id(self))}>'
1019 )
1021 def _convert_to_pdfimage(self) -> PdfImage:
1022 # Construct a temporary PDF that holds this inline image, and...
1023 tmppdf = Pdf.new()
1024 tmppdf.add_blank_page(page_size=(self.width, self.height))
1025 tmppdf.pages[0].contents_add(
1026 f'{self.width} 0 0 {self.height} 0 0 cm'.encode('ascii'), prepend=True
1027 )
1028 tmppdf.pages[0].contents_add(self.unparse())
1030 # ...externalize it,
1031 tmppdf.pages[0].externalize_inline_images()
1032 raw_img = cast(Stream, next(im for im in tmppdf.pages[0].images.values()))
1034 # ...then use the regular PdfImage API to extract it.
1035 img = PdfImage(raw_img)
1036 img._set_pdf_source(tmppdf) # Hold tmppdf open while PdfImage exists
1037 return img
1039 def as_pil_image(self) -> Image.Image:
1040 """Return inline image as a Pillow Image."""
1041 return self._convert_to_pdfimage().as_pil_image()
1043 def extract_to(self, *, stream: BinaryIO | None = None, fileprefix: str = ''):
1044 """Extract the inline image directly to a usable image file.
1046 See:
1047 :meth:`PdfImage.extract_to`
1048 """
1049 return self._convert_to_pdfimage().extract_to(
1050 stream=stream, fileprefix=fileprefix
1051 )
1053 def read_bytes(self):
1054 """Return decompressed image bytes."""
1055 # qpdf does not have an API to return this directly, so convert it.
1056 return self._convert_to_pdfimage().read_bytes()
1058 def get_stream_buffer(self):
1059 """Return decompressed stream buffer."""
1060 # qpdf does not have an API to return this directly, so convert it.
1061 return self._convert_to_pdfimage().get_stream_buffer()
1064__all__ = [
1065 'CMYKDecodeArray',
1066 'DecodeArray',
1067 'HifiPrintImageNotTranscodableError',
1068 'ImageDecompressionError',
1069 'InvalidPdfImageError',
1070 'PaletteData',
1071 'PdfImage',
1072 'PdfImageBase',
1073 'PdfInlineImage',
1074 'PdfJpxImage',
1075 'RGBDecodeArray',
1076 'UnsupportedImageTypeError',
1077]