Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pikepdf/models/image.py: 26%

1# SPDX-FileCopyrightText: 2022 James R. Barlow

2# SPDX-License-Identifier: MPL-2.0

4"""Extract images embedded in PDF."""

6from __future__ import annotations

8from abc import ABC, abstractmethod

9from collections.abc import Callable

10from copy import copy

11from decimal import Decimal

12from io import BytesIO

13from itertools import zip_longest

14from pathlib import Path

15from shutil import copyfileobj

16from typing import Any, BinaryIO, NamedTuple, TypeVar, cast

18from PIL import Image

19from PIL.ImageCms import ImageCmsProfile

21from pikepdf import jbig2

22from pikepdf._core import Buffer, Pdf, PdfError, StreamDecodeLevel

23from pikepdf._exceptions import DependencyError

24from pikepdf.models import _transcoding

25from pikepdf.models._transcoding import ImageDecompressionError

26from pikepdf.objects import (

27 Array,

28 Dictionary,

29 Name,

30 Object,

31 Stream,

32 String,

33)

35T = TypeVar('T')

37RGBDecodeArray = tuple[float, float, float, float, float, float]

38GrayDecodeArray = tuple[float, float]

39CMYKDecodeArray = tuple[float, float, float, float, float, float, float, float]

40DecodeArray = RGBDecodeArray | GrayDecodeArray | CMYKDecodeArray

43class UnsupportedImageTypeError(Exception):

44 """This image is formatted in a way pikepdf does not supported."""

47class NotExtractableError(Exception):

48 """Indicates that an image cannot be directly extracted."""

51class HifiPrintImageNotTranscodableError(NotExtractableError):

52 """Image contains high fidelity printing information and cannot be extracted."""

55class InvalidPdfImageError(Exception):

56 """This image is not valid according to the PDF 1.7 specification."""

59def _array_str(value: Object | str | list):

60 """Simplify pikepdf objects to array of str. Keep streams, dictionaries intact."""

62 def _convert(item):

63 if isinstance(item, list | Array):

64 return [_convert(subitem) for subitem in item]

65 if isinstance(item, Stream | Dictionary | bytes | int):

66 return item

67 if isinstance(item, Name | str):

68 return str(item)

69 if isinstance(item, (String)):

70 return bytes(item)

71 raise NotImplementedError(value)

73 result = _convert(value)

74 if not isinstance(result, list):

75 result = [result]

76 return result

79def _ensure_list(value: list[Object] | Dictionary | Array | Object) -> list[Object]:

80 """Ensure value is a list of pikepdf.Object, if it was not already.

82 To support DecodeParms which can be present as either an array of dicts or a single

83 dict. It's easier to convert to an array of one dict.

84 """

85 if isinstance(value, list):

86 return value

87 return list(value.wrap_in_array().as_list())

90def _metadata_from_obj(

91 obj: Dictionary | Stream, name: str, type_: Callable[[Any], T], default: T

92) -> T | None:

93 """Retrieve metadata from a dictionary or stream and wrangle types."""

94 val = getattr(obj, name, default)

95 try:

96 return type_(val)

97 except TypeError:

98 if val is None:

99 return None

100 raise NotImplementedError('Metadata access for ' + name)

101

102

103class PaletteData(NamedTuple):

104 """Returns the color space and binary representation of the palette.

105

106 ``base_colorspace`` is typically ``"RGB"`` or ``"L"`` (for grayscale).

107

108 ``palette`` is typically 256 or 256*3=768 bytes, for grayscale and RGB color

109 respectively, with each unit/triplet being the grayscale/RGB triplet values.

110 """

111

112 base_colorspace: str

113 palette: bytes

114

115

116class PdfImageBase(ABC):

117 """Abstract base class for images."""

118

119 SIMPLE_COLORSPACES = {'/DeviceRGB', '/DeviceGray', '/CalRGB', '/CalGray'}

120 MAIN_COLORSPACES = SIMPLE_COLORSPACES | {'/DeviceCMYK', '/CalCMYK', '/ICCBased'}

121 PRINT_COLORSPACES = {'/Separation', '/DeviceN'}

122

123 @abstractmethod

124 def _metadata(self, name: str, type_: Callable[[Any], T], default: T) -> T:

125 """Get metadata for this image type."""

126

127 @property

128 def width(self) -> int:

129 """Width of the image data in pixels."""

130 return self._metadata('Width', int, 0)

131

132 @property

133 def height(self) -> int:

134 """Height of the image data in pixels."""

135 return self._metadata('Height', int, 0)

136

137 @property

138 def image_mask(self) -> bool:

139 """Return ``True`` if this is an image mask."""

140 return self._metadata('ImageMask', bool, False)

141

142 @property

143 def _bpc(self) -> int | None:

144 """Bits per component for this image (low-level)."""

145 return self._metadata('BitsPerComponent', int, 0)

146

147 @property

148 def _colorspaces(self):

149 """Colorspace (low-level)."""

150 return self._metadata('ColorSpace', _array_str, [])

151

152 @property

153 def filters(self):

154 """List of names of the filters that we applied to encode this image."""

155 return self._metadata('Filter', _array_str, [])

156

157 @property

158 def _decode_array(self) -> DecodeArray:

159 """Extract the /Decode array."""

160 decode: list = self._metadata('Decode', _ensure_list, [])

161 if decode and len(decode) in (2, 6, 8):

162 return cast(DecodeArray, tuple(float(value) for value in decode))

163

164 if self.colorspace in ('/DeviceGray', '/CalGray'):

165 return (0.0, 1.0)

166 if self.colorspace in ('/DeviceRGB', '/CalRGB'):

167 return (0.0, 1.0, 0.0, 1.0, 0.0, 1.0)

168 if self.colorspace == '/DeviceCMYK':

169 return (0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0)

170 if self.colorspace == '/ICCBased':

171 if self._approx_mode_from_icc() == 'L':

172 return (0.0, 1.0)

173 if self._approx_mode_from_icc() == 'RGB':

174 return (0.0, 1.0, 0.0, 1.0, 0.0, 1.0)

175 if self.image_mask:

176 return (0.0, 1.0) # Default for image masks; per RM 8.9.6.2

177

178 raise NotImplementedError(

179 "Don't how to retrieve default /Decode array for image" + repr(self)

180 )

181

182 @property

183 def decode_parms(self):

184 """List of the /DecodeParms, arguments to filters."""

185 return self._metadata('DecodeParms', _ensure_list, [])

186

187 @property

188 def colorspace(self) -> str | None:

189 """PDF name of the colorspace that best describes this image."""

190 if self.image_mask:

191 return None # Undefined for image masks

192 if self._colorspaces:

193 if self._colorspaces[0] in self.MAIN_COLORSPACES:

194 return self._colorspaces[0]

195 if self._colorspaces[0] == '/Indexed':

196 subspace = self._colorspaces[1]

197 if isinstance(subspace, str) and subspace in self.MAIN_COLORSPACES:

198 return subspace

199 if isinstance(subspace, list) and subspace[0] in (

200 '/ICCBased',

201 '/DeviceN',

202 '/CalGray',

203 '/CalRGB',

204 ):

205 return subspace[0]

206 if self._colorspaces[0] == '/DeviceN':

207 return '/DeviceN'

208

209 raise NotImplementedError(

210 "not sure how to get colorspace: " + repr(self._colorspaces)

211 )

212

213 @property

214 def bits_per_component(self) -> int:

215 """Bits per component of this image."""

216 if self._bpc is None or self._bpc == 0:

217 return 1 if self.image_mask else 8

218 return self._bpc

219

220 @property

221 @abstractmethod

222 def icc(self) -> ImageCmsProfile | None:

223 """Return ICC profile for this image if one is defined."""

224

225 @property

226 def indexed(self) -> bool:

227 """Check if the image has a defined color palette."""

228 return '/Indexed' in self._colorspaces

229

230 def _colorspace_has_name(self, name):

231 try:

232 cs = self._colorspaces

233 if cs[0] == '/Indexed' and cs[1][0] == name:

234 return True

235 if cs[0] == name:

236 return True

237 except (IndexError, AttributeError, KeyError):

238 pass

239 return False

240

241 @property

242 def is_device_n(self) -> bool:

243 """Check if image has a /DeviceN (complex printing) colorspace."""

244 return self._colorspace_has_name('/DeviceN')

245

246 @property

247 def is_separation(self) -> bool:

248 """Check if image has a /DeviceN (complex printing) colorspace."""

249 return self._colorspace_has_name('/Separation')

250

251 @property

252 def size(self) -> tuple[int, int]:

253 """Size of image as (width, height)."""

254 return self.width, self.height

255

256 def _approx_mode_from_icc(self):

257 if self.indexed:

258 icc_profile = self._colorspaces[1][1]

259 else:

260 icc_profile = self._colorspaces[1]

261 icc_profile_nchannels = int(icc_profile['/N'])

262

263 if icc_profile_nchannels == 1:

264 return 'L'

265

266 # Multiple channels, need to open the profile and look

267 mode_from_xcolor_space = {'RGB ': 'RGB', 'CMYK': 'CMYK'}

268 xcolor_space = self.icc.profile.xcolor_space

269 return mode_from_xcolor_space.get(xcolor_space, '')

270

271 @property

272 def mode(self) -> str:

273 """``PIL.Image.mode`` equivalent for this image, where possible.

274

275 If an ICC profile is attached to the image, we still attempt to resolve a Pillow

276 mode.

277 """

278 m = ''

279 if self.is_device_n:

280 m = 'DeviceN'

281 elif self.is_separation:

282 m = 'Separation'

283 elif self.indexed:

284 m = 'P'

285 elif self.colorspace == '/DeviceGray' and self.bits_per_component == 1:

286 m = '1'

287 elif self.colorspace == '/DeviceGray' and self.bits_per_component > 1:

288 m = 'L'

289 elif self.colorspace == '/DeviceRGB':

290 m = 'RGB'

291 elif self.colorspace == '/DeviceCMYK':

292 m = 'CMYK'

293 elif self.colorspace == '/ICCBased':

294 try:

295 m = self._approx_mode_from_icc()

296 except (ValueError, TypeError) as e:

297 raise NotImplementedError(

298 "Not sure how to handle PDF image of this type"

299 ) from e

300 if m == '':

301 raise NotImplementedError(

302 "Not sure how to handle PDF image of this type"

303 ) from None

304 return m

305

306 @property

307 def filter_decodeparms(self):

308 """Return normalized the Filter and DecodeParms data.

309

310 PDF has a lot of possible data structures concerning /Filter and

311 /DecodeParms. /Filter can be absent or a name or an array, /DecodeParms

312 can be absent or a dictionary (if /Filter is a name) or an array (if

313 /Filter is an array). When both are arrays the lengths match.

314

315 Normalize this into:

316 [(/FilterName, {/DecodeParmName: Value, ...}), ...]

317

318 The order of /Filter matters as indicates the encoding/decoding sequence.

319 """

320 return list(zip_longest(self.filters, self.decode_parms, fillvalue={}))

321

322 @property

323 def palette(self) -> PaletteData | None:

324 """Retrieve the color palette for this image if applicable."""

325 if not self.indexed:

326 return None

327 try:

328 _idx, base, _hival, lookup = self._colorspaces

329 except ValueError as e:

330 raise ValueError('Not sure how to interpret this palette') from e

331 if self.icc or self.is_device_n or self.is_separation or isinstance(base, list):

332 base = str(base[0])

333 else:

334 base = str(base)

335 lookup = bytes(lookup)

336 if base not in self.MAIN_COLORSPACES and base not in self.PRINT_COLORSPACES:

337 raise NotImplementedError(f"not sure how to interpret this palette: {base}")

338 if base in ('/DeviceRGB', '/CalRGB'):

339 base = 'RGB'

340 elif base in ('/DeviceGray', '/CalGray'):

341 base = 'L'

342 elif base == '/DeviceCMYK':

343 base = 'CMYK'

344 elif base == '/DeviceN':

345 base = 'DeviceN'

346 elif base == '/Separation':

347 base = 'Separation'

348 elif base == '/ICCBased':

349 base = self._approx_mode_from_icc()

350 else:

351 raise NotImplementedError(f"not sure how to interpret this palette: {base}")

352 return PaletteData(base, lookup)

353

354 @abstractmethod

355 def as_pil_image(self) -> Image.Image:

356 """Convert this PDF image to a Python PIL (Pillow) image."""

357

358 def _repr_png_(self) -> bytes:

359 """Display hook for IPython/Jupyter."""

360 b = BytesIO()

361 with self.as_pil_image() as im:

362 im.save(b, 'PNG')

363 return b.getvalue()

364

365

366class PdfImage(PdfImageBase):

367 """Support class to provide a consistent API for manipulating PDF images.

368

369 The data structure for images inside PDFs is irregular and complex,

370 making it difficult to use without introducing errors for less

371 typical cases. This class addresses these difficulties by providing a

372 regular, Pythonic API similar in spirit (and convertible to) the Python

373 Pillow imaging library.

374 """

375

376 obj: Stream

377 _icc: ImageCmsProfile | None

378 _pdf_source: Pdf | None

379

380 def __new__(cls, obj: Stream):

381 """Construct a PdfImage... or a PdfJpxImage if that is what we really are."""

382 try:

383 # Check if JPXDecode is called for and initialize as PdfJpxImage

384 filters = _ensure_list(obj.Filter)

385 if Name.JPXDecode in filters:

386 return super().__new__(PdfJpxImage)

387 except (AttributeError, KeyError):

388 # __init__ will deal with any other errors

389 pass

390 return super().__new__(PdfImage)

391

392 def __init__(self, obj: Stream):

393 """Construct a PDF image from a Image XObject inside a PDF.

394

395 ``pim = PdfImage(page.Resources.XObject['/ImageNN'])``

396

397 Args:

398 obj: an Image XObject

399 """

400 if isinstance(obj, Stream) and obj.stream_dict.get("/Subtype") != "/Image":

401 raise TypeError("can't construct PdfImage from non-image")

402 self.obj = obj

403 self._icc = None

404

405 def __eq__(self, other):

406 if not isinstance(other, PdfImageBase):

407 return NotImplemented

408 return self.obj == other.obj

409

410 @classmethod

411 def _from_pil_image(cls, *, pdf, page, name, image): # pragma: no cover

412 """Insert a PIL image into a PDF (rudimentary).

413

414 Args:

415 pdf (pikepdf.Pdf): the PDF to attach the image to

416 page (pikepdf.Object): the page to attach the image to

417 name (str or pikepdf.Name): the name to set the image

418 image (PIL.Image.Image): the image to insert

419 """

420 data = image.tobytes()

421

422 imstream = Stream(pdf, data)

423 imstream.Type = Name('/XObject')

424 imstream.Subtype = Name('/Image')

425 if image.mode == 'RGB':

426 imstream.ColorSpace = Name('/DeviceRGB')

427 elif image.mode in ('1', 'L'):

428 imstream.ColorSpace = Name('/DeviceGray')

429 imstream.BitsPerComponent = 1 if image.mode == '1' else 8

430 imstream.Width = image.width

431 imstream.Height = image.height

432

433 page.Resources.XObject[name] = imstream

434

435 return cls(imstream)

436

437 def _metadata(self, name, type_, default):

438 return _metadata_from_obj(self.obj, name, type_, default)

439

440 @property

441 def _iccstream(self):

442 if self.colorspace == '/ICCBased':

443 if not self.indexed:

444 return self._colorspaces[1]

445 assert isinstance(self._colorspaces[1], list)

446 return self._colorspaces[1][1]

447 raise NotImplementedError("Don't know how to find ICC stream for image")

448

449 @property

450 def icc(self) -> ImageCmsProfile | None:

451 """If an ICC profile is attached, return a Pillow object that describe it.

452

453 Most of the information may be found in ``icc.profile``.

454 """

455 if self.colorspace not in ('/ICCBased', '/Indexed'):

456 return None

457 if not self._icc:

458 iccstream = self._iccstream

459 iccbuffer = iccstream.get_stream_buffer()

460 iccbytesio = BytesIO(iccbuffer)

461 try:

462 self._icc = ImageCmsProfile(iccbytesio)

463 except OSError as e:

464 if str(e) == 'cannot open profile from string':

465 # ICC profile is corrupt

466 raise UnsupportedImageTypeError(

467 "ICC profile corrupt or not readable"

468 ) from e

469 return self._icc

470

471 def _remove_simple_filters(self):

472 """Remove simple lossless compression where it appears."""

473 COMPLEX_FILTERS = {

474 '/DCTDecode',

475 '/JPXDecode',

476 '/JBIG2Decode',

477 '/CCITTFaxDecode',

478 }

479 indices = [n for n, filt in enumerate(self.filters) if filt in COMPLEX_FILTERS]

480 if len(indices) > 1:

481 raise NotImplementedError(

482 f"Object {self.obj.objgen} has compound complex filters: "

483 f"{self.filters}. We cannot decompress this."

484 )

485 if len(indices) == 0:

486 # No complex filter indices, so all filters are simple - remove them all

487 return self.obj.read_bytes(StreamDecodeLevel.specialized), []

488

489 n = indices[0]

490 if n == 0:

491 # The only filter is complex, so return

492 return self.obj.read_raw_bytes(), self.filters

493

494 obj_copy = copy(self.obj)

495 obj_copy.Filter = Array([Name(f) for f in self.filters[:n]])

496 obj_copy.DecodeParms = Array(self.decode_parms[:n])

497 return obj_copy.read_bytes(StreamDecodeLevel.specialized), self.filters[n:]

498

499 def _extract_direct(self, *, stream: BinaryIO) -> str | None:

500 """Attempt to extract the image directly to a usable image file.

501

502 If there is no way to extract the image without decompressing or

503 transcoding then raise an exception. The type and format of image

504 generated will vary.

505

506 Args:

507 stream: Writable file stream to write data to, e.g. an open file

508 """

509

510 def normal_dct_rgb() -> bool:

511 # Normal DCTDecode RGB images have the default value of

512 # /ColorTransform 1 and are actually in YUV. Such a file can be

513 # saved as a standard JPEG. RGB JPEGs without YUV conversion can't

514 # be saved as JPEGs, and are probably bugs. Some software in the

515 # wild actually produces RGB JPEGs in PDFs (probably a bug).

516 DEFAULT_CT_RGB = 1

517 ct = DEFAULT_CT_RGB

518 if self.filter_decodeparms[0][1] is not None:

519 ct = self.filter_decodeparms[0][1].get(

520 '/ColorTransform', DEFAULT_CT_RGB

521 )

522 return self.mode == 'RGB' and ct == DEFAULT_CT_RGB

523

524 def normal_dct_cmyk() -> bool:

525 # Normal DCTDecode CMYKs have /ColorTransform 0 and can be saved.

526 # There is a YUVK colorspace but CMYK JPEGs don't generally use it

527 DEFAULT_CT_CMYK = 0

528 ct = DEFAULT_CT_CMYK

529 if self.filter_decodeparms[0][1] is not None:

530 ct = self.filter_decodeparms[0][1].get(

531 '/ColorTransform', DEFAULT_CT_CMYK

532 )

533 return self.mode == 'CMYK' and ct == DEFAULT_CT_CMYK

534

535 data, filters = self._remove_simple_filters()

536

537 if filters == ['/CCITTFaxDecode']:

538 if self.colorspace == '/ICCBased':

539 icc = self._iccstream.read_bytes()

540 else:

541 icc = None

542 stream.write(self._generate_ccitt_header(data, icc=icc))

543 stream.write(data)

544 return '.tif'

545 if filters == ['/DCTDecode'] and (

546 self.mode == 'L' or normal_dct_rgb() or normal_dct_cmyk()

547 ):

548 stream.write(data)

549 return '.jpg'

550

551 return None

552

553 def _extract_transcoded_1248bits(self) -> Image.Image:

554 """Extract an image when there are 1/2/4/8 bits packed in byte data."""

555 stride = 0 # tell Pillow to calculate stride from line width

556 scale = 0 if self.mode == 'L' else 1

557 if self.bits_per_component in (2, 4):

558 buffer, stride = _transcoding.unpack_subbyte_pixels(

559 self.read_bytes(), self.size, self.bits_per_component, scale

560 )

561 elif self.bits_per_component == 8:

562 buffer = cast(memoryview, self.get_stream_buffer())

563 else:

564 raise InvalidPdfImageError("BitsPerComponent must be 1, 2, 4, 8, or 16")

565

566 if self.mode == 'P' and self.palette is not None:

567 base_mode, palette = self.palette

568 im = _transcoding.image_from_buffer_and_palette(

569 buffer,

570 self.size,

571 stride,

572 base_mode,

573 palette,

574 )

575 else:

576 im = _transcoding.image_from_byte_buffer(buffer, self.size, stride)

577 return im

578

579 def _extract_transcoded_1bit(self) -> Image.Image:

580 if not self.image_mask and self.mode in ('RGB', 'CMYK'):

581 raise UnsupportedImageTypeError("1-bit RGB and CMYK are not supported")

582 try:

583 data = self.read_bytes()

584 except (RuntimeError, PdfError) as e:

585 if (

586 'read_bytes called on unfilterable stream' in str(e)

587 and not jbig2.get_decoder().available()

588 ):

589 raise DependencyError(

590 "jbig2dec - not installed or installed version is too old "

591 "(older than version 0.15)"

592 ) from None

593 raise

594

595 im = Image.frombytes('1', self.size, data)

596

597 if self.palette is not None:

598 base_mode, palette = self.palette

599 im = _transcoding.fix_1bit_palette_image(im, base_mode, palette)

600

601 return im

602

603 def _extract_transcoded_mask(self) -> Image.Image:

604 return self._extract_transcoded_1bit()

605

606 def _extract_transcoded(self) -> Image.Image:

607 if self.image_mask:

608 return self._extract_transcoded_mask()

609

610 if self.mode in {'DeviceN', 'Separation'}:

611 raise HifiPrintImageNotTranscodableError()

612

613 if self.mode == 'RGB' and self.bits_per_component == 8:

614 # Cannot use the zero-copy .get_stream_buffer here, we have 3-byte

615 # RGB and Pillow needs RGBX.

616 im = Image.frombuffer(

617 'RGB', self.size, self.read_bytes(), 'raw', 'RGB', 0, 1

618 )

619 elif self.mode == 'CMYK' and self.bits_per_component == 8:

620 im = Image.frombuffer(

621 'CMYK', self.size, self.get_stream_buffer(), 'raw', 'CMYK', 0, 1

622 )

623 # elif self.mode == '1':

624 elif self.bits_per_component == 1:

625 im = self._extract_transcoded_1bit()

626 elif self.mode in ('L', 'P') and self.bits_per_component <= 8:

627 im = self._extract_transcoded_1248bits()

628 else:

629 raise UnsupportedImageTypeError(repr(self) + ", " + repr(self.obj))

630

631 if self.colorspace == '/ICCBased' and self.icc is not None:

632 im.info['icc_profile'] = self.icc.tobytes()

633

634 return im

635

636 def _extract_to_stream(self, *, stream: BinaryIO) -> str:

637 """Extract the image to a stream.

638

639 If possible, the compressed data is extracted and inserted into

640 a compressed image file format without transcoding the compressed

641 content. If this is not possible, the data will be decompressed

642 and extracted to an appropriate format.

643

644 Args:

645 stream: Writable stream to write data to

646

647 Returns:

648 The file format extension.

649 """

650 direct_extraction = self._extract_direct(stream=stream)

651 if direct_extraction:

652 return direct_extraction

653

654 im = None

655 try:

656 im = self._extract_transcoded()

657 if im.mode == 'CMYK':

658 im.save(stream, format='tiff', compression='tiff_adobe_deflate')

659 return '.tiff'

660 if im:

661 im.save(stream, format='png')

662 return '.png'

663 except PdfError as e:

664 if 'called on unfilterable stream' in str(e):

665 raise UnsupportedImageTypeError(repr(self)) from e

666 raise

667 finally:

668 if im:

669 im.close()

670

671 raise UnsupportedImageTypeError(repr(self))

672

673 def extract_to(

674 self, *, stream: BinaryIO | None = None, fileprefix: str = ''

675 ) -> str:

676 """Extract the image directly to a usable image file.

677

678 If possible, the compressed data is extracted and inserted into

679 a compressed image file format without transcoding the compressed

680 content. If this is not possible, the data will be decompressed

681 and extracted to an appropriate format.

682

683 Because it is not known until attempted what image format will be

684 extracted, users should not assume what format they are getting back.

685 When saving the image to a file, use a temporary filename, and then

686 rename the file to its final name based on the returned file extension.

687

688 Images might be saved as any of .png, .jpg, or .tiff.

689

690 Examples:

691 >>> im.extract_to(stream=bytes_io) # doctest: +SKIP

692 '.png'

693

694 >>> im.extract_to(fileprefix='/tmp/image00') # doctest: +SKIP

695 '/tmp/image00.jpg'

696

697 Args:

698 stream: Writable stream to write data to.

699 fileprefix (str or Path): The path to write the extracted image to,

700 without the file extension.

701

702 Returns:

703 If *fileprefix* was provided, then the fileprefix with the

704 appropriate extension. If no *fileprefix*, then an extension

705 indicating the file type.

706 """

707 if bool(stream) == bool(fileprefix):

708 raise ValueError("Cannot set both stream and fileprefix")

709 if stream:

710 return self._extract_to_stream(stream=stream)

711

712 bio = BytesIO()

713 extension = self._extract_to_stream(stream=bio)

714 bio.seek(0)

715 filepath = Path(str(Path(fileprefix)) + extension)

716 with filepath.open('wb') as target:

717 copyfileobj(bio, target)

718 return str(filepath)

719

720 def read_bytes(

721 self, decode_level: StreamDecodeLevel = StreamDecodeLevel.specialized

722 ) -> bytes:

723 """Decompress this image and return it as unencoded bytes."""

724 return self.obj.read_bytes(decode_level=decode_level)

725

726 def get_stream_buffer(

727 self, decode_level: StreamDecodeLevel = StreamDecodeLevel.specialized

728 ) -> Buffer:

729 """Access this image with the buffer protocol."""

730 return self.obj.get_stream_buffer(decode_level=decode_level)

731

732 def as_pil_image(self) -> Image.Image:

733 """Extract the image as a Pillow Image, using decompression as necessary.

734

735 Caller must close the image.

736 """

737 bio = BytesIO()

738 direct_extraction = self._extract_direct(stream=bio)

739 if direct_extraction:

740 bio.seek(0)

741 return Image.open(bio)

742

743 im = self._extract_transcoded()

744 if not im:

745 raise UnsupportedImageTypeError(repr(self))

746

747 return im

748

749 def _generate_ccitt_header(self, data: bytes, icc: bytes | None = None) -> bytes:

750 """Construct a CCITT G3 or G4 header from the PDF metadata."""

751 # https://stackoverflow.com/questions/2641770/

752 # https://www.itu.int/itudoc/itu-t/com16/tiff-fx/docs/tiff6.pdf

753

754 if not self.decode_parms:

755 raise ValueError("/CCITTFaxDecode without /DecodeParms")

756

757 expected_defaults = [

758 ("/EncodedByteAlign", False),

759 ]

760 for name, val in expected_defaults:

761 if self.decode_parms[0].get(name, val) != val:

762 raise UnsupportedImageTypeError(

763 f"/CCITTFaxDecode with decode parameter {name} not equal {val}"

764 )

765

766 k = self.decode_parms[0].get("/K", 0)

767 t4_options = None

768 if k < 0:

769 ccitt_group = 4 # Group 4

770 elif k > 0:

771 ccitt_group = 3 # Group 3 2-D

772 t4_options = 1

773 else:

774 ccitt_group = 3 # Group 3 1-D

775 black_is_one = self.decode_parms[0].get("/BlackIs1", False)

776 decode = self._decode_array

777 # PDF spec says:

778 # BlackIs1: A flag indicating whether 1 bits shall be interpreted as black

779 # pixels and 0 bits as white pixels, the reverse of the normal

780 # PDF convention for image data. Default value: false.

781 # TIFF spec says:

782 # use 0 for white_is_zero (=> black is 1) MINISWHITE

783 # use 1 for black_is_zero (=> white is 1) MINISBLACK

784 photometry = 1 if black_is_one else 0

785

786 # If Decode is [1, 0] then the photometry is inverted

787 if len(decode) == 2 and decode == (1.0, 0.0):

788 photometry = 1 - photometry

789

790 img_size = len(data)

791 if icc is None:

792 icc = b''

793

794 return _transcoding.generate_ccitt_header(

795 self.size,

796 data_length=img_size,

797 ccitt_group=ccitt_group,

798 t4_options=t4_options,

799 photometry=photometry,

800 icc=icc,

801 )

802

803 def show(self): # pragma: no cover

804 """Show the image however PIL wants to."""

805 self.as_pil_image().show()

806

807 def _set_pdf_source(self, pdf: Pdf):

808 self._pdf_source = pdf

809

810 def __repr__(self):

811 try:

812 mode = self.mode

813 except NotImplementedError:

814 mode = '?'

815 return (

816 f'<pikepdf.PdfImage image mode={mode} '

817 f'size={self.width}x{self.height} at {hex(id(self))}>'

818 )

819

820

821class PdfJpxImage(PdfImage):

822 """Support class for JPEG 2000 images. Implements the same API as :class:`PdfImage`.

823

824 If you call PdfImage(object_that_is_actually_jpeg2000_image), pikepdf will return

825 this class instead, due to the check in PdfImage.__new__.

826 """

827

828 def __init__(self, obj):

829 """Initialize a JPEG 2000 image."""

830 super().__init__(obj)

831 self._jpxpil = self.as_pil_image()

832

833 def __eq__(self, other):

834 if not isinstance(other, PdfImageBase):

835 return NotImplemented

836 return (

837 self.obj == other.obj

838 and isinstance(other, PdfJpxImage)

839 and self._jpxpil == other._jpxpil

840 )

841

842 def _extract_direct(self, *, stream: BinaryIO) -> str | None:

843 data, filters = self._remove_simple_filters()

844 if filters != ['/JPXDecode']:

845 return None

846 stream.write(data)

847 return '.jp2'

848

849 def _extract_transcoded(self) -> Image.Image:

850 return super()._extract_transcoded()

851

852 @property

853 def _colorspaces(self):

854 """Return the effective colorspace of a JPEG 2000 image.

855

856 If the ColorSpace dictionary is present, the colorspace embedded in the

857 JPEG 2000 data will be ignored, as required by the specification.

858 """

859 # (PDF 1.7 Table 89) If ColorSpace is present, any colour space

860 # specifications in the JPEG2000 data shall be ignored.

861 super_colorspaces = super()._colorspaces

862 if super_colorspaces:

863 return super_colorspaces

864 if self._jpxpil.mode == 'L':

865 return ['/DeviceGray']

866 if self._jpxpil.mode == 'RGB':

867 return ['/DeviceRGB']

868 raise NotImplementedError('Complex JP2 colorspace')

869

870 @property

871 def _bpc(self) -> int:

872 """Return 8, since bpc is not meaningful for JPEG 2000 encoding."""

873 # (PDF 1.7 Table 89) If the image stream uses the JPXDecode filter, this

874 # entry is optional and shall be ignored if present. The bit depth is

875 # determined by the conforming reader in the process of decoding the

876 # JPEG2000 image.

877 return 8

878

879 @property

880 def indexed(self) -> bool:

881 """Return False, since JPEG 2000 should not be indexed."""

882 # Nothing in the spec precludes an Indexed JPXDecode image, except for

883 # the fact that doing so is madness. Let's assume it no one is that

884 # insane.

885 return False

886

887 def __repr__(self):

888 return (

889 f'<pikepdf.PdfJpxImage JPEG2000 image mode={self.mode} '

890 f'size={self.width}x{self.height} at {hex(id(self))}>'

891 )

892

893

894class PdfInlineImage(PdfImageBase):

895 """Support class for PDF inline images."""

896

897 # Inline images can contain abbreviations that we write automatically

898 ABBREVS = {

899 b'/W': b'/Width',

900 b'/H': b'/Height',

901 b'/BPC': b'/BitsPerComponent',

902 b'/IM': b'/ImageMask',

903 b'/CS': b'/ColorSpace',

904 b'/F': b'/Filter',

905 b'/DP': b'/DecodeParms',

906 b'/G': b'/DeviceGray',

907 b'/RGB': b'/DeviceRGB',

908 b'/CMYK': b'/DeviceCMYK',

909 b'/I': b'/Indexed',

910 b'/AHx': b'/ASCIIHexDecode',

911 b'/A85': b'/ASCII85Decode',

912 b'/LZW': b'/LZWDecode',

913 b'/RL': b'/RunLengthDecode',

914 b'/CCF': b'/CCITTFaxDecode',

915 b'/DCT': b'/DCTDecode',

916 }

917 REVERSE_ABBREVS = {v: k for k, v in ABBREVS.items()}

918

919 _data: Object

920 _image_object: tuple[Object, ...]

921

922 def __init__(self, *, image_data: Object, image_object: tuple):

923 """Construct wrapper for inline image.

924

925 Args:

926 image_data: data stream for image, extracted from content stream

927 image_object: the metadata for image, also from content stream

928 """

929 # Convert the sequence of pikepdf.Object from the content stream into

930 # a dictionary object by unparsing it (to bytes), eliminating inline

931 # image abbreviations, and constructing a bytes string equivalent to

932 # what an image XObject would look like. Then retrieve data from there

933

934 self._data = image_data

935 self._image_object = image_object

936

937 reparse = b' '.join(

938 self._unparse_obj(obj, remap_names=self.ABBREVS) for obj in image_object

939 )

940 try:

941 reparsed_obj = Object.parse(b'<< ' + reparse + b' >>')

942 except PdfError as e:

943 raise PdfError("parsing inline " + reparse.decode('unicode_escape')) from e

944 self.obj = reparsed_obj

945

946 def __eq__(self, other):

947 if not isinstance(other, PdfImageBase):

948 return NotImplemented

949 return (

950 self.obj == other.obj

951 and isinstance(other, PdfInlineImage)

952 and (

953 self._data._inline_image_raw_bytes()

954 == other._data._inline_image_raw_bytes()

955 )

956 )

957

958 @classmethod

959 def _unparse_obj(cls, obj, remap_names):

960 if isinstance(obj, Object):

961 if isinstance(obj, Name):

962 name = obj.unparse(resolved=True)

963 assert isinstance(name, bytes)

964 return remap_names.get(name, name)

965 return obj.unparse(resolved=True)

966 if isinstance(obj, bool):

967 return b'true' if obj else b'false' # Lower case for PDF spec

968 if isinstance(obj, int | Decimal | float):

969 return str(obj).encode('ascii')

970 raise NotImplementedError(repr(obj))

971

972 def _metadata(self, name, type_, default):

973 return _metadata_from_obj(self.obj, name, type_, default)

974

975 def unparse(self) -> bytes:

976 """Create the content stream bytes that reproduce this inline image."""

977

978 def metadata_tokens():

979 for metadata_obj in self._image_object:

980 unparsed = self._unparse_obj(

981 metadata_obj, remap_names=self.REVERSE_ABBREVS

982 )

983 assert isinstance(unparsed, bytes)

984 yield unparsed

985

986 def inline_image_tokens():

987 yield b'BI\n'

988 yield b' '.join(m for m in metadata_tokens())

989 yield b'\nID\n'

990 yield self._data._inline_image_raw_bytes()

991 yield b'EI'

992

993 return b''.join(inline_image_tokens())

994

995 @property

996 def icc(self): # pragma: no cover

997 """Raise an exception since ICC profiles are not supported on inline images."""

998 raise InvalidPdfImageError(

999 "Inline images with ICC profiles are not supported in the PDF specification"

1000 )

1001

1002 def __repr__(self):

1003 try:

1004 mode = self.mode

1005 except NotImplementedError:

1006 mode = '?'

1007 return (

1008 f'<pikepdf.PdfInlineImage image mode={mode} '

1009 f'size={self.width}x{self.height} at {hex(id(self))}>'

1010 )

1011

1012 def _convert_to_pdfimage(self) -> PdfImage:

1013 # Construct a temporary PDF that holds this inline image, and...

1014 tmppdf = Pdf.new()

1015 tmppdf.add_blank_page(page_size=(self.width, self.height))

1016 tmppdf.pages[0].contents_add(

1017 f'{self.width} 0 0 {self.height} 0 0 cm'.encode('ascii'), prepend=True

1018 )

1019 tmppdf.pages[0].contents_add(self.unparse())

1020

1021 # ...externalize it,

1022 tmppdf.pages[0].externalize_inline_images()

1023 raw_img = cast(Stream, next(im for im in tmppdf.pages[0].images.values()))

1024

1025 # ...then use the regular PdfImage API to extract it.

1026 img = PdfImage(raw_img)

1027 img._set_pdf_source(tmppdf) # Hold tmppdf open while PdfImage exists

1028 return img

1029

1030 def as_pil_image(self) -> Image.Image:

1031 """Return inline image as a Pillow Image."""

1032 return self._convert_to_pdfimage().as_pil_image()

1033

1034 def extract_to(self, *, stream: BinaryIO | None = None, fileprefix: str = ''):

1035 """Extract the inline image directly to a usable image file.

1036

1037 See:

1038 :meth:`PdfImage.extract_to`

1039 """

1040 return self._convert_to_pdfimage().extract_to(

1041 stream=stream, fileprefix=fileprefix

1042 )

1043

1044 def read_bytes(self):

1045 """Return decompressed image bytes."""

1046 # qpdf does not have an API to return this directly, so convert it.

1047 return self._convert_to_pdfimage().read_bytes()

1048

1049 def get_stream_buffer(self):

1050 """Return decompressed stream buffer."""

1051 # qpdf does not have an API to return this directly, so convert it.

1052 return self._convert_to_pdfimage().get_stream_buffer()

1053

1054

1055__all__ = [

1056 'CMYKDecodeArray',

1057 'DecodeArray',

1058 'HifiPrintImageNotTranscodableError',

1059 'ImageDecompressionError',

1060 'InvalidPdfImageError',

1061 'PaletteData',

1062 'PdfImage',

1063 'PdfImageBase',

1064 'PdfInlineImage',

1065 'PdfJpxImage',

1066 'RGBDecodeArray',

1067 'UnsupportedImageTypeError',

1068]