Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pikepdf/models/image.py: 26%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

539 statements  

1# SPDX-FileCopyrightText: 2022 James R. Barlow 

2# SPDX-License-Identifier: MPL-2.0 

3 

4"""Extract images embedded in PDF.""" 

5 

6from __future__ import annotations 

7 

8from abc import ABC, abstractmethod 

9from collections.abc import Callable 

10from copy import copy 

11from decimal import Decimal 

12from io import BytesIO 

13from itertools import zip_longest 

14from pathlib import Path 

15from shutil import copyfileobj 

16from typing import Any, BinaryIO, NamedTuple, TypeVar, cast 

17 

18from PIL import Image 

19from PIL.ImageCms import ImageCmsProfile 

20 

21from pikepdf import jbig2 

22from pikepdf._core import Buffer, Pdf, PdfError, StreamDecodeLevel 

23from pikepdf._exceptions import DependencyError 

24from pikepdf.models import _transcoding 

25from pikepdf.models._transcoding import ImageDecompressionError 

26from pikepdf.objects import ( 

27 Array, 

28 Dictionary, 

29 Name, 

30 Object, 

31 Stream, 

32 String, 

33) 

34 

35T = TypeVar('T') 

36 

37RGBDecodeArray = tuple[float, float, float, float, float, float] 

38GrayDecodeArray = tuple[float, float] 

39CMYKDecodeArray = tuple[float, float, float, float, float, float, float, float] 

40DecodeArray = RGBDecodeArray | GrayDecodeArray | CMYKDecodeArray 

41 

42 

43class UnsupportedImageTypeError(Exception): 

44 """This image is formatted in a way pikepdf does not supported.""" 

45 

46 

47class NotExtractableError(Exception): 

48 """Indicates that an image cannot be directly extracted.""" 

49 

50 

51class HifiPrintImageNotTranscodableError(NotExtractableError): 

52 """Image contains high fidelity printing information and cannot be extracted.""" 

53 

54 

55class InvalidPdfImageError(Exception): 

56 """This image is not valid according to the PDF 1.7 specification.""" 

57 

58 

59def _array_str(value: Object | str | list): 

60 """Simplify pikepdf objects to array of str. Keep streams, dictionaries intact.""" 

61 

62 def _convert(item): 

63 if isinstance(item, list | Array): 

64 return [_convert(subitem) for subitem in item] 

65 if isinstance(item, Stream | Dictionary | bytes | int): 

66 return item 

67 if isinstance(item, Name | str): 

68 return str(item) 

69 if isinstance(item, (String)): 

70 return bytes(item) 

71 raise NotImplementedError(value) 

72 

73 result = _convert(value) 

74 if not isinstance(result, list): 

75 result = [result] 

76 return result 

77 

78 

79def _ensure_list(value: list[Object] | Dictionary | Array | Object) -> list[Object]: 

80 """Ensure value is a list of pikepdf.Object, if it was not already. 

81 

82 To support DecodeParms which can be present as either an array of dicts or a single 

83 dict. It's easier to convert to an array of one dict. 

84 """ 

85 if isinstance(value, list): 

86 return value 

87 return list(value.wrap_in_array().as_list()) 

88 

89 

90def _metadata_from_obj( 

91 obj: Dictionary | Stream, name: str, type_: Callable[[Any], T], default: T 

92) -> T | None: 

93 """Retrieve metadata from a dictionary or stream and wrangle types.""" 

94 val = getattr(obj, name, default) 

95 try: 

96 return type_(val) 

97 except TypeError: 

98 if val is None: 

99 return None 

100 raise NotImplementedError('Metadata access for ' + name) 

101 

102 

103class PaletteData(NamedTuple): 

104 """Returns the color space and binary representation of the palette. 

105 

106 ``base_colorspace`` is typically ``"RGB"`` or ``"L"`` (for grayscale). 

107 

108 ``palette`` is typically 256 or 256*3=768 bytes, for grayscale and RGB color 

109 respectively, with each unit/triplet being the grayscale/RGB triplet values. 

110 """ 

111 

112 base_colorspace: str 

113 palette: bytes 

114 

115 

116class PdfImageBase(ABC): 

117 """Abstract base class for images.""" 

118 

119 SIMPLE_COLORSPACES = {'/DeviceRGB', '/DeviceGray', '/CalRGB', '/CalGray'} 

120 MAIN_COLORSPACES = SIMPLE_COLORSPACES | {'/DeviceCMYK', '/CalCMYK', '/ICCBased'} 

121 PRINT_COLORSPACES = {'/Separation', '/DeviceN'} 

122 

123 @abstractmethod 

124 def _metadata(self, name: str, type_: Callable[[Any], T], default: T) -> T: 

125 """Get metadata for this image type.""" 

126 

127 @property 

128 def width(self) -> int: 

129 """Width of the image data in pixels.""" 

130 return self._metadata('Width', int, 0) 

131 

132 @property 

133 def height(self) -> int: 

134 """Height of the image data in pixels.""" 

135 return self._metadata('Height', int, 0) 

136 

137 @property 

138 def image_mask(self) -> bool: 

139 """Return ``True`` if this is an image mask.""" 

140 return self._metadata('ImageMask', bool, False) 

141 

142 @property 

143 def _bpc(self) -> int | None: 

144 """Bits per component for this image (low-level).""" 

145 return self._metadata('BitsPerComponent', int, 0) 

146 

147 @property 

148 def _colorspaces(self): 

149 """Colorspace (low-level).""" 

150 return self._metadata('ColorSpace', _array_str, []) 

151 

152 @property 

153 def filters(self): 

154 """List of names of the filters that we applied to encode this image.""" 

155 return self._metadata('Filter', _array_str, []) 

156 

157 @property 

158 def _decode_array(self) -> DecodeArray: 

159 """Extract the /Decode array.""" 

160 decode: list = self._metadata('Decode', _ensure_list, []) 

161 if decode and len(decode) in (2, 6, 8): 

162 return cast(DecodeArray, tuple(float(value) for value in decode)) 

163 

164 if self.colorspace in ('/DeviceGray', '/CalGray'): 

165 return (0.0, 1.0) 

166 if self.colorspace in ('/DeviceRGB', '/CalRGB'): 

167 return (0.0, 1.0, 0.0, 1.0, 0.0, 1.0) 

168 if self.colorspace == '/DeviceCMYK': 

169 return (0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0) 

170 if self.colorspace == '/ICCBased': 

171 if self._approx_mode_from_icc() == 'L': 

172 return (0.0, 1.0) 

173 if self._approx_mode_from_icc() == 'RGB': 

174 return (0.0, 1.0, 0.0, 1.0, 0.0, 1.0) 

175 if self.image_mask: 

176 return (0.0, 1.0) # Default for image masks; per RM 8.9.6.2 

177 

178 raise NotImplementedError( 

179 "Don't how to retrieve default /Decode array for image" + repr(self) 

180 ) 

181 

182 @property 

183 def decode_parms(self): 

184 """List of the /DecodeParms, arguments to filters.""" 

185 return self._metadata('DecodeParms', _ensure_list, []) 

186 

187 @property 

188 def colorspace(self) -> str | None: 

189 """PDF name of the colorspace that best describes this image.""" 

190 if self.image_mask: 

191 return None # Undefined for image masks 

192 if self._colorspaces: 

193 if self._colorspaces[0] in self.MAIN_COLORSPACES: 

194 return self._colorspaces[0] 

195 if self._colorspaces[0] == '/Indexed': 

196 subspace = self._colorspaces[1] 

197 if isinstance(subspace, str) and subspace in self.MAIN_COLORSPACES: 

198 return subspace 

199 if isinstance(subspace, list) and subspace[0] in ( 

200 '/ICCBased', 

201 '/DeviceN', 

202 '/CalGray', 

203 '/CalRGB', 

204 ): 

205 return subspace[0] 

206 if self._colorspaces[0] == '/DeviceN': 

207 return '/DeviceN' 

208 

209 raise NotImplementedError( 

210 "not sure how to get colorspace: " + repr(self._colorspaces) 

211 ) 

212 

213 @property 

214 def bits_per_component(self) -> int: 

215 """Bits per component of this image.""" 

216 if self._bpc is None or self._bpc == 0: 

217 return 1 if self.image_mask else 8 

218 return self._bpc 

219 

220 @property 

221 @abstractmethod 

222 def icc(self) -> ImageCmsProfile | None: 

223 """Return ICC profile for this image if one is defined.""" 

224 

225 @property 

226 def indexed(self) -> bool: 

227 """Check if the image has a defined color palette.""" 

228 return '/Indexed' in self._colorspaces 

229 

230 def _colorspace_has_name(self, name): 

231 try: 

232 cs = self._colorspaces 

233 if cs[0] == '/Indexed' and cs[1][0] == name: 

234 return True 

235 if cs[0] == name: 

236 return True 

237 except (IndexError, AttributeError, KeyError): 

238 pass 

239 return False 

240 

241 @property 

242 def is_device_n(self) -> bool: 

243 """Check if image has a /DeviceN (complex printing) colorspace.""" 

244 return self._colorspace_has_name('/DeviceN') 

245 

246 @property 

247 def is_separation(self) -> bool: 

248 """Check if image has a /DeviceN (complex printing) colorspace.""" 

249 return self._colorspace_has_name('/Separation') 

250 

251 @property 

252 def size(self) -> tuple[int, int]: 

253 """Size of image as (width, height).""" 

254 return self.width, self.height 

255 

256 def _approx_mode_from_icc(self): 

257 if self.indexed: 

258 icc_profile = self._colorspaces[1][1] 

259 else: 

260 icc_profile = self._colorspaces[1] 

261 icc_profile_nchannels = int(icc_profile['/N']) 

262 

263 if icc_profile_nchannels == 1: 

264 return 'L' 

265 

266 # Multiple channels, need to open the profile and look 

267 mode_from_xcolor_space = {'RGB ': 'RGB', 'CMYK': 'CMYK'} 

268 xcolor_space = self.icc.profile.xcolor_space 

269 return mode_from_xcolor_space.get(xcolor_space, '') 

270 

271 @property 

272 def mode(self) -> str: 

273 """``PIL.Image.mode`` equivalent for this image, where possible. 

274 

275 If an ICC profile is attached to the image, we still attempt to resolve a Pillow 

276 mode. 

277 """ 

278 m = '' 

279 if self.is_device_n: 

280 m = 'DeviceN' 

281 elif self.is_separation: 

282 m = 'Separation' 

283 elif self.indexed: 

284 m = 'P' 

285 elif self.colorspace == '/DeviceGray' and self.bits_per_component == 1: 

286 m = '1' 

287 elif self.colorspace == '/DeviceGray' and self.bits_per_component > 1: 

288 m = 'L' 

289 elif self.colorspace == '/DeviceRGB': 

290 m = 'RGB' 

291 elif self.colorspace == '/DeviceCMYK': 

292 m = 'CMYK' 

293 elif self.colorspace == '/ICCBased': 

294 try: 

295 m = self._approx_mode_from_icc() 

296 except (ValueError, TypeError) as e: 

297 raise NotImplementedError( 

298 "Not sure how to handle PDF image of this type" 

299 ) from e 

300 if m == '': 

301 raise NotImplementedError( 

302 "Not sure how to handle PDF image of this type" 

303 ) from None 

304 return m 

305 

306 @property 

307 def filter_decodeparms(self): 

308 """Return normalized the Filter and DecodeParms data. 

309 

310 PDF has a lot of possible data structures concerning /Filter and 

311 /DecodeParms. /Filter can be absent or a name or an array, /DecodeParms 

312 can be absent or a dictionary (if /Filter is a name) or an array (if 

313 /Filter is an array). When both are arrays the lengths match. 

314 

315 Normalize this into: 

316 [(/FilterName, {/DecodeParmName: Value, ...}), ...] 

317 

318 The order of /Filter matters as indicates the encoding/decoding sequence. 

319 """ 

320 return list(zip_longest(self.filters, self.decode_parms, fillvalue={})) 

321 

322 @property 

323 def palette(self) -> PaletteData | None: 

324 """Retrieve the color palette for this image if applicable.""" 

325 if not self.indexed: 

326 return None 

327 try: 

328 _idx, base, _hival, lookup = self._colorspaces 

329 except ValueError as e: 

330 raise ValueError('Not sure how to interpret this palette') from e 

331 if self.icc or self.is_device_n or self.is_separation or isinstance(base, list): 

332 base = str(base[0]) 

333 else: 

334 base = str(base) 

335 lookup = bytes(lookup) 

336 if base not in self.MAIN_COLORSPACES and base not in self.PRINT_COLORSPACES: 

337 raise NotImplementedError(f"not sure how to interpret this palette: {base}") 

338 if base in ('/DeviceRGB', '/CalRGB'): 

339 base = 'RGB' 

340 elif base in ('/DeviceGray', '/CalGray'): 

341 base = 'L' 

342 elif base == '/DeviceCMYK': 

343 base = 'CMYK' 

344 elif base == '/DeviceN': 

345 base = 'DeviceN' 

346 elif base == '/Separation': 

347 base = 'Separation' 

348 elif base == '/ICCBased': 

349 base = self._approx_mode_from_icc() 

350 else: 

351 raise NotImplementedError(f"not sure how to interpret this palette: {base}") 

352 return PaletteData(base, lookup) 

353 

354 @abstractmethod 

355 def as_pil_image(self) -> Image.Image: 

356 """Convert this PDF image to a Python PIL (Pillow) image.""" 

357 

358 def _repr_png_(self) -> bytes: 

359 """Display hook for IPython/Jupyter.""" 

360 b = BytesIO() 

361 with self.as_pil_image() as im: 

362 im.save(b, 'PNG') 

363 return b.getvalue() 

364 

365 

366class PdfImage(PdfImageBase): 

367 """Support class to provide a consistent API for manipulating PDF images. 

368 

369 The data structure for images inside PDFs is irregular and complex, 

370 making it difficult to use without introducing errors for less 

371 typical cases. This class addresses these difficulties by providing a 

372 regular, Pythonic API similar in spirit (and convertible to) the Python 

373 Pillow imaging library. 

374 """ 

375 

376 obj: Stream 

377 _icc: ImageCmsProfile | None 

378 _pdf_source: Pdf | None 

379 

380 def __new__(cls, obj: Stream): 

381 """Construct a PdfImage... or a PdfJpxImage if that is what we really are.""" 

382 try: 

383 # Check if JPXDecode is called for and initialize as PdfJpxImage 

384 filters = _ensure_list(obj.Filter) 

385 if Name.JPXDecode in filters: 

386 return super().__new__(PdfJpxImage) 

387 except (AttributeError, KeyError): 

388 # __init__ will deal with any other errors 

389 pass 

390 return super().__new__(PdfImage) 

391 

392 def __init__(self, obj: Stream): 

393 """Construct a PDF image from a Image XObject inside a PDF. 

394 

395 ``pim = PdfImage(page.Resources.XObject['/ImageNN'])`` 

396 

397 Args: 

398 obj: an Image XObject 

399 """ 

400 if isinstance(obj, Stream) and obj.stream_dict.get("/Subtype") != "/Image": 

401 raise TypeError("can't construct PdfImage from non-image") 

402 self.obj = obj 

403 self._icc = None 

404 

405 def __eq__(self, other): 

406 if not isinstance(other, PdfImageBase): 

407 return NotImplemented 

408 return self.obj == other.obj 

409 

410 @classmethod 

411 def _from_pil_image(cls, *, pdf, page, name, image): # pragma: no cover 

412 """Insert a PIL image into a PDF (rudimentary). 

413 

414 Args: 

415 pdf (pikepdf.Pdf): the PDF to attach the image to 

416 page (pikepdf.Object): the page to attach the image to 

417 name (str or pikepdf.Name): the name to set the image 

418 image (PIL.Image.Image): the image to insert 

419 """ 

420 data = image.tobytes() 

421 

422 imstream = Stream(pdf, data) 

423 imstream.Type = Name('/XObject') 

424 imstream.Subtype = Name('/Image') 

425 if image.mode == 'RGB': 

426 imstream.ColorSpace = Name('/DeviceRGB') 

427 elif image.mode in ('1', 'L'): 

428 imstream.ColorSpace = Name('/DeviceGray') 

429 imstream.BitsPerComponent = 1 if image.mode == '1' else 8 

430 imstream.Width = image.width 

431 imstream.Height = image.height 

432 

433 page.Resources.XObject[name] = imstream 

434 

435 return cls(imstream) 

436 

437 def _metadata(self, name, type_, default): 

438 return _metadata_from_obj(self.obj, name, type_, default) 

439 

440 @property 

441 def _iccstream(self): 

442 if self.colorspace == '/ICCBased': 

443 if not self.indexed: 

444 return self._colorspaces[1] 

445 assert isinstance(self._colorspaces[1], list) 

446 return self._colorspaces[1][1] 

447 raise NotImplementedError("Don't know how to find ICC stream for image") 

448 

449 @property 

450 def icc(self) -> ImageCmsProfile | None: 

451 """If an ICC profile is attached, return a Pillow object that describe it. 

452 

453 Most of the information may be found in ``icc.profile``. 

454 """ 

455 if self.colorspace not in ('/ICCBased', '/Indexed'): 

456 return None 

457 if not self._icc: 

458 iccstream = self._iccstream 

459 iccbuffer = iccstream.get_stream_buffer() 

460 iccbytesio = BytesIO(iccbuffer) 

461 try: 

462 self._icc = ImageCmsProfile(iccbytesio) 

463 except OSError as e: 

464 if str(e) == 'cannot open profile from string': 

465 # ICC profile is corrupt 

466 raise UnsupportedImageTypeError( 

467 "ICC profile corrupt or not readable" 

468 ) from e 

469 return self._icc 

470 

471 def _remove_simple_filters(self): 

472 """Remove simple lossless compression where it appears.""" 

473 COMPLEX_FILTERS = { 

474 '/DCTDecode', 

475 '/JPXDecode', 

476 '/JBIG2Decode', 

477 '/CCITTFaxDecode', 

478 } 

479 indices = [n for n, filt in enumerate(self.filters) if filt in COMPLEX_FILTERS] 

480 if len(indices) > 1: 

481 raise NotImplementedError( 

482 f"Object {self.obj.objgen} has compound complex filters: " 

483 f"{self.filters}. We cannot decompress this." 

484 ) 

485 if len(indices) == 0: 

486 # No complex filter indices, so all filters are simple - remove them all 

487 return self.obj.read_bytes(StreamDecodeLevel.specialized), [] 

488 

489 n = indices[0] 

490 if n == 0: 

491 # The only filter is complex, so return 

492 return self.obj.read_raw_bytes(), self.filters 

493 

494 obj_copy = copy(self.obj) 

495 obj_copy.Filter = Array([Name(f) for f in self.filters[:n]]) 

496 obj_copy.DecodeParms = Array(self.decode_parms[:n]) 

497 return obj_copy.read_bytes(StreamDecodeLevel.specialized), self.filters[n:] 

498 

499 def _extract_direct(self, *, stream: BinaryIO) -> str | None: 

500 """Attempt to extract the image directly to a usable image file. 

501 

502 If there is no way to extract the image without decompressing or 

503 transcoding then raise an exception. The type and format of image 

504 generated will vary. 

505 

506 Args: 

507 stream: Writable file stream to write data to, e.g. an open file 

508 """ 

509 

510 def normal_dct_rgb() -> bool: 

511 # Normal DCTDecode RGB images have the default value of 

512 # /ColorTransform 1 and are actually in YUV. Such a file can be 

513 # saved as a standard JPEG. RGB JPEGs without YUV conversion can't 

514 # be saved as JPEGs, and are probably bugs. Some software in the 

515 # wild actually produces RGB JPEGs in PDFs (probably a bug). 

516 DEFAULT_CT_RGB = 1 

517 ct = DEFAULT_CT_RGB 

518 if self.filter_decodeparms[0][1] is not None: 

519 ct = self.filter_decodeparms[0][1].get( 

520 '/ColorTransform', DEFAULT_CT_RGB 

521 ) 

522 return self.mode == 'RGB' and ct == DEFAULT_CT_RGB 

523 

524 def normal_dct_cmyk() -> bool: 

525 # Normal DCTDecode CMYKs have /ColorTransform 0 and can be saved. 

526 # There is a YUVK colorspace but CMYK JPEGs don't generally use it 

527 DEFAULT_CT_CMYK = 0 

528 ct = DEFAULT_CT_CMYK 

529 if self.filter_decodeparms[0][1] is not None: 

530 ct = self.filter_decodeparms[0][1].get( 

531 '/ColorTransform', DEFAULT_CT_CMYK 

532 ) 

533 return self.mode == 'CMYK' and ct == DEFAULT_CT_CMYK 

534 

535 data, filters = self._remove_simple_filters() 

536 

537 if filters == ['/CCITTFaxDecode']: 

538 if self.colorspace == '/ICCBased': 

539 icc = self._iccstream.read_bytes() 

540 else: 

541 icc = None 

542 stream.write(self._generate_ccitt_header(data, icc=icc)) 

543 stream.write(data) 

544 return '.tif' 

545 if filters == ['/DCTDecode'] and ( 

546 self.mode == 'L' or normal_dct_rgb() or normal_dct_cmyk() 

547 ): 

548 stream.write(data) 

549 return '.jpg' 

550 

551 return None 

552 

553 def _extract_transcoded_1248bits(self) -> Image.Image: 

554 """Extract an image when there are 1/2/4/8 bits packed in byte data.""" 

555 stride = 0 # tell Pillow to calculate stride from line width 

556 scale = 0 if self.mode == 'L' else 1 

557 if self.bits_per_component in (2, 4): 

558 buffer, stride = _transcoding.unpack_subbyte_pixels( 

559 self.read_bytes(), self.size, self.bits_per_component, scale 

560 ) 

561 elif self.bits_per_component == 8: 

562 buffer = cast(memoryview, self.get_stream_buffer()) 

563 else: 

564 raise InvalidPdfImageError("BitsPerComponent must be 1, 2, 4, 8, or 16") 

565 

566 if self.mode == 'P' and self.palette is not None: 

567 base_mode, palette = self.palette 

568 im = _transcoding.image_from_buffer_and_palette( 

569 buffer, 

570 self.size, 

571 stride, 

572 base_mode, 

573 palette, 

574 ) 

575 else: 

576 im = _transcoding.image_from_byte_buffer(buffer, self.size, stride) 

577 return im 

578 

579 def _extract_transcoded_1bit(self) -> Image.Image: 

580 if not self.image_mask and self.mode in ('RGB', 'CMYK'): 

581 raise UnsupportedImageTypeError("1-bit RGB and CMYK are not supported") 

582 try: 

583 data = self.read_bytes() 

584 except (RuntimeError, PdfError) as e: 

585 if ( 

586 'read_bytes called on unfilterable stream' in str(e) 

587 and not jbig2.get_decoder().available() 

588 ): 

589 raise DependencyError( 

590 "jbig2dec - not installed or installed version is too old " 

591 "(older than version 0.15)" 

592 ) from None 

593 raise 

594 

595 im = Image.frombytes('1', self.size, data) 

596 

597 if self.palette is not None: 

598 base_mode, palette = self.palette 

599 im = _transcoding.fix_1bit_palette_image(im, base_mode, palette) 

600 

601 return im 

602 

603 def _extract_transcoded_mask(self) -> Image.Image: 

604 return self._extract_transcoded_1bit() 

605 

606 def _extract_transcoded(self) -> Image.Image: 

607 if self.image_mask: 

608 return self._extract_transcoded_mask() 

609 

610 if self.mode in {'DeviceN', 'Separation'}: 

611 raise HifiPrintImageNotTranscodableError() 

612 

613 if self.mode == 'RGB' and self.bits_per_component == 8: 

614 # Cannot use the zero-copy .get_stream_buffer here, we have 3-byte 

615 # RGB and Pillow needs RGBX. 

616 im = Image.frombuffer( 

617 'RGB', self.size, self.read_bytes(), 'raw', 'RGB', 0, 1 

618 ) 

619 elif self.mode == 'CMYK' and self.bits_per_component == 8: 

620 im = Image.frombuffer( 

621 'CMYK', self.size, self.get_stream_buffer(), 'raw', 'CMYK', 0, 1 

622 ) 

623 # elif self.mode == '1': 

624 elif self.bits_per_component == 1: 

625 im = self._extract_transcoded_1bit() 

626 elif self.mode in ('L', 'P') and self.bits_per_component <= 8: 

627 im = self._extract_transcoded_1248bits() 

628 else: 

629 raise UnsupportedImageTypeError(repr(self) + ", " + repr(self.obj)) 

630 

631 if self.colorspace == '/ICCBased' and self.icc is not None: 

632 im.info['icc_profile'] = self.icc.tobytes() 

633 

634 return im 

635 

636 def _extract_to_stream(self, *, stream: BinaryIO) -> str: 

637 """Extract the image to a stream. 

638 

639 If possible, the compressed data is extracted and inserted into 

640 a compressed image file format without transcoding the compressed 

641 content. If this is not possible, the data will be decompressed 

642 and extracted to an appropriate format. 

643 

644 Args: 

645 stream: Writable stream to write data to 

646 

647 Returns: 

648 The file format extension. 

649 """ 

650 direct_extraction = self._extract_direct(stream=stream) 

651 if direct_extraction: 

652 return direct_extraction 

653 

654 im = None 

655 try: 

656 im = self._extract_transcoded() 

657 if im.mode == 'CMYK': 

658 im.save(stream, format='tiff', compression='tiff_adobe_deflate') 

659 return '.tiff' 

660 if im: 

661 im.save(stream, format='png') 

662 return '.png' 

663 except PdfError as e: 

664 if 'called on unfilterable stream' in str(e): 

665 raise UnsupportedImageTypeError(repr(self)) from e 

666 raise 

667 finally: 

668 if im: 

669 im.close() 

670 

671 raise UnsupportedImageTypeError(repr(self)) 

672 

673 def extract_to( 

674 self, *, stream: BinaryIO | None = None, fileprefix: str = '' 

675 ) -> str: 

676 """Extract the image directly to a usable image file. 

677 

678 If possible, the compressed data is extracted and inserted into 

679 a compressed image file format without transcoding the compressed 

680 content. If this is not possible, the data will be decompressed 

681 and extracted to an appropriate format. 

682 

683 Because it is not known until attempted what image format will be 

684 extracted, users should not assume what format they are getting back. 

685 When saving the image to a file, use a temporary filename, and then 

686 rename the file to its final name based on the returned file extension. 

687 

688 Images might be saved as any of .png, .jpg, or .tiff. 

689 

690 Examples: 

691 >>> im.extract_to(stream=bytes_io) # doctest: +SKIP 

692 '.png' 

693 

694 >>> im.extract_to(fileprefix='/tmp/image00') # doctest: +SKIP 

695 '/tmp/image00.jpg' 

696 

697 Args: 

698 stream: Writable stream to write data to. 

699 fileprefix (str or Path): The path to write the extracted image to, 

700 without the file extension. 

701 

702 Returns: 

703 If *fileprefix* was provided, then the fileprefix with the 

704 appropriate extension. If no *fileprefix*, then an extension 

705 indicating the file type. 

706 """ 

707 if bool(stream) == bool(fileprefix): 

708 raise ValueError("Cannot set both stream and fileprefix") 

709 if stream: 

710 return self._extract_to_stream(stream=stream) 

711 

712 bio = BytesIO() 

713 extension = self._extract_to_stream(stream=bio) 

714 bio.seek(0) 

715 filepath = Path(str(Path(fileprefix)) + extension) 

716 with filepath.open('wb') as target: 

717 copyfileobj(bio, target) 

718 return str(filepath) 

719 

720 def read_bytes( 

721 self, decode_level: StreamDecodeLevel = StreamDecodeLevel.specialized 

722 ) -> bytes: 

723 """Decompress this image and return it as unencoded bytes.""" 

724 return self.obj.read_bytes(decode_level=decode_level) 

725 

726 def get_stream_buffer( 

727 self, decode_level: StreamDecodeLevel = StreamDecodeLevel.specialized 

728 ) -> Buffer: 

729 """Access this image with the buffer protocol.""" 

730 return self.obj.get_stream_buffer(decode_level=decode_level) 

731 

732 def as_pil_image(self) -> Image.Image: 

733 """Extract the image as a Pillow Image, using decompression as necessary. 

734 

735 Caller must close the image. 

736 """ 

737 bio = BytesIO() 

738 direct_extraction = self._extract_direct(stream=bio) 

739 if direct_extraction: 

740 bio.seek(0) 

741 return Image.open(bio) 

742 

743 im = self._extract_transcoded() 

744 if not im: 

745 raise UnsupportedImageTypeError(repr(self)) 

746 

747 return im 

748 

749 def _generate_ccitt_header(self, data: bytes, icc: bytes | None = None) -> bytes: 

750 """Construct a CCITT G3 or G4 header from the PDF metadata.""" 

751 # https://stackoverflow.com/questions/2641770/ 

752 # https://www.itu.int/itudoc/itu-t/com16/tiff-fx/docs/tiff6.pdf 

753 

754 if not self.decode_parms: 

755 raise ValueError("/CCITTFaxDecode without /DecodeParms") 

756 

757 expected_defaults = [ 

758 ("/EncodedByteAlign", False), 

759 ] 

760 for name, val in expected_defaults: 

761 if self.decode_parms[0].get(name, val) != val: 

762 raise UnsupportedImageTypeError( 

763 f"/CCITTFaxDecode with decode parameter {name} not equal {val}" 

764 ) 

765 

766 k = self.decode_parms[0].get("/K", 0) 

767 t4_options = None 

768 if k < 0: 

769 ccitt_group = 4 # Group 4 

770 elif k > 0: 

771 ccitt_group = 3 # Group 3 2-D 

772 t4_options = 1 

773 else: 

774 ccitt_group = 3 # Group 3 1-D 

775 black_is_one = self.decode_parms[0].get("/BlackIs1", False) 

776 decode = self._decode_array 

777 # PDF spec says: 

778 # BlackIs1: A flag indicating whether 1 bits shall be interpreted as black 

779 # pixels and 0 bits as white pixels, the reverse of the normal 

780 # PDF convention for image data. Default value: false. 

781 # TIFF spec says: 

782 # use 0 for white_is_zero (=> black is 1) MINISWHITE 

783 # use 1 for black_is_zero (=> white is 1) MINISBLACK 

784 photometry = 1 if black_is_one else 0 

785 

786 # If Decode is [1, 0] then the photometry is inverted 

787 if len(decode) == 2 and decode == (1.0, 0.0): 

788 photometry = 1 - photometry 

789 

790 img_size = len(data) 

791 if icc is None: 

792 icc = b'' 

793 

794 return _transcoding.generate_ccitt_header( 

795 self.size, 

796 data_length=img_size, 

797 ccitt_group=ccitt_group, 

798 t4_options=t4_options, 

799 photometry=photometry, 

800 icc=icc, 

801 ) 

802 

803 def show(self): # pragma: no cover 

804 """Show the image however PIL wants to.""" 

805 self.as_pil_image().show() 

806 

807 def _set_pdf_source(self, pdf: Pdf): 

808 self._pdf_source = pdf 

809 

810 def __repr__(self): 

811 try: 

812 mode = self.mode 

813 except NotImplementedError: 

814 mode = '?' 

815 return ( 

816 f'<pikepdf.PdfImage image mode={mode} ' 

817 f'size={self.width}x{self.height} at {hex(id(self))}>' 

818 ) 

819 

820 

821class PdfJpxImage(PdfImage): 

822 """Support class for JPEG 2000 images. Implements the same API as :class:`PdfImage`. 

823 

824 If you call PdfImage(object_that_is_actually_jpeg2000_image), pikepdf will return 

825 this class instead, due to the check in PdfImage.__new__. 

826 """ 

827 

828 def __init__(self, obj): 

829 """Initialize a JPEG 2000 image.""" 

830 super().__init__(obj) 

831 self._jpxpil = self.as_pil_image() 

832 

833 def __eq__(self, other): 

834 if not isinstance(other, PdfImageBase): 

835 return NotImplemented 

836 return ( 

837 self.obj == other.obj 

838 and isinstance(other, PdfJpxImage) 

839 and self._jpxpil == other._jpxpil 

840 ) 

841 

842 def _extract_direct(self, *, stream: BinaryIO) -> str | None: 

843 data, filters = self._remove_simple_filters() 

844 if filters != ['/JPXDecode']: 

845 return None 

846 stream.write(data) 

847 return '.jp2' 

848 

849 def _extract_transcoded(self) -> Image.Image: 

850 return super()._extract_transcoded() 

851 

852 @property 

853 def _colorspaces(self): 

854 """Return the effective colorspace of a JPEG 2000 image. 

855 

856 If the ColorSpace dictionary is present, the colorspace embedded in the 

857 JPEG 2000 data will be ignored, as required by the specification. 

858 """ 

859 # (PDF 1.7 Table 89) If ColorSpace is present, any colour space 

860 # specifications in the JPEG2000 data shall be ignored. 

861 super_colorspaces = super()._colorspaces 

862 if super_colorspaces: 

863 return super_colorspaces 

864 if self._jpxpil.mode == 'L': 

865 return ['/DeviceGray'] 

866 if self._jpxpil.mode == 'RGB': 

867 return ['/DeviceRGB'] 

868 raise NotImplementedError('Complex JP2 colorspace') 

869 

870 @property 

871 def _bpc(self) -> int: 

872 """Return 8, since bpc is not meaningful for JPEG 2000 encoding.""" 

873 # (PDF 1.7 Table 89) If the image stream uses the JPXDecode filter, this 

874 # entry is optional and shall be ignored if present. The bit depth is 

875 # determined by the conforming reader in the process of decoding the 

876 # JPEG2000 image. 

877 return 8 

878 

879 @property 

880 def indexed(self) -> bool: 

881 """Return False, since JPEG 2000 should not be indexed.""" 

882 # Nothing in the spec precludes an Indexed JPXDecode image, except for 

883 # the fact that doing so is madness. Let's assume it no one is that 

884 # insane. 

885 return False 

886 

887 def __repr__(self): 

888 return ( 

889 f'<pikepdf.PdfJpxImage JPEG2000 image mode={self.mode} ' 

890 f'size={self.width}x{self.height} at {hex(id(self))}>' 

891 ) 

892 

893 

894class PdfInlineImage(PdfImageBase): 

895 """Support class for PDF inline images.""" 

896 

897 # Inline images can contain abbreviations that we write automatically 

898 ABBREVS = { 

899 b'/W': b'/Width', 

900 b'/H': b'/Height', 

901 b'/BPC': b'/BitsPerComponent', 

902 b'/IM': b'/ImageMask', 

903 b'/CS': b'/ColorSpace', 

904 b'/F': b'/Filter', 

905 b'/DP': b'/DecodeParms', 

906 b'/G': b'/DeviceGray', 

907 b'/RGB': b'/DeviceRGB', 

908 b'/CMYK': b'/DeviceCMYK', 

909 b'/I': b'/Indexed', 

910 b'/AHx': b'/ASCIIHexDecode', 

911 b'/A85': b'/ASCII85Decode', 

912 b'/LZW': b'/LZWDecode', 

913 b'/RL': b'/RunLengthDecode', 

914 b'/CCF': b'/CCITTFaxDecode', 

915 b'/DCT': b'/DCTDecode', 

916 } 

917 REVERSE_ABBREVS = {v: k for k, v in ABBREVS.items()} 

918 

919 _data: Object 

920 _image_object: tuple[Object, ...] 

921 

922 def __init__(self, *, image_data: Object, image_object: tuple): 

923 """Construct wrapper for inline image. 

924 

925 Args: 

926 image_data: data stream for image, extracted from content stream 

927 image_object: the metadata for image, also from content stream 

928 """ 

929 # Convert the sequence of pikepdf.Object from the content stream into 

930 # a dictionary object by unparsing it (to bytes), eliminating inline 

931 # image abbreviations, and constructing a bytes string equivalent to 

932 # what an image XObject would look like. Then retrieve data from there 

933 

934 self._data = image_data 

935 self._image_object = image_object 

936 

937 reparse = b' '.join( 

938 self._unparse_obj(obj, remap_names=self.ABBREVS) for obj in image_object 

939 ) 

940 try: 

941 reparsed_obj = Object.parse(b'<< ' + reparse + b' >>') 

942 except PdfError as e: 

943 raise PdfError("parsing inline " + reparse.decode('unicode_escape')) from e 

944 self.obj = reparsed_obj 

945 

946 def __eq__(self, other): 

947 if not isinstance(other, PdfImageBase): 

948 return NotImplemented 

949 return ( 

950 self.obj == other.obj 

951 and isinstance(other, PdfInlineImage) 

952 and ( 

953 self._data._inline_image_raw_bytes() 

954 == other._data._inline_image_raw_bytes() 

955 ) 

956 ) 

957 

958 @classmethod 

959 def _unparse_obj(cls, obj, remap_names): 

960 if isinstance(obj, Object): 

961 if isinstance(obj, Name): 

962 name = obj.unparse(resolved=True) 

963 assert isinstance(name, bytes) 

964 return remap_names.get(name, name) 

965 return obj.unparse(resolved=True) 

966 if isinstance(obj, bool): 

967 return b'true' if obj else b'false' # Lower case for PDF spec 

968 if isinstance(obj, int | Decimal | float): 

969 return str(obj).encode('ascii') 

970 raise NotImplementedError(repr(obj)) 

971 

972 def _metadata(self, name, type_, default): 

973 return _metadata_from_obj(self.obj, name, type_, default) 

974 

975 def unparse(self) -> bytes: 

976 """Create the content stream bytes that reproduce this inline image.""" 

977 

978 def metadata_tokens(): 

979 for metadata_obj in self._image_object: 

980 unparsed = self._unparse_obj( 

981 metadata_obj, remap_names=self.REVERSE_ABBREVS 

982 ) 

983 assert isinstance(unparsed, bytes) 

984 yield unparsed 

985 

986 def inline_image_tokens(): 

987 yield b'BI\n' 

988 yield b' '.join(m for m in metadata_tokens()) 

989 yield b'\nID\n' 

990 yield self._data._inline_image_raw_bytes() 

991 yield b'EI' 

992 

993 return b''.join(inline_image_tokens()) 

994 

995 @property 

996 def icc(self): # pragma: no cover 

997 """Raise an exception since ICC profiles are not supported on inline images.""" 

998 raise InvalidPdfImageError( 

999 "Inline images with ICC profiles are not supported in the PDF specification" 

1000 ) 

1001 

1002 def __repr__(self): 

1003 try: 

1004 mode = self.mode 

1005 except NotImplementedError: 

1006 mode = '?' 

1007 return ( 

1008 f'<pikepdf.PdfInlineImage image mode={mode} ' 

1009 f'size={self.width}x{self.height} at {hex(id(self))}>' 

1010 ) 

1011 

1012 def _convert_to_pdfimage(self) -> PdfImage: 

1013 # Construct a temporary PDF that holds this inline image, and... 

1014 tmppdf = Pdf.new() 

1015 tmppdf.add_blank_page(page_size=(self.width, self.height)) 

1016 tmppdf.pages[0].contents_add( 

1017 f'{self.width} 0 0 {self.height} 0 0 cm'.encode('ascii'), prepend=True 

1018 ) 

1019 tmppdf.pages[0].contents_add(self.unparse()) 

1020 

1021 # ...externalize it, 

1022 tmppdf.pages[0].externalize_inline_images() 

1023 raw_img = cast(Stream, next(im for im in tmppdf.pages[0].images.values())) 

1024 

1025 # ...then use the regular PdfImage API to extract it. 

1026 img = PdfImage(raw_img) 

1027 img._set_pdf_source(tmppdf) # Hold tmppdf open while PdfImage exists 

1028 return img 

1029 

1030 def as_pil_image(self) -> Image.Image: 

1031 """Return inline image as a Pillow Image.""" 

1032 return self._convert_to_pdfimage().as_pil_image() 

1033 

1034 def extract_to(self, *, stream: BinaryIO | None = None, fileprefix: str = ''): 

1035 """Extract the inline image directly to a usable image file. 

1036 

1037 See: 

1038 :meth:`PdfImage.extract_to` 

1039 """ 

1040 return self._convert_to_pdfimage().extract_to( 

1041 stream=stream, fileprefix=fileprefix 

1042 ) 

1043 

1044 def read_bytes(self): 

1045 """Return decompressed image bytes.""" 

1046 # qpdf does not have an API to return this directly, so convert it. 

1047 return self._convert_to_pdfimage().read_bytes() 

1048 

1049 def get_stream_buffer(self): 

1050 """Return decompressed stream buffer.""" 

1051 # qpdf does not have an API to return this directly, so convert it. 

1052 return self._convert_to_pdfimage().get_stream_buffer() 

1053 

1054 

1055__all__ = [ 

1056 'CMYKDecodeArray', 

1057 'DecodeArray', 

1058 'HifiPrintImageNotTranscodableError', 

1059 'ImageDecompressionError', 

1060 'InvalidPdfImageError', 

1061 'PaletteData', 

1062 'PdfImage', 

1063 'PdfImageBase', 

1064 'PdfInlineImage', 

1065 'PdfJpxImage', 

1066 'RGBDecodeArray', 

1067 'UnsupportedImageTypeError', 

1068]