Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pikepdf/objects.py: 47%

1# SPDX-FileCopyrightText: 2022 James R. Barlow

2# SPDX-License-Identifier: MPL-2.0

4"""Provide classes to stand in for PDF objects.

6The purpose of these is to provide nice-looking classes to allow explicit

7construction of PDF objects and more pythonic idioms and facilitate discovery

8by documentation generators and linters.

10It's also a place to narrow the scope of input types to those more easily

11converted to C++.

13There is some deliberate "smoke and mirrors" here: all of the objects are truly

14instances of ``pikepdf.Object``, which is a variant container object. The

15``__new__`` constructs a ``pikepdf.Object`` in each case, and the rest of the

16class definition is present as an aide for code introspection.

17"""

19from __future__ import annotations

21from collections.abc import Iterable, Mapping

22from decimal import Decimal

24# pylint: disable=unused-import, abstract-method

25from secrets import token_urlsafe

26from typing import TYPE_CHECKING, Any, cast

28from pikepdf import _core

29from pikepdf._core import Matrix, Object, ObjectType, Rectangle

31if TYPE_CHECKING: # pragma: no cover

32 from pikepdf import Pdf

34# By default pikepdf.Object will identify itself as pikepdf._core.Object

35# Here we change the module to discourage people from using that internal name

36# Instead it will become pikepdf.objects.Object

37Object.__module__ = __name__

38ObjectType.__module__ = __name__

41# type(Object) is the metaclass that pybind11 defines; we wish to extend that

42# pylint cannot see the C++ metaclass definition and is thoroughly confused.

43# pylint: disable=invalid-metaclass

46class _ObjectMeta(type(Object)): # type: ignore

47 """Support instance checking."""

49 object_type: ObjectType

51 def __instancecheck__(self, instance: Any) -> bool:

52 # Note: since this class is a metaclass, self is a class object

53 if type(instance) is not Object:

54 return False

55 return self.object_type == instance._type_code

58class _NameObjectMeta(_ObjectMeta):

59 """Support usage pikepdf.Name.Whatever -> Name('/Whatever')."""

61 def __getattr__(self, attr: str) -> Name:

62 if attr.startswith('_') or attr == 'object_type':

63 return getattr(_ObjectMeta, attr)

64 return Name('/' + attr)

66 def __setattr__(self, attr: str, value: Any) -> None:

67 # No need for a symmetric .startswith('_'). To prevent user error, we

68 # simply don't allow mucking with the pikepdf.Name class's attributes.

69 # There is no reason to ever assign to them.

70 raise AttributeError(

71 "Attributes may not be set on pikepdf.Name. Perhaps you meant to "

72 "modify a Dictionary rather than a Name?"

73 )

75 def __getitem__(self, item: str) -> None:

76 if item.startswith('/'):

77 item = item[1:]

78 raise TypeError(

79 "pikepdf.Name is not subscriptable. You probably meant:\n"

80 f" pikepdf.Name.{item}\n"

81 "or\n"

82 f" pikepdf.Name('/{item}')\n"

83 )

86class Name(Object, metaclass=_NameObjectMeta):

87 """Construct a PDF Name object.

89 Names can be constructed with two notations:

91 1. ``Name.Resources``

93 2. ``Name('/Resources')``

95 The two are semantically equivalent. The former is preferred for names

96 that are normally expected to be in a PDF. The latter is preferred for

97 dynamic names and attributes.

98 """

100 object_type = ObjectType.name_

101

102 def __new__(cls, name: str | Name) -> Name:

103 """Construct a PDF Name."""

104 # QPDF_Name::unparse ensures that names are always saved in a UTF-8

105 # compatible way, so we only need to guard the input.

106 if isinstance(name, bytes):

107 raise TypeError("Name should be str")

108 if isinstance(name, Name):

109 return name # Names are immutable so we can return a reference

110 return _core._new_name(name)

111

112 @classmethod

113 def random(cls, len_: int = 16, prefix: str = '') -> Name:

114 """Generate a cryptographically strong, random, valid PDF Name.

115

116 If you are inserting a new name into a PDF (for example,

117 name for a new image), you can use this function to generate a

118 cryptographically strong random name that is almost certainly already

119 not already in the PDF, and not colliding with other existing names.

120

121 This function uses Python's secrets.token_urlsafe, which returns a

122 URL-safe encoded random number of the desired length. An optional

123 *prefix* may be prepended. (The encoding is ultimately done with

124 :func:`base64.urlsafe_b64encode`.) Serendipitously, URL-safe is also

125 PDF-safe.

126

127 When the length parameter is 16 (16 random bytes or 128 bits), the result

128 is probably globally unique and can be treated as never colliding with

129 other names.

130

131 The length of the returned string may vary because it is encoded,

132 but will always have ``8 * len_`` random bits.

133

134 Args:

135 len_: The length of the random string.

136 prefix: A prefix to prepend to the random string.

137 """

138 random_string = token_urlsafe(len_)

139 return _core._new_name(f"/{prefix}{random_string}")

140

141

142class Operator(Object, metaclass=_ObjectMeta):

143 """Construct an operator for use in a content stream.

144

145 An Operator is one of a limited set of commands that can appear in PDF content

146 streams (roughly the mini-language that draws objects, lines and text on a

147 virtual PDF canvas). The commands :func:`parse_content_stream` and

148 :func:`unparse_content_stream` create and expect Operators respectively, along

149 with their operands.

150

151 pikepdf uses the special Operator "INLINE IMAGE" to denote an inline image

152 in a content stream.

153 """

154

155 object_type = ObjectType.operator

156

157 def __new__(cls, name: str) -> Operator:

158 """Construct an operator."""

159 return cast('Operator', _core._new_operator(name))

160

161

162class String(Object, metaclass=_ObjectMeta):

163 """Construct a PDF String object."""

164

165 object_type = ObjectType.string

166

167 def __new__(cls, s: str | bytes) -> String:

168 """Construct a PDF String.

169

170 Args:

171 s: The string to use. String will be encoded for

172 PDF, bytes will be constructed without encoding.

173 """

174 if isinstance(s, bytes | bytearray | memoryview):

175 return _core._new_string(s)

176 return _core._new_string_utf8(s)

177

178

179class Array(Object, metaclass=_ObjectMeta):

180 """Construct a PDF Array object."""

181

182 object_type = ObjectType.array

183

184 def __new__(cls, a: Iterable | Rectangle | Matrix | None = None) -> Array:

185 """Construct a PDF Array.

186

187 Args:

188 a: An iterable of objects. All objects must be either

189 `pikepdf.Object` or convertible to `pikepdf.Object`.

190 """

191 if isinstance(a, str | bytes):

192 raise TypeError('Strings cannot be converted to arrays of chars')

193

194 if a is None:

195 a = []

196 elif isinstance(a, Rectangle | Matrix):

197 return a.as_array()

198 elif isinstance(a, Array):

199 return cast(Array, a.__copy__())

200 return _core._new_array(a)

201

202

203class Dictionary(Object, metaclass=_ObjectMeta):

204 """Construct a PDF Dictionary object."""

205

206 object_type = ObjectType.dictionary

207

208 def __new__(cls, d: Mapping | None = None, **kwargs) -> Dictionary:

209 """Construct a PDF Dictionary.

210

211 Works from either a Python ``dict`` or keyword arguments.

212

213 These two examples are equivalent:

214

215 .. code-block:: python

216

217 pikepdf.Dictionary({'/NameOne': 1, '/NameTwo': 'Two'})

218

219 pikepdf.Dictionary(NameOne=1, NameTwo='Two')

220

221 In either case, the keys must be strings, and the strings

222 correspond to the desired Names in the PDF Dictionary. The values

223 must all be convertible to `pikepdf.Object`.

224 """

225 if kwargs and d is not None:

226 raise ValueError('Cannot use both a mapping object and keyword args')

227 if kwargs:

228 # Add leading slash

229 # Allows Dictionary(MediaBox=(0,0,1,1), Type=Name('/Page')...

230 return _core._new_dictionary({('/' + k): v for k, v in kwargs.items()})

231 if isinstance(d, Dictionary):

232 # Already a dictionary

233 return cast(Dictionary, d.__copy__())

234 if not d:

235 d = {}

236 if d and any(key == '/' or not key.startswith('/') for key in d.keys()):

237 raise KeyError("Dictionary created from strings must begin with '/'")

238 return _core._new_dictionary(d)

239

240

241class Stream(Object, metaclass=_ObjectMeta):

242 """Construct a PDF Stream object."""

243

244 object_type = ObjectType.stream

245

246 def __new__(cls, owner: Pdf, data: bytes | None = None, d=None, **kwargs) -> Stream:

247 """Create a new stream object.

248

249 Streams stores arbitrary binary data and may or may not be compressed.

250 It also may or may not be a page or Form XObject's content stream.

251

252 A stream dictionary is like a pikepdf.Dictionary or Python dict, except

253 it has a binary payload of data attached. The dictionary describes

254 how the data is compressed or encoded.

255

256 The dictionary may be initialized just like pikepdf.Dictionary is initialized,

257 using a mapping object or keyword arguments.

258

259 Args:

260 owner: The Pdf to which this stream shall be attached.

261 data: The data bytes for the stream.

262 d: An optional mapping object that will be used to construct the stream's

263 dictionary.

264 kwargs: Keyword arguments that will define the stream dictionary. Do not set

265 /Length here as pikepdf will manage this value. Set /Filter

266 if the data is already encoded in some format.

267

268 Examples:

269 Using kwargs:

270 >>> pdf = pikepdf.Pdf.new()

271 >>> s1 = pikepdf.Stream(

272 ... pdf,

273 ... b"uncompressed image data",

274 ... BitsPerComponent=8,

275 ... ColorSpace=pikepdf.Name.DeviceRGB,

276 ... )

277 Using dict:

278 >>> pdf = pikepdf.Pdf.new()

279 >>> d = pikepdf.Dictionary(Key1=1, Key2=2)

280 >>> s2 = pikepdf.Stream(

281 ... pdf,

282 ... b"data",

283 ... d

284 ... )

285

286 .. versionchanged:: 2.2

287 Support creation of ``pikepdf.Stream`` from existing dictionary.

288

289 .. versionchanged:: 3.0

290 ``obj`` argument was removed; use ``data``.

291 """

292 if data is None:

293 raise TypeError("Must make Stream from binary data")

294

295 stream_dict = None

296 if d or kwargs:

297 stream_dict = Dictionary(d, **kwargs)

298

299 stream = _core._new_stream(owner, data)

300 if stream_dict:

301 stream.stream_dict = stream_dict

302 return stream

303

304

305class Integer(Object, metaclass=_ObjectMeta):

306 """A PDF integer object.

307

308 In explicit conversion mode, PDF integers are returned as this type instead

309 of being automatically converted to Python ``int``.

310

311 Supports ``int()`` conversion, indexing operations (via ``__index__``),

312 and arithmetic operations. Arithmetic operations return native Python ``int``.

313

314 .. versionadded:: 10.1

315 """

316

317 object_type = ObjectType.integer

318

319 def __new__(cls, val: int | Integer) -> Integer:

320 """Construct a PDF Integer.

321

322 Args:

323 val: The integer value.

324 """

325 if isinstance(val, Integer):

326 return val

327 return _core._new_integer(val) # type: ignore[return-value]

328

329

330class Boolean(Object, metaclass=_ObjectMeta):

331 """A PDF boolean object.

332

333 In explicit conversion mode, PDF booleans are returned as this type instead

334 of being automatically converted to Python ``bool``.

335

336 Supports ``bool()`` conversion via ``__bool__``.

337

338 .. versionadded:: 10.1

339 """

340

341 object_type = ObjectType.boolean

342

343 def __new__(cls, val: bool | Boolean) -> Boolean:

344 """Construct a PDF Boolean.

345

346 Args:

347 val: The boolean value.

348 """

349 if isinstance(val, Boolean):

350 return val

351 return _core._new_boolean(val) # type: ignore[return-value]

352

353

354class Real(Object, metaclass=_ObjectMeta):

355 """A PDF real (floating-point) object.

356

357 In explicit conversion mode, PDF reals are returned as this type instead

358 of being automatically converted to Python ``Decimal``.

359

360 Supports ``float()`` conversion. Use ``as_decimal()`` for lossless conversion.

361

362 .. versionadded:: 10.1

363 """

364

365 object_type = ObjectType.real

366

367 def __new__(cls, val: float | Decimal | Real, places: int = 6) -> Real:

368 """Construct a PDF Real.

369

370 Args:

371 val: The real value. Converted to string representation internally.

372 places: Number of decimal places (used when val is float).

373 """

374 if isinstance(val, Real):

375 return val

376 if isinstance(val, float):

377 return _core._new_real(val, places) # type: ignore[return-value]

378 return _core._new_real(str(val)) # type: ignore[return-value]

379

380

381# Note on numbers ABC registration:

382# numbers.Integral.register(Integer) and numbers.Real.register(Real) don't work

383# as expected because of the "smoke and mirrors" design - at runtime all Objects

384# are actually pikepdf.Object instances, not Integer/Real instances.

385# The isinstance(obj, Integer) check uses metaclass magic (_ObjectMeta) that

386# checks the object's _type_code attribute. This doesn't satisfy the numbers ABC

387# registration mechanism which checks the actual type hierarchy.

388

389

390class _NamePathMeta(type):

391 """Metaclass for NamePath to support NamePath.A.B syntax."""

392

393 def __getattr__(cls, name: str) -> _core._NamePath:

394 if name.startswith('_'):

395 raise AttributeError(name)

396 return _core._NamePath()._append_name(name)

397

398 def __getitem__(cls, key: str | int | Name) -> _core._NamePath:

399 # NamePath['/A'] or NamePath[0]

400 if isinstance(key, str):

401 return _core._NamePath()._append_name(key)

402 elif isinstance(key, int):

403 return _core._NamePath()._append_index(key)

404 elif isinstance(key, Name):

405 return _core._NamePath()._append_name(str(key))

406 raise TypeError(f"NamePath key must be str, int, or Name, not {type(key)}")

407

408 def __call__(cls, *args) -> _core._NamePath:

409 # NamePath() or NamePath('/A', '/B')

410 if not args:

411 return _core._NamePath()

412 return _core._NamePath(*args)

413

414

415class NamePath(metaclass=_NamePathMeta):

416 """Path for accessing nested Dictionary/Stream values.

417

418 NamePath provides ergonomic access to deeply nested PDF structures with a

419 single access operation and helpful error messages when keys are not found.

420

421 Usage examples::

422

423 # Shorthand syntax - most common

424 obj[NamePath.Resources.Font.F1]

425

426 # With array indices

427 obj[NamePath.Pages.Kids[0].MediaBox]

428

429 # Chained access - supports non Python-identifier names

430 NamePath['/A']['/B'].C[0] # equivalent to NamePath.A.B.C[0]

431

432 # Alternate syntax to support lists

433 obj[NamePath(Name.Resources, Name.Font)]

434

435 # Using string objects

436 obj[NamePath('/Resources', '/Weird-Name')]

437

438 # Empty path returns the object itself

439 obj[NamePath()]

440

441 # Setting nested values (all parents must exist)

442 obj[NamePath.Root.Info.Title] = pikepdf.String("Test")

443

444 # With default value

445 obj.get(NamePath.Root.Metadata, None)

446

447 When a key is not found, the KeyError message identifies the exact failure

448 point, e.g.: "Key /C not found; traversed NamePath.A.B"

449

450 .. versionadded:: 10.1

451 """

452

453 # This class is never instantiated - the metaclass intercepts construction

454 # and returns _core._NamePath instances instead

455 pass