Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pikepdf/objects.py: 47%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

127 statements  

1# SPDX-FileCopyrightText: 2022 James R. Barlow 

2# SPDX-License-Identifier: MPL-2.0 

3 

4"""Provide classes to stand in for PDF objects. 

5 

6The purpose of these is to provide nice-looking classes to allow explicit 

7construction of PDF objects and more pythonic idioms and facilitate discovery 

8by documentation generators and linters. 

9 

10It's also a place to narrow the scope of input types to those more easily 

11converted to C++. 

12 

13There is some deliberate "smoke and mirrors" here: all of the objects are truly 

14instances of ``pikepdf.Object``, which is a variant container object. The 

15``__new__`` constructs a ``pikepdf.Object`` in each case, and the rest of the 

16class definition is present as an aide for code introspection. 

17""" 

18 

19from __future__ import annotations 

20 

21from collections.abc import Iterable, Mapping 

22from decimal import Decimal 

23 

24# pylint: disable=unused-import, abstract-method 

25from secrets import token_urlsafe 

26from typing import TYPE_CHECKING, Any, cast 

27 

28from pikepdf import _core 

29from pikepdf._core import Matrix, Object, ObjectType, Rectangle 

30 

31if TYPE_CHECKING: # pragma: no cover 

32 from pikepdf import Pdf 

33 

34# By default pikepdf.Object will identify itself as pikepdf._core.Object 

35# Here we change the module to discourage people from using that internal name 

36# Instead it will become pikepdf.objects.Object 

37Object.__module__ = __name__ 

38ObjectType.__module__ = __name__ 

39 

40 

41# type(Object) is the metaclass that pybind11 defines; we wish to extend that 

42# pylint cannot see the C++ metaclass definition and is thoroughly confused. 

43# pylint: disable=invalid-metaclass 

44 

45 

46class _ObjectMeta(type(Object)): # type: ignore 

47 """Support instance checking.""" 

48 

49 object_type: ObjectType 

50 

51 def __instancecheck__(self, instance: Any) -> bool: 

52 # Note: since this class is a metaclass, self is a class object 

53 if type(instance) is not Object: 

54 return False 

55 return self.object_type == instance._type_code 

56 

57 

58class _NameObjectMeta(_ObjectMeta): 

59 """Support usage pikepdf.Name.Whatever -> Name('/Whatever').""" 

60 

61 def __getattr__(self, attr: str) -> Name: 

62 if attr.startswith('_') or attr == 'object_type': 

63 return getattr(_ObjectMeta, attr) 

64 return Name('/' + attr) 

65 

66 def __setattr__(self, attr: str, value: Any) -> None: 

67 # No need for a symmetric .startswith('_'). To prevent user error, we 

68 # simply don't allow mucking with the pikepdf.Name class's attributes. 

69 # There is no reason to ever assign to them. 

70 raise AttributeError( 

71 "Attributes may not be set on pikepdf.Name. Perhaps you meant to " 

72 "modify a Dictionary rather than a Name?" 

73 ) 

74 

75 def __getitem__(self, item: str) -> None: 

76 if item.startswith('/'): 

77 item = item[1:] 

78 raise TypeError( 

79 "pikepdf.Name is not subscriptable. You probably meant:\n" 

80 f" pikepdf.Name.{item}\n" 

81 "or\n" 

82 f" pikepdf.Name('/{item}')\n" 

83 ) 

84 

85 

86class Name(Object, metaclass=_NameObjectMeta): 

87 """Construct a PDF Name object. 

88 

89 Names can be constructed with two notations: 

90 

91 1. ``Name.Resources`` 

92 

93 2. ``Name('/Resources')`` 

94 

95 The two are semantically equivalent. The former is preferred for names 

96 that are normally expected to be in a PDF. The latter is preferred for 

97 dynamic names and attributes. 

98 """ 

99 

100 object_type = ObjectType.name_ 

101 

102 def __new__(cls, name: str | Name) -> Name: 

103 """Construct a PDF Name.""" 

104 # QPDF_Name::unparse ensures that names are always saved in a UTF-8 

105 # compatible way, so we only need to guard the input. 

106 if isinstance(name, bytes): 

107 raise TypeError("Name should be str") 

108 if isinstance(name, Name): 

109 return name # Names are immutable so we can return a reference 

110 return _core._new_name(name) 

111 

112 @classmethod 

113 def random(cls, len_: int = 16, prefix: str = '') -> Name: 

114 """Generate a cryptographically strong, random, valid PDF Name. 

115 

116 If you are inserting a new name into a PDF (for example, 

117 name for a new image), you can use this function to generate a 

118 cryptographically strong random name that is almost certainly already 

119 not already in the PDF, and not colliding with other existing names. 

120 

121 This function uses Python's secrets.token_urlsafe, which returns a 

122 URL-safe encoded random number of the desired length. An optional 

123 *prefix* may be prepended. (The encoding is ultimately done with 

124 :func:`base64.urlsafe_b64encode`.) Serendipitously, URL-safe is also 

125 PDF-safe. 

126 

127 When the length parameter is 16 (16 random bytes or 128 bits), the result 

128 is probably globally unique and can be treated as never colliding with 

129 other names. 

130 

131 The length of the returned string may vary because it is encoded, 

132 but will always have ``8 * len_`` random bits. 

133 

134 Args: 

135 len_: The length of the random string. 

136 prefix: A prefix to prepend to the random string. 

137 """ 

138 random_string = token_urlsafe(len_) 

139 return _core._new_name(f"/{prefix}{random_string}") 

140 

141 

142class Operator(Object, metaclass=_ObjectMeta): 

143 """Construct an operator for use in a content stream. 

144 

145 An Operator is one of a limited set of commands that can appear in PDF content 

146 streams (roughly the mini-language that draws objects, lines and text on a 

147 virtual PDF canvas). The commands :func:`parse_content_stream` and 

148 :func:`unparse_content_stream` create and expect Operators respectively, along 

149 with their operands. 

150 

151 pikepdf uses the special Operator "INLINE IMAGE" to denote an inline image 

152 in a content stream. 

153 """ 

154 

155 object_type = ObjectType.operator 

156 

157 def __new__(cls, name: str) -> Operator: 

158 """Construct an operator.""" 

159 return cast('Operator', _core._new_operator(name)) 

160 

161 

162class String(Object, metaclass=_ObjectMeta): 

163 """Construct a PDF String object.""" 

164 

165 object_type = ObjectType.string 

166 

167 def __new__(cls, s: str | bytes) -> String: 

168 """Construct a PDF String. 

169 

170 Args: 

171 s: The string to use. String will be encoded for 

172 PDF, bytes will be constructed without encoding. 

173 """ 

174 if isinstance(s, bytes | bytearray | memoryview): 

175 return _core._new_string(s) 

176 return _core._new_string_utf8(s) 

177 

178 

179class Array(Object, metaclass=_ObjectMeta): 

180 """Construct a PDF Array object.""" 

181 

182 object_type = ObjectType.array 

183 

184 def __new__(cls, a: Iterable | Rectangle | Matrix | None = None) -> Array: 

185 """Construct a PDF Array. 

186 

187 Args: 

188 a: An iterable of objects. All objects must be either 

189 `pikepdf.Object` or convertible to `pikepdf.Object`. 

190 """ 

191 if isinstance(a, str | bytes): 

192 raise TypeError('Strings cannot be converted to arrays of chars') 

193 

194 if a is None: 

195 a = [] 

196 elif isinstance(a, Rectangle | Matrix): 

197 return a.as_array() 

198 elif isinstance(a, Array): 

199 return cast(Array, a.__copy__()) 

200 return _core._new_array(a) 

201 

202 

203class Dictionary(Object, metaclass=_ObjectMeta): 

204 """Construct a PDF Dictionary object.""" 

205 

206 object_type = ObjectType.dictionary 

207 

208 def __new__(cls, d: Mapping | None = None, **kwargs) -> Dictionary: 

209 """Construct a PDF Dictionary. 

210 

211 Works from either a Python ``dict`` or keyword arguments. 

212 

213 These two examples are equivalent: 

214 

215 .. code-block:: python 

216 

217 pikepdf.Dictionary({'/NameOne': 1, '/NameTwo': 'Two'}) 

218 

219 pikepdf.Dictionary(NameOne=1, NameTwo='Two') 

220 

221 In either case, the keys must be strings, and the strings 

222 correspond to the desired Names in the PDF Dictionary. The values 

223 must all be convertible to `pikepdf.Object`. 

224 """ 

225 if kwargs and d is not None: 

226 raise ValueError('Cannot use both a mapping object and keyword args') 

227 if kwargs: 

228 # Add leading slash 

229 # Allows Dictionary(MediaBox=(0,0,1,1), Type=Name('/Page')... 

230 return _core._new_dictionary({('/' + k): v for k, v in kwargs.items()}) 

231 if isinstance(d, Dictionary): 

232 # Already a dictionary 

233 return cast(Dictionary, d.__copy__()) 

234 if not d: 

235 d = {} 

236 if d and any(key == '/' or not key.startswith('/') for key in d.keys()): 

237 raise KeyError("Dictionary created from strings must begin with '/'") 

238 return _core._new_dictionary(d) 

239 

240 

241class Stream(Object, metaclass=_ObjectMeta): 

242 """Construct a PDF Stream object.""" 

243 

244 object_type = ObjectType.stream 

245 

246 def __new__(cls, owner: Pdf, data: bytes | None = None, d=None, **kwargs) -> Stream: 

247 """Create a new stream object. 

248 

249 Streams stores arbitrary binary data and may or may not be compressed. 

250 It also may or may not be a page or Form XObject's content stream. 

251 

252 A stream dictionary is like a pikepdf.Dictionary or Python dict, except 

253 it has a binary payload of data attached. The dictionary describes 

254 how the data is compressed or encoded. 

255 

256 The dictionary may be initialized just like pikepdf.Dictionary is initialized, 

257 using a mapping object or keyword arguments. 

258 

259 Args: 

260 owner: The Pdf to which this stream shall be attached. 

261 data: The data bytes for the stream. 

262 d: An optional mapping object that will be used to construct the stream's 

263 dictionary. 

264 kwargs: Keyword arguments that will define the stream dictionary. Do not set 

265 /Length here as pikepdf will manage this value. Set /Filter 

266 if the data is already encoded in some format. 

267 

268 Examples: 

269 Using kwargs: 

270 >>> pdf = pikepdf.Pdf.new() 

271 >>> s1 = pikepdf.Stream( 

272 ... pdf, 

273 ... b"uncompressed image data", 

274 ... BitsPerComponent=8, 

275 ... ColorSpace=pikepdf.Name.DeviceRGB, 

276 ... ) 

277 Using dict: 

278 >>> pdf = pikepdf.Pdf.new() 

279 >>> d = pikepdf.Dictionary(Key1=1, Key2=2) 

280 >>> s2 = pikepdf.Stream( 

281 ... pdf, 

282 ... b"data", 

283 ... d 

284 ... ) 

285 

286 .. versionchanged:: 2.2 

287 Support creation of ``pikepdf.Stream`` from existing dictionary. 

288 

289 .. versionchanged:: 3.0 

290 ``obj`` argument was removed; use ``data``. 

291 """ 

292 if data is None: 

293 raise TypeError("Must make Stream from binary data") 

294 

295 stream_dict = None 

296 if d or kwargs: 

297 stream_dict = Dictionary(d, **kwargs) 

298 

299 stream = _core._new_stream(owner, data) 

300 if stream_dict: 

301 stream.stream_dict = stream_dict 

302 return stream 

303 

304 

305class Integer(Object, metaclass=_ObjectMeta): 

306 """A PDF integer object. 

307 

308 In explicit conversion mode, PDF integers are returned as this type instead 

309 of being automatically converted to Python ``int``. 

310 

311 Supports ``int()`` conversion, indexing operations (via ``__index__``), 

312 and arithmetic operations. Arithmetic operations return native Python ``int``. 

313 

314 .. versionadded:: 10.1 

315 """ 

316 

317 object_type = ObjectType.integer 

318 

319 def __new__(cls, val: int | Integer) -> Integer: 

320 """Construct a PDF Integer. 

321 

322 Args: 

323 val: The integer value. 

324 """ 

325 if isinstance(val, Integer): 

326 return val 

327 return _core._new_integer(val) # type: ignore[return-value] 

328 

329 

330class Boolean(Object, metaclass=_ObjectMeta): 

331 """A PDF boolean object. 

332 

333 In explicit conversion mode, PDF booleans are returned as this type instead 

334 of being automatically converted to Python ``bool``. 

335 

336 Supports ``bool()`` conversion via ``__bool__``. 

337 

338 .. versionadded:: 10.1 

339 """ 

340 

341 object_type = ObjectType.boolean 

342 

343 def __new__(cls, val: bool | Boolean) -> Boolean: 

344 """Construct a PDF Boolean. 

345 

346 Args: 

347 val: The boolean value. 

348 """ 

349 if isinstance(val, Boolean): 

350 return val 

351 return _core._new_boolean(val) # type: ignore[return-value] 

352 

353 

354class Real(Object, metaclass=_ObjectMeta): 

355 """A PDF real (floating-point) object. 

356 

357 In explicit conversion mode, PDF reals are returned as this type instead 

358 of being automatically converted to Python ``Decimal``. 

359 

360 Supports ``float()`` conversion. Use ``as_decimal()`` for lossless conversion. 

361 

362 .. versionadded:: 10.1 

363 """ 

364 

365 object_type = ObjectType.real 

366 

367 def __new__(cls, val: float | Decimal | Real, places: int = 6) -> Real: 

368 """Construct a PDF Real. 

369 

370 Args: 

371 val: The real value. Converted to string representation internally. 

372 places: Number of decimal places (used when val is float). 

373 """ 

374 if isinstance(val, Real): 

375 return val 

376 if isinstance(val, float): 

377 return _core._new_real(val, places) # type: ignore[return-value] 

378 return _core._new_real(str(val)) # type: ignore[return-value] 

379 

380 

381# Note on numbers ABC registration: 

382# numbers.Integral.register(Integer) and numbers.Real.register(Real) don't work 

383# as expected because of the "smoke and mirrors" design - at runtime all Objects 

384# are actually pikepdf.Object instances, not Integer/Real instances. 

385# The isinstance(obj, Integer) check uses metaclass magic (_ObjectMeta) that 

386# checks the object's _type_code attribute. This doesn't satisfy the numbers ABC 

387# registration mechanism which checks the actual type hierarchy. 

388 

389 

390class _NamePathMeta(type): 

391 """Metaclass for NamePath to support NamePath.A.B syntax.""" 

392 

393 def __getattr__(cls, name: str) -> _core._NamePath: 

394 if name.startswith('_'): 

395 raise AttributeError(name) 

396 return _core._NamePath()._append_name(name) 

397 

398 def __getitem__(cls, key: str | int | Name) -> _core._NamePath: 

399 # NamePath['/A'] or NamePath[0] 

400 if isinstance(key, str): 

401 return _core._NamePath()._append_name(key) 

402 elif isinstance(key, int): 

403 return _core._NamePath()._append_index(key) 

404 elif isinstance(key, Name): 

405 return _core._NamePath()._append_name(str(key)) 

406 raise TypeError(f"NamePath key must be str, int, or Name, not {type(key)}") 

407 

408 def __call__(cls, *args) -> _core._NamePath: 

409 # NamePath() or NamePath('/A', '/B') 

410 if not args: 

411 return _core._NamePath() 

412 return _core._NamePath(*args) 

413 

414 

415class NamePath(metaclass=_NamePathMeta): 

416 """Path for accessing nested Dictionary/Stream values. 

417 

418 NamePath provides ergonomic access to deeply nested PDF structures with a 

419 single access operation and helpful error messages when keys are not found. 

420 

421 Usage examples:: 

422 

423 # Shorthand syntax - most common 

424 obj[NamePath.Resources.Font.F1] 

425 

426 # With array indices 

427 obj[NamePath.Pages.Kids[0].MediaBox] 

428 

429 # Chained access - supports non Python-identifier names 

430 NamePath['/A']['/B'].C[0] # equivalent to NamePath.A.B.C[0] 

431 

432 # Alternate syntax to support lists 

433 obj[NamePath(Name.Resources, Name.Font)] 

434 

435 # Using string objects 

436 obj[NamePath('/Resources', '/Weird-Name')] 

437 

438 # Empty path returns the object itself 

439 obj[NamePath()] 

440 

441 # Setting nested values (all parents must exist) 

442 obj[NamePath.Root.Info.Title] = pikepdf.String("Test") 

443 

444 # With default value 

445 obj.get(NamePath.Root.Metadata, None) 

446 

447 When a key is not found, the KeyError message identifies the exact failure 

448 point, e.g.: "Key /C not found; traversed NamePath.A.B" 

449 

450 .. versionadded:: 10.1 

451 """ 

452 

453 # This class is never instantiated - the metaclass intercepts construction 

454 # and returns _core._NamePath instances instead 

455 pass