Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pikepdf/models/metadata/_core.py: 28%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

164 statements  

1# SPDX-FileCopyrightText: 2022 James R. Barlow 

2# SPDX-License-Identifier: MPL-2.0 

3 

4"""PdfMetadata - facade for XMP and DocumentInfo metadata.""" 

5 

6from __future__ import annotations 

7 

8import logging 

9from collections.abc import Iterator, MutableMapping 

10from datetime import datetime, timezone 

11from typing import TYPE_CHECKING, Any 

12from warnings import warn 

13 

14from pikepdf._version import __version__ as pikepdf_version 

15from pikepdf.models.metadata._constants import ( 

16 XMP_NS_PDF, 

17 XMP_NS_PDFA_ID, 

18 XMP_NS_PDFX_ID, 

19 XMP_NS_XMP, 

20 clean, 

21) 

22from pikepdf.models.metadata._converters import DOCINFO_MAPPING, DocinfoMapping 

23from pikepdf.models.metadata._docinfo import DocinfoStore 

24from pikepdf.models.metadata._xmp import XmpDocument 

25from pikepdf.objects import Name, Stream 

26 

27if TYPE_CHECKING: # pragma: no cover 

28 from lxml.etree import QName 

29 

30 from pikepdf import Pdf 

31 

32 

33log = logging.getLogger(__name__) 

34 

35 

36class PdfMetadata(MutableMapping): 

37 """Read and edit the metadata associated with a PDF. 

38 

39 The PDF specification contain two types of metadata, the newer XMP 

40 (Extensible Metadata Platform, XML-based) and older DocumentInformation 

41 dictionary. The PDF 2.0 specification removes the DocumentInformation 

42 dictionary. 

43 

44 This primarily works with XMP metadata, but includes methods to generate 

45 XMP from DocumentInformation and will also coordinate updates to 

46 DocumentInformation so that the two are kept consistent. 

47 

48 XMP metadata fields may be accessed using the full XML namespace URI or 

49 the short name. For example ``metadata['dc:description']`` 

50 and ``metadata['{http://purl.org/dc/elements/1.1/}description']`` 

51 both refer to the same field. Several common XML namespaces are registered 

52 automatically. 

53 

54 See the XMP specification for details of allowable fields. 

55 

56 To update metadata, use a with block. 

57 

58 Example: 

59 >>> with pdf.open_metadata() as records: 

60 ... records['dc:title'] = 'New Title' 

61 

62 See Also: 

63 :meth:`pikepdf.Pdf.open_metadata` 

64 """ 

65 

66 # Keep DOCINFO_MAPPING at class level for backward compatibility 

67 DOCINFO_MAPPING: list[DocinfoMapping] = DOCINFO_MAPPING 

68 

69 # Delegate namespace dicts to XmpDocument for backward compatibility 

70 NS: dict[str, str] = XmpDocument.NS 

71 REVERSE_NS: dict[str, str] = XmpDocument.REVERSE_NS 

72 

73 def __init__( 

74 self, 

75 pdf: Pdf, 

76 pikepdf_mark: bool = True, 

77 sync_docinfo: bool = True, 

78 overwrite_invalid_xml: bool = True, 

79 ): 

80 """Construct PdfMetadata. Use Pdf.open_metadata() instead.""" 

81 self._pdf = pdf 

82 self.mark = pikepdf_mark 

83 self.sync_docinfo = sync_docinfo 

84 self._updating = False 

85 self._overwrite_invalid_xml = overwrite_invalid_xml 

86 

87 # Initialize XmpDocument with PDF's XMP data 

88 self._xmp_doc = self._load_xmp() 

89 

90 # Initialize DocinfoStore 

91 self._docinfo = DocinfoStore(pdf) 

92 

93 def _load_xmp(self) -> XmpDocument: 

94 """Load XMP from PDF or create empty XmpDocument.""" 

95 try: 

96 data = self._pdf.Root.Metadata.read_bytes() 

97 except AttributeError: 

98 data = b'' 

99 

100 return XmpDocument( 

101 data, overwrite_invalid_xml=self._overwrite_invalid_xml 

102 ) 

103 

104 def load_from_docinfo( 

105 self, docinfo, delete_missing: bool = False, raise_failure: bool = False 

106 ) -> None: 

107 """Populate the XMP metadata object with DocumentInfo. 

108 

109 Arguments: 

110 docinfo: a DocumentInfo, e.g pdf.docinfo 

111 delete_missing: if the entry is not DocumentInfo, delete the equivalent 

112 from XMP 

113 raise_failure: if True, raise any failure to convert docinfo; 

114 otherwise warn and continue 

115 

116 A few entries in the deprecated DocumentInfo dictionary are considered 

117 approximately equivalent to certain XMP records. This method copies 

118 those entries into the XMP metadata. 

119 """ 

120 from lxml.etree import QName 

121 

122 def warn_or_raise(msg, e=None): 

123 if raise_failure: 

124 raise ValueError(msg) from e 

125 warn(msg) 

126 

127 for uri, shortkey, docinfo_name, converter in self.DOCINFO_MAPPING: 

128 qname = QName(uri, shortkey) 

129 # docinfo might be a dict or pikepdf.Dictionary, so lookup keys 

130 # by str(Name) 

131 val = docinfo.get(str(docinfo_name)) 

132 if val is None: 

133 if delete_missing and qname in self: 

134 del self[qname] 

135 continue 

136 try: 

137 val = str(val) 

138 if converter: 

139 val = converter.xmp_from_docinfo(val) 

140 if not val: 

141 continue 

142 self._setitem(qname, val, True) 

143 except (ValueError, AttributeError, NotImplementedError) as e: 

144 warn_or_raise( 

145 f"The metadata field {docinfo_name} could not be copied to XMP", e 

146 ) 

147 valid_docinfo_names = { 

148 str(docinfo_name) for _, _, docinfo_name, _ in self.DOCINFO_MAPPING 

149 } 

150 extra_docinfo_names = {str(k) for k in docinfo.keys()} - valid_docinfo_names 

151 for extra in extra_docinfo_names: 

152 warn_or_raise( 

153 f"The metadata field {extra} with value '{repr(docinfo.get(extra))}' " 

154 "has no XMP equivalent, so it was discarded", 

155 ) 

156 

157 def __enter__(self): 

158 """Open metadata for editing.""" 

159 self._updating = True 

160 return self 

161 

162 def __exit__(self, exc_type, exc_val, exc_tb): 

163 """Close metadata and apply changes.""" 

164 try: 

165 if exc_type is not None: 

166 return 

167 self._apply_changes() 

168 finally: 

169 self._updating = False 

170 

171 def _update_docinfo(self): 

172 """Update the PDF's DocumentInfo dictionary to match XMP metadata. 

173 

174 The standard mapping is described here: 

175 https://www.pdfa.org/pdfa-metadata-xmp-rdf-dublin-core/ 

176 """ 

177 from lxml.etree import QName 

178 

179 # Touch object to ensure it exists 

180 self._pdf.docinfo # pylint: disable=pointless-statement 

181 for uri, element, docinfo_name, converter in self.DOCINFO_MAPPING: 

182 qname = QName(uri, element) 

183 try: 

184 value = self[qname] 

185 except KeyError: 

186 if docinfo_name in self._pdf.docinfo: 

187 del self._pdf.docinfo[docinfo_name] 

188 continue 

189 if converter: 

190 try: 

191 value = converter.docinfo_from_xmp(value) 

192 except ValueError: 

193 warn( 

194 f"The DocumentInfo field {docinfo_name} could not be " 

195 "updated from XMP" 

196 ) 

197 value = None 

198 except Exception as e: 

199 raise ValueError( 

200 "An error occurred while updating DocumentInfo field " 

201 f"{docinfo_name} from XMP {qname} with value {value}" 

202 ) from e 

203 if value is None: 

204 if docinfo_name in self._pdf.docinfo: 

205 del self._pdf.docinfo[docinfo_name] 

206 continue 

207 self._docinfo.set(docinfo_name, clean(value)) 

208 

209 def _apply_changes(self): 

210 """Serialize our changes back to the PDF in memory. 

211 

212 Depending how we are initialized, leave our metadata mark and producer. 

213 """ 

214 from lxml.etree import QName 

215 

216 if self.mark: 

217 # We were asked to mark the file as being edited by pikepdf 

218 self._setitem( 

219 QName(XMP_NS_XMP, 'MetadataDate'), 

220 datetime.now(timezone.utc).isoformat(), 

221 applying_mark=True, 

222 ) 

223 self._setitem( 

224 QName(XMP_NS_PDF, 'Producer'), 

225 'pikepdf ' + pikepdf_version, 

226 applying_mark=True, 

227 ) 

228 xml = self._xmp_doc.to_bytes() 

229 self._pdf.Root.Metadata = Stream(self._pdf, xml) 

230 self._pdf.Root.Metadata[Name.Type] = Name.Metadata 

231 self._pdf.Root.Metadata[Name.Subtype] = Name.XML 

232 if self.sync_docinfo: 

233 self._update_docinfo() 

234 

235 @classmethod 

236 def _qname(cls, name: QName | str) -> str: 

237 """Convert name to an XML QName. 

238 

239 e.g. pdf:Producer -> {http://ns.adobe.com/pdf/1.3/}Producer 

240 """ 

241 return XmpDocument.qname(name) 

242 

243 @classmethod 

244 def register_xml_namespace(cls, uri: str, prefix: str) -> None: 

245 """Register a new XML/XMP namespace. 

246 

247 Arguments: 

248 uri: The long form of the namespace. 

249 prefix: The alias to use when interpreting XMP. 

250 """ 

251 XmpDocument.register_xml_namespace(uri, prefix) 

252 

253 def _prefix_from_uri(self, uriname: str) -> str: 

254 """Given a fully qualified XML name, find a prefix. 

255 

256 e.g. {http://ns.adobe.com/pdf/1.3/}Producer -> pdf:Producer 

257 """ 

258 return self._xmp_doc.prefix_from_uri(uriname) 

259 

260 def __contains__(self, key: object) -> bool: # type: ignore[override] 

261 """Test if XMP key is in metadata.""" 

262 from lxml.etree import QName 

263 

264 if not isinstance(key, (str, QName)): 

265 raise TypeError(f"{key!r} must be str or QName") 

266 return key in self._xmp_doc 

267 

268 def __getitem__(self, key: str | QName) -> Any: 

269 """Retrieve XMP metadata for key.""" 

270 return self._xmp_doc[key] 

271 

272 def __iter__(self) -> Iterator[str]: 

273 """Iterate through XMP metadata attributes and nodes.""" 

274 return iter(self._xmp_doc) 

275 

276 def __len__(self) -> int: 

277 """Return number of items in metadata.""" 

278 return len(self._xmp_doc) 

279 

280 def _setitem( 

281 self, 

282 key: str | QName, 

283 val: set[str] | list[str] | str, 

284 applying_mark: bool = False, 

285 ) -> None: 

286 if not self._updating: 

287 raise RuntimeError("Metadata not opened for editing, use with block") 

288 

289 qkey = self._qname(key) 

290 self._setitem_check_args(key, val, applying_mark, qkey) 

291 self._xmp_doc.set_value(key, val) 

292 

293 def _setitem_check_args( 

294 self, key: str | QName, val: Any, applying_mark: bool, qkey: str 

295 ) -> None: 

296 if ( 

297 self.mark 

298 and not applying_mark 

299 and qkey 

300 in ( 

301 self._qname('xmp:MetadataDate'), 

302 self._qname('pdf:Producer'), 

303 ) 

304 ): 

305 # Complain if user writes self[pdf:Producer] = ... and because it will 

306 # be overwritten on save, unless self._updating_mark, in which case 

307 # the action was initiated internally 

308 log.warning( 

309 f"Update to {key} will be overwritten because metadata was opened " 

310 "with set_pikepdf_as_editor=True" 

311 ) 

312 if isinstance(val, str) and qkey in (self._qname('dc:creator')): 

313 log.error(f"{key} should be set to a list of strings") 

314 

315 def __setitem__(self, key: str | QName, val: set[str] | list[str] | str) -> None: 

316 """Set XMP metadata key to value.""" 

317 return self._setitem(key, val, False) 

318 

319 def __delitem__(self, key: str | QName) -> None: 

320 """Delete item from XMP metadata.""" 

321 if not self._updating: 

322 raise RuntimeError("Metadata not opened for editing, use with block") 

323 del self._xmp_doc[key] 

324 

325 @property 

326 def pdfa_status(self) -> str: 

327 """Return the PDF/A conformance level claimed by this PDF, or False. 

328 

329 A PDF may claim to PDF/A compliant without this being true. Use an 

330 independent verifier such as veraPDF to test if a PDF is truly 

331 conformant. 

332 

333 Returns: 

334 The conformance level of the PDF/A, or an empty string if the 

335 PDF does not claim PDF/A conformance. Possible valid values 

336 are: 1A, 1B, 2A, 2B, 2U, 3A, 3B, 3U. Note that ISO standard 

337 typically refers to PDF/A-1b for example, using lower case; 

338 this function returns the value as it appears in the PDF, which 

339 is uppercase. 

340 """ 

341 from lxml.etree import QName 

342 

343 key_part = QName(XMP_NS_PDFA_ID, 'part') 

344 key_conformance = QName(XMP_NS_PDFA_ID, 'conformance') 

345 try: 

346 return self[key_part] + self[key_conformance] 

347 except KeyError: 

348 return '' 

349 

350 @property 

351 def pdfx_status(self) -> str: 

352 """Return the PDF/X conformance level claimed by this PDF, or False. 

353 

354 A PDF may claim to PDF/X compliant without this being true. Use an 

355 independent verifier such as veraPDF to test if a PDF is truly 

356 conformant. 

357 

358 Returns: 

359 The conformance level of the PDF/X, or an empty string if the 

360 PDF does not claim PDF/X conformance. 

361 """ 

362 from lxml.etree import QName 

363 

364 pdfx_version = QName(XMP_NS_PDFX_ID, 'GTS_PDFXVersion') 

365 try: 

366 return self[pdfx_version] 

367 except KeyError: 

368 return '' 

369 

370 def __str__(self) -> str: 

371 """Convert XMP metadata to XML string.""" 

372 return str(self._xmp_doc) 

373 

374 # Backward compatibility methods for internal API access 

375 def _load(self) -> None: 

376 """No-op for backward compatibility. 

377 

378 Previously this triggered lazy loading of XMP. Now XMP is loaded 

379 immediately in __init__. 

380 """ 

381 pass 

382 

383 def _get_rdf_root(self): 

384 """Get the rdf:RDF root element. 

385 

386 Provided for backward compatibility with code that accesses 

387 internal XMP structure. 

388 """ 

389 return self._xmp_doc._get_rdf_root() 

390 

391 def _get_xml_bytes(self, xpacket: bool = True) -> bytes: 

392 """Serialize XMP to XML bytes. 

393 

394 Provided for backward compatibility. 

395 """ 

396 return self._xmp_doc.to_bytes(xpacket=xpacket)