Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pikepdf/models/metadata/_core.py: 30%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

159 statements  

1# SPDX-FileCopyrightText: 2022 James R. Barlow 

2# SPDX-License-Identifier: MPL-2.0 

3 

4"""PdfMetadata - facade for XMP and DocumentInfo metadata.""" 

5 

6from __future__ import annotations 

7 

8import logging 

9from collections.abc import Iterator, MutableMapping 

10from datetime import datetime, timezone 

11from typing import TYPE_CHECKING, Any 

12from warnings import warn 

13 

14from lxml.etree import QName 

15 

16from pikepdf._version import __version__ as pikepdf_version 

17from pikepdf.models.metadata._constants import ( 

18 XMP_NS_PDF, 

19 XMP_NS_PDFA_ID, 

20 XMP_NS_PDFX_ID, 

21 XMP_NS_XMP, 

22 clean, 

23) 

24from pikepdf.models.metadata._converters import DOCINFO_MAPPING, DocinfoMapping 

25from pikepdf.models.metadata._docinfo import DocinfoStore 

26from pikepdf.models.metadata._xmp import XmpDocument 

27from pikepdf.objects import Name, Stream 

28 

29if TYPE_CHECKING: # pragma: no cover 

30 from pikepdf import Pdf 

31 

32log = logging.getLogger(__name__) 

33 

34 

35class PdfMetadata(MutableMapping): 

36 """Read and edit the metadata associated with a PDF. 

37 

38 The PDF specification contain two types of metadata, the newer XMP 

39 (Extensible Metadata Platform, XML-based) and older DocumentInformation 

40 dictionary. The PDF 2.0 specification removes the DocumentInformation 

41 dictionary. 

42 

43 This primarily works with XMP metadata, but includes methods to generate 

44 XMP from DocumentInformation and will also coordinate updates to 

45 DocumentInformation so that the two are kept consistent. 

46 

47 XMP metadata fields may be accessed using the full XML namespace URI or 

48 the short name. For example ``metadata['dc:description']`` 

49 and ``metadata['{http://purl.org/dc/elements/1.1/}description']`` 

50 both refer to the same field. Several common XML namespaces are registered 

51 automatically. 

52 

53 See the XMP specification for details of allowable fields. 

54 

55 To update metadata, use a with block. 

56 

57 Example: 

58 >>> with pdf.open_metadata() as records: 

59 ... records['dc:title'] = 'New Title' 

60 

61 See Also: 

62 :meth:`pikepdf.Pdf.open_metadata` 

63 """ 

64 

65 # Keep DOCINFO_MAPPING at class level for backward compatibility 

66 DOCINFO_MAPPING: list[DocinfoMapping] = DOCINFO_MAPPING 

67 

68 # Delegate namespace dicts to XmpDocument for backward compatibility 

69 NS: dict[str, str] = XmpDocument.NS 

70 REVERSE_NS: dict[str, str] = XmpDocument.REVERSE_NS 

71 

72 def __init__( 

73 self, 

74 pdf: Pdf, 

75 pikepdf_mark: bool = True, 

76 sync_docinfo: bool = True, 

77 overwrite_invalid_xml: bool = True, 

78 ): 

79 """Construct PdfMetadata. Use Pdf.open_metadata() instead.""" 

80 self._pdf = pdf 

81 self.mark = pikepdf_mark 

82 self.sync_docinfo = sync_docinfo 

83 self._updating = False 

84 self._overwrite_invalid_xml = overwrite_invalid_xml 

85 

86 # Initialize XmpDocument with PDF's XMP data 

87 self._xmp_doc = self._load_xmp() 

88 

89 # Initialize DocinfoStore 

90 self._docinfo = DocinfoStore(pdf) 

91 

92 def _load_xmp(self) -> XmpDocument: 

93 """Load XMP from PDF or create empty XmpDocument.""" 

94 try: 

95 data = self._pdf.Root.Metadata.read_bytes() 

96 except AttributeError: 

97 data = b'' 

98 

99 return XmpDocument( 

100 data, overwrite_invalid_xml=self._overwrite_invalid_xml 

101 ) 

102 

103 def load_from_docinfo( 

104 self, docinfo, delete_missing: bool = False, raise_failure: bool = False 

105 ) -> None: 

106 """Populate the XMP metadata object with DocumentInfo. 

107 

108 Arguments: 

109 docinfo: a DocumentInfo, e.g pdf.docinfo 

110 delete_missing: if the entry is not DocumentInfo, delete the equivalent 

111 from XMP 

112 raise_failure: if True, raise any failure to convert docinfo; 

113 otherwise warn and continue 

114 

115 A few entries in the deprecated DocumentInfo dictionary are considered 

116 approximately equivalent to certain XMP records. This method copies 

117 those entries into the XMP metadata. 

118 """ 

119 

120 def warn_or_raise(msg, e=None): 

121 if raise_failure: 

122 raise ValueError(msg) from e 

123 warn(msg) 

124 

125 for uri, shortkey, docinfo_name, converter in self.DOCINFO_MAPPING: 

126 qname = QName(uri, shortkey) 

127 # docinfo might be a dict or pikepdf.Dictionary, so lookup keys 

128 # by str(Name) 

129 val = docinfo.get(str(docinfo_name)) 

130 if val is None: 

131 if delete_missing and qname in self: 

132 del self[qname] 

133 continue 

134 try: 

135 val = str(val) 

136 if converter: 

137 val = converter.xmp_from_docinfo(val) 

138 if not val: 

139 continue 

140 self._setitem(qname, val, True) 

141 except (ValueError, AttributeError, NotImplementedError) as e: 

142 warn_or_raise( 

143 f"The metadata field {docinfo_name} could not be copied to XMP", e 

144 ) 

145 valid_docinfo_names = { 

146 str(docinfo_name) for _, _, docinfo_name, _ in self.DOCINFO_MAPPING 

147 } 

148 extra_docinfo_names = {str(k) for k in docinfo.keys()} - valid_docinfo_names 

149 for extra in extra_docinfo_names: 

150 warn_or_raise( 

151 f"The metadata field {extra} with value '{repr(docinfo.get(extra))}' " 

152 "has no XMP equivalent, so it was discarded", 

153 ) 

154 

155 def __enter__(self): 

156 """Open metadata for editing.""" 

157 self._updating = True 

158 return self 

159 

160 def __exit__(self, exc_type, exc_val, exc_tb): 

161 """Close metadata and apply changes.""" 

162 try: 

163 if exc_type is not None: 

164 return 

165 self._apply_changes() 

166 finally: 

167 self._updating = False 

168 

169 def _update_docinfo(self): 

170 """Update the PDF's DocumentInfo dictionary to match XMP metadata. 

171 

172 The standard mapping is described here: 

173 https://www.pdfa.org/pdfa-metadata-xmp-rdf-dublin-core/ 

174 """ 

175 # Touch object to ensure it exists 

176 self._pdf.docinfo # pylint: disable=pointless-statement 

177 for uri, element, docinfo_name, converter in self.DOCINFO_MAPPING: 

178 qname = QName(uri, element) 

179 try: 

180 value = self[qname] 

181 except KeyError: 

182 if docinfo_name in self._pdf.docinfo: 

183 del self._pdf.docinfo[docinfo_name] 

184 continue 

185 if converter: 

186 try: 

187 value = converter.docinfo_from_xmp(value) 

188 except ValueError: 

189 warn( 

190 f"The DocumentInfo field {docinfo_name} could not be " 

191 "updated from XMP" 

192 ) 

193 value = None 

194 except Exception as e: 

195 raise ValueError( 

196 "An error occurred while updating DocumentInfo field " 

197 f"{docinfo_name} from XMP {qname} with value {value}" 

198 ) from e 

199 if value is None: 

200 if docinfo_name in self._pdf.docinfo: 

201 del self._pdf.docinfo[docinfo_name] 

202 continue 

203 self._docinfo.set(docinfo_name, clean(value)) 

204 

205 def _apply_changes(self): 

206 """Serialize our changes back to the PDF in memory. 

207 

208 Depending how we are initialized, leave our metadata mark and producer. 

209 """ 

210 if self.mark: 

211 # We were asked to mark the file as being edited by pikepdf 

212 self._setitem( 

213 QName(XMP_NS_XMP, 'MetadataDate'), 

214 datetime.now(timezone.utc).isoformat(), 

215 applying_mark=True, 

216 ) 

217 self._setitem( 

218 QName(XMP_NS_PDF, 'Producer'), 

219 'pikepdf ' + pikepdf_version, 

220 applying_mark=True, 

221 ) 

222 xml = self._xmp_doc.to_bytes() 

223 self._pdf.Root.Metadata = Stream(self._pdf, xml) 

224 self._pdf.Root.Metadata[Name.Type] = Name.Metadata 

225 self._pdf.Root.Metadata[Name.Subtype] = Name.XML 

226 if self.sync_docinfo: 

227 self._update_docinfo() 

228 

229 @classmethod 

230 def _qname(cls, name: QName | str) -> str: 

231 """Convert name to an XML QName. 

232 

233 e.g. pdf:Producer -> {http://ns.adobe.com/pdf/1.3/}Producer 

234 """ 

235 return XmpDocument.qname(name) 

236 

237 @classmethod 

238 def register_xml_namespace(cls, uri: str, prefix: str) -> None: 

239 """Register a new XML/XMP namespace. 

240 

241 Arguments: 

242 uri: The long form of the namespace. 

243 prefix: The alias to use when interpreting XMP. 

244 """ 

245 XmpDocument.register_xml_namespace(uri, prefix) 

246 

247 def _prefix_from_uri(self, uriname: str) -> str: 

248 """Given a fully qualified XML name, find a prefix. 

249 

250 e.g. {http://ns.adobe.com/pdf/1.3/}Producer -> pdf:Producer 

251 """ 

252 return self._xmp_doc.prefix_from_uri(uriname) 

253 

254 def __contains__(self, key: object) -> bool: # type: ignore[override] 

255 """Test if XMP key is in metadata.""" 

256 if not isinstance(key, (str, QName)): 

257 raise TypeError(f"{key!r} must be str or QName") 

258 return key in self._xmp_doc 

259 

260 def __getitem__(self, key: str | QName) -> Any: 

261 """Retrieve XMP metadata for key.""" 

262 return self._xmp_doc[key] 

263 

264 def __iter__(self) -> Iterator[str]: 

265 """Iterate through XMP metadata attributes and nodes.""" 

266 return iter(self._xmp_doc) 

267 

268 def __len__(self) -> int: 

269 """Return number of items in metadata.""" 

270 return len(self._xmp_doc) 

271 

272 def _setitem( 

273 self, 

274 key: str | QName, 

275 val: set[str] | list[str] | str, 

276 applying_mark: bool = False, 

277 ) -> None: 

278 if not self._updating: 

279 raise RuntimeError("Metadata not opened for editing, use with block") 

280 

281 qkey = self._qname(key) 

282 self._setitem_check_args(key, val, applying_mark, qkey) 

283 self._xmp_doc.set_value(key, val) 

284 

285 def _setitem_check_args( 

286 self, key: str | QName, val: Any, applying_mark: bool, qkey: str 

287 ) -> None: 

288 if ( 

289 self.mark 

290 and not applying_mark 

291 and qkey 

292 in ( 

293 self._qname('xmp:MetadataDate'), 

294 self._qname('pdf:Producer'), 

295 ) 

296 ): 

297 # Complain if user writes self[pdf:Producer] = ... and because it will 

298 # be overwritten on save, unless self._updating_mark, in which case 

299 # the action was initiated internally 

300 log.warning( 

301 f"Update to {key} will be overwritten because metadata was opened " 

302 "with set_pikepdf_as_editor=True" 

303 ) 

304 if isinstance(val, str) and qkey in (self._qname('dc:creator')): 

305 log.error(f"{key} should be set to a list of strings") 

306 

307 def __setitem__(self, key: str | QName, val: set[str] | list[str] | str) -> None: 

308 """Set XMP metadata key to value.""" 

309 return self._setitem(key, val, False) 

310 

311 def __delitem__(self, key: str | QName) -> None: 

312 """Delete item from XMP metadata.""" 

313 if not self._updating: 

314 raise RuntimeError("Metadata not opened for editing, use with block") 

315 del self._xmp_doc[key] 

316 

317 @property 

318 def pdfa_status(self) -> str: 

319 """Return the PDF/A conformance level claimed by this PDF, or False. 

320 

321 A PDF may claim to PDF/A compliant without this being true. Use an 

322 independent verifier such as veraPDF to test if a PDF is truly 

323 conformant. 

324 

325 Returns: 

326 The conformance level of the PDF/A, or an empty string if the 

327 PDF does not claim PDF/A conformance. Possible valid values 

328 are: 1A, 1B, 2A, 2B, 2U, 3A, 3B, 3U. Note that ISO standard 

329 typically refers to PDF/A-1b for example, using lower case; 

330 this function returns the value as it appears in the PDF, which 

331 is uppercase. 

332 """ 

333 key_part = QName(XMP_NS_PDFA_ID, 'part') 

334 key_conformance = QName(XMP_NS_PDFA_ID, 'conformance') 

335 try: 

336 return self[key_part] + self[key_conformance] 

337 except KeyError: 

338 return '' 

339 

340 @property 

341 def pdfx_status(self) -> str: 

342 """Return the PDF/X conformance level claimed by this PDF, or False. 

343 

344 A PDF may claim to PDF/X compliant without this being true. Use an 

345 independent verifier such as veraPDF to test if a PDF is truly 

346 conformant. 

347 

348 Returns: 

349 The conformance level of the PDF/X, or an empty string if the 

350 PDF does not claim PDF/X conformance. 

351 """ 

352 pdfx_version = QName(XMP_NS_PDFX_ID, 'GTS_PDFXVersion') 

353 try: 

354 return self[pdfx_version] 

355 except KeyError: 

356 return '' 

357 

358 def __str__(self) -> str: 

359 """Convert XMP metadata to XML string.""" 

360 return str(self._xmp_doc) 

361 

362 # Backward compatibility methods for internal API access 

363 def _load(self) -> None: 

364 """No-op for backward compatibility. 

365 

366 Previously this triggered lazy loading of XMP. Now XMP is loaded 

367 immediately in __init__. 

368 """ 

369 pass 

370 

371 def _get_rdf_root(self): 

372 """Get the rdf:RDF root element. 

373 

374 Provided for backward compatibility with code that accesses 

375 internal XMP structure. 

376 """ 

377 return self._xmp_doc._get_rdf_root() 

378 

379 def _get_xml_bytes(self, xpacket: bool = True) -> bytes: 

380 """Serialize XMP to XML bytes. 

381 

382 Provided for backward compatibility. 

383 """ 

384 return self._xmp_doc.to_bytes(xpacket=xpacket)