Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pikepdf/models/metadata/

1# SPDX-FileCopyrightText: 2022 James R. Barlow

2# SPDX-License-Identifier: MPL-2.0

4"""PdfMetadata - facade for XMP and DocumentInfo metadata."""

6from __future__ import annotations

8import logging

9from collections.abc import Iterator, MutableMapping

10from datetime import datetime, timezone

11from typing import TYPE_CHECKING, Any

12from warnings import warn

14from lxml.etree import QName

16from pikepdf._version import __version__ as pikepdf_version

17from pikepdf.models.metadata._constants import (

18 XMP_NS_PDF,

19 XMP_NS_PDFA_ID,

20 XMP_NS_PDFX_ID,

21 XMP_NS_XMP,

22 clean,

23)

24from pikepdf.models.metadata._converters import DOCINFO_MAPPING, DocinfoMapping

25from pikepdf.models.metadata._docinfo import DocinfoStore

26from pikepdf.models.metadata._xmp import XmpDocument

27from pikepdf.objects import Name, Stream

29if TYPE_CHECKING: # pragma: no cover

30 from pikepdf import Pdf

32log = logging.getLogger(__name__)

35class PdfMetadata(MutableMapping):

36 """Read and edit the metadata associated with a PDF.

38 The PDF specification contain two types of metadata, the newer XMP

39 (Extensible Metadata Platform, XML-based) and older DocumentInformation

40 dictionary. The PDF 2.0 specification removes the DocumentInformation

41 dictionary.

43 This primarily works with XMP metadata, but includes methods to generate

44 XMP from DocumentInformation and will also coordinate updates to

45 DocumentInformation so that the two are kept consistent.

47 XMP metadata fields may be accessed using the full XML namespace URI or

48 the short name. For example ``metadata['dc:description']``

49 and ``metadata['{http://purl.org/dc/elements/1.1/}description']``

50 both refer to the same field. Several common XML namespaces are registered

51 automatically.

53 See the XMP specification for details of allowable fields.

55 To update metadata, use a with block.

57 Example:

58 >>> with pdf.open_metadata() as records:

59 ... records['dc:title'] = 'New Title'

61 See Also:

62 :meth:`pikepdf.Pdf.open_metadata`

63 """

65 # Keep DOCINFO_MAPPING at class level for backward compatibility

66 DOCINFO_MAPPING: list[DocinfoMapping] = DOCINFO_MAPPING

68 # Delegate namespace dicts to XmpDocument for backward compatibility

69 NS: dict[str, str] = XmpDocument.NS

70 REVERSE_NS: dict[str, str] = XmpDocument.REVERSE_NS

72 def __init__(

73 self,

74 pdf: Pdf,

75 pikepdf_mark: bool = True,

76 sync_docinfo: bool = True,

77 overwrite_invalid_xml: bool = True,

78 ):

79 """Construct PdfMetadata. Use Pdf.open_metadata() instead."""

80 self._pdf = pdf

81 self.mark = pikepdf_mark

82 self.sync_docinfo = sync_docinfo

83 self._updating = False

84 self._overwrite_invalid_xml = overwrite_invalid_xml

86 # Initialize XmpDocument with PDF's XMP data

87 self._xmp_doc = self._load_xmp()

89 # Initialize DocinfoStore

90 self._docinfo = DocinfoStore(pdf)

92 def _load_xmp(self) -> XmpDocument:

93 """Load XMP from PDF or create empty XmpDocument."""

94 try:

95 data = self._pdf.Root.Metadata.read_bytes()

96 except AttributeError:

97 data = b''

99 return XmpDocument(

100 data, overwrite_invalid_xml=self._overwrite_invalid_xml

101 )

102

103 def load_from_docinfo(

104 self, docinfo, delete_missing: bool = False, raise_failure: bool = False

105 ) -> None:

106 """Populate the XMP metadata object with DocumentInfo.

107

108 Arguments:

109 docinfo: a DocumentInfo, e.g pdf.docinfo

110 delete_missing: if the entry is not DocumentInfo, delete the equivalent

111 from XMP

112 raise_failure: if True, raise any failure to convert docinfo;

113 otherwise warn and continue

114

115 A few entries in the deprecated DocumentInfo dictionary are considered

116 approximately equivalent to certain XMP records. This method copies

117 those entries into the XMP metadata.

118 """

119

120 def warn_or_raise(msg, e=None):

121 if raise_failure:

122 raise ValueError(msg) from e

123 warn(msg)

124

125 for uri, shortkey, docinfo_name, converter in self.DOCINFO_MAPPING:

126 qname = QName(uri, shortkey)

127 # docinfo might be a dict or pikepdf.Dictionary, so lookup keys

128 # by str(Name)

129 val = docinfo.get(str(docinfo_name))

130 if val is None:

131 if delete_missing and qname in self:

132 del self[qname]

133 continue

134 try:

135 val = str(val)

136 if converter:

137 val = converter.xmp_from_docinfo(val)

138 if not val:

139 continue

140 self._setitem(qname, val, True)

141 except (ValueError, AttributeError, NotImplementedError) as e:

142 warn_or_raise(

143 f"The metadata field {docinfo_name} could not be copied to XMP", e

144 )

145 valid_docinfo_names = {

146 str(docinfo_name) for _, _, docinfo_name, _ in self.DOCINFO_MAPPING

147 }

148 extra_docinfo_names = {str(k) for k in docinfo.keys()} - valid_docinfo_names

149 for extra in extra_docinfo_names:

150 warn_or_raise(

151 f"The metadata field {extra} with value '{repr(docinfo.get(extra))}' "

152 "has no XMP equivalent, so it was discarded",

153 )

154

155 def __enter__(self):

156 """Open metadata for editing."""

157 self._updating = True

158 return self

159

160 def __exit__(self, exc_type, exc_val, exc_tb):

161 """Close metadata and apply changes."""

162 try:

163 if exc_type is not None:

164 return

165 self._apply_changes()

166 finally:

167 self._updating = False

168

169 def _update_docinfo(self):

170 """Update the PDF's DocumentInfo dictionary to match XMP metadata.

171

172 The standard mapping is described here:

173 https://www.pdfa.org/pdfa-metadata-xmp-rdf-dublin-core/

174 """

175 # Touch object to ensure it exists

176 self._pdf.docinfo # pylint: disable=pointless-statement

177 for uri, element, docinfo_name, converter in self.DOCINFO_MAPPING:

178 qname = QName(uri, element)

179 try:

180 value = self[qname]

181 except KeyError:

182 if docinfo_name in self._pdf.docinfo:

183 del self._pdf.docinfo[docinfo_name]

184 continue

185 if converter:

186 try:

187 value = converter.docinfo_from_xmp(value)

188 except ValueError:

189 warn(

190 f"The DocumentInfo field {docinfo_name} could not be "

191 "updated from XMP"

192 )

193 value = None

194 except Exception as e:

195 raise ValueError(

196 "An error occurred while updating DocumentInfo field "

197 f"{docinfo_name} from XMP {qname} with value {value}"

198 ) from e

199 if value is None:

200 if docinfo_name in self._pdf.docinfo:

201 del self._pdf.docinfo[docinfo_name]

202 continue

203 self._docinfo.set(docinfo_name, clean(value))

204

205 def _apply_changes(self):

206 """Serialize our changes back to the PDF in memory.

207

208 Depending how we are initialized, leave our metadata mark and producer.

209 """

210 if self.mark:

211 # We were asked to mark the file as being edited by pikepdf

212 self._setitem(

213 QName(XMP_NS_XMP, 'MetadataDate'),

214 datetime.now(timezone.utc).isoformat(),

215 applying_mark=True,

216 )

217 self._setitem(

218 QName(XMP_NS_PDF, 'Producer'),

219 'pikepdf ' + pikepdf_version,

220 applying_mark=True,

221 )

222 xml = self._xmp_doc.to_bytes()

223 self._pdf.Root.Metadata = Stream(self._pdf, xml)

224 self._pdf.Root.Metadata[Name.Type] = Name.Metadata

225 self._pdf.Root.Metadata[Name.Subtype] = Name.XML

226 if self.sync_docinfo:

227 self._update_docinfo()

228

229 @classmethod

230 def _qname(cls, name: QName | str) -> str:

231 """Convert name to an XML QName.

232

233 e.g. pdf:Producer -> {http://ns.adobe.com/pdf/1.3/}Producer

234 """

235 return XmpDocument.qname(name)

236

237 @classmethod

238 def register_xml_namespace(cls, uri: str, prefix: str) -> None:

239 """Register a new XML/XMP namespace.

240

241 Arguments:

242 uri: The long form of the namespace.

243 prefix: The alias to use when interpreting XMP.

244 """

245 XmpDocument.register_xml_namespace(uri, prefix)

246

247 def _prefix_from_uri(self, uriname: str) -> str:

248 """Given a fully qualified XML name, find a prefix.

249

250 e.g. {http://ns.adobe.com/pdf/1.3/}Producer -> pdf:Producer

251 """

252 return self._xmp_doc.prefix_from_uri(uriname)

253

254 def __contains__(self, key: object) -> bool: # type: ignore[override]

255 """Test if XMP key is in metadata."""

256 if not isinstance(key, (str, QName)):

257 raise TypeError(f"{key!r} must be str or QName")

258 return key in self._xmp_doc

259

260 def __getitem__(self, key: str | QName) -> Any:

261 """Retrieve XMP metadata for key."""

262 return self._xmp_doc[key]

263

264 def __iter__(self) -> Iterator[str]:

265 """Iterate through XMP metadata attributes and nodes."""

266 return iter(self._xmp_doc)

267

268 def __len__(self) -> int:

269 """Return number of items in metadata."""

270 return len(self._xmp_doc)

271

272 def _setitem(

273 self,

274 key: str | QName,

275 val: set[str] | list[str] | str,

276 applying_mark: bool = False,

277 ) -> None:

278 if not self._updating:

279 raise RuntimeError("Metadata not opened for editing, use with block")

280

281 qkey = self._qname(key)

282 self._setitem_check_args(key, val, applying_mark, qkey)

283 self._xmp_doc.set_value(key, val)

284

285 def _setitem_check_args(

286 self, key: str | QName, val: Any, applying_mark: bool, qkey: str

287 ) -> None:

288 if (

289 self.mark

290 and not applying_mark

291 and qkey

292 in (

293 self._qname('xmp:MetadataDate'),

294 self._qname('pdf:Producer'),

295 )

296 ):

297 # Complain if user writes self[pdf:Producer] = ... and because it will

298 # be overwritten on save, unless self._updating_mark, in which case

299 # the action was initiated internally

300 log.warning(

301 f"Update to {key} will be overwritten because metadata was opened "

302 "with set_pikepdf_as_editor=True"

303 )

304 if isinstance(val, str) and qkey in (self._qname('dc:creator')):

305 log.error(f"{key} should be set to a list of strings")

306

307 def __setitem__(self, key: str | QName, val: set[str] | list[str] | str) -> None:

308 """Set XMP metadata key to value."""

309 return self._setitem(key, val, False)

310

311 def __delitem__(self, key: str | QName) -> None:

312 """Delete item from XMP metadata."""

313 if not self._updating:

314 raise RuntimeError("Metadata not opened for editing, use with block")

315 del self._xmp_doc[key]

316

317 @property

318 def pdfa_status(self) -> str:

319 """Return the PDF/A conformance level claimed by this PDF, or False.

320

321 A PDF may claim to PDF/A compliant without this being true. Use an

322 independent verifier such as veraPDF to test if a PDF is truly

323 conformant.

324

325 Returns:

326 The conformance level of the PDF/A, or an empty string if the

327 PDF does not claim PDF/A conformance. Possible valid values

328 are: 1A, 1B, 2A, 2B, 2U, 3A, 3B, 3U. Note that ISO standard

329 typically refers to PDF/A-1b for example, using lower case;

330 this function returns the value as it appears in the PDF, which

331 is uppercase.

332 """

333 key_part = QName(XMP_NS_PDFA_ID, 'part')

334 key_conformance = QName(XMP_NS_PDFA_ID, 'conformance')

335 try:

336 return self[key_part] + self[key_conformance]

337 except KeyError:

338 return ''

339

340 @property

341 def pdfx_status(self) -> str:

342 """Return the PDF/X conformance level claimed by this PDF, or False.

343

344 A PDF may claim to PDF/X compliant without this being true. Use an

345 independent verifier such as veraPDF to test if a PDF is truly

346 conformant.

347

348 Returns:

349 The conformance level of the PDF/X, or an empty string if the

350 PDF does not claim PDF/X conformance.

351 """

352 pdfx_version = QName(XMP_NS_PDFX_ID, 'GTS_PDFXVersion')

353 try:

354 return self[pdfx_version]

355 except KeyError:

356 return ''

357

358 def __str__(self) -> str:

359 """Convert XMP metadata to XML string."""

360 return str(self._xmp_doc)

361

362 # Backward compatibility methods for internal API access

363 def _load(self) -> None:

364 """No-op for backward compatibility.

365

366 Previously this triggered lazy loading of XMP. Now XMP is loaded

367 immediately in __init__.

368 """

369 pass

370

371 def _get_rdf_root(self):

372 """Get the rdf:RDF root element.

373

374 Provided for backward compatibility with code that accesses

375 internal XMP structure.

376 """

377 return self._xmp_doc._get_rdf_root()

378

379 def _get_xml_bytes(self, xpacket: bool = True) -> bytes:

380 """Serialize XMP to XML bytes.

381

382 Provided for backward compatibility.

383 """

384 return self._xmp_doc.to_bytes(xpacket=xpacket)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pikepdf/models/metadata/_core.py: 30%

159 statements