Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pikepdf/models/metadata/

1# SPDX-FileCopyrightText: 2022 James R. Barlow

2# SPDX-License-Identifier: MPL-2.0

4"""PdfMetadata - facade for XMP and DocumentInfo metadata."""

6from __future__ import annotations

8import logging

9from collections.abc import Iterator, MutableMapping

10from datetime import datetime, timezone

11from typing import TYPE_CHECKING, Any

12from warnings import warn

14from pikepdf._version import __version__ as pikepdf_version

15from pikepdf.models.metadata._constants import (

16 XMP_NS_PDF,

17 XMP_NS_PDFA_ID,

18 XMP_NS_PDFX_ID,

19 XMP_NS_XMP,

20 clean,

21)

22from pikepdf.models.metadata._converters import DOCINFO_MAPPING, DocinfoMapping

23from pikepdf.models.metadata._docinfo import DocinfoStore

24from pikepdf.models.metadata._xmp import XmpDocument

25from pikepdf.objects import Name, Stream

27if TYPE_CHECKING: # pragma: no cover

28 from lxml.etree import QName

30 from pikepdf import Pdf

33log = logging.getLogger(__name__)

36class PdfMetadata(MutableMapping):

37 """Read and edit the metadata associated with a PDF.

39 The PDF specification contain two types of metadata, the newer XMP

40 (Extensible Metadata Platform, XML-based) and older DocumentInformation

41 dictionary. The PDF 2.0 specification removes the DocumentInformation

42 dictionary.

44 This primarily works with XMP metadata, but includes methods to generate

45 XMP from DocumentInformation and will also coordinate updates to

46 DocumentInformation so that the two are kept consistent.

48 XMP metadata fields may be accessed using the full XML namespace URI or

49 the short name. For example ``metadata['dc:description']``

50 and ``metadata['{http://purl.org/dc/elements/1.1/}description']``

51 both refer to the same field. Several common XML namespaces are registered

52 automatically.

54 See the XMP specification for details of allowable fields.

56 To update metadata, use a with block.

58 Example:

59 >>> with pdf.open_metadata() as records:

60 ... records['dc:title'] = 'New Title'

62 See Also:

63 :meth:`pikepdf.Pdf.open_metadata`

64 """

66 # Keep DOCINFO_MAPPING at class level for backward compatibility

67 DOCINFO_MAPPING: list[DocinfoMapping] = DOCINFO_MAPPING

69 # Delegate namespace dicts to XmpDocument for backward compatibility

70 NS: dict[str, str] = XmpDocument.NS

71 REVERSE_NS: dict[str, str] = XmpDocument.REVERSE_NS

73 def __init__(

74 self,

75 pdf: Pdf,

76 pikepdf_mark: bool = True,

77 sync_docinfo: bool = True,

78 overwrite_invalid_xml: bool = True,

79 ):

80 """Construct PdfMetadata. Use Pdf.open_metadata() instead."""

81 self._pdf = pdf

82 self.mark = pikepdf_mark

83 self.sync_docinfo = sync_docinfo

84 self._updating = False

85 self._overwrite_invalid_xml = overwrite_invalid_xml

87 # Initialize XmpDocument with PDF's XMP data

88 self._xmp_doc = self._load_xmp()

90 # Initialize DocinfoStore

91 self._docinfo = DocinfoStore(pdf)

93 def _load_xmp(self) -> XmpDocument:

94 """Load XMP from PDF or create empty XmpDocument."""

95 try:

96 data = self._pdf.Root.Metadata.read_bytes()

97 except AttributeError:

98 data = b''

100 return XmpDocument(

101 data, overwrite_invalid_xml=self._overwrite_invalid_xml

102 )

103

104 def load_from_docinfo(

105 self, docinfo, delete_missing: bool = False, raise_failure: bool = False

106 ) -> None:

107 """Populate the XMP metadata object with DocumentInfo.

108

109 Arguments:

110 docinfo: a DocumentInfo, e.g pdf.docinfo

111 delete_missing: if the entry is not DocumentInfo, delete the equivalent

112 from XMP

113 raise_failure: if True, raise any failure to convert docinfo;

114 otherwise warn and continue

115

116 A few entries in the deprecated DocumentInfo dictionary are considered

117 approximately equivalent to certain XMP records. This method copies

118 those entries into the XMP metadata.

119 """

120 from lxml.etree import QName

121

122 def warn_or_raise(msg, e=None):

123 if raise_failure:

124 raise ValueError(msg) from e

125 warn(msg)

126

127 for uri, shortkey, docinfo_name, converter in self.DOCINFO_MAPPING:

128 qname = QName(uri, shortkey)

129 # docinfo might be a dict or pikepdf.Dictionary, so lookup keys

130 # by str(Name)

131 val = docinfo.get(str(docinfo_name))

132 if val is None:

133 if delete_missing and qname in self:

134 del self[qname]

135 continue

136 try:

137 val = str(val)

138 if converter:

139 val = converter.xmp_from_docinfo(val)

140 if not val:

141 continue

142 self._setitem(qname, val, True)

143 except (ValueError, AttributeError, NotImplementedError) as e:

144 warn_or_raise(

145 f"The metadata field {docinfo_name} could not be copied to XMP", e

146 )

147 valid_docinfo_names = {

148 str(docinfo_name) for _, _, docinfo_name, _ in self.DOCINFO_MAPPING

149 }

150 extra_docinfo_names = {str(k) for k in docinfo.keys()} - valid_docinfo_names

151 for extra in extra_docinfo_names:

152 warn_or_raise(

153 f"The metadata field {extra} with value '{repr(docinfo.get(extra))}' "

154 "has no XMP equivalent, so it was discarded",

155 )

156

157 def __enter__(self):

158 """Open metadata for editing."""

159 self._updating = True

160 return self

161

162 def __exit__(self, exc_type, exc_val, exc_tb):

163 """Close metadata and apply changes."""

164 try:

165 if exc_type is not None:

166 return

167 self._apply_changes()

168 finally:

169 self._updating = False

170

171 def _update_docinfo(self):

172 """Update the PDF's DocumentInfo dictionary to match XMP metadata.

173

174 The standard mapping is described here:

175 https://www.pdfa.org/pdfa-metadata-xmp-rdf-dublin-core/

176 """

177 from lxml.etree import QName

178

179 # Touch object to ensure it exists

180 self._pdf.docinfo # pylint: disable=pointless-statement

181 for uri, element, docinfo_name, converter in self.DOCINFO_MAPPING:

182 qname = QName(uri, element)

183 try:

184 value = self[qname]

185 except KeyError:

186 if docinfo_name in self._pdf.docinfo:

187 del self._pdf.docinfo[docinfo_name]

188 continue

189 if converter:

190 try:

191 value = converter.docinfo_from_xmp(value)

192 except ValueError:

193 warn(

194 f"The DocumentInfo field {docinfo_name} could not be "

195 "updated from XMP"

196 )

197 value = None

198 except Exception as e:

199 raise ValueError(

200 "An error occurred while updating DocumentInfo field "

201 f"{docinfo_name} from XMP {qname} with value {value}"

202 ) from e

203 if value is None:

204 if docinfo_name in self._pdf.docinfo:

205 del self._pdf.docinfo[docinfo_name]

206 continue

207 self._docinfo.set(docinfo_name, clean(value))

208

209 def _apply_changes(self):

210 """Serialize our changes back to the PDF in memory.

211

212 Depending how we are initialized, leave our metadata mark and producer.

213 """

214 from lxml.etree import QName

215

216 if self.mark:

217 # We were asked to mark the file as being edited by pikepdf

218 self._setitem(

219 QName(XMP_NS_XMP, 'MetadataDate'),

220 datetime.now(timezone.utc).isoformat(),

221 applying_mark=True,

222 )

223 self._setitem(

224 QName(XMP_NS_PDF, 'Producer'),

225 'pikepdf ' + pikepdf_version,

226 applying_mark=True,

227 )

228 xml = self._xmp_doc.to_bytes()

229 self._pdf.Root.Metadata = Stream(self._pdf, xml)

230 self._pdf.Root.Metadata[Name.Type] = Name.Metadata

231 self._pdf.Root.Metadata[Name.Subtype] = Name.XML

232 if self.sync_docinfo:

233 self._update_docinfo()

234

235 @classmethod

236 def _qname(cls, name: QName | str) -> str:

237 """Convert name to an XML QName.

238

239 e.g. pdf:Producer -> {http://ns.adobe.com/pdf/1.3/}Producer

240 """

241 return XmpDocument.qname(name)

242

243 @classmethod

244 def register_xml_namespace(cls, uri: str, prefix: str) -> None:

245 """Register a new XML/XMP namespace.

246

247 Arguments:

248 uri: The long form of the namespace.

249 prefix: The alias to use when interpreting XMP.

250 """

251 XmpDocument.register_xml_namespace(uri, prefix)

252

253 def _prefix_from_uri(self, uriname: str) -> str:

254 """Given a fully qualified XML name, find a prefix.

255

256 e.g. {http://ns.adobe.com/pdf/1.3/}Producer -> pdf:Producer

257 """

258 return self._xmp_doc.prefix_from_uri(uriname)

259

260 def __contains__(self, key: object) -> bool: # type: ignore[override]

261 """Test if XMP key is in metadata."""

262 from lxml.etree import QName

263

264 if not isinstance(key, (str, QName)):

265 raise TypeError(f"{key!r} must be str or QName")

266 return key in self._xmp_doc

267

268 def __getitem__(self, key: str | QName) -> Any:

269 """Retrieve XMP metadata for key."""

270 return self._xmp_doc[key]

271

272 def __iter__(self) -> Iterator[str]:

273 """Iterate through XMP metadata attributes and nodes."""

274 return iter(self._xmp_doc)

275

276 def __len__(self) -> int:

277 """Return number of items in metadata."""

278 return len(self._xmp_doc)

279

280 def _setitem(

281 self,

282 key: str | QName,

283 val: set[str] | list[str] | str,

284 applying_mark: bool = False,

285 ) -> None:

286 if not self._updating:

287 raise RuntimeError("Metadata not opened for editing, use with block")

288

289 qkey = self._qname(key)

290 self._setitem_check_args(key, val, applying_mark, qkey)

291 self._xmp_doc.set_value(key, val)

292

293 def _setitem_check_args(

294 self, key: str | QName, val: Any, applying_mark: bool, qkey: str

295 ) -> None:

296 if (

297 self.mark

298 and not applying_mark

299 and qkey

300 in (

301 self._qname('xmp:MetadataDate'),

302 self._qname('pdf:Producer'),

303 )

304 ):

305 # Complain if user writes self[pdf:Producer] = ... and because it will

306 # be overwritten on save, unless self._updating_mark, in which case

307 # the action was initiated internally

308 log.warning(

309 f"Update to {key} will be overwritten because metadata was opened "

310 "with set_pikepdf_as_editor=True"

311 )

312 if isinstance(val, str) and qkey in (self._qname('dc:creator')):

313 log.error(f"{key} should be set to a list of strings")

314

315 def __setitem__(self, key: str | QName, val: set[str] | list[str] | str) -> None:

316 """Set XMP metadata key to value."""

317 return self._setitem(key, val, False)

318

319 def __delitem__(self, key: str | QName) -> None:

320 """Delete item from XMP metadata."""

321 if not self._updating:

322 raise RuntimeError("Metadata not opened for editing, use with block")

323 del self._xmp_doc[key]

324

325 @property

326 def pdfa_status(self) -> str:

327 """Return the PDF/A conformance level claimed by this PDF, or False.

328

329 A PDF may claim to PDF/A compliant without this being true. Use an

330 independent verifier such as veraPDF to test if a PDF is truly

331 conformant.

332

333 Returns:

334 The conformance level of the PDF/A, or an empty string if the

335 PDF does not claim PDF/A conformance. Possible valid values

336 are: 1A, 1B, 2A, 2B, 2U, 3A, 3B, 3U. Note that ISO standard

337 typically refers to PDF/A-1b for example, using lower case;

338 this function returns the value as it appears in the PDF, which

339 is uppercase.

340 """

341 from lxml.etree import QName

342

343 key_part = QName(XMP_NS_PDFA_ID, 'part')

344 key_conformance = QName(XMP_NS_PDFA_ID, 'conformance')

345 try:

346 return self[key_part] + self[key_conformance]

347 except KeyError:

348 return ''

349

350 @property

351 def pdfx_status(self) -> str:

352 """Return the PDF/X conformance level claimed by this PDF, or False.

353

354 A PDF may claim to PDF/X compliant without this being true. Use an

355 independent verifier such as veraPDF to test if a PDF is truly

356 conformant.

357

358 Returns:

359 The conformance level of the PDF/X, or an empty string if the

360 PDF does not claim PDF/X conformance.

361 """

362 from lxml.etree import QName

363

364 pdfx_version = QName(XMP_NS_PDFX_ID, 'GTS_PDFXVersion')

365 try:

366 return self[pdfx_version]

367 except KeyError:

368 return ''

369

370 def __str__(self) -> str:

371 """Convert XMP metadata to XML string."""

372 return str(self._xmp_doc)

373

374 # Backward compatibility methods for internal API access

375 def _load(self) -> None:

376 """No-op for backward compatibility.

377

378 Previously this triggered lazy loading of XMP. Now XMP is loaded

379 immediately in __init__.

380 """

381 pass

382

383 def _get_rdf_root(self):

384 """Get the rdf:RDF root element.

385

386 Provided for backward compatibility with code that accesses

387 internal XMP structure.

388 """

389 return self._xmp_doc._get_rdf_root()

390

391 def _get_xml_bytes(self, xpacket: bool = True) -> bytes:

392 """Serialize XMP to XML bytes.

393

394 Provided for backward compatibility.

395 """

396 return self._xmp_doc.to_bytes(xpacket=xpacket)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pikepdf/models/metadata/_core.py: 28%

164 statements