Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/xmp.py: 42%

1"""

2Anything related to Extensible Metadata Platform (XMP) metadata.

4https://en.wikipedia.org/wiki/Extensible_Metadata_Platform

5"""

7import datetime

8import decimal

9import re

10from collections.abc import Iterator

11from typing import (

12 Any,

13 Callable,

14 Optional,

15 TypeVar,

16 Union,

17)

18from xml.dom.minidom import Document, parseString

19from xml.dom.minidom import Element as XmlElement

20from xml.parsers.expat import ExpatError

22from ._protocols import XmpInformationProtocol

23from ._utils import StreamType, deprecate_with_replacement, deprecation_no_replacement

24from .errors import PdfReadError

25from .generic import ContentStream, PdfObject

27RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"

28DC_NAMESPACE = "http://purl.org/dc/elements/1.1/"

29XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/"

30PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/"

31XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/"

33# What is the PDFX namespace, you might ask?

34# It's documented here: https://github.com/adobe/xmp-docs/raw/master/XMPSpecifications/XMPSpecificationPart3.pdf

35# This namespace is used to place "custom metadata"

36# properties, which are arbitrary metadata properties with no semantic or

37# documented meaning.

38#

39# Elements in the namespace are key/value-style storage,

40# where the element name is the key and the content is the value. The keys

41# are transformed into valid XML identifiers by substituting an invalid

42# identifier character with \u2182 followed by the unicode hex ID of the

43# original character. A key like "my car" is therefore "my\u21820020car".

44#

45# \u2182 is the unicode character \u{ROMAN NUMERAL TEN THOUSAND}

46#

47# The pdfx namespace should be avoided.

48# A custom data schema and sensical XML elements could be used instead, as is

49# suggested by Adobe's own documentation on XMP under "Extensibility of

50# Schemas".

51PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/"

53# PDF/A

54PDFAID_NAMESPACE = "http://www.aiim.org/pdfa/ns/id/"

56iso8601 = re.compile(

57 """

58 (?P<year>[0-9]{4})

59 (-

60 (?P<month>[0-9]{2})

61 (-

62 (?P<day>[0-9]+)

63 (T

64 (?P<hour>[0-9]{2}):

65 (?P<minute>[0-9]{2})

66 (:(?P<second>[0-9]{2}(.[0-9]+)?))?

67 (?P<tzd>Z|[-+][0-9]{2}:[0-9]{2})

68 )?

69 )?

70 )?

71 """,

72 re.VERBOSE,

73)

76K = TypeVar("K")

79def _identity(value: K) -> K:

80 return value

83def _converter_date(value: str) -> datetime.datetime:

84 matches = iso8601.match(value)

85 if matches is None:

86 raise ValueError(f"Invalid date format: {value}")

87 year = int(matches.group("year"))

88 month = int(matches.group("month") or "1")

89 day = int(matches.group("day") or "1")

90 hour = int(matches.group("hour") or "0")

91 minute = int(matches.group("minute") or "0")

92 second = decimal.Decimal(matches.group("second") or "0")

93 seconds_dec = second.to_integral(decimal.ROUND_FLOOR)

94 milliseconds_dec = (second - seconds_dec) * 1_000_000

96 seconds = int(seconds_dec)

97 milliseconds = int(milliseconds_dec)

99 tzd = matches.group("tzd") or "Z"

100 dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds)

101 if tzd != "Z":

102 tzd_hours, tzd_minutes = (int(x) for x in tzd.split(":"))

103 tzd_hours *= -1

104 if tzd_hours < 0:

105 tzd_minutes *= -1

106 dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes)

107 return dt

108

109

110def _generic_get(

111 element: XmlElement, self: "XmpInformation", list_type: str, converter: Callable[[Any], Any] = _identity

112) -> Optional[list[str]]:

113 containers = element.getElementsByTagNameNS(RDF_NAMESPACE, list_type)

114 retval: list[Any] = []

115 if len(containers):

116 for container in containers:

117 for item in container.getElementsByTagNameNS(RDF_NAMESPACE, "li"):

118 value = self._get_text(item)

119 value = converter(value)

120 retval.append(value)

121 return retval

122 return None

123

124

125def _getter_bag(

126 namespace: str, name: str

127) -> Callable[["XmpInformation"], Optional[list[str]]]:

128 def get(self: "XmpInformation") -> Optional[list[str]]:

129 cached = self.cache.get(namespace, {}).get(name)

130 if cached:

131 return cached

132 retval: list[str] = []

133 for element in self.get_element("", namespace, name):

134 if (bags := _generic_get(element, self, list_type="Bag")) is not None:

135 retval.extend(bags)

136 else:

137 value = self._get_text(element)

138 retval.append(value)

139 ns_cache = self.cache.setdefault(namespace, {})

140 ns_cache[name] = retval

141 return retval

142

143 return get

144

145

146def _getter_seq(

147 namespace: str, name: str, converter: Callable[[Any], Any] = _identity

148) -> Callable[["XmpInformation"], Optional[list[Any]]]:

149 def get(self: "XmpInformation") -> Optional[list[Any]]:

150 cached = self.cache.get(namespace, {}).get(name)

151 if cached:

152 return cached

153 retval = []

154 for element in self.get_element("", namespace, name):

155 if (seqs := _generic_get(element, self, list_type="Seq", converter=converter)) is not None:

156 retval.extend(seqs)

157 elif (bags := _generic_get(element, self, list_type="Bag")) is not None:

158 # See issue at https://github.com/py-pdf/pypdf/issues/3324

159 # Some applications violate the XMP metadata standard regarding `dc:creator` which should

160 # be an "ordered array" and thus a sequence, but use an unordered array (bag) instead.

161 # This seems to stem from the fact that the original Dublin Core specification does indeed

162 # use bags or direct values, while PDFs are expected to follow the XMP standard and ignore

163 # the plain Dublin Core variant. For this reason, add a fallback here to deal with such

164 # issues accordingly.

165 retval.extend(bags)

166 else:

167 value = converter(self._get_text(element))

168 retval.append(value)

169 ns_cache = self.cache.setdefault(namespace, {})

170 ns_cache[name] = retval

171 return retval

172

173 return get

174

175

176def _getter_langalt(

177 namespace: str, name: str

178) -> Callable[["XmpInformation"], Optional[dict[Any, Any]]]:

179 def get(self: "XmpInformation") -> Optional[dict[Any, Any]]:

180 cached = self.cache.get(namespace, {}).get(name)

181 if cached:

182 return cached

183 retval = {}

184 for element in self.get_element("", namespace, name):

185 alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt")

186 if len(alts):

187 for alt in alts:

188 for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"):

189 value = self._get_text(item)

190 retval[item.getAttribute("xml:lang")] = value

191 else:

192 retval["x-default"] = self._get_text(element)

193 ns_cache = self.cache.setdefault(namespace, {})

194 ns_cache[name] = retval

195 return retval

196

197 return get

198

199

200def _getter_single(

201 namespace: str, name: str, converter: Callable[[str], Any] = _identity

202) -> Callable[["XmpInformation"], Optional[Any]]:

203 def get(self: "XmpInformation") -> Optional[Any]:

204 cached = self.cache.get(namespace, {}).get(name)

205 if cached:

206 return cached

207 value = None

208 for element in self.get_element("", namespace, name):

209 if element.nodeType == element.ATTRIBUTE_NODE:

210 value = element.nodeValue

211 else:

212 value = self._get_text(element)

213 break

214 if value is not None:

215 value = converter(value)

216 ns_cache = self.cache.setdefault(namespace, {})

217 ns_cache[name] = value

218 return value

219

220 return get

221

222

223class XmpInformation(XmpInformationProtocol, PdfObject):

224 """

225 An object that represents Extensible Metadata Platform (XMP) metadata.

226 Usually accessed by :py:attr:`xmp_metadata()<pypdf.PdfReader.xmp_metadata>`.

227

228 Raises:

229 PdfReadError: if XML is invalid

230

231 """

232

233 def __init__(self, stream: ContentStream) -> None:

234 self.stream = stream

235 try:

236 data = self.stream.get_data()

237 doc_root: Document = parseString(data) # noqa: S318

238 except (AttributeError, ExpatError) as e:

239 raise PdfReadError(f"XML in XmpInformation was invalid: {e}")

240 self.rdf_root: XmlElement = doc_root.getElementsByTagNameNS(

241 RDF_NAMESPACE, "RDF"

242 )[0]

243 self.cache: dict[Any, Any] = {}

244

245 def write_to_stream(

246 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

247 ) -> None:

248 deprecate_with_replacement(

249 "XmpInformation.write_to_stream",

250 "PdfWriter.xmp_metadata",

251 "6.0.0"

252 )

253 if encryption_key is not None: # deprecated

254 deprecation_no_replacement(

255 "the encryption_key parameter of write_to_stream", "5.0.0"

256 )

257 self.stream.write_to_stream(stream)

258

259 def get_element(self, about_uri: str, namespace: str, name: str) -> Iterator[Any]:

260 for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):

261 if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri:

262 attr = desc.getAttributeNodeNS(namespace, name)

263 if attr is not None:

264 yield attr

265 yield from desc.getElementsByTagNameNS(namespace, name)

266

267 def get_nodes_in_namespace(self, about_uri: str, namespace: str) -> Iterator[Any]:

268 for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):

269 if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri:

270 for i in range(desc.attributes.length):

271 attr = desc.attributes.item(i)

272 if attr and attr.namespaceURI == namespace:

273 yield attr

274 for child in desc.childNodes:

275 if child.namespaceURI == namespace:

276 yield child

277

278 def _get_text(self, element: XmlElement) -> str:

279 text = ""

280 for child in element.childNodes:

281 if child.nodeType == child.TEXT_NODE:

282 text += child.data

283 return text

284

285 dc_contributor = property(_getter_bag(DC_NAMESPACE, "contributor"))

286 """

287 Contributors to the resource (other than the authors).

288

289 An unsorted array of names.

290 """

291

292 dc_coverage = property(_getter_single(DC_NAMESPACE, "coverage"))

293 """Text describing the extent or scope of the resource."""

294

295 dc_creator = property(_getter_seq(DC_NAMESPACE, "creator"))

296 """A sorted array of names of the authors of the resource, listed in order

297 of precedence."""

298

299 dc_date = property(_getter_seq(DC_NAMESPACE, "date", _converter_date))

300 """

301 A sorted array of dates (datetime.datetime instances) of significance to

302 the resource.

303

304 The dates and times are in UTC.

305 """

306

307 dc_description = property(_getter_langalt(DC_NAMESPACE, "description"))

308 """A language-keyed dictionary of textual descriptions of the content of the

309 resource."""

310

311 dc_format = property(_getter_single(DC_NAMESPACE, "format"))

312 """The mime-type of the resource."""

313

314 dc_identifier = property(_getter_single(DC_NAMESPACE, "identifier"))

315 """Unique identifier of the resource."""

316

317 dc_language = property(_getter_bag(DC_NAMESPACE, "language"))

318 """An unordered array specifying the languages used in the resource."""

319

320 dc_publisher = property(_getter_bag(DC_NAMESPACE, "publisher"))

321 """An unordered array of publisher names."""

322

323 dc_relation = property(_getter_bag(DC_NAMESPACE, "relation"))

324 """An unordered array of text descriptions of relationships to other

325 documents."""

326

327 dc_rights = property(_getter_langalt(DC_NAMESPACE, "rights"))

328 """A language-keyed dictionary of textual descriptions of the rights the

329 user has to this resource."""

330

331 dc_source = property(_getter_single(DC_NAMESPACE, "source"))

332 """Unique identifier of the work from which this resource was derived."""

333

334 dc_subject = property(_getter_bag(DC_NAMESPACE, "subject"))

335 """An unordered array of descriptive phrases or keywords that specify the

336 topic of the content of the resource."""

337

338 dc_title = property(_getter_langalt(DC_NAMESPACE, "title"))

339 """A language-keyed dictionary of the title of the resource."""

340

341 dc_type = property(_getter_bag(DC_NAMESPACE, "type"))

342 """An unordered array of textual descriptions of the document type."""

343

344 pdf_keywords = property(_getter_single(PDF_NAMESPACE, "Keywords"))

345 """An unformatted text string representing document keywords."""

346

347 pdf_pdfversion = property(_getter_single(PDF_NAMESPACE, "PDFVersion"))

348 """The PDF file version, for example 1.0 or 1.3."""

349

350 pdf_producer = property(_getter_single(PDF_NAMESPACE, "Producer"))

351 """The name of the tool that saved the document as a PDF."""

352

353 xmp_create_date = property(

354 _getter_single(XMP_NAMESPACE, "CreateDate", _converter_date)

355 )

356 """

357 The date and time the resource was originally created.

358

359 The date and time are returned as a UTC datetime.datetime object.

360 """

361

362 xmp_modify_date = property(

363 _getter_single(XMP_NAMESPACE, "ModifyDate", _converter_date)

364 )

365 """

366 The date and time the resource was last modified.

367

368 The date and time are returned as a UTC datetime.datetime object.

369 """

370

371 xmp_metadata_date = property(

372 _getter_single(XMP_NAMESPACE, "MetadataDate", _converter_date)

373 )

374 """

375 The date and time that any metadata for this resource was last changed.

376

377 The date and time are returned as a UTC datetime.datetime object.

378 """

379

380 xmp_creator_tool = property(_getter_single(XMP_NAMESPACE, "CreatorTool"))

381 """The name of the first known tool used to create the resource."""

382

383 xmpmm_document_id = property(_getter_single(XMPMM_NAMESPACE, "DocumentID"))

384 """The common identifier for all versions and renditions of this resource."""

385

386 xmpmm_instance_id = property(_getter_single(XMPMM_NAMESPACE, "InstanceID"))

387 """An identifier for a specific incarnation of a document, updated each

388 time a file is saved."""

389

390 pdfaid_part = property(_getter_single(PDFAID_NAMESPACE, "part"))

391 """The part of the PDF/A standard that the document conforms to (e.g., 1, 2, 3)."""

392

393 pdfaid_conformance = property(_getter_single(PDFAID_NAMESPACE, "conformance"))

394 """The conformance level within the PDF/A standard (e.g., 'A', 'B', 'U')."""

395

396 @property

397 def custom_properties(self) -> dict[Any, Any]:

398 """

399 Retrieve custom metadata properties defined in the undocumented pdfx

400 metadata schema.

401

402 Returns:

403 A dictionary of key/value items for custom metadata properties.

404

405 """

406 if not hasattr(self, "_custom_properties"):

407 self._custom_properties = {}

408 for node in self.get_nodes_in_namespace("", PDFX_NAMESPACE):

409 key = node.localName

410 while True:

411 # see documentation about PDFX_NAMESPACE earlier in file

412 idx = key.find("\u2182")

413 if idx == -1:

414 break

415 key = (

416 key[:idx]

417 + chr(int(key[idx + 1 : idx + 5], base=16))

418 + key[idx + 5 :]

419 )

420 if node.nodeType == node.ATTRIBUTE_NODE:

421 value = node.nodeValue

422 else:

423 value = self._get_text(node)

424 self._custom_properties[key] = value

425 return self._custom_properties