Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/xmp.py: 42%

1"""

2Anything related to Extensible Metadata Platform (XMP) metadata.

4https://en.wikipedia.org/wiki/Extensible_Metadata_Platform

5"""

7import datetime

8import decimal

9import re

10from typing import (

11 Any,

12 Callable,

13 Dict,

14 Iterator,

15 List,

16 Optional,

17 TypeVar,

18 Union,

19)

20from xml.dom.minidom import Document, parseString

21from xml.dom.minidom import Element as XmlElement

22from xml.parsers.expat import ExpatError

24from ._protocols import XmpInformationProtocol

25from ._utils import StreamType, deprecate_no_replacement

26from .errors import PdfReadError

27from .generic import ContentStream, PdfObject

29RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"

30DC_NAMESPACE = "http://purl.org/dc/elements/1.1/"

31XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/"

32PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/"

33XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/"

35# What is the PDFX namespace, you might ask?

36# It's documented here: https://github.com/adobe/xmp-docs/raw/master/XMPSpecifications/XMPSpecificationPart3.pdf

37# This namespace is used to place "custom metadata"

38# properties, which are arbitrary metadata properties with no semantic or

39# documented meaning.

40#

41# Elements in the namespace are key/value-style storage,

42# where the element name is the key and the content is the value. The keys

43# are transformed into valid XML identifiers by substituting an invalid

44# identifier character with \u2182 followed by the unicode hex ID of the

45# original character. A key like "my car" is therefore "my\u21820020car".

46#

47# \u2182 is the unicode character \u{ROMAN NUMERAL TEN THOUSAND}

48#

49# The pdfx namespace should be avoided.

50# A custom data schema and sensical XML elements could be used instead, as is

51# suggested by Adobe's own documentation on XMP under "Extensibility of

52# Schemas".

53PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/"

55# PDF/A

56PDFAID_NAMESPACE = "http://www.aiim.org/pdfa/ns/id/"

58iso8601 = re.compile(

59 """

60 (?P<year>[0-9]{4})

61 (-

62 (?P<month>[0-9]{2})

63 (-

64 (?P<day>[0-9]+)

65 (T

66 (?P<hour>[0-9]{2}):

67 (?P<minute>[0-9]{2})

68 (:(?P<second>[0-9]{2}(.[0-9]+)?))?

69 (?P<tzd>Z|[-+][0-9]{2}:[0-9]{2})

70 )?

71 )?

72 )?

73 """,

74 re.VERBOSE,

75)

78K = TypeVar("K")

81def _identity(value: K) -> K:

82 return value

85def _converter_date(value: str) -> datetime.datetime:

86 matches = iso8601.match(value)

87 if matches is None:

88 raise ValueError(f"Invalid date format: {value}")

89 year = int(matches.group("year"))

90 month = int(matches.group("month") or "1")

91 day = int(matches.group("day") or "1")

92 hour = int(matches.group("hour") or "0")

93 minute = int(matches.group("minute") or "0")

94 second = decimal.Decimal(matches.group("second") or "0")

95 seconds_dec = second.to_integral(decimal.ROUND_FLOOR)

96 milliseconds_dec = (second - seconds_dec) * 1_000_000

98 seconds = int(seconds_dec)

99 milliseconds = int(milliseconds_dec)

100

101 tzd = matches.group("tzd") or "Z"

102 dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds)

103 if tzd != "Z":

104 tzd_hours, tzd_minutes = (int(x) for x in tzd.split(":"))

105 tzd_hours *= -1

106 if tzd_hours < 0:

107 tzd_minutes *= -1

108 dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes)

109 return dt

110

111

112def _generic_get(

113 element: XmlElement, self: "XmpInformation", list_type: str, converter: Callable[[Any], Any] = _identity

114) -> Optional[List[str]]:

115 containers = element.getElementsByTagNameNS(RDF_NAMESPACE, list_type)

116 retval: List[Any] = []

117 if len(containers):

118 for container in containers:

119 for item in container.getElementsByTagNameNS(RDF_NAMESPACE, "li"):

120 value = self._get_text(item)

121 value = converter(value)

122 retval.append(value)

123 return retval

124 return None

125

126

127def _getter_bag(

128 namespace: str, name: str

129) -> Callable[["XmpInformation"], Optional[List[str]]]:

130 def get(self: "XmpInformation") -> Optional[List[str]]:

131 cached = self.cache.get(namespace, {}).get(name)

132 if cached:

133 return cached

134 retval: List[str] = []

135 for element in self.get_element("", namespace, name):

136 if (bags := _generic_get(element, self, list_type="Bag")) is not None:

137 retval.extend(bags)

138 else:

139 value = self._get_text(element)

140 retval.append(value)

141 ns_cache = self.cache.setdefault(namespace, {})

142 ns_cache[name] = retval

143 return retval

144

145 return get

146

147

148def _getter_seq(

149 namespace: str, name: str, converter: Callable[[Any], Any] = _identity

150) -> Callable[["XmpInformation"], Optional[List[Any]]]:

151 def get(self: "XmpInformation") -> Optional[List[Any]]:

152 cached = self.cache.get(namespace, {}).get(name)

153 if cached:

154 return cached

155 retval = []

156 for element in self.get_element("", namespace, name):

157 if (seqs := _generic_get(element, self, list_type="Seq", converter=converter)) is not None:

158 retval.extend(seqs)

159 elif (bags := _generic_get(element, self, list_type="Bag")) is not None:

160 # See issue at https://github.com/py-pdf/pypdf/issues/3324

161 # Some applications violate the XMP metadata standard regarding `dc:creator` which should

162 # be an "ordered array" and thus a sequence, but use an unordered array (bag) instead.

163 # This seems to stem from the fact that the original Dublin Core specification does indeed

164 # use bags or direct values, while PDFs are expected to follow the XMP standard and ignore

165 # the plain Dublin Core variant. For this reason, add a fallback here to deal with such

166 # issues accordingly.

167 retval.extend(bags)

168 else:

169 value = converter(self._get_text(element))

170 retval.append(value)

171 ns_cache = self.cache.setdefault(namespace, {})

172 ns_cache[name] = retval

173 return retval

174

175 return get

176

177

178def _getter_langalt(

179 namespace: str, name: str

180) -> Callable[["XmpInformation"], Optional[Dict[Any, Any]]]:

181 def get(self: "XmpInformation") -> Optional[Dict[Any, Any]]:

182 cached = self.cache.get(namespace, {}).get(name)

183 if cached:

184 return cached

185 retval = {}

186 for element in self.get_element("", namespace, name):

187 alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt")

188 if len(alts):

189 for alt in alts:

190 for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"):

191 value = self._get_text(item)

192 retval[item.getAttribute("xml:lang")] = value

193 else:

194 retval["x-default"] = self._get_text(element)

195 ns_cache = self.cache.setdefault(namespace, {})

196 ns_cache[name] = retval

197 return retval

198

199 return get

200

201

202def _getter_single(

203 namespace: str, name: str, converter: Callable[[str], Any] = _identity

204) -> Callable[["XmpInformation"], Optional[Any]]:

205 def get(self: "XmpInformation") -> Optional[Any]:

206 cached = self.cache.get(namespace, {}).get(name)

207 if cached:

208 return cached

209 value = None

210 for element in self.get_element("", namespace, name):

211 if element.nodeType == element.ATTRIBUTE_NODE:

212 value = element.nodeValue

213 else:

214 value = self._get_text(element)

215 break

216 if value is not None:

217 value = converter(value)

218 ns_cache = self.cache.setdefault(namespace, {})

219 ns_cache[name] = value

220 return value

221

222 return get

223

224

225class XmpInformation(XmpInformationProtocol, PdfObject):

226 """

227 An object that represents Extensible Metadata Platform (XMP) metadata.

228 Usually accessed by :py:attr:`xmp_metadata()<pypdf.PdfReader.xmp_metadata>`.

229

230 Raises:

231 PdfReadError: if XML is invalid

232

233 """

234

235 def __init__(self, stream: ContentStream) -> None:

236 self.stream = stream

237 try:

238 data = self.stream.get_data()

239 doc_root: Document = parseString(data) # noqa: S318

240 except (AttributeError, ExpatError) as e:

241 raise PdfReadError(f"XML in XmpInformation was invalid: {e}")

242 self.rdf_root: XmlElement = doc_root.getElementsByTagNameNS(

243 RDF_NAMESPACE, "RDF"

244 )[0]

245 self.cache: Dict[Any, Any] = {}

246

247 def write_to_stream(

248 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

249 ) -> None:

250 if encryption_key is not None: # deprecated

251 deprecate_no_replacement(

252 "the encryption_key parameter of write_to_stream", "5.0.0"

253 )

254 self.stream.write_to_stream(stream)

255

256 def get_element(self, about_uri: str, namespace: str, name: str) -> Iterator[Any]:

257 for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):

258 if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri:

259 attr = desc.getAttributeNodeNS(namespace, name)

260 if attr is not None:

261 yield attr

262 yield from desc.getElementsByTagNameNS(namespace, name)

263

264 def get_nodes_in_namespace(self, about_uri: str, namespace: str) -> Iterator[Any]:

265 for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):

266 if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri:

267 for i in range(desc.attributes.length):

268 attr = desc.attributes.item(i)

269 if attr and attr.namespaceURI == namespace:

270 yield attr

271 for child in desc.childNodes:

272 if child.namespaceURI == namespace:

273 yield child

274

275 def _get_text(self, element: XmlElement) -> str:

276 text = ""

277 for child in element.childNodes:

278 if child.nodeType == child.TEXT_NODE:

279 text += child.data

280 return text

281

282 dc_contributor = property(_getter_bag(DC_NAMESPACE, "contributor"))

283 """

284 Contributors to the resource (other than the authors).

285

286 An unsorted array of names.

287 """

288

289 dc_coverage = property(_getter_single(DC_NAMESPACE, "coverage"))

290 """Text describing the extent or scope of the resource."""

291

292 dc_creator = property(_getter_seq(DC_NAMESPACE, "creator"))

293 """A sorted array of names of the authors of the resource, listed in order

294 of precedence."""

295

296 dc_date = property(_getter_seq(DC_NAMESPACE, "date", _converter_date))

297 """

298 A sorted array of dates (datetime.datetime instances) of significance to

299 the resource.

300

301 The dates and times are in UTC.

302 """

303

304 dc_description = property(_getter_langalt(DC_NAMESPACE, "description"))

305 """A language-keyed dictionary of textual descriptions of the content of the

306 resource."""

307

308 dc_format = property(_getter_single(DC_NAMESPACE, "format"))

309 """The mime-type of the resource."""

310

311 dc_identifier = property(_getter_single(DC_NAMESPACE, "identifier"))

312 """Unique identifier of the resource."""

313

314 dc_language = property(_getter_bag(DC_NAMESPACE, "language"))

315 """An unordered array specifying the languages used in the resource."""

316

317 dc_publisher = property(_getter_bag(DC_NAMESPACE, "publisher"))

318 """An unordered array of publisher names."""

319

320 dc_relation = property(_getter_bag(DC_NAMESPACE, "relation"))

321 """An unordered array of text descriptions of relationships to other

322 documents."""

323

324 dc_rights = property(_getter_langalt(DC_NAMESPACE, "rights"))

325 """A language-keyed dictionary of textual descriptions of the rights the

326 user has to this resource."""

327

328 dc_source = property(_getter_single(DC_NAMESPACE, "source"))

329 """Unique identifier of the work from which this resource was derived."""

330

331 dc_subject = property(_getter_bag(DC_NAMESPACE, "subject"))

332 """An unordered array of descriptive phrases or keywords that specify the

333 topic of the content of the resource."""

334

335 dc_title = property(_getter_langalt(DC_NAMESPACE, "title"))

336 """A language-keyed dictionary of the title of the resource."""

337

338 dc_type = property(_getter_bag(DC_NAMESPACE, "type"))

339 """An unordered array of textual descriptions of the document type."""

340

341 pdf_keywords = property(_getter_single(PDF_NAMESPACE, "Keywords"))

342 """An unformatted text string representing document keywords."""

343

344 pdf_pdfversion = property(_getter_single(PDF_NAMESPACE, "PDFVersion"))

345 """The PDF file version, for example 1.0 or 1.3."""

346

347 pdf_producer = property(_getter_single(PDF_NAMESPACE, "Producer"))

348 """The name of the tool that saved the document as a PDF."""

349

350 xmp_create_date = property(

351 _getter_single(XMP_NAMESPACE, "CreateDate", _converter_date)

352 )

353 """

354 The date and time the resource was originally created.

355

356 The date and time are returned as a UTC datetime.datetime object.

357 """

358

359 xmp_modify_date = property(

360 _getter_single(XMP_NAMESPACE, "ModifyDate", _converter_date)

361 )

362 """

363 The date and time the resource was last modified.

364

365 The date and time are returned as a UTC datetime.datetime object.

366 """

367

368 xmp_metadata_date = property(

369 _getter_single(XMP_NAMESPACE, "MetadataDate", _converter_date)

370 )

371 """

372 The date and time that any metadata for this resource was last changed.

373

374 The date and time are returned as a UTC datetime.datetime object.

375 """

376

377 xmp_creator_tool = property(_getter_single(XMP_NAMESPACE, "CreatorTool"))

378 """The name of the first known tool used to create the resource."""

379

380 xmpmm_document_id = property(_getter_single(XMPMM_NAMESPACE, "DocumentID"))

381 """The common identifier for all versions and renditions of this resource."""

382

383 xmpmm_instance_id = property(_getter_single(XMPMM_NAMESPACE, "InstanceID"))

384 """An identifier for a specific incarnation of a document, updated each

385 time a file is saved."""

386

387 pdfaid_part = property(_getter_single(PDFAID_NAMESPACE, "part"))

388 """The part of the PDF/A standard that the document conforms to (e.g., 1, 2, 3)."""

389

390 pdfaid_conformance = property(_getter_single(PDFAID_NAMESPACE, "conformance"))

391 """The conformance level within the PDF/A standard (e.g., 'A', 'B', 'U')."""

392

393 @property

394 def custom_properties(self) -> Dict[Any, Any]:

395 """

396 Retrieve custom metadata properties defined in the undocumented pdfx

397 metadata schema.

398

399 Returns:

400 A dictionary of key/value items for custom metadata properties.

401

402 """

403 if not hasattr(self, "_custom_properties"):

404 self._custom_properties = {}

405 for node in self.get_nodes_in_namespace("", PDFX_NAMESPACE):

406 key = node.localName

407 while True:

408 # see documentation about PDFX_NAMESPACE earlier in file

409 idx = key.find("\u2182")

410 if idx == -1:

411 break

412 key = (

413 key[:idx]

414 + chr(int(key[idx + 1 : idx + 5], base=16))

415 + key[idx + 5 :]

416 )

417 if node.nodeType == node.ATTRIBUTE_NODE:

418 value = node.nodeValue

419 else:

420 value = self._get_text(node)

421 self._custom_properties[key] = value

422 return self._custom_properties