1# SPDX-FileCopyrightText: 2022 James R. Barlow
2# SPDX-License-Identifier: MPL-2.0
3
4"""XMP metadata constants, templates, and utilities."""
5
6from __future__ import annotations
7
8import re
9from collections.abc import Callable, Iterable
10from typing import NamedTuple
11
12from lxml import etree
13from lxml.etree import QName
14
15# XMP Namespace URIs
16XMP_NS_DC = "http://purl.org/dc/elements/1.1/"
17XMP_NS_PDF = "http://ns.adobe.com/pdf/1.3/"
18XMP_NS_PDFA_ID = "http://www.aiim.org/pdfa/ns/id/"
19XMP_NS_PDFA_EXTENSION = "http://www.aiim.org/pdfa/ns/extension/"
20XMP_NS_PDFA_PROPERTY = "http://www.aiim.org/pdfa/ns/property#"
21XMP_NS_PDFA_SCHEMA = "http://www.aiim.org/pdfa/ns/schema#"
22XMP_NS_PDFUA_ID = "http://www.aiim.org/pdfua/ns/id/"
23XMP_NS_PDFX_ID = "http://www.npes.org/pdfx/ns/id/"
24XMP_NS_PHOTOSHOP = "http://ns.adobe.com/photoshop/1.0/"
25XMP_NS_PRISM = "http://prismstandard.org/namespaces/basic/1.0/"
26XMP_NS_PRISM2 = "http://prismstandard.org/namespaces/basic/2.0/"
27XMP_NS_PRISM3 = "http://prismstandard.org/namespaces/basic/3.0/"
28XMP_NS_RDF = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
29XMP_NS_XMP = "http://ns.adobe.com/xap/1.0/"
30XMP_NS_XMP_MM = "http://ns.adobe.com/xap/1.0/mm/"
31XMP_NS_XMP_RIGHTS = "http://ns.adobe.com/xap/1.0/rights/"
32
33# This one should not be registered with lxml
34XMP_NS_XML = "http://www.w3.org/XML/1998/namespace"
35
36DEFAULT_NAMESPACES: list[tuple[str, str]] = [
37 ('adobe:ns:meta/', 'x'),
38 (XMP_NS_DC, 'dc'),
39 (XMP_NS_PDF, 'pdf'),
40 (XMP_NS_PDFA_ID, 'pdfaid'),
41 (XMP_NS_PDFA_EXTENSION, 'pdfaExtension'),
42 (XMP_NS_PDFA_PROPERTY, 'pdfaProperty'),
43 (XMP_NS_PDFA_SCHEMA, 'pdfaSchema'),
44 (XMP_NS_PDFUA_ID, 'pdfuaid'),
45 (XMP_NS_PDFX_ID, 'pdfxid'),
46 (XMP_NS_PHOTOSHOP, 'photoshop'),
47 (XMP_NS_PRISM, 'prism'),
48 (XMP_NS_PRISM2, 'prism2'),
49 (XMP_NS_PRISM3, 'prism3'),
50 (XMP_NS_RDF, 'rdf'),
51 (XMP_NS_XMP, 'xmp'),
52 (XMP_NS_XMP_MM, 'xmpMM'),
53 (XMP_NS_XMP_RIGHTS, 'xmpRights'),
54 ('http://crossref.org/crossmark/1.0/', 'crossmark'),
55 ('http://www.niso.org/schemas/jav/1.0/', 'jav'),
56 ('http://ns.adobe.com/pdfx/1.3/', 'pdfx'),
57 ('http://www.niso.org/schemas/ali/1.0/', 'ali'),
58]
59
60# Register all namespaces with lxml
61for _uri, _prefix in DEFAULT_NAMESPACES:
62 etree.register_namespace(_prefix, _uri)
63
64# XMP packet wrappers
65XPACKET_BEGIN = b"""<?xpacket begin="\xef\xbb\xbf" id="W5M0MpCehiHzreSzNTczkc9d"?>\n"""
66
67XMP_EMPTY = b"""<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="pikepdf">
68 <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
69 </rdf:RDF>
70</x:xmpmeta>
71"""
72
73XPACKET_END = b"""\n<?xpacket end="w"?>\n"""
74
75
76class XmpContainer(NamedTuple):
77 """Map XMP container object to suitable Python container."""
78
79 rdf_type: str
80 py_type: type
81 insert_fn: Callable[..., None]
82
83
84class AltList(list):
85 """XMP AltList container for language alternatives."""
86
87
88XMP_CONTAINERS = [
89 XmpContainer('Alt', AltList, AltList.append),
90 XmpContainer('Bag', set, set.add),
91 XmpContainer('Seq', list, list.append),
92]
93
94LANG_ALTS = frozenset(
95 [
96 str(QName(XMP_NS_DC, 'title')),
97 str(QName(XMP_NS_DC, 'description')),
98 str(QName(XMP_NS_DC, 'rights')),
99 str(QName(XMP_NS_XMP_RIGHTS, 'UsageTerms')),
100 ]
101)
102
103# These are the illegal characters in XML 1.0. (XML 1.1 is a bit more permissive,
104# but we'll be strict to ensure wider compatibility.)
105re_xml_illegal_chars = re.compile(
106 r"(?u)[^\x09\x0A\x0D\x20-\U0000D7FF\U0000E000-\U0000FFFD\U00010000-\U0010FFFF]"
107)
108re_xml_illegal_bytes = re.compile(rb"[^\x09\x0A\x0D\x20-\xFF]|�")
109
110
111def clean(s: str | Iterable[str], joiner: str = '; ') -> str:
112 """Ensure an object can safely be inserted in a XML tag body.
113
114 If we still have a non-str object at this point, the best option is to
115 join it, because it's apparently calling for a new node in a place that
116 isn't allowed in the spec or not supported.
117 """
118 from warnings import warn
119
120 if not isinstance(s, str):
121 if isinstance(s, Iterable):
122 warn(f"Merging elements of {s}")
123 if isinstance(s, set):
124 s = joiner.join(sorted(s))
125 else:
126 s = joiner.join(s)
127 else:
128 raise TypeError("object must be a string or iterable of strings")
129 return re_xml_illegal_chars.sub('', s)