1# SPDX-FileCopyrightText: 2022 James R. Barlow
2# SPDX-License-Identifier: MPL-2.0
3
4"""XMP metadata constants, templates, and utilities."""
5
6from __future__ import annotations
7
8import re
9from collections.abc import Callable, Iterable
10from typing import Any, NamedTuple
11
12# XMP Namespace URIs
13XMP_NS_DC = "http://purl.org/dc/elements/1.1/"
14XMP_NS_PDF = "http://ns.adobe.com/pdf/1.3/"
15XMP_NS_PDFA_ID = "http://www.aiim.org/pdfa/ns/id/"
16XMP_NS_PDFA_EXTENSION = "http://www.aiim.org/pdfa/ns/extension/"
17XMP_NS_PDFA_PROPERTY = "http://www.aiim.org/pdfa/ns/property#"
18XMP_NS_PDFA_SCHEMA = "http://www.aiim.org/pdfa/ns/schema#"
19XMP_NS_PDFUA_ID = "http://www.aiim.org/pdfua/ns/id/"
20XMP_NS_PDFX_ID = "http://www.npes.org/pdfx/ns/id/"
21XMP_NS_PHOTOSHOP = "http://ns.adobe.com/photoshop/1.0/"
22XMP_NS_PRISM = "http://prismstandard.org/namespaces/basic/1.0/"
23XMP_NS_PRISM2 = "http://prismstandard.org/namespaces/basic/2.0/"
24XMP_NS_PRISM3 = "http://prismstandard.org/namespaces/basic/3.0/"
25XMP_NS_RDF = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
26XMP_NS_XMP = "http://ns.adobe.com/xap/1.0/"
27XMP_NS_XMP_MM = "http://ns.adobe.com/xap/1.0/mm/"
28XMP_NS_XMP_RIGHTS = "http://ns.adobe.com/xap/1.0/rights/"
29
30# This one should not be registered with lxml
31XMP_NS_XML = "http://www.w3.org/XML/1998/namespace"
32
33DEFAULT_NAMESPACES: list[tuple[str, str]] = [
34 ('adobe:ns:meta/', 'x'),
35 (XMP_NS_DC, 'dc'),
36 (XMP_NS_PDF, 'pdf'),
37 (XMP_NS_PDFA_ID, 'pdfaid'),
38 (XMP_NS_PDFA_EXTENSION, 'pdfaExtension'),
39 (XMP_NS_PDFA_PROPERTY, 'pdfaProperty'),
40 (XMP_NS_PDFA_SCHEMA, 'pdfaSchema'),
41 (XMP_NS_PDFUA_ID, 'pdfuaid'),
42 (XMP_NS_PDFX_ID, 'pdfxid'),
43 (XMP_NS_PHOTOSHOP, 'photoshop'),
44 (XMP_NS_PRISM, 'prism'),
45 (XMP_NS_PRISM2, 'prism2'),
46 (XMP_NS_PRISM3, 'prism3'),
47 (XMP_NS_RDF, 'rdf'),
48 (XMP_NS_XMP, 'xmp'),
49 (XMP_NS_XMP_MM, 'xmpMM'),
50 (XMP_NS_XMP_RIGHTS, 'xmpRights'),
51 ('http://crossref.org/crossmark/1.0/', 'crossmark'),
52 ('http://www.niso.org/schemas/jav/1.0/', 'jav'),
53 ('http://ns.adobe.com/pdfx/1.3/', 'pdfx'),
54 ('http://www.niso.org/schemas/ali/1.0/', 'ali'),
55]
56
57
58# XMP packet wrappers
59XPACKET_BEGIN = b"""<?xpacket begin="\xef\xbb\xbf" id="W5M0MpCehiHzreSzNTczkc9d"?>\n"""
60
61XMP_EMPTY = b"""<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="pikepdf">
62 <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
63 </rdf:RDF>
64</x:xmpmeta>
65"""
66
67XPACKET_END = b"""\n<?xpacket end="w"?>\n"""
68
69
70class XmpContainer(NamedTuple):
71 """Map XMP container object to suitable Python container."""
72
73 rdf_type: str
74 py_type: type
75 insert_fn: Callable[..., None]
76
77
78class AltList(list):
79 """XMP AltList container for language alternatives."""
80
81
82XMP_CONTAINERS = [
83 XmpContainer('Alt', AltList, AltList.append),
84 XmpContainer('Bag', set, set.add),
85 XmpContainer('Seq', list, list.append),
86]
87
88
89_LANG_ALTS_LAZY = [
90 (XMP_NS_DC, 'title'),
91 (XMP_NS_DC, 'description'),
92 (XMP_NS_DC, 'rights'),
93 (XMP_NS_XMP_RIGHTS, 'UsageTerms'),
94]
95
96_LOADED_LXML_NAMESPACES = False
97
98# lxml lazy-loading
99def __getattr__(name: str) -> Any:
100 global _LOADED_LXML_NAMESPACES
101
102 if name == 'LANG_ALTS':
103 from lxml.etree import QName
104
105 if not _LOADED_LXML_NAMESPACES:
106 from lxml import etree
107 # Register all namespaces with lxml
108 for _uri, _prefix in DEFAULT_NAMESPACES:
109 etree.register_namespace(_prefix, _uri)
110 _LOADED_LXML_NAMESPACES = True
111
112 val = frozenset([str(QName(x, y)) for x,y in _LANG_ALTS_LAZY])
113 globals()[name] = val
114
115 return val
116
117 raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
118
119
120# These are the illegal characters in XML 1.0. (XML 1.1 is a bit more permissive,
121# but we'll be strict to ensure wider compatibility.)
122re_xml_illegal_chars = re.compile(
123 r"(?u)[^\x09\x0A\x0D\x20-\U0000D7FF\U0000E000-\U0000FFFD\U00010000-\U0010FFFF]"
124)
125re_xml_illegal_bytes = re.compile(rb"[^\x09\x0A\x0D\x20-\xFF]|�")
126
127
128def clean(s: str | Iterable[str], joiner: str = '; ') -> str:
129 """Ensure an object can safely be inserted in a XML tag body.
130
131 If we still have a non-str object at this point, the best option is to
132 join it, because it's apparently calling for a new node in a place that
133 isn't allowed in the spec or not supported.
134 """
135 from warnings import warn
136
137 if not isinstance(s, str):
138 if isinstance(s, Iterable):
139 warn(f"Merging elements of {s}")
140 if isinstance(s, set):
141 s = joiner.join(sorted(s))
142 else:
143 s = joiner.join(s)
144 else:
145 raise TypeError("object must be a string or iterable of strings")
146 return re_xml_illegal_chars.sub('', s)