1"""The ``lxml.isoschematron`` package implements ISO Schematron support on top
2of the pure-xslt 'skeleton' implementation.
3"""
4
5import sys
6import os.path
7from lxml import etree as _etree # due to validator __init__ signature
8
9
10# some compat stuff, borrowed from lxml.html
11try:
12 unicode
13except NameError:
14 # Python 3
15 unicode = str
16try:
17 basestring
18except NameError:
19 # Python 3
20 basestring = str
21
22
23__all__ = ['extract_xsd', 'extract_rng', 'iso_dsdl_include',
24 'iso_abstract_expand', 'iso_svrl_for_xslt1',
25 'svrl_validation_errors', 'schematron_schema_valid',
26 'stylesheet_params', 'Schematron']
27
28
29# some namespaces
30#FIXME: Maybe lxml should provide a dedicated place for common namespace
31#FIXME: definitions?
32XML_SCHEMA_NS = "http://www.w3.org/2001/XMLSchema"
33RELAXNG_NS = "http://relaxng.org/ns/structure/1.0"
34SCHEMATRON_NS = "http://purl.oclc.org/dsdl/schematron"
35SVRL_NS = "http://purl.oclc.org/dsdl/svrl"
36
37
38# some helpers
39_schematron_root = '{%s}schema' % SCHEMATRON_NS
40_xml_schema_root = '{%s}schema' % XML_SCHEMA_NS
41_resources_dir = os.path.join(os.path.dirname(__file__), 'resources')
42
43
44# the iso-schematron skeleton implementation steps aka xsl transformations
45extract_xsd = _etree.XSLT(_etree.parse(
46 os.path.join(_resources_dir, 'xsl', 'XSD2Schtrn.xsl')))
47extract_rng = _etree.XSLT(_etree.parse(
48 os.path.join(_resources_dir, 'xsl', 'RNG2Schtrn.xsl')))
49iso_dsdl_include = _etree.XSLT(_etree.parse(
50 os.path.join(_resources_dir, 'xsl', 'iso-schematron-xslt1',
51 'iso_dsdl_include.xsl')))
52iso_abstract_expand = _etree.XSLT(_etree.parse(
53 os.path.join(_resources_dir, 'xsl', 'iso-schematron-xslt1',
54 'iso_abstract_expand.xsl')))
55iso_svrl_for_xslt1 = _etree.XSLT(_etree.parse(
56 os.path.join(_resources_dir,
57 'xsl', 'iso-schematron-xslt1', 'iso_svrl_for_xslt1.xsl')))
58
59
60# svrl result accessors
61svrl_validation_errors = _etree.XPath(
62 '//svrl:failed-assert', namespaces={'svrl': SVRL_NS})
63
64# RelaxNG validator for schematron schemas
65schematron_schema_valid_supported = False
66try:
67 schematron_schema_valid = _etree.RelaxNG(
68 file=os.path.join(_resources_dir, 'rng', 'iso-schematron.rng'))
69 schematron_schema_valid_supported = True
70except _etree.RelaxNGParseError:
71 # Some distributions delete the file due to licensing issues.
72 def schematron_schema_valid(arg):
73 raise NotImplementedError("Validating the ISO schematron requires iso-schematron.rng")
74
75
76def stylesheet_params(**kwargs):
77 """Convert keyword args to a dictionary of stylesheet parameters.
78 XSL stylesheet parameters must be XPath expressions, i.e.:
79
80 * string expressions, like "'5'"
81 * simple (number) expressions, like "5"
82 * valid XPath expressions, like "/a/b/text()"
83
84 This function converts native Python keyword arguments to stylesheet
85 parameters following these rules:
86 If an arg is a string wrap it with XSLT.strparam().
87 If an arg is an XPath object use its path string.
88 If arg is None raise TypeError.
89 Else convert arg to string.
90 """
91 result = {}
92 for key, val in kwargs.items():
93 if isinstance(val, basestring):
94 val = _etree.XSLT.strparam(val)
95 elif val is None:
96 raise TypeError('None not allowed as a stylesheet parameter')
97 elif not isinstance(val, _etree.XPath):
98 val = unicode(val)
99 result[key] = val
100 return result
101
102
103# helper function for use in Schematron __init__
104def _stylesheet_param_dict(paramsDict, kwargsDict):
105 """Return a copy of paramsDict, updated with kwargsDict entries, wrapped as
106 stylesheet arguments.
107 kwargsDict entries with a value of None are ignored.
108 """
109 # beware of changing mutable default arg
110 paramsDict = dict(paramsDict)
111 for k, v in kwargsDict.items():
112 if v is not None: # None values do not override
113 paramsDict[k] = v
114 paramsDict = stylesheet_params(**paramsDict)
115 return paramsDict
116
117
118class Schematron(_etree._Validator):
119 """An ISO Schematron validator.
120
121 Pass a root Element or an ElementTree to turn it into a validator.
122 Alternatively, pass a filename as keyword argument 'file' to parse from
123 the file system.
124
125 Schematron is a less well known, but very powerful schema language.
126 The main idea is to use the capabilities of XPath to put restrictions on
127 the structure and the content of XML documents.
128
129 The standard behaviour is to fail on ``failed-assert`` findings only
130 (``ASSERTS_ONLY``). To change this, you can either pass a report filter
131 function to the ``error_finder`` parameter (e.g. ``ASSERTS_AND_REPORTS``
132 or a custom ``XPath`` object), or subclass isoschematron.Schematron for
133 complete control of the validation process.
134
135 Built on the Schematron language 'reference' skeleton pure-xslt
136 implementation, the validator is created as an XSLT 1.0 stylesheet using
137 these steps:
138
139 0) (Extract from XML Schema or RelaxNG schema)
140 1) Process inclusions
141 2) Process abstract patterns
142 3) Compile the schematron schema to XSLT
143
144 The ``include`` and ``expand`` keyword arguments can be used to switch off
145 steps 1) and 2).
146 To set parameters for steps 1), 2) and 3) hand parameter dictionaries to the
147 keyword arguments ``include_params``, ``expand_params`` or
148 ``compile_params``.
149 For convenience, the compile-step parameter ``phase`` is also exposed as a
150 keyword argument ``phase``. This takes precedence if the parameter is also
151 given in the parameter dictionary.
152
153 If ``store_schematron`` is set to True, the (included-and-expanded)
154 schematron document tree is stored and available through the ``schematron``
155 property.
156 If ``store_xslt`` is set to True, the validation XSLT document tree will be
157 stored and can be retrieved through the ``validator_xslt`` property.
158 With ``store_report`` set to True (default: False), the resulting validation
159 report document gets stored and can be accessed as the ``validation_report``
160 property.
161
162 If ``validate_schema`` is set to False, the validation of the schema file
163 itself is disabled. Validation happens by default after building the full
164 schema, unless the schema validation file cannot be found at import time,
165 in which case the validation gets disabled. Some lxml distributions exclude
166 this file due to licensing issues. ISO-Schematron validation can then still
167 be used normally, but the schemas themselves cannot be validated.
168
169 Here is a usage example::
170
171 >>> from lxml import etree
172 >>> from lxml.isoschematron import Schematron
173
174 >>> schematron = Schematron(etree.XML('''
175 ... <schema xmlns="http://purl.oclc.org/dsdl/schematron" >
176 ... <pattern id="id_only_attribute">
177 ... <title>id is the only permitted attribute name</title>
178 ... <rule context="*">
179 ... <report test="@*[not(name()='id')]">Attribute
180 ... <name path="@*[not(name()='id')]"/> is forbidden<name/>
181 ... </report>
182 ... </rule>
183 ... </pattern>
184 ... </schema>'''),
185 ... error_finder=Schematron.ASSERTS_AND_REPORTS)
186
187 >>> xml = etree.XML('''
188 ... <AAA name="aaa">
189 ... <BBB id="bbb"/>
190 ... <CCC color="ccc"/>
191 ... </AAA>
192 ... ''')
193
194 >>> schematron.validate(xml)
195 False
196
197 >>> xml = etree.XML('''
198 ... <AAA id="aaa">
199 ... <BBB id="bbb"/>
200 ... <CCC/>
201 ... </AAA>
202 ... ''')
203
204 >>> schematron.validate(xml)
205 True
206 """
207
208 # libxml2 error categorization for validation errors
209 _domain = _etree.ErrorDomains.SCHEMATRONV
210 _level = _etree.ErrorLevels.ERROR
211 _error_type = _etree.ErrorTypes.SCHEMATRONV_ASSERT
212
213 # convenience definitions for common behaviours
214 ASSERTS_ONLY = svrl_validation_errors # Default
215 ASSERTS_AND_REPORTS = _etree.XPath(
216 '//svrl:failed-assert | //svrl:successful-report',
217 namespaces={'svrl': SVRL_NS})
218
219 def _extract(self, element):
220 """Extract embedded schematron schema from non-schematron host schema.
221 This method will only be called by __init__ if the given schema document
222 is not a schematron schema by itself.
223 Must return a schematron schema document tree or None.
224 """
225 schematron = None
226 if element.tag == _xml_schema_root:
227 schematron = self._extract_xsd(element)
228 elif element.nsmap.get(element.prefix) == RELAXNG_NS:
229 # RelaxNG does not have a single unique root element
230 schematron = self._extract_rng(element)
231 return schematron
232
233 # customization points
234 # etree.XSLT objects that provide the extract, include, expand, compile
235 # steps
236 _extract_xsd = extract_xsd
237 _extract_rng = extract_rng
238 _include = iso_dsdl_include
239 _expand = iso_abstract_expand
240 _compile = iso_svrl_for_xslt1
241
242 # etree.xpath object that determines input document validity when applied to
243 # the svrl result report; must return a list of result elements (empty if
244 # valid)
245 _validation_errors = ASSERTS_ONLY
246
247 def __init__(self, etree=None, file=None, include=True, expand=True,
248 include_params={}, expand_params={}, compile_params={},
249 store_schematron=False, store_xslt=False, store_report=False,
250 phase=None, error_finder=ASSERTS_ONLY,
251 validate_schema=schematron_schema_valid_supported):
252 super().__init__()
253
254 self._store_report = store_report
255 self._schematron = None
256 self._validator_xslt = None
257 self._validation_report = None
258 if error_finder is not self.ASSERTS_ONLY:
259 self._validation_errors = error_finder
260
261 # parse schema document, may be a schematron schema or an XML Schema or
262 # a RelaxNG schema with embedded schematron rules
263 root = None
264 try:
265 if etree is not None:
266 if _etree.iselement(etree):
267 root = etree
268 else:
269 root = etree.getroot()
270 elif file is not None:
271 root = _etree.parse(file).getroot()
272 except Exception:
273 raise _etree.SchematronParseError(
274 "No tree or file given: %s" % sys.exc_info()[1])
275 if root is None:
276 raise ValueError("Empty tree")
277 if root.tag == _schematron_root:
278 schematron = root
279 else:
280 schematron = self._extract(root)
281 if schematron is None:
282 raise _etree.SchematronParseError(
283 "Document is not a schematron schema or schematron-extractable")
284 # perform the iso-schematron skeleton implementation steps to get a
285 # validating xslt
286 if include:
287 schematron = self._include(schematron, **include_params)
288 if expand:
289 schematron = self._expand(schematron, **expand_params)
290 if validate_schema and not schematron_schema_valid(schematron):
291 raise _etree.SchematronParseError(
292 "invalid schematron schema: %s" %
293 schematron_schema_valid.error_log)
294 if store_schematron:
295 self._schematron = schematron
296 # add new compile keyword args here if exposing them
297 compile_kwargs = {'phase': phase}
298 compile_params = _stylesheet_param_dict(compile_params, compile_kwargs)
299 validator_xslt = self._compile(schematron, **compile_params)
300 if store_xslt:
301 self._validator_xslt = validator_xslt
302 self._validator = _etree.XSLT(validator_xslt)
303
304 def __call__(self, etree):
305 """Validate doc using Schematron.
306
307 Returns true if document is valid, false if not.
308 """
309 self._clear_error_log()
310 result = self._validator(etree)
311 if self._store_report:
312 self._validation_report = result
313 errors = self._validation_errors(result)
314 if errors:
315 if _etree.iselement(etree):
316 fname = etree.getroottree().docinfo.URL or '<file>'
317 else:
318 fname = etree.docinfo.URL or '<file>'
319 for error in errors:
320 # Does svrl report the line number, anywhere? Don't think so.
321 self._append_log_message(
322 domain=self._domain, type=self._error_type,
323 level=self._level, line=0,
324 message=_etree.tostring(error, encoding='unicode'),
325 filename=fname)
326 return False
327 return True
328
329 @property
330 def schematron(self):
331 """ISO-schematron schema document (None if object has been initialized
332 with store_schematron=False).
333 """
334 return self._schematron
335
336 @property
337 def validator_xslt(self):
338 """ISO-schematron skeleton implementation XSLT validator document (None
339 if object has been initialized with store_xslt=False).
340 """
341 return self._validator_xslt
342
343 @property
344 def validation_report(self):
345 """ISO-schematron validation result report (None if result-storing has
346 been turned off).
347 """
348 return self._validation_report