1"""Notebook format validators."""
2
3# Copyright (c) IPython Development Team.
4# Distributed under the terms of the Modified BSD License.
5from __future__ import annotations
6
7import json
8import pprint
9import warnings
10from copy import deepcopy
11from pathlib import Path
12from textwrap import dedent
13from typing import Any, Optional
14
15from ._imports import import_item
16from .corpus.words import generate_corpus_id
17from .json_compat import ValidationError, _validator_for_name, get_current_validator
18from .reader import get_version
19from .warnings import DuplicateCellId, MissingIDFieldWarning
20
21validators = {}
22_deprecated = object()
23
24
25__all__ = [
26 "ValidationError",
27 "get_validator",
28 "isvalid",
29 "NotebookValidationError",
30 "better_validation_error",
31 "normalize",
32 "validate",
33 "iter_validate",
34]
35
36
37def _relax_additional_properties(obj):
38 """relax any `additionalProperties`"""
39 if isinstance(obj, dict):
40 for key, value in obj.items():
41 value = ( # noqa: PLW2901
42 True if key == "additionalProperties" else _relax_additional_properties(value)
43 )
44 obj[key] = value
45 elif isinstance(obj, list):
46 for i, value in enumerate(obj):
47 obj[i] = _relax_additional_properties(value)
48 return obj
49
50
51def _allow_undefined(schema):
52 schema["definitions"]["cell"]["oneOf"].append({"$ref": "#/definitions/unrecognized_cell"})
53 schema["definitions"]["output"]["oneOf"].append({"$ref": "#/definitions/unrecognized_output"})
54 return schema
55
56
57def get_validator(version=None, version_minor=None, relax_add_props=False, name=None):
58 """Load the JSON schema into a Validator"""
59 if version is None:
60 from . import current_nbformat
61
62 version = current_nbformat
63
64 v = import_item("nbformat.v%s" % version)
65 current_minor = getattr(v, "nbformat_minor", 0)
66 if version_minor is None:
67 version_minor = current_minor
68
69 current_validator = _validator_for_name(name) if name else get_current_validator()
70
71 version_tuple = (current_validator.name, version, version_minor)
72
73 if version_tuple not in validators:
74 try:
75 schema_json = _get_schema_json(v, version=version, version_minor=version_minor)
76 except AttributeError:
77 return None
78
79 if current_minor < version_minor:
80 # notebook from the future, relax all `additionalProperties: False` requirements
81 schema_json = _relax_additional_properties(schema_json)
82 # and allow undefined cell types and outputs
83 schema_json = _allow_undefined(schema_json)
84
85 validators[version_tuple] = current_validator(schema_json)
86
87 if relax_add_props:
88 try:
89 schema_json = _get_schema_json(v, version=version, version_minor=version_minor)
90 except AttributeError:
91 return None
92
93 # this allows properties to be added for intermediate
94 # representations while validating for all other kinds of errors
95 schema_json = _relax_additional_properties(schema_json)
96 validators[version_tuple] = current_validator(schema_json)
97
98 return validators[version_tuple]
99
100
101def _get_schema_json(v, version=None, version_minor=None):
102 """
103 Gets the json schema from a given imported library and nbformat version.
104 """
105 if (version, version_minor) in v.nbformat_schema:
106 schema_path = str(Path(v.__file__).parent / v.nbformat_schema[(version, version_minor)])
107 elif version_minor > v.nbformat_minor:
108 # load the latest schema
109 schema_path = str(Path(v.__file__).parent / v.nbformat_schema[(None, None)])
110 else:
111 msg = "Cannot find appropriate nbformat schema file."
112 raise AttributeError(msg)
113 with Path(schema_path).open(encoding="utf8") as f:
114 schema_json = json.load(f)
115 return schema_json # noqa: RET504
116
117
118def isvalid(nbjson, ref=None, version=None, version_minor=None):
119 """Checks whether the given notebook JSON conforms to the current
120 notebook format schema. Returns True if the JSON is valid, and
121 False otherwise.
122
123 To see the individual errors that were encountered, please use the
124 `validate` function instead.
125 """
126 orig = deepcopy(nbjson)
127 try:
128 with warnings.catch_warnings():
129 warnings.filterwarnings("ignore", category=DeprecationWarning)
130 warnings.filterwarnings("ignore", category=MissingIDFieldWarning)
131 validate(nbjson, ref, version, version_minor, repair_duplicate_cell_ids=False)
132 except ValidationError:
133 return False
134 else:
135 return True
136 finally:
137 if nbjson != orig:
138 raise AssertionError
139
140
141def _format_as_index(indices):
142 """
143 (from jsonschema._utils.format_as_index, copied to avoid relying on private API)
144
145 Construct a single string containing indexing operations for the indices.
146
147 For example, [1, 2, "foo"] -> [1][2]["foo"]
148 """
149
150 if not indices:
151 return ""
152 return "[%s]" % "][".join(repr(index) for index in indices)
153
154
155_ITEM_LIMIT = 16
156_STR_LIMIT = 64
157
158
159def _truncate_obj(obj):
160 """Truncate objects for use in validation tracebacks
161
162 Cell and output lists are squashed, as are long strings, lists, and dicts.
163 """
164 if isinstance(obj, dict):
165 truncated_dict = {k: _truncate_obj(v) for k, v in list(obj.items())[:_ITEM_LIMIT]}
166 if isinstance(truncated_dict.get("cells"), list):
167 truncated_dict["cells"] = ["...%i cells..." % len(obj["cells"])]
168 if isinstance(truncated_dict.get("outputs"), list):
169 truncated_dict["outputs"] = ["...%i outputs..." % len(obj["outputs"])]
170
171 if len(obj) > _ITEM_LIMIT:
172 truncated_dict["..."] = "%i keys truncated" % (len(obj) - _ITEM_LIMIT)
173 return truncated_dict
174 if isinstance(obj, list):
175 truncated_list = [_truncate_obj(item) for item in obj[:_ITEM_LIMIT]]
176 if len(obj) > _ITEM_LIMIT:
177 truncated_list.append("...%i items truncated..." % (len(obj) - _ITEM_LIMIT))
178 return truncated_list
179 if isinstance(obj, str):
180 truncated_str = obj[:_STR_LIMIT]
181 if len(obj) > _STR_LIMIT:
182 truncated_str += "..."
183 return truncated_str
184 return obj
185
186
187class NotebookValidationError(ValidationError): # type:ignore[misc]
188 """Schema ValidationError with truncated representation
189
190 to avoid massive verbose tracebacks.
191 """
192
193 def __init__(self, original, ref=None):
194 """Initialize the error class."""
195 self.original = original
196 self.ref = getattr(self.original, "ref", ref)
197 self.message = self.original.message
198
199 def __getattr__(self, key):
200 """Get an attribute from the error."""
201 return getattr(self.original, key)
202
203 def __unicode__(self):
204 """Custom str for validation errors
205
206 avoids dumping full schema and notebook to logs
207 """
208 error = self.original
209 instance = _truncate_obj(error.instance)
210
211 return "\n".join(
212 [
213 error.message,
214 "",
215 "Failed validating {!r} in {}{}:".format(
216 error.validator,
217 self.ref or "notebook",
218 _format_as_index(list(error.relative_schema_path)[:-1]),
219 ),
220 "",
221 "On instance%s:" % _format_as_index(error.relative_path),
222 pprint.pformat(instance, width=78),
223 ]
224 )
225
226 __str__ = __unicode__
227
228
229def better_validation_error(error, version, version_minor):
230 """Get better ValidationError on oneOf failures
231
232 oneOf errors aren't informative.
233 if it's a cell type or output_type error,
234 try validating directly based on the type for a better error message
235 """
236 if not len(error.schema_path):
237 return error
238 key = error.schema_path[-1]
239 ref = None
240 if key.endswith("Of"):
241 if isinstance(error.instance, dict):
242 if "cell_type" in error.instance:
243 ref = error.instance["cell_type"] + "_cell"
244 elif "output_type" in error.instance:
245 ref = error.instance["output_type"]
246
247 if ref:
248 try:
249 validate(
250 error.instance,
251 ref,
252 version=version,
253 version_minor=version_minor,
254 )
255 except ValidationError as sub_error:
256 # keep extending relative path
257 error.relative_path.extend(sub_error.relative_path)
258 sub_error.relative_path = error.relative_path
259 better = better_validation_error(sub_error, version, version_minor)
260 if better.ref is None:
261 better.ref = ref
262 return better
263 except Exception: # noqa: S110
264 # if it fails for some reason,
265 # let the original error through
266 pass
267 return NotebookValidationError(error, ref)
268
269
270def normalize(
271 nbdict: Any,
272 version: Optional[int] = None,
273 version_minor: Optional[int] = None,
274 *,
275 relax_add_props: bool = False,
276 strip_invalid_metadata: bool = False,
277) -> tuple[int, Any]:
278 """
279 Normalise a notebook prior to validation.
280
281 This tries to implement a couple of normalisation steps to standardise
282 notebooks and make validation easier.
283
284 You should in general not rely on this function and make sure the notebooks
285 that reach nbformat are already in a normal form. If not you likely have a bug,
286 and may have security issues.
287
288 Parameters
289 ----------
290 nbdict : dict
291 notebook document
292 version : int
293 version_minor : int
294 relax_add_props : bool
295 Whether to allow extra property in the Json schema validating the
296 notebook.
297 strip_invalid_metadata : bool
298 Whether to strip metadata that does not exist in the Json schema when
299 validating the notebook.
300
301 Returns
302 -------
303 changes : int
304 number of changes in the notebooks
305 notebook : dict
306 deep-copy of the original object with relevant changes.
307
308 """
309 nbdict = deepcopy(nbdict)
310 nbdict_version, nbdict_version_minor = get_version(nbdict)
311 if version is None:
312 version = nbdict_version
313 if version_minor is None:
314 version_minor = nbdict_version_minor
315 return _normalize(
316 nbdict,
317 version,
318 version_minor,
319 True,
320 relax_add_props=relax_add_props,
321 strip_invalid_metadata=strip_invalid_metadata,
322 )
323
324
325def _normalize(
326 nbdict: Any,
327 version: int,
328 version_minor: int,
329 repair_duplicate_cell_ids: bool,
330 relax_add_props: bool,
331 strip_invalid_metadata: bool,
332) -> tuple[int, Any]:
333 """
334 Private normalisation routine.
335
336 This function attempts to normalize the `nbdict` passed to it.
337
338 As `_normalize()` is currently used both in `validate()` (for
339 historical reasons), and in the `normalize()` public function,
340 `_normalize()` does currently mutate `nbdict`.
341 Ideally, once `validate()` stops calling `_normalize()`, `_normalize()`
342 may stop mutating `nbdict`.
343
344 """
345 changes = 0
346
347 if (version, version_minor) >= (4, 5):
348 # if we support cell ids ensure default ids are provided
349 for cell in nbdict["cells"]:
350 if "id" not in cell:
351 warnings.warn(
352 "Cell is missing an id field, this will become"
353 " a hard error in future nbformat versions. You may want"
354 " to use `normalize()` on your notebooks before validations"
355 " (available since nbformat 5.1.4). Previous versions of nbformat"
356 " are fixing this issue transparently, and will stop doing so"
357 " in the future.",
358 MissingIDFieldWarning,
359 stacklevel=3,
360 )
361 # Generate cell ids if any are missing
362 if repair_duplicate_cell_ids:
363 cell["id"] = generate_corpus_id()
364 changes += 1
365
366 # if we support cell ids check for uniqueness when validating the whole notebook
367 seen_ids = set()
368 for cell in nbdict["cells"]:
369 if "id" not in cell:
370 continue
371 cell_id = cell["id"]
372 if cell_id in seen_ids:
373 # Best effort to repair if we find a duplicate id
374 if repair_duplicate_cell_ids:
375 new_id = generate_corpus_id()
376 cell["id"] = new_id
377 changes += 1
378 warnings.warn(
379 f"Non-unique cell id {cell_id!r} detected. Corrected to {new_id!r}.",
380 DuplicateCellId,
381 stacklevel=3,
382 )
383 else:
384 msg = f"Non-unique cell id '{cell_id}' detected."
385 raise ValidationError(msg)
386 seen_ids.add(cell_id)
387 if strip_invalid_metadata:
388 changes += _strip_invalida_metadata(
389 nbdict, version, version_minor, relax_add_props=relax_add_props
390 )
391 return changes, nbdict
392
393
394def _dep_warn(field):
395 warnings.warn(
396 dedent(
397 f"""`{field}` kwargs of validate has been deprecated for security
398 reasons, and will be removed soon.
399
400 Please explicitly use the `n_changes, new_notebook = nbformat.validator.normalize(old_notebook, ...)` if you wish to
401 normalise your notebook. `normalize` is available since nbformat 5.5.0
402
403 """
404 ),
405 DeprecationWarning,
406 stacklevel=3,
407 )
408
409
410def validate(
411 nbdict: Any = None,
412 ref: Optional[str] = None,
413 version: Optional[int] = None,
414 version_minor: Optional[int] = None,
415 relax_add_props: bool = False,
416 nbjson: Any = None,
417 repair_duplicate_cell_ids: bool = _deprecated, # type: ignore[assignment]
418 strip_invalid_metadata: bool = _deprecated, # type: ignore[assignment]
419) -> None:
420 """Checks whether the given notebook dict-like object
421 conforms to the relevant notebook format schema.
422
423 Parameters
424 ----------
425 nbdict : dict
426 notebook document
427 ref : optional, str
428 reference to the subset of the schema we want to validate against.
429 for example ``"markdown_cell"``, `"code_cell"` ....
430 version : int
431 version_minor : int
432 relax_add_props : bool
433 Whether to allow extra properties in the JSON schema validating the notebook.
434 When True, all known fields are validated, but unknown fields are ignored.
435 nbjson
436 repair_duplicate_cell_ids : bool
437 Deprecated since 5.5.0 - will be removed in the future.
438 strip_invalid_metadata : bool
439 Deprecated since 5.5.0 - will be removed in the future.
440
441 Returns
442 -------
443 None
444
445 Raises
446 ------
447 ValidationError if not valid.
448
449 Notes
450 -----
451 Prior to Nbformat 5.5.0 the `validate` and `isvalid` method would silently
452 try to fix invalid notebook and mutate arguments. This behavior is deprecated
453 and will be removed in a near future.
454
455 Please explicitly call `normalize` if you need to normalize notebooks.
456 """
457 assert isinstance(ref, str) or ref is None
458
459 if strip_invalid_metadata is _deprecated:
460 strip_invalid_metadata = False
461 else:
462 _dep_warn("strip_invalid_metadata")
463
464 if repair_duplicate_cell_ids is _deprecated:
465 repair_duplicate_cell_ids = True
466 else:
467 _dep_warn("repair_duplicate_cell_ids")
468
469 # backwards compatibility for nbjson argument
470 if nbdict is not None:
471 pass
472 elif nbjson is not None:
473 nbdict = nbjson
474 else:
475 msg = "validate() missing 1 required argument: 'nbdict'"
476 raise TypeError(msg)
477
478 if ref is None:
479 # if ref is not specified, we have a whole notebook, so we can get the version
480 nbdict_version, nbdict_version_minor = get_version(nbdict)
481 if version is None:
482 version = nbdict_version
483 if version_minor is None:
484 version_minor = nbdict_version_minor
485 # if ref is specified, and we don't have a version number, assume we're validating against 1.0
486 elif version is None:
487 version, version_minor = 1, 0
488
489 if ref is None:
490 assert isinstance(version, int)
491 assert isinstance(version_minor, int)
492 _normalize(
493 nbdict,
494 version,
495 version_minor,
496 repair_duplicate_cell_ids,
497 relax_add_props=relax_add_props,
498 strip_invalid_metadata=strip_invalid_metadata,
499 )
500
501 for error in iter_validate(
502 nbdict,
503 ref=ref,
504 version=version,
505 version_minor=version_minor,
506 relax_add_props=relax_add_props,
507 strip_invalid_metadata=strip_invalid_metadata,
508 ):
509 raise error
510
511
512def _get_errors(
513 nbdict: Any, version: int, version_minor: int, relax_add_props: bool, *args: Any
514) -> Any:
515 validator = get_validator(version, version_minor, relax_add_props=relax_add_props)
516 if not validator:
517 msg = f"No schema for validating v{version}.{version_minor} notebooks"
518 raise ValidationError(msg)
519 iter_errors = validator.iter_errors(nbdict, *args)
520 errors = list(iter_errors)
521 # jsonschema gives the best error messages.
522 if len(errors) and validator.name != "jsonschema":
523 validator = get_validator(
524 version=version,
525 version_minor=version_minor,
526 relax_add_props=relax_add_props,
527 name="jsonschema",
528 )
529 return validator.iter_errors(nbdict, *args)
530 return iter(errors)
531
532
533def _strip_invalida_metadata(
534 nbdict: Any, version: int, version_minor: int, relax_add_props: bool
535) -> int:
536 """
537 This function tries to extract metadata errors from the validator and fix
538 them if necessary. This mostly mean stripping unknown keys from metadata
539 fields, or removing metadata fields altogether.
540
541 Parameters
542 ----------
543 nbdict : dict
544 notebook document
545 version : int
546 version_minor : int
547 relax_add_props : bool
548 Whether to allow extra property in the Json schema validating the
549 notebook.
550
551 Returns
552 -------
553 int
554 number of modifications
555
556 """
557 errors = _get_errors(nbdict, version, version_minor, relax_add_props)
558 changes = 0
559 if len(list(errors)) > 0:
560 # jsonschema gives a better error tree.
561 validator = get_validator(
562 version=version,
563 version_minor=version_minor,
564 relax_add_props=relax_add_props,
565 name="jsonschema",
566 )
567 if not validator:
568 msg = f"No jsonschema for validating v{version}.{version_minor} notebooks"
569 raise ValidationError(msg)
570 errors = validator.iter_errors(nbdict)
571 error_tree = validator.error_tree(errors)
572 if "metadata" in error_tree:
573 for key in error_tree["metadata"]:
574 nbdict["metadata"].pop(key, None)
575 changes += 1
576
577 if "cells" in error_tree:
578 number_of_cells = len(nbdict.get("cells", 0))
579 for cell_idx in range(number_of_cells):
580 # Cells don't report individual metadata keys as having failed validation
581 # Instead it reports that it failed to validate against each cell-type definition.
582 # We have to delve into why those definitions failed to uncover which metadata
583 # keys are misbehaving.
584 if "oneOf" in error_tree["cells"][cell_idx].errors:
585 intended_cell_type = nbdict["cells"][cell_idx]["cell_type"]
586 schemas_by_index = [
587 ref["$ref"]
588 for ref in error_tree["cells"][cell_idx].errors["oneOf"].schema["oneOf"]
589 ]
590 cell_type_definition_name = f"#/definitions/{intended_cell_type}_cell"
591 if cell_type_definition_name in schemas_by_index:
592 schema_index = schemas_by_index.index(cell_type_definition_name)
593 for error in error_tree["cells"][cell_idx].errors["oneOf"].context:
594 rel_path = error.relative_path
595 error_for_intended_schema = error.schema_path[0] == schema_index
596 is_top_level_metadata_key = (
597 len(rel_path) == 2 and rel_path[0] == "metadata"
598 )
599 if error_for_intended_schema and is_top_level_metadata_key:
600 nbdict["cells"][cell_idx]["metadata"].pop(rel_path[1], None)
601 changes += 1
602
603 return changes
604
605
606def iter_validate(
607 nbdict=None,
608 ref=None,
609 version=None,
610 version_minor=None,
611 relax_add_props=False,
612 nbjson=None,
613 strip_invalid_metadata=False,
614):
615 """Checks whether the given notebook dict-like object conforms to the
616 relevant notebook format schema.
617
618 Returns a generator of all ValidationErrors if not valid.
619
620 Notes
621 -----
622 To fix: For security reasons, this function should *never* mutate its `nbdict` argument, and
623 should *never* try to validate a mutated or modified version of its notebook.
624
625 """
626 # backwards compatibility for nbjson argument
627 if nbdict is not None:
628 pass
629 elif nbjson is not None:
630 nbdict = nbjson
631 else:
632 msg = "iter_validate() missing 1 required argument: 'nbdict'"
633 raise TypeError(msg)
634
635 if version is None:
636 version, version_minor = get_version(nbdict)
637
638 if ref:
639 try:
640 errors = _get_errors(
641 nbdict,
642 version,
643 version_minor,
644 relax_add_props,
645 {"$ref": "#/definitions/%s" % ref},
646 )
647 except ValidationError as e:
648 yield e
649 return
650
651 else:
652 if strip_invalid_metadata:
653 _strip_invalida_metadata(nbdict, version, version_minor, relax_add_props)
654
655 # Validate one more time to ensure that us removing metadata
656 # didn't cause another complex validation issue in the schema.
657 # Also to ensure that higher-level errors produced by individual metadata validation
658 # failures are removed.
659 try:
660 errors = _get_errors(nbdict, version, version_minor, relax_add_props)
661 except ValidationError as e:
662 yield e
663 return
664
665 for error in errors:
666 yield better_validation_error(error, version, version_minor)