1"""Notebook format validators."""
2
3# Copyright (c) IPython Development Team.
4# Distributed under the terms of the Modified BSD License.
5from __future__ import annotations
6
7import json
8import pprint
9import time
10import warnings
11from copy import deepcopy
12from pathlib import Path
13from textwrap import dedent
14from typing import Any
15
16from ._imports import import_item
17from .corpus.words import generate_corpus_id
18from .json_compat import ValidationError, _validator_for_name, get_current_validator
19from .reader import get_version
20from .warnings import DuplicateCellId, MissingIDFieldWarning
21
22validators = {}
23_deprecated = object()
24
25
26__all__ = [
27 "NotebookValidationError",
28 "ValidationError",
29 "better_validation_error",
30 "get_validator",
31 "isvalid",
32 "iter_validate",
33 "normalize",
34 "validate",
35]
36
37
38def _relax_additional_properties(obj):
39 """relax any `additionalProperties`"""
40 if isinstance(obj, dict):
41 for key, value in obj.items():
42 value = ( # noqa: PLW2901
43 True if key == "additionalProperties" else _relax_additional_properties(value)
44 )
45 obj[key] = value
46 elif isinstance(obj, list):
47 for i, value in enumerate(obj):
48 obj[i] = _relax_additional_properties(value)
49 return obj
50
51
52def _allow_undefined(schema):
53 schema["definitions"]["cell"]["oneOf"].append({"$ref": "#/definitions/unrecognized_cell"})
54 schema["definitions"]["output"]["oneOf"].append({"$ref": "#/definitions/unrecognized_output"})
55 return schema
56
57
58def get_validator(version=None, version_minor=None, relax_add_props=False, name=None):
59 """Load the JSON schema into a Validator"""
60 if version is None:
61 from . import current_nbformat # noqa:PLC0415
62
63 version = current_nbformat
64
65 v = import_item("nbformat.v%s" % version)
66 current_minor = getattr(v, "nbformat_minor", 0)
67 if version_minor is None:
68 version_minor = current_minor
69
70 current_validator = _validator_for_name(name) if name else get_current_validator()
71
72 version_tuple = (current_validator.name, version, version_minor)
73
74 if version_tuple not in validators:
75 try:
76 schema_json = _get_schema_json(v, version=version, version_minor=version_minor)
77 except AttributeError:
78 return None
79
80 if current_minor < version_minor:
81 # notebook from the future, relax all `additionalProperties: False` requirements
82 schema_json = _relax_additional_properties(schema_json)
83 # and allow undefined cell types and outputs
84 schema_json = _allow_undefined(schema_json)
85
86 validators[version_tuple] = current_validator(schema_json)
87
88 if relax_add_props:
89 try:
90 schema_json = _get_schema_json(v, version=version, version_minor=version_minor)
91 except AttributeError:
92 return None
93
94 # this allows properties to be added for intermediate
95 # representations while validating for all other kinds of errors
96 schema_json = _relax_additional_properties(schema_json)
97 validators[version_tuple] = current_validator(schema_json)
98
99 return validators[version_tuple]
100
101
102def _get_schema_json(v, version=None, version_minor=None):
103 """
104 Gets the json schema from a given imported library and nbformat version.
105 """
106 if (version, version_minor) in v.nbformat_schema:
107 schema_path = str(Path(v.__file__).parent / v.nbformat_schema[(version, version_minor)])
108 elif version_minor > v.nbformat_minor:
109 # load the latest schema
110 schema_path = str(Path(v.__file__).parent / v.nbformat_schema[(None, None)])
111 else:
112 msg = "Cannot find appropriate nbformat schema file."
113 raise AttributeError(msg)
114 with Path(schema_path).open(encoding="utf8") as f:
115 schema_json = json.load(f)
116 return schema_json # noqa: RET504
117
118
119def isvalid(nbjson, ref=None, version=None, version_minor=None):
120 """Checks whether the given notebook JSON conforms to the current
121 notebook format schema. Returns True if the JSON is valid, and
122 False otherwise.
123
124 To see the individual errors that were encountered, please use the
125 `validate` function instead.
126 """
127 orig = deepcopy(nbjson)
128 try:
129 with warnings.catch_warnings():
130 warnings.filterwarnings("ignore", category=DeprecationWarning)
131 warnings.filterwarnings("ignore", category=MissingIDFieldWarning)
132 validate(nbjson, ref, version, version_minor, repair_duplicate_cell_ids=False)
133 except ValidationError:
134 return False
135 else:
136 return True
137 finally:
138 if nbjson != orig:
139 raise AssertionError
140
141
142def _format_as_index(indices):
143 """
144 (from jsonschema._utils.format_as_index, copied to avoid relying on private API)
145
146 Construct a single string containing indexing operations for the indices.
147
148 For example, [1, 2, "foo"] -> [1][2]["foo"]
149 """
150
151 if not indices:
152 return ""
153 return "[%s]" % "][".join(repr(index) for index in indices)
154
155
156_ITEM_LIMIT = 16
157_STR_LIMIT = 64
158
159
160def _truncate_obj(obj):
161 """Truncate objects for use in validation tracebacks
162
163 Cell and output lists are squashed, as are long strings, lists, and dicts.
164 """
165 if isinstance(obj, dict):
166 truncated_dict = {k: _truncate_obj(v) for k, v in list(obj.items())[:_ITEM_LIMIT]}
167 if isinstance(truncated_dict.get("cells"), list):
168 truncated_dict["cells"] = ["...%i cells..." % len(obj["cells"])]
169 if isinstance(truncated_dict.get("outputs"), list):
170 truncated_dict["outputs"] = ["...%i outputs..." % len(obj["outputs"])]
171
172 if len(obj) > _ITEM_LIMIT:
173 truncated_dict["..."] = "%i keys truncated" % (len(obj) - _ITEM_LIMIT)
174 return truncated_dict
175 if isinstance(obj, list):
176 truncated_list = [_truncate_obj(item) for item in obj[:_ITEM_LIMIT]]
177 if len(obj) > _ITEM_LIMIT:
178 truncated_list.append("...%i items truncated..." % (len(obj) - _ITEM_LIMIT))
179 return truncated_list
180 if isinstance(obj, str):
181 truncated_str = obj[:_STR_LIMIT]
182 if len(obj) > _STR_LIMIT:
183 truncated_str += "..."
184 return truncated_str
185 return obj
186
187
188class NotebookValidationError(ValidationError): # type:ignore[misc]
189 """Schema ValidationError with truncated representation
190
191 to avoid massive verbose tracebacks.
192 """
193
194 def __init__(self, original, ref=None):
195 """Initialize the error class."""
196 self.original = original
197 self.ref = getattr(self.original, "ref", ref)
198 self.message = self.original.message
199
200 def __getattr__(self, key):
201 """Get an attribute from the error."""
202 return getattr(self.original, key)
203
204 def __unicode__(self):
205 """Custom str for validation errors
206
207 avoids dumping full schema and notebook to logs
208 """
209 error = self.original
210 instance = _truncate_obj(error.instance)
211
212 return "\n".join(
213 [
214 error.message,
215 "",
216 "Failed validating {!r} in {}{}:".format(
217 error.validator,
218 self.ref or "notebook",
219 _format_as_index(list(error.relative_schema_path)[:-1]),
220 ),
221 "",
222 "On instance%s:" % _format_as_index(error.relative_path),
223 pprint.pformat(instance, width=78),
224 ]
225 )
226
227 __str__ = __unicode__
228
229
230def better_validation_error(error, version, version_minor):
231 """Get better ValidationError on oneOf failures
232
233 oneOf errors aren't informative.
234 if it's a cell type or output_type error,
235 try validating directly based on the type for a better error message
236 """
237 if not len(error.schema_path):
238 return error
239 key = error.schema_path[-1]
240 ref = None
241 if key.endswith("Of"):
242 if isinstance(error.instance, dict):
243 if "cell_type" in error.instance:
244 ref = error.instance["cell_type"] + "_cell"
245 elif "output_type" in error.instance:
246 ref = error.instance["output_type"]
247
248 if ref:
249 try:
250 validate(
251 error.instance,
252 ref,
253 version=version,
254 version_minor=version_minor,
255 )
256 except ValidationError as sub_error:
257 # keep extending relative path
258 error.relative_path.extend(sub_error.relative_path)
259 sub_error.relative_path = error.relative_path
260 better = better_validation_error(sub_error, version, version_minor)
261 if better.ref is None:
262 better.ref = ref
263 return better
264 except Exception: # noqa: S110
265 # if it fails for some reason,
266 # let the original error through
267 pass
268 return NotebookValidationError(error, ref)
269
270
271def normalize(
272 nbdict: Any,
273 version: int | None = None,
274 version_minor: int | None = None,
275 *,
276 relax_add_props: bool = False,
277 strip_invalid_metadata: bool = False,
278) -> tuple[int, Any]:
279 """
280 Normalise a notebook prior to validation.
281
282 This tries to implement a couple of normalisation steps to standardise
283 notebooks and make validation easier.
284
285 You should in general not rely on this function and make sure the notebooks
286 that reach nbformat are already in a normal form. If not you likely have a bug,
287 and may have security issues.
288
289 Parameters
290 ----------
291 nbdict : dict
292 notebook document
293 version : int
294 version_minor : int
295 relax_add_props : bool
296 Whether to allow extra property in the Json schema validating the
297 notebook.
298 strip_invalid_metadata : bool
299 Whether to strip metadata that does not exist in the Json schema when
300 validating the notebook.
301
302 Returns
303 -------
304 changes : int
305 number of changes in the notebooks
306 notebook : dict
307 deep-copy of the original object with relevant changes.
308
309 """
310 nbdict = deepcopy(nbdict)
311 nbdict_version, nbdict_version_minor = get_version(nbdict)
312 if version is None:
313 version = nbdict_version
314 if version_minor is None:
315 version_minor = nbdict_version_minor
316 return _normalize(
317 nbdict,
318 version,
319 version_minor,
320 True,
321 relax_add_props=relax_add_props,
322 strip_invalid_metadata=strip_invalid_metadata,
323 )
324
325
326def _normalize(
327 nbdict: Any,
328 version: int,
329 version_minor: int,
330 repair_duplicate_cell_ids: bool,
331 relax_add_props: bool,
332 strip_invalid_metadata: bool,
333) -> tuple[int, Any]:
334 """
335 Private normalisation routine.
336
337 This function attempts to normalize the `nbdict` passed to it.
338
339 As `_normalize()` is currently used both in `validate()` (for
340 historical reasons), and in the `normalize()` public function,
341 `_normalize()` does currently mutate `nbdict`.
342 Ideally, once `validate()` stops calling `_normalize()`, `_normalize()`
343 may stop mutating `nbdict`.
344
345 """
346 changes = 0
347
348 if (version, version_minor) >= (4, 5):
349 # if we support cell ids ensure default ids are provided
350 for cell in nbdict["cells"]:
351 if "id" not in cell:
352 warnings.warn(
353 "Cell is missing an id field, this will become"
354 " a hard error in future nbformat versions. You may want"
355 " to use `normalize()` on your notebooks before validations"
356 " (available since nbformat 5.1.4). Previous versions of nbformat"
357 " are fixing this issue transparently, and will stop doing so"
358 " in the future.",
359 MissingIDFieldWarning,
360 stacklevel=3,
361 )
362 # Generate cell ids if any are missing
363 if repair_duplicate_cell_ids:
364 cell["id"] = generate_corpus_id()
365 changes += 1
366
367 # if we support cell ids check for uniqueness when validating the whole notebook
368 seen_ids = set()
369 for cell in nbdict["cells"]:
370 if "id" not in cell:
371 continue
372 cell_id = cell["id"]
373 if cell_id in seen_ids:
374 # Best effort to repair if we find a duplicate id
375 if repair_duplicate_cell_ids:
376 new_id = generate_corpus_id()
377 cell["id"] = new_id
378 changes += 1
379 warnings.warn(
380 f"Non-unique cell id {cell_id!r} detected. Corrected to {new_id!r}.",
381 DuplicateCellId,
382 stacklevel=3,
383 )
384 else:
385 msg = f"Non-unique cell id '{cell_id}' detected."
386 raise ValidationError(msg)
387 seen_ids.add(cell_id)
388 if strip_invalid_metadata:
389 changes += _strip_invalida_metadata(
390 nbdict, version, version_minor, relax_add_props=relax_add_props
391 )
392 return changes, nbdict
393
394
395def _dep_warn(field):
396 # Deprecated since 2023 and security issue start to annoy people.
397 time.sleep(2)
398 # regularly bump this by 1 sec.
399
400 warnings.warn(
401 dedent(
402 f"""`{field}` kwargs of validate has been deprecated for security
403 reasons, and will be removed soon.
404
405 Please explicitly use the `n_changes, new_notebook = nbformat.validator.normalize(old_notebook, ...)` if you wish to
406 normalise your notebook. `normalize` is available since nbformat 5.5.0
407
408 """
409 ),
410 DeprecationWarning,
411 stacklevel=3,
412 )
413
414
415def validate(
416 nbdict: Any = None,
417 ref: str | None = None,
418 version: int | None = None,
419 version_minor: int | None = None,
420 relax_add_props: bool = False,
421 nbjson: Any = None,
422 repair_duplicate_cell_ids: bool = _deprecated, # type: ignore[assignment]
423 strip_invalid_metadata: bool = _deprecated, # type: ignore[assignment]
424) -> None:
425 """Checks whether the given notebook dict-like object
426 conforms to the relevant notebook format schema.
427
428 Parameters
429 ----------
430 nbdict : dict
431 notebook document
432 ref : optional, str
433 reference to the subset of the schema we want to validate against.
434 for example ``"markdown_cell"``, `"code_cell"` ....
435 version : int
436 version_minor : int
437 relax_add_props : bool
438 Whether to allow extra properties in the JSON schema validating the notebook.
439 When True, all known fields are validated, but unknown fields are ignored.
440 nbjson
441 repair_duplicate_cell_ids : bool
442 Deprecated since 5.5.0 - will be removed in the future.
443 strip_invalid_metadata : bool
444 Deprecated since 5.5.0 - will be removed in the future.
445
446 Returns
447 -------
448 None
449
450 Raises
451 ------
452 ValidationError if not valid.
453
454 Notes
455 -----
456 Prior to Nbformat 5.5.0 the `validate` and `isvalid` method would silently
457 try to fix invalid notebook and mutate arguments. This behavior is deprecated
458 and will be removed in a near future.
459
460 Please explicitly call `normalize` if you need to normalize notebooks.
461 """
462 assert isinstance(ref, str) or ref is None
463
464 if strip_invalid_metadata is _deprecated:
465 strip_invalid_metadata = False
466 else:
467 _dep_warn("strip_invalid_metadata")
468
469 if repair_duplicate_cell_ids is _deprecated:
470 repair_duplicate_cell_ids = True
471 else:
472 _dep_warn("repair_duplicate_cell_ids")
473
474 # backwards compatibility for nbjson argument
475 if nbdict is not None:
476 pass
477 elif nbjson is not None:
478 nbdict = nbjson
479 else:
480 msg = "validate() missing 1 required argument: 'nbdict'"
481 raise TypeError(msg)
482
483 if ref is None:
484 # if ref is not specified, we have a whole notebook, so we can get the version
485 nbdict_version, nbdict_version_minor = get_version(nbdict)
486 if version is None:
487 version = nbdict_version
488 if version_minor is None:
489 version_minor = nbdict_version_minor
490 # if ref is specified, and we don't have a version number, assume we're validating against 1.0
491 elif version is None:
492 version, version_minor = 1, 0
493
494 if ref is None:
495 assert isinstance(version, int)
496 assert isinstance(version_minor, int)
497 _normalize(
498 nbdict,
499 version,
500 version_minor,
501 repair_duplicate_cell_ids,
502 relax_add_props=relax_add_props,
503 strip_invalid_metadata=strip_invalid_metadata,
504 )
505
506 for error in iter_validate(
507 nbdict,
508 ref=ref,
509 version=version,
510 version_minor=version_minor,
511 relax_add_props=relax_add_props,
512 strip_invalid_metadata=strip_invalid_metadata,
513 ):
514 raise error
515
516
517def _get_errors(
518 nbdict: Any, version: int, version_minor: int, relax_add_props: bool, *args: Any
519) -> Any:
520 validator = get_validator(version, version_minor, relax_add_props=relax_add_props)
521 if not validator:
522 msg = f"No schema for validating v{version}.{version_minor} notebooks"
523 raise ValidationError(msg)
524 iter_errors = validator.iter_errors(nbdict, *args)
525 errors = list(iter_errors)
526 # jsonschema gives the best error messages.
527 if errors and validator.name != "jsonschema":
528 validator = get_validator(
529 version=version,
530 version_minor=version_minor,
531 relax_add_props=relax_add_props,
532 name="jsonschema",
533 )
534 return validator.iter_errors(nbdict, *args)
535 return iter(errors)
536
537
538def _strip_invalida_metadata(
539 nbdict: Any, version: int, version_minor: int, relax_add_props: bool
540) -> int:
541 """
542 This function tries to extract metadata errors from the validator and fix
543 them if necessary. This mostly mean stripping unknown keys from metadata
544 fields, or removing metadata fields altogether.
545
546 Parameters
547 ----------
548 nbdict : dict
549 notebook document
550 version : int
551 version_minor : int
552 relax_add_props : bool
553 Whether to allow extra property in the Json schema validating the
554 notebook.
555
556 Returns
557 -------
558 int
559 number of modifications
560
561 """
562 errors = _get_errors(nbdict, version, version_minor, relax_add_props)
563 changes = 0
564 if len(list(errors)) > 0:
565 # jsonschema gives a better error tree.
566 validator = get_validator(
567 version=version,
568 version_minor=version_minor,
569 relax_add_props=relax_add_props,
570 name="jsonschema",
571 )
572 if not validator:
573 msg = f"No jsonschema for validating v{version}.{version_minor} notebooks"
574 raise ValidationError(msg)
575 errors = validator.iter_errors(nbdict)
576 error_tree = validator.error_tree(errors)
577 if "metadata" in error_tree:
578 for key in error_tree["metadata"]:
579 nbdict["metadata"].pop(key, None)
580 changes += 1
581
582 if "cells" in error_tree:
583 number_of_cells = len(nbdict.get("cells", 0))
584 for cell_idx in range(number_of_cells):
585 # Cells don't report individual metadata keys as having failed validation
586 # Instead it reports that it failed to validate against each cell-type definition.
587 # We have to delve into why those definitions failed to uncover which metadata
588 # keys are misbehaving.
589 if "oneOf" in error_tree["cells"][cell_idx].errors:
590 intended_cell_type = nbdict["cells"][cell_idx]["cell_type"]
591 schemas_by_index = [
592 ref["$ref"]
593 for ref in error_tree["cells"][cell_idx].errors["oneOf"].schema["oneOf"]
594 ]
595 cell_type_definition_name = f"#/definitions/{intended_cell_type}_cell"
596 if cell_type_definition_name in schemas_by_index:
597 schema_index = schemas_by_index.index(cell_type_definition_name)
598 for error in error_tree["cells"][cell_idx].errors["oneOf"].context:
599 rel_path = error.relative_path
600 error_for_intended_schema = error.schema_path[0] == schema_index
601 is_top_level_metadata_key = (
602 len(rel_path) == 2 and rel_path[0] == "metadata"
603 )
604 if error_for_intended_schema and is_top_level_metadata_key:
605 nbdict["cells"][cell_idx]["metadata"].pop(rel_path[1], None)
606 changes += 1
607
608 return changes
609
610
611def iter_validate(
612 nbdict=None,
613 ref=None,
614 version=None,
615 version_minor=None,
616 relax_add_props=False,
617 nbjson=None,
618 strip_invalid_metadata=False,
619):
620 """Checks whether the given notebook dict-like object conforms to the
621 relevant notebook format schema.
622
623 Returns a generator of all ValidationErrors if not valid.
624
625 Notes
626 -----
627 To fix: For security reasons, this function should *never* mutate its `nbdict` argument, and
628 should *never* try to validate a mutated or modified version of its notebook.
629
630 """
631 # backwards compatibility for nbjson argument
632 if nbdict is not None:
633 pass
634 elif nbjson is not None:
635 nbdict = nbjson
636 else:
637 msg = "iter_validate() missing 1 required argument: 'nbdict'"
638 raise TypeError(msg)
639
640 if version is None:
641 version, version_minor = get_version(nbdict)
642
643 if ref:
644 try:
645 errors = _get_errors(
646 nbdict,
647 version,
648 version_minor,
649 relax_add_props,
650 {"$ref": "#/definitions/%s" % ref},
651 )
652 except ValidationError as e:
653 yield e
654 return
655
656 else:
657 if strip_invalid_metadata:
658 _strip_invalida_metadata(nbdict, version, version_minor, relax_add_props)
659
660 # Validate one more time to ensure that us removing metadata
661 # didn't cause another complex validation issue in the schema.
662 # Also to ensure that higher-level errors produced by individual metadata validation
663 # failures are removed.
664 try:
665 errors = _get_errors(nbdict, version, version_minor, relax_add_props)
666 except ValidationError as e:
667 yield e
668 return
669
670 for error in errors:
671 yield better_validation_error(error, version, version_minor)