Merge pull request dandi#16 from candleindark/linkml-error-categoriza…

…tion Improve categorization of errors from the LinkML validator
candleindark · Oct 23, 2024 · 6ba722e · 6ba722e
2 parents 706cfa6 + 97476f5
commit 6ba722e
Show file tree

Hide file tree

Showing 4 changed files with 213 additions and 14 deletions.
diff --git a/src/dandisets_linkml_status_tools/cli/models.py b/src/dandisets_linkml_status_tools/cli/models.py
@@ -1,6 +1,6 @@
 from collections.abc import Sequence
 from datetime import datetime
-from typing import Annotated, Any, Union
+from typing import Annotated, Any, NamedTuple, Union
 
 from dandi.dandiapi import VersionStatus
 from jsonschema.exceptions import ValidationError
@@ -160,3 +160,26 @@ def dandiset_schema_version(self) -> str:
 
     # Errors encountered in validation against the dandiset metadata model in LinkML
     linkml_validation_errs: LinkmlValidationErrsType = []
+
+
+class JsonschemaValidationErrorType(NamedTuple):
+    """
+    A named tuple for representing types of `jsonschema.exceptions.ValidationError`
+    objects.
+
+    The type of a `jsonschema.exceptions.ValidationError` is decided by the value of its
+    `validator` field and the value of its `validator_value` field. The values
+    of these fields are bundled in an instance of this named tuple to represent a type
+    of `jsonschema.exceptions.ValidationError` objects.
+    """
+
+    validator: str
+    validator_value: Any
+
+    def __eq__(self, other: object) -> bool:
+        return (
+            isinstance(other, JsonschemaValidationErrorType)
+            and self.validator == other.validator
+            and type(self.validator_value) is type(other.validator_value)  # noqa E721
+            and self.validator_value == other.validator_value
+        )
diff --git a/src/dandisets_linkml_status_tools/cli/tools.py b/src/dandisets_linkml_status_tools/cli/tools.py
@@ -1,13 +1,13 @@
 import json
 import logging
-import re
 from collections import Counter
 from collections.abc import Iterable
 from copy import deepcopy
 from functools import partial
+from itertools import chain
 from pathlib import Path
 from shutil import rmtree
-from typing import Any, Optional
+from typing import Any, NamedTuple, Optional
 
 from dandi.dandiapi import RemoteDandiset
 from dandischema.models import Dandiset
@@ -29,6 +29,7 @@
 
 from dandisets_linkml_status_tools.cli.models import (
     DandisetValidationReport,
+    JsonschemaValidationErrorType,
     LinkmlValidationErrsType,
     PydanticValidationErrsType,
     dandiset_metadata_adapter,
@@ -288,7 +289,7 @@ def output_reports(reports: list[DandisetValidationReport], output_path: Path) -
                     # For the linkml column
                     (
                         f"[{len(r.linkml_validation_errs)} "
-                        f"({' + '.join(str(v) for v in linkml_err_counts.values())})]"
+                        f"({' + '.join(str(c) for _, c in linkml_err_counts)})]"
                         f"({version_dir}/linkml_validation_errs.yaml)"
                         if r.linkml_validation_errs
                         else "0"
@@ -360,16 +361,69 @@ def get_pydantic_err_counts(errs: PydanticValidationErrsType) -> Counter[str]:
     return Counter(isorted(e["type"] for e in errs))
 
 
-def get_linkml_err_counts(errs: LinkmlValidationErrsType) -> Counter[str]:
+class _JsonschemaValidationErrorCounts(NamedTuple):
+    """
+    A record of the counts of individual types of JSON schema validation error
     """
-    Get a `Counter` object that counts the LinkML validation errors by type
-    :param errs: The list of LinkML validation errors to be counted
-    :return: The `Counter` object
 
-    Notes: The determination of the type of a LinkML validation error is rather
-        rudimentary at this point.
+    types: list[JsonschemaValidationErrorType]
     """
-    linkml_err_types = [
-        re.sub(r".*(is .*) in \S.*", r"\1", e.message, count=1) for e in errs
-    ]
-    return Counter(isorted(linkml_err_types))
+    The unique types of JSON schema validation errors
+    """
+
+    counts: list[int]
+    """
+    The corresponding counts, by index, of the types of JSON schema validation errors
+    """
+
+
+def get_linkml_err_counts(
+    errs: LinkmlValidationErrsType,
+) -> list[tuple[JsonschemaValidationErrorType, int]]:
+    """
+    Counts given LinkML validation errors by type
+
+    :param errs: A list of LinkML validation errors to be counted
+    :return: A list of tuples where each tuple contains a
+        `JsonschemaValidationErrorType` object and the count of the errors of the type
+        represented by that object
+    """
+
+    def count_err(e_: ValidationResult) -> None:
+        validator = e_.source.validator
+        err_type = JsonschemaValidationErrorType(validator, e_.source.validator_value)
+
+        if validator in counter:
+            for i, t in enumerate(counter[validator].types):
+                if t == err_type:
+                    counter[validator].counts[i] += 1
+                    break
+            else:
+                counter[validator].types.append(err_type)
+                counter[validator].counts.append(1)
+        else:
+            counter[validator] = _JsonschemaValidationErrorCounts(
+                types=[err_type], counts=[1]
+            )
+
+    def compile_counts() -> list[tuple[JsonschemaValidationErrorType, int]]:
+        def sorting_key(
+            c: tuple[JsonschemaValidationErrorType, int]
+        ) -> tuple[str, int]:
+            return c[0].validator, -c[1]
+
+        return sorted(
+            chain.from_iterable(zip(t, c) for t, c in counter.values()), key=sorting_key
+        )
+
+    # A dictionary that keeps the counts of individual types of JSON schema validation
+    # errors. The keys of the dictionary are the `validator` of
+    # the `JsonschemaValidationErrorType` objects, and the values are
+    # the `_JsonschemaValidationErrorCounts` that tallies the errors represented by
+    # `JsonschemaValidationErrorType` objects with the same `validator` value.
+    counter: dict[str, _JsonschemaValidationErrorCounts] = {}
+
+    for e in errs:
+        count_err(e)
+
+    return compile_counts()
diff --git a/tests/test_cli/test_models.py b/tests/test_cli/test_models.py
@@ -0,0 +1,41 @@
+import pytest
+
+from dandisets_linkml_status_tools.cli.models import JsonschemaValidationErrorType
+
+
+@pytest.mark.parametrize(
+    ("op1", "op2", "expected_result"),
+    [
+        (JsonschemaValidationErrorType("integer", [1, 2]), "hello", False),
+        (
+            JsonschemaValidationErrorType("integer", 1),
+            JsonschemaValidationErrorType("string", 1),
+            False,
+        ),
+        (
+            JsonschemaValidationErrorType("integer", 1),
+            JsonschemaValidationErrorType("integer", "1"),
+            False,
+        ),
+        (
+            JsonschemaValidationErrorType("integer", 1),
+            JsonschemaValidationErrorType("integer", 2),
+            False,
+        ),
+        (
+            JsonschemaValidationErrorType("integer", 42),
+            JsonschemaValidationErrorType("integer", 42),
+            True,
+        ),
+        (
+            JsonschemaValidationErrorType("integer", [1, 2, 3]),
+            JsonschemaValidationErrorType("integer", [1, 2, 3]),
+            True,
+        ),
+    ],
+)
+def test_jsonschema_validation_error_type_equality(op1, op2, expected_result):
+    """
+    Test the equal operator of the `JsonschemaValidationErrorType` class
+    """
+    assert (op1 == op2) == expected_result
diff --git a/tests/test_cli/test_tools.py b/tests/test_cli/test_tools.py
@@ -0,0 +1,81 @@
+import pytest
+from jsonschema.exceptions import ValidationError
+from linkml.validator.report import Severity, ValidationResult
+
+from dandisets_linkml_status_tools.cli.models import JsonschemaValidationErrorType
+from dandisets_linkml_status_tools.cli.tools import get_linkml_err_counts
+
+
+@pytest.mark.parametrize(
+    ("error_types", "expected_counts"),
+    [
+        ([], []),
+        (
+            [
+                JsonschemaValidationErrorType("integer", 1),
+                JsonschemaValidationErrorType("integer", 2),
+                JsonschemaValidationErrorType("string", "hello"),
+            ],
+            [
+                (JsonschemaValidationErrorType("integer", 1), 1),
+                (JsonschemaValidationErrorType("integer", 2), 1),
+                (JsonschemaValidationErrorType("string", "hello"), 1),
+            ],
+        ),
+        (
+            [
+                JsonschemaValidationErrorType("integer", 1),
+                JsonschemaValidationErrorType("integer", 1),
+                JsonschemaValidationErrorType("integer", 1),
+            ],
+            [(JsonschemaValidationErrorType("integer", 1), 3)],
+        ),
+        (
+            [
+                JsonschemaValidationErrorType("integer", 1),
+                JsonschemaValidationErrorType("string", "hello"),
+                JsonschemaValidationErrorType("string", "hello"),
+                JsonschemaValidationErrorType("integer", 2),
+                JsonschemaValidationErrorType("integer", 1),
+                JsonschemaValidationErrorType("array", [1, 2, 3]),
+                JsonschemaValidationErrorType("array", (1, 2, 3)),
+            ],
+            [
+                (JsonschemaValidationErrorType("array", [1, 2, 3]), 1),
+                (JsonschemaValidationErrorType("array", (1, 2, 3)), 1),
+                (JsonschemaValidationErrorType("integer", 1), 2),
+                (JsonschemaValidationErrorType("integer", 2), 1),
+                (JsonschemaValidationErrorType("string", "hello"), 2),
+            ],
+        ),
+    ],
+)
+def test_get_linkml_err_counts(
+    error_types: list[JsonschemaValidationErrorType],
+    expected_counts: list[tuple[JsonschemaValidationErrorType, int]],
+):
+    """
+    Test the `get_linkml_err_counts` function
+
+    :param error_types: A list of JSON schema validation error types
+    :param expected_counts: A list of tuples of JSON schema validation error types
+        and their expected counts
+    """
+    errs = []
+    for t in error_types:
+        # noinspection PyTypeChecker
+        jsonschema_validation_error = ValidationError(
+            message="An artificial error",
+            validator=t.validator,
+            validator_value=t.validator_value,
+        )
+        validation_result = ValidationResult(
+            type="jsonschema",
+            severity=Severity.ERROR,
+            message="What need to be fixed",
+            source=jsonschema_validation_error,
+        )
+        errs.append(validation_result)
+
+    counts = get_linkml_err_counts(errs)
+    assert counts == expected_counts