Skip to content

Commit

Permalink
Merge pull request dandi#16 from candleindark/linkml-error-categoriza…
Browse files Browse the repository at this point in the history
…tion

Improve categorization of errors from the LinkML validator
  • Loading branch information
candleindark authored Oct 23, 2024
2 parents 706cfa6 + 97476f5 commit 6ba722e
Show file tree
Hide file tree
Showing 4 changed files with 213 additions and 14 deletions.
25 changes: 24 additions & 1 deletion src/dandisets_linkml_status_tools/cli/models.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from collections.abc import Sequence
from datetime import datetime
from typing import Annotated, Any, Union
from typing import Annotated, Any, NamedTuple, Union

from dandi.dandiapi import VersionStatus
from jsonschema.exceptions import ValidationError
Expand Down Expand Up @@ -160,3 +160,26 @@ def dandiset_schema_version(self) -> str:

# Errors encountered in validation against the dandiset metadata model in LinkML
linkml_validation_errs: LinkmlValidationErrsType = []


class JsonschemaValidationErrorType(NamedTuple):
"""
A named tuple for representing types of `jsonschema.exceptions.ValidationError`
objects.
The type of a `jsonschema.exceptions.ValidationError` is decided by the value of its
`validator` field and the value of its `validator_value` field. The values
of these fields are bundled in an instance of this named tuple to represent a type
of `jsonschema.exceptions.ValidationError` objects.
"""

validator: str
validator_value: Any

def __eq__(self, other: object) -> bool:
return (
isinstance(other, JsonschemaValidationErrorType)
and self.validator == other.validator
and type(self.validator_value) is type(other.validator_value) # noqa E721
and self.validator_value == other.validator_value
)
80 changes: 67 additions & 13 deletions src/dandisets_linkml_status_tools/cli/tools.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import json
import logging
import re
from collections import Counter
from collections.abc import Iterable
from copy import deepcopy
from functools import partial
from itertools import chain
from pathlib import Path
from shutil import rmtree
from typing import Any, Optional
from typing import Any, NamedTuple, Optional

from dandi.dandiapi import RemoteDandiset
from dandischema.models import Dandiset
Expand All @@ -29,6 +29,7 @@

from dandisets_linkml_status_tools.cli.models import (
DandisetValidationReport,
JsonschemaValidationErrorType,
LinkmlValidationErrsType,
PydanticValidationErrsType,
dandiset_metadata_adapter,
Expand Down Expand Up @@ -288,7 +289,7 @@ def output_reports(reports: list[DandisetValidationReport], output_path: Path) -
# For the linkml column
(
f"[{len(r.linkml_validation_errs)} "
f"({' + '.join(str(v) for v in linkml_err_counts.values())})]"
f"({' + '.join(str(c) for _, c in linkml_err_counts)})]"
f"({version_dir}/linkml_validation_errs.yaml)"
if r.linkml_validation_errs
else "0"
Expand Down Expand Up @@ -360,16 +361,69 @@ def get_pydantic_err_counts(errs: PydanticValidationErrsType) -> Counter[str]:
return Counter(isorted(e["type"] for e in errs))


def get_linkml_err_counts(errs: LinkmlValidationErrsType) -> Counter[str]:
class _JsonschemaValidationErrorCounts(NamedTuple):
"""
A record of the counts of individual types of JSON schema validation error
"""
Get a `Counter` object that counts the LinkML validation errors by type
:param errs: The list of LinkML validation errors to be counted
:return: The `Counter` object

Notes: The determination of the type of a LinkML validation error is rather
rudimentary at this point.
types: list[JsonschemaValidationErrorType]
"""
linkml_err_types = [
re.sub(r".*(is .*) in \S.*", r"\1", e.message, count=1) for e in errs
]
return Counter(isorted(linkml_err_types))
The unique types of JSON schema validation errors
"""

counts: list[int]
"""
The corresponding counts, by index, of the types of JSON schema validation errors
"""


def get_linkml_err_counts(
errs: LinkmlValidationErrsType,
) -> list[tuple[JsonschemaValidationErrorType, int]]:
"""
Counts given LinkML validation errors by type
:param errs: A list of LinkML validation errors to be counted
:return: A list of tuples where each tuple contains a
`JsonschemaValidationErrorType` object and the count of the errors of the type
represented by that object
"""

def count_err(e_: ValidationResult) -> None:
validator = e_.source.validator
err_type = JsonschemaValidationErrorType(validator, e_.source.validator_value)

if validator in counter:
for i, t in enumerate(counter[validator].types):
if t == err_type:
counter[validator].counts[i] += 1
break
else:
counter[validator].types.append(err_type)
counter[validator].counts.append(1)
else:
counter[validator] = _JsonschemaValidationErrorCounts(
types=[err_type], counts=[1]
)

def compile_counts() -> list[tuple[JsonschemaValidationErrorType, int]]:
def sorting_key(
c: tuple[JsonschemaValidationErrorType, int]
) -> tuple[str, int]:
return c[0].validator, -c[1]

return sorted(
chain.from_iterable(zip(t, c) for t, c in counter.values()), key=sorting_key
)

# A dictionary that keeps the counts of individual types of JSON schema validation
# errors. The keys of the dictionary are the `validator` of
# the `JsonschemaValidationErrorType` objects, and the values are
# the `_JsonschemaValidationErrorCounts` that tallies the errors represented by
# `JsonschemaValidationErrorType` objects with the same `validator` value.
counter: dict[str, _JsonschemaValidationErrorCounts] = {}

for e in errs:
count_err(e)

return compile_counts()
41 changes: 41 additions & 0 deletions tests/test_cli/test_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import pytest

from dandisets_linkml_status_tools.cli.models import JsonschemaValidationErrorType


@pytest.mark.parametrize(
("op1", "op2", "expected_result"),
[
(JsonschemaValidationErrorType("integer", [1, 2]), "hello", False),
(
JsonschemaValidationErrorType("integer", 1),
JsonschemaValidationErrorType("string", 1),
False,
),
(
JsonschemaValidationErrorType("integer", 1),
JsonschemaValidationErrorType("integer", "1"),
False,
),
(
JsonschemaValidationErrorType("integer", 1),
JsonschemaValidationErrorType("integer", 2),
False,
),
(
JsonschemaValidationErrorType("integer", 42),
JsonschemaValidationErrorType("integer", 42),
True,
),
(
JsonschemaValidationErrorType("integer", [1, 2, 3]),
JsonschemaValidationErrorType("integer", [1, 2, 3]),
True,
),
],
)
def test_jsonschema_validation_error_type_equality(op1, op2, expected_result):
"""
Test the equal operator of the `JsonschemaValidationErrorType` class
"""
assert (op1 == op2) == expected_result
81 changes: 81 additions & 0 deletions tests/test_cli/test_tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import pytest
from jsonschema.exceptions import ValidationError
from linkml.validator.report import Severity, ValidationResult

from dandisets_linkml_status_tools.cli.models import JsonschemaValidationErrorType
from dandisets_linkml_status_tools.cli.tools import get_linkml_err_counts


@pytest.mark.parametrize(
("error_types", "expected_counts"),
[
([], []),
(
[
JsonschemaValidationErrorType("integer", 1),
JsonschemaValidationErrorType("integer", 2),
JsonschemaValidationErrorType("string", "hello"),
],
[
(JsonschemaValidationErrorType("integer", 1), 1),
(JsonschemaValidationErrorType("integer", 2), 1),
(JsonschemaValidationErrorType("string", "hello"), 1),
],
),
(
[
JsonschemaValidationErrorType("integer", 1),
JsonschemaValidationErrorType("integer", 1),
JsonschemaValidationErrorType("integer", 1),
],
[(JsonschemaValidationErrorType("integer", 1), 3)],
),
(
[
JsonschemaValidationErrorType("integer", 1),
JsonschemaValidationErrorType("string", "hello"),
JsonschemaValidationErrorType("string", "hello"),
JsonschemaValidationErrorType("integer", 2),
JsonschemaValidationErrorType("integer", 1),
JsonschemaValidationErrorType("array", [1, 2, 3]),
JsonschemaValidationErrorType("array", (1, 2, 3)),
],
[
(JsonschemaValidationErrorType("array", [1, 2, 3]), 1),
(JsonschemaValidationErrorType("array", (1, 2, 3)), 1),
(JsonschemaValidationErrorType("integer", 1), 2),
(JsonschemaValidationErrorType("integer", 2), 1),
(JsonschemaValidationErrorType("string", "hello"), 2),
],
),
],
)
def test_get_linkml_err_counts(
error_types: list[JsonschemaValidationErrorType],
expected_counts: list[tuple[JsonschemaValidationErrorType, int]],
):
"""
Test the `get_linkml_err_counts` function
:param error_types: A list of JSON schema validation error types
:param expected_counts: A list of tuples of JSON schema validation error types
and their expected counts
"""
errs = []
for t in error_types:
# noinspection PyTypeChecker
jsonschema_validation_error = ValidationError(
message="An artificial error",
validator=t.validator,
validator_value=t.validator_value,
)
validation_result = ValidationResult(
type="jsonschema",
severity=Severity.ERROR,
message="What need to be fixed",
source=jsonschema_validation_error,
)
errs.append(validation_result)

counts = get_linkml_err_counts(errs)
assert counts == expected_counts

0 comments on commit 6ba722e

Please sign in to comment.