From 529b0cfce47303b221e22576e864ce24a8d21f48 Mon Sep 17 00:00:00 2001 From: martonvago <57952344+martonvago@users.noreply.github.com> Date: Wed, 5 Nov 2025 13:56:36 +0000 Subject: [PATCH 1/2] feat: :sparkles: handle grouped errors under resource fields (#175) # Description This PR adds a function for handling grouped errors under `$.resources[x].schema.fields[x]`. Here, the problem comes from the fact that each field type has its own sub-JSON-schema, and each one of these sub-schemas flags issues when the type of a field is not its own type. So, if a field has `type="number"` and there is something wrong with the field, then the sub-schemas for `year`, `string`, etc. will also flag issues. The goal is to flag issues only for `number`. Part of #15 Needs an in-depth review. ## Checklist - [x] Formatted Markdown - [x] Ran `just run-all` --- src/check_datapackage/check.py | 64 ++++++++++++++++++++++++++++-- src/check_datapackage/constants.py | 18 +++++++++ tests/test_check.py | 59 +++++++++++++++++++++++++++ 3 files changed, 138 insertions(+), 3 deletions(-) diff --git a/src/check_datapackage/check.py b/src/check_datapackage/check.py index 0689b96c..c0fd1994 100644 --- a/src/check_datapackage/check.py +++ b/src/check_datapackage/check.py @@ -8,7 +8,11 @@ from jsonschema import Draft7Validator, FormatChecker, ValidationError from check_datapackage.config import Config -from check_datapackage.constants import DATA_PACKAGE_SCHEMA_PATH, GROUP_ERRORS +from check_datapackage.constants import ( + DATA_PACKAGE_SCHEMA_PATH, + FIELD_TYPES, + GROUP_ERRORS, +) from check_datapackage.exclusion import exclude from check_datapackage.extensions import apply_extensions from check_datapackage.internals import ( @@ -151,6 +155,7 @@ class SchemaError: schema_path (str): The path to the violated check in the JSON schema. Path components are separated by '/'. jsonpath (str): The JSON path to the field that violates the check. + instance (Any): The part of the object that failed the check. parent (Optional[SchemaError]): The error group the error belongs to, if any. """ @@ -158,6 +163,7 @@ class SchemaError: type: str schema_path: str jsonpath: str + instance: Any parent: Optional["SchemaError"] = None @@ -193,7 +199,7 @@ def _handle_S_resources_x( ) -> SchemaErrorEdits: """Do not flag missing `path` and `data` separately.""" edits = SchemaErrorEdits() - errors_in_group = _filter(schema_errors, lambda error: error.parent == parent_error) + errors_in_group = _get_errors_in_group(schema_errors, parent_error) # If the parent error is caused by other errors, remove it if errors_in_group: edits.remove.append(parent_error) @@ -212,6 +218,7 @@ def _handle_S_resources_x( type="required", jsonpath=parent_error.jsonpath, schema_path=parent_error.schema_path, + instance=parent_error.instance, ) ) @@ -230,7 +237,7 @@ def _handle_S_resources_x_path( If `path` is an array, flag errors for the array-based schema. """ edits = SchemaErrorEdits() - errors_in_group = _filter(schema_errors, lambda error: error.parent == parent_error) + errors_in_group = _get_errors_in_group(schema_errors, parent_error) type_errors = _filter(errors_in_group, _is_path_type_error) only_type_errors = len(errors_in_group) == len(type_errors) @@ -246,6 +253,7 @@ def _handle_S_resources_x_path( type="type", jsonpath=type_errors[0].jsonpath, schema_path=type_errors[0].schema_path, + instance=parent_error.instance, ) ) @@ -254,11 +262,54 @@ def _handle_S_resources_x_path( return edits +def _handle_S_resources_x_schema_fields_x( + parent_error: SchemaError, + schema_errors: list[SchemaError], +) -> SchemaErrorEdits: + """Only flag errors for the relevant field type. + + E.g., if the field type is `string`, flag errors for the string-based schema only. + """ + edits = SchemaErrorEdits() + errors_in_group = _get_errors_in_group(schema_errors, parent_error) + edits.remove.append(parent_error) + + field_type: str = parent_error.instance.get("type", "string") + + # The field's type is unknown + if field_type not in FIELD_TYPES: + unknown_field_error = SchemaError( + message=( + "The type property in this resource schema field is incorrect. " + f"The value can only be one of these types: {', '.join(FIELD_TYPES)}." + ), + type="enum", + jsonpath=f"{parent_error.jsonpath}.type", + schema_path=parent_error.schema_path, + instance=parent_error.instance, + ) + # Replace all errors with an unknown field error + edits.add.append(unknown_field_error) + edits.remove.extend(errors_in_group) + return edits + + # The field's type is known; keep only errors for this field type + schema_index = FIELD_TYPES.index(field_type) + + errors_for_other_types = _filter( + errors_in_group, + lambda error: f"fields/items/oneOf/{schema_index}/" not in error.schema_path, + ) + edits.remove.extend(errors_for_other_types) + return edits + + _schema_path_to_handler: list[ tuple[str, Callable[[SchemaError, list[SchemaError]], SchemaErrorEdits]] ] = [ ("resources/items/oneOf", _handle_S_resources_x), ("resources/items/properties/path/oneOf", _handle_S_resources_x_path), + ("fields/items/oneOf", _handle_S_resources_x_schema_fields_x), ] @@ -330,6 +381,7 @@ def _create_schema_error(error: ValidationError) -> SchemaError: type=str(error.validator), jsonpath=_get_full_json_path_from_error(error), schema_path="/".join(_map(error.absolute_schema_path, str)), + instance=error.instance, parent=_create_schema_error(error.parent) if error.parent else None, # type: ignore[arg-type] ) @@ -348,3 +400,9 @@ def _create_issue(error: SchemaError) -> Issue: jsonpath=error.jsonpath, type=error.type, ) + + +def _get_errors_in_group( + schema_errors: list[SchemaError], parent_error: SchemaError +) -> list[SchemaError]: + return _filter(schema_errors, lambda error: error.parent == parent_error) diff --git a/src/check_datapackage/constants.py b/src/check_datapackage/constants.py index a2150c8e..59ce6603 100644 --- a/src/check_datapackage/constants.py +++ b/src/check_datapackage/constants.py @@ -6,3 +6,21 @@ DATA_PACKAGE_SCHEMA_PATH = Path( str(files("check_datapackage.schemas").joinpath("data-package-2-0.json")) ) + +FIELD_TYPES = [ + "string", + "number", + "integer", + "date", + "time", + "datetime", + "year", + "yearmonth", + "boolean", + "object", + "geopoint", + "geojson", + "array", + "duration", + "any", +] diff --git a/tests/test_check.py b/tests/test_check.py index d2e3021d..82156784 100644 --- a/tests/test_check.py +++ b/tests/test_check.py @@ -4,6 +4,7 @@ from check_datapackage.check import DataPackageError, check from check_datapackage.config import Config +from check_datapackage.constants import FIELD_TYPES from check_datapackage.examples import ( example_package_properties, example_resource_properties, @@ -274,6 +275,64 @@ def test_fail_with_bad_resource_path(path, location, type): assert issues[0].jsonpath == location +def test_fail_empty_field(): + properties = example_package_properties() + properties["resources"][0]["schema"]["fields"][0] = {} + + issues = check(properties) + + assert len(issues) == 1 + assert issues[0].type == "required" + assert issues[0].jsonpath == "$.resources[0].schema.fields[0].name" + + +def test_fail_unknown_field(): + properties = example_package_properties() + properties["resources"][0]["schema"]["fields"][0]["type"] = "unknown" + + issues = check(properties) + + assert len(issues) == 1 + assert issues[0].type == "enum" + assert issues[0].jsonpath == "$.resources[0].schema.fields[0].type" + + +@mark.parametrize("type", FIELD_TYPES) +def test_fail_field_with_bad_property(type): + properties = example_package_properties() + properties["resources"][0]["schema"]["fields"][0]["type"] = type + properties["resources"][0]["schema"]["fields"][0]["title"] = 4 + + issues = check(properties) + + assert len(issues) == 1 + assert issues[0].type == "type" + assert issues[0].jsonpath == "$.resources[0].schema.fields[0].title" + + +def test_fail_field_with_bad_format(): + properties = example_package_properties() + properties["resources"][0]["schema"]["fields"][0]["format"] = 4 + + issues = check(properties) + + assert len(issues) == 1 + assert issues[0].type == "enum" + assert issues[0].jsonpath == "$.resources[0].schema.fields[0].format" + + +def test_fail_unknown_field_with_bad_property(): + properties = example_package_properties() + properties["resources"][0]["schema"]["fields"][0]["title"] = 4 + properties["resources"][0]["schema"]["fields"][0]["type"] = "unknown" + + issues = check(properties) + + assert len(issues) == 1 + assert issues[0].type == "enum" + assert issues[0].jsonpath == "$.resources[0].schema.fields[0].type" + + def test_error_as_true(): properties = { "name": 123, From cdf83bfab56b8bec6ad54f0abfcbe8456dce14d2 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 5 Nov 2025 13:57:08 +0000 Subject: [PATCH 2/2] build(version): :bookmark: update version from 0.13.0 to 0.14.0 --- CHANGELOG.md | 6 ++++++ pyproject.toml | 2 +- uv.lock | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7d8910d8..0eaf6136 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,12 @@ often, sometimes several in a day. It also means any individual release will not have many changes within it. Below is a list of releases along with what was changed within it. +## 0.14.0 (2025-11-05) + +### Feat + +- :sparkles: handle grouped errors under resource fields (#175) + ## 0.13.0 (2025-11-04) ### Feat diff --git a/pyproject.toml b/pyproject.toml index 9cefb9fd..9802bfc0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "check-datapackage" -version = "0.13.0" +version = "0.14.0" # TODO: Add a description of the package. description = "" authors = [ diff --git a/uv.lock b/uv.lock index fa885b12..70c95aea 100644 --- a/uv.lock +++ b/uv.lock @@ -352,7 +352,7 @@ wheels = [ [[package]] name = "check-datapackage" -version = "0.13.0" +version = "0.14.0" source = { editable = "." } dependencies = [ { name = "jsonschema" },