From cee0ed1e28535b07835269ba5f4a1ac5856cb545 Mon Sep 17 00:00:00 2001 From: Philip Hackstock <20710924+phackstock@users.noreply.github.com> Date: Thu, 29 Aug 2024 17:13:07 +0200 Subject: [PATCH] Support data validation using value and tolerance (subclass implementation) (#371) * Add check that some validation criteria must exist * Add more tests * Make black * No reason not to allow both rtol and atol * Fix failing tests * Make bounds-fail-validation more generic * Implement validation by value and tolerance * Make ruff * Update docstring * Don't allow extra arguments to IamcDataFilter * Make fieldl optional * Make black * Separate DataValidationCriteria into separate classes * Sort imports * Use pydantic functionality for model_dump * Clean up DataValidationCriteria classes * Add explicit criteria check * Use property * Update error messages * Apply suggestions from code review Co-authored-by: Daniel Huppmann * Apply black * Disable Windows tests for now as there's a GH issue --------- Co-authored-by: Daniel Huppmann --- .github/workflows/pytest.yml | 2 +- nomenclature/processor/data_validator.py | 72 ++++++++++++++++--- nomenclature/processor/iamc.py | 6 +- .../validate_bounds_and_rtol.yaml | 5 ++ .../validate_bounds_and_value.yaml | 5 ++ ...s.yaml => validate_data_fails_bounds.yaml} | 4 +- .../validate_data_fails_value.yaml | 12 ++++ .../validate_missing_criteria.yaml | 2 + tests/test_cli.py | 2 +- tests/test_validate_data.py | 40 +++++++++-- 10 files changed, 130 insertions(+), 20 deletions(-) create mode 100644 tests/data/validation/validate_data/validate_bounds_and_rtol.yaml create mode 100644 tests/data/validation/validate_data/validate_bounds_and_value.yaml rename tests/data/validation/validate_data/{validate_data_fails.yaml => validate_data_fails_bounds.yaml} (61%) create mode 100644 tests/data/validation/validate_data/validate_data_fails_value.yaml create mode 100644 tests/data/validation/validate_data/validate_missing_criteria.yaml diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 0a788026..3b1f9f38 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -15,7 +15,7 @@ jobs: shell: bash strategy: matrix: - os: ["macos", "ubuntu", "windows"] + os: ["macos", "ubuntu"] # keep consistent with py-version badge in README.md and docs/index.rst python-version: ["3.10", "3.11", "3.12"] fail-fast: false diff --git a/nomenclature/processor/data_validator.py b/nomenclature/processor/data_validator.py index 8afab90d..17b9624f 100644 --- a/nomenclature/processor/data_validator.py +++ b/nomenclature/processor/data_validator.py @@ -1,34 +1,88 @@ import logging -from pathlib import Path import textwrap -from typing import List, Union +from pathlib import Path +from typing import List, Optional, Union import yaml from pyam import IamDataFrame from pyam.logging import adjust_log_level +from pydantic import computed_field, field_validator, model_validator from nomenclature.definition import DataStructureDefinition from nomenclature.error import ErrorCollector -from nomenclature.processor.iamc import IamcDataFilter from nomenclature.processor import Processor +from nomenclature.processor.iamc import IamcDataFilter from nomenclature.processor.utils import get_relative_path logger = logging.getLogger(__name__) -class DataValidationCriteria(IamcDataFilter): - """Data validation criteria""" +class DataValidationCriteriaValue(IamcDataFilter): + value: float + rtol: float = 0.0 + atol: float = 0.0 + + @property + def tolerance(self) -> float: + return self.value * self.rtol + self.atol + + @computed_field + def upper_bound(self) -> float: + return self.value + self.tolerance + + @computed_field + def lower_bound(self) -> float: + return self.value - self.tolerance + + @property + def validation_args(self): + return self.model_dump( + exclude_none=True, + exclude_unset=True, + exclude=["value", "rtol", "atol"], + ) - upper_bound: float = None - lower_bound: float = None + @property + def criteria(self): + return self.model_dump( + exclude_none=True, + exclude_unset=True, + exclude=["lower_bound", "upper_bound"], + ) + + +class DataValidationCriteriaBounds(IamcDataFilter): + upper_bound: Optional[float] = None + lower_bound: Optional[float] = None + + @model_validator(mode="after") + def check_validation_criteria_exist(self): + if self.upper_bound is None and self.lower_bound is None: + raise ValueError("No validation criteria provided: " + str(self.criteria)) + return self + + @property + def validation_args(self): + return self.criteria class DataValidator(Processor): """Processor for validating IAMC datapoints""" - criteria_items: List[DataValidationCriteria] + criteria_items: List[DataValidationCriteriaBounds | DataValidationCriteriaValue] file: Path + @field_validator("criteria_items", mode="before") + def check_criteria(cls, v): + for criterion in v: + has_bounds = any(c in criterion for c in ["upper_bound", "lower_bound"]) + has_values = any(c in criterion for c in ["value", "atol", "rtol"]) + if has_bounds and has_values: + raise ValueError( + f"Cannot use bounds and value-criteria simultaneously: {criterion}" + ) + return v + @classmethod def from_file(cls, file: Union[Path, str]) -> "DataValidator": with open(file, "r") as f: @@ -40,7 +94,7 @@ def apply(self, df: IamDataFrame) -> IamDataFrame: with adjust_log_level(): for item in self.criteria_items: - failed_validation = df.validate(**item.criteria) + failed_validation = df.validate(**item.validation_args) if failed_validation is not None: error_list.append( " Criteria: " diff --git a/nomenclature/processor/iamc.py b/nomenclature/processor/iamc.py index 163b22a0..1eba7abc 100644 --- a/nomenclature/processor/iamc.py +++ b/nomenclature/processor/iamc.py @@ -1,5 +1,5 @@ from typing import List -from pydantic import BaseModel, field_validator +from pydantic import BaseModel, ConfigDict, field_validator from pyam import IAMC_IDX @@ -7,6 +7,8 @@ class IamcDataFilter(BaseModel): + model_config = ConfigDict(extra="forbid") + model: List[str] | None = None scenario: List[str] | None = None region: List[str] | None = None @@ -21,7 +23,7 @@ def single_input_to_list(cls, v): @property def criteria(self): - return dict(item for item in self.model_dump().items() if item[1] is not None) + return self.model_dump(exclude_none=True, exclude_unset=True) def validate_with_definition(self, dsd: DataStructureDefinition) -> None: error_msg = "" diff --git a/tests/data/validation/validate_data/validate_bounds_and_rtol.yaml b/tests/data/validation/validate_data/validate_bounds_and_rtol.yaml new file mode 100644 index 00000000..08c24978 --- /dev/null +++ b/tests/data/validation/validate_data/validate_bounds_and_rtol.yaml @@ -0,0 +1,5 @@ + - variable: Final Energy + year: 2010 + upper_bound: 2.5 + lower_bound: 1 + rtol: 0.5 diff --git a/tests/data/validation/validate_data/validate_bounds_and_value.yaml b/tests/data/validation/validate_data/validate_bounds_and_value.yaml new file mode 100644 index 00000000..7281d50f --- /dev/null +++ b/tests/data/validation/validate_data/validate_bounds_and_value.yaml @@ -0,0 +1,5 @@ + - variable: Final Energy + year: 2010 + upper_bound: 2.5 + lower_bound: 1 + value: 1.5 diff --git a/tests/data/validation/validate_data/validate_data_fails.yaml b/tests/data/validation/validate_data/validate_data_fails_bounds.yaml similarity index 61% rename from tests/data/validation/validate_data/validate_data_fails.yaml rename to tests/data/validation/validate_data/validate_data_fails_bounds.yaml index e576e3a4..a6a79939 100644 --- a/tests/data/validation/validate_data/validate_data_fails.yaml +++ b/tests/data/validation/validate_data/validate_data_fails_bounds.yaml @@ -1,10 +1,10 @@ - # 2005 value passes the validation, but the 2010 value does not + # 2005 value passes the validation, fails validation for 2010 for both scenarios - variable: Primary Energy upper_bound: 5. # variable exists only for 'scen_a' - variable: Primary Energy|Coal lower_bound: 2 -# both upper and lower bound are triggered +# both upper and lower bound fail for both scenarios - variable: Primary Energy year: 2005 upper_bound: 1.9 diff --git a/tests/data/validation/validate_data/validate_data_fails_value.yaml b/tests/data/validation/validate_data/validate_data_fails_value.yaml new file mode 100644 index 00000000..b2e614e1 --- /dev/null +++ b/tests/data/validation/validate_data/validate_data_fails_value.yaml @@ -0,0 +1,12 @@ + # 2005 value passes the validation, fails validation for 2010 for both scenarios + - variable: Primary Energy + value: 2. + atol: 1. +# variable exists only for 'scen_a', fails validation for 2005 + - variable: Primary Energy|Coal + value: 3 +# both upper and lower bound fail for both scenarios + - variable: Primary Energy + year: 2005 + value: 1.5 + rtol: 0.2 diff --git a/tests/data/validation/validate_data/validate_missing_criteria.yaml b/tests/data/validation/validate_data/validate_missing_criteria.yaml new file mode 100644 index 00000000..cc9f84a6 --- /dev/null +++ b/tests/data/validation/validate_data/validate_missing_criteria.yaml @@ -0,0 +1,2 @@ + - variable: Final Energy + year: 2010 diff --git a/tests/test_cli.py b/tests/test_cli.py index 6687140a..86c5b8d9 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -266,7 +266,7 @@ def test_cli_validate_data_fails(): ) assert cli_result.exit_code == 1 - assert "Collected 2 errors" in str(cli_result.exception) + assert "Collected 5 errors" in str(cli_result.exception) assert "Asia" in str(cli_result.exception) assert "Final Energy|Industry" in str(cli_result.exception) diff --git a/tests/test_validate_data.py b/tests/test_validate_data.py index 2e451a78..cf547680 100644 --- a/tests/test_validate_data.py +++ b/tests/test_validate_data.py @@ -30,6 +30,19 @@ def test_DataValidator_from_file(): assert obs.validate_with_definition(dsd) is None +@pytest.mark.parametrize( + "name, match", + [ + ("missing_criteria", "No validation criteria provided:"), + ("bounds_and_value", "Cannot use bounds and value-criteria simultaneously:"), + ("bounds_and_rtol", "Cannot use bounds and value-criteria simultaneously:"), + ], +) +def test_DataValidator_illegal_structure(name, match): + with pytest.raises(ValueError, match=match): + DataValidator.from_file(DATA_VALIDATION_TEST_DIR / f"validate_{name}.yaml") + + @pytest.mark.parametrize( "dimension, match", [ @@ -68,21 +81,38 @@ def test_DataValidator_apply_no_matching_data(simple_df): assert data_validator.apply(simple_df) == simple_df -def test_DataValidator_apply_fails(simple_df, caplog): - data_file = DATA_VALIDATION_TEST_DIR / "validate_data_fails.yaml" +@pytest.mark.parametrize( + "file, item_1, item_2, item_3", + [ + ( + "bounds", + "upper_bound: 5.0", + "lower_bound: 2.0", + "upper_bound: 1.9, lower_bound: 1.1", + ), + ( + "value", + "value: 2.0, atol: 1.0", + "value: 3.0", + "value: 1.5, rtol: 0.2", + ), + ], +) +def test_DataValidator_apply_fails(simple_df, file, item_1, item_2, item_3, caplog): + data_file = DATA_VALIDATION_TEST_DIR / f"validate_data_fails_{file}.yaml" data_validator = DataValidator.from_file(data_file) failed_validation_message = f"""Failed data validation (file {data_file.relative_to(Path.cwd())}): - Criteria: variable: ['Primary Energy'], upper_bound: 5.0 + Criteria: variable: ['Primary Energy'], {item_1} model scenario region variable unit year value 0 model_a scen_a World Primary Energy EJ/yr 2010 6.0 1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 - Criteria: variable: ['Primary Energy|Coal'], lower_bound: 2.0 + Criteria: variable: ['Primary Energy|Coal'], {item_2} model scenario region variable unit year value 0 model_a scen_a World Primary Energy|Coal EJ/yr 2005 0.5 - Criteria: variable: ['Primary Energy'], year: [2005], upper_bound: 1.9, lower_bound: 1.1 + Criteria: variable: ['Primary Energy'], year: [2005], {item_3} model scenario region variable unit year value 0 model_a scen_a World Primary Energy EJ/yr 2005 1.0 1 model_a scen_b World Primary Energy EJ/yr 2005 2.0"""