Support data validation using value and tolerance (subclass implement…

…ation) (#371) * Add check that some validation criteria must exist * Add more tests * Make black * No reason not to allow both rtol and atol * Fix failing tests * Make bounds-fail-validation more generic * Implement validation by value and tolerance * Make ruff * Update docstring * Don't allow extra arguments to IamcDataFilter * Make fieldl optional * Make black * Separate DataValidationCriteria into separate classes * Sort imports * Use pydantic functionality for model_dump * Clean up DataValidationCriteria classes * Add explicit criteria check * Use property * Update error messages * Apply suggestions from code review Co-authored-by: Daniel Huppmann <dh@dergelbesalon.at> * Apply black * Disable Windows tests for now as there's a GH issue --------- Co-authored-by: Daniel Huppmann <dh@dergelbesalon.at>
IAMconsortium · Aug 29, 2024 · cee0ed1 · cee0ed1
1 parent 18c0b12
commit cee0ed1
Show file tree

Hide file tree

Showing 10 changed files with 130 additions and 20 deletions.
diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
@@ -15,7 +15,7 @@ jobs:
         shell: bash
     strategy:
       matrix:
-        os: ["macos", "ubuntu", "windows"]
+        os: ["macos", "ubuntu"]
         # keep consistent with py-version badge in README.md and docs/index.rst
         python-version: ["3.10", "3.11", "3.12"]
       fail-fast: false

diff --git a/nomenclature/processor/data_validator.py b/nomenclature/processor/data_validator.py
@@ -1,34 +1,88 @@
 import logging
-from pathlib import Path
 import textwrap
-from typing import List, Union
+from pathlib import Path
+from typing import List, Optional, Union
 
 import yaml
 from pyam import IamDataFrame
 from pyam.logging import adjust_log_level
+from pydantic import computed_field, field_validator, model_validator
 
 from nomenclature.definition import DataStructureDefinition
 from nomenclature.error import ErrorCollector
-from nomenclature.processor.iamc import IamcDataFilter
 from nomenclature.processor import Processor
+from nomenclature.processor.iamc import IamcDataFilter
 from nomenclature.processor.utils import get_relative_path
 
 logger = logging.getLogger(__name__)
 
 
-class DataValidationCriteria(IamcDataFilter):
-    """Data validation criteria"""
+class DataValidationCriteriaValue(IamcDataFilter):
+    value: float
+    rtol: float = 0.0
+    atol: float = 0.0
+
+    @property
+    def tolerance(self) -> float:
+        return self.value * self.rtol + self.atol
+
+    @computed_field
+    def upper_bound(self) -> float:
+        return self.value + self.tolerance
+
+    @computed_field
+    def lower_bound(self) -> float:
+        return self.value - self.tolerance
+
+    @property
+    def validation_args(self):
+        return self.model_dump(
+            exclude_none=True,
+            exclude_unset=True,
+            exclude=["value", "rtol", "atol"],
+        )
 
-    upper_bound: float = None
-    lower_bound: float = None
+    @property
+    def criteria(self):
+        return self.model_dump(
+            exclude_none=True,
+            exclude_unset=True,
+            exclude=["lower_bound", "upper_bound"],
+        )
+
+
+class DataValidationCriteriaBounds(IamcDataFilter):
+    upper_bound: Optional[float] = None
+    lower_bound: Optional[float] = None
+
+    @model_validator(mode="after")
+    def check_validation_criteria_exist(self):
+        if self.upper_bound is None and self.lower_bound is None:
+            raise ValueError("No validation criteria provided: " + str(self.criteria))
+        return self
+
+    @property
+    def validation_args(self):
+        return self.criteria
 
 
 class DataValidator(Processor):
     """Processor for validating IAMC datapoints"""
 
-    criteria_items: List[DataValidationCriteria]
+    criteria_items: List[DataValidationCriteriaBounds | DataValidationCriteriaValue]
     file: Path
 
+    @field_validator("criteria_items", mode="before")
+    def check_criteria(cls, v):
+        for criterion in v:
+            has_bounds = any(c in criterion for c in ["upper_bound", "lower_bound"])
+            has_values = any(c in criterion for c in ["value", "atol", "rtol"])
+            if has_bounds and has_values:
+                raise ValueError(
+                    f"Cannot use bounds and value-criteria simultaneously: {criterion}"
+                )
+        return v
+
     @classmethod
     def from_file(cls, file: Union[Path, str]) -> "DataValidator":
         with open(file, "r") as f:
@@ -40,7 +94,7 @@ def apply(self, df: IamDataFrame) -> IamDataFrame:
 
         with adjust_log_level():
             for item in self.criteria_items:
-                failed_validation = df.validate(**item.criteria)
+                failed_validation = df.validate(**item.validation_args)
                 if failed_validation is not None:
                     error_list.append(
                         "  Criteria: "

diff --git a/nomenclature/processor/iamc.py b/nomenclature/processor/iamc.py
@@ -1,12 +1,14 @@
 from typing import List
-from pydantic import BaseModel, field_validator
+from pydantic import BaseModel, ConfigDict, field_validator
 
 from pyam import IAMC_IDX
 
 from nomenclature.definition import DataStructureDefinition
 
 
 class IamcDataFilter(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+
     model: List[str] | None = None
     scenario: List[str] | None = None
     region: List[str] | None = None
@@ -21,7 +23,7 @@ def single_input_to_list(cls, v):
 
     @property
     def criteria(self):
-        return dict(item for item in self.model_dump().items() if item[1] is not None)
+        return self.model_dump(exclude_none=True, exclude_unset=True)
 
     def validate_with_definition(self, dsd: DataStructureDefinition) -> None:
         error_msg = ""

diff --git a/tests/data/validation/validate_data/validate_bounds_and_rtol.yaml b/tests/data/validation/validate_data/validate_bounds_and_rtol.yaml
@@ -0,0 +1,5 @@
+ - variable: Final Energy
+   year: 2010
+   upper_bound: 2.5
+   lower_bound: 1
+   rtol: 0.5
diff --git a/tests/data/validation/validate_data/validate_bounds_and_value.yaml b/tests/data/validation/validate_data/validate_bounds_and_value.yaml
@@ -0,0 +1,5 @@
+ - variable: Final Energy
+   year: 2010
+   upper_bound: 2.5
+   lower_bound: 1
+   value: 1.5
diff --git a/...on/validate_data/validate_data_fails.yaml → ...date_data/validate_data_fails_bounds.yaml b/...on/validate_data/validate_data_fails.yaml → ...date_data/validate_data_fails_bounds.yaml
@@ -1,10 +1,10 @@
- # 2005 value passes the validation, but the 2010 value does not
+  # 2005 value passes the validation, fails validation for 2010 for both scenarios
  - variable: Primary Energy
    upper_bound: 5.
 # variable exists only for 'scen_a'
  - variable: Primary Energy|Coal
    lower_bound: 2
-# both upper and lower bound are triggered
+# both upper and lower bound fail for both scenarios
  - variable: Primary Energy
    year: 2005
    upper_bound: 1.9

diff --git a/tests/data/validation/validate_data/validate_data_fails_value.yaml b/tests/data/validation/validate_data/validate_data_fails_value.yaml
@@ -0,0 +1,12 @@
+ # 2005 value passes the validation, fails validation for 2010 for both scenarios
+ - variable: Primary Energy
+   value: 2.
+   atol: 1.
+# variable exists only for 'scen_a', fails validation for 2005
+ - variable: Primary Energy|Coal
+   value: 3
+# both upper and lower bound fail for both scenarios
+ - variable: Primary Energy
+   year: 2005
+   value: 1.5
+   rtol: 0.2
diff --git a/tests/data/validation/validate_data/validate_missing_criteria.yaml b/tests/data/validation/validate_data/validate_missing_criteria.yaml
@@ -0,0 +1,2 @@
+ - variable: Final Energy
+   year: 2010
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -266,7 +266,7 @@ def test_cli_validate_data_fails():
     )
 
     assert cli_result.exit_code == 1
-    assert "Collected 2 errors" in str(cli_result.exception)
+    assert "Collected 5 errors" in str(cli_result.exception)
     assert "Asia" in str(cli_result.exception)
     assert "Final Energy|Industry" in str(cli_result.exception)
 

diff --git a/tests/test_validate_data.py b/tests/test_validate_data.py
@@ -30,6 +30,19 @@ def test_DataValidator_from_file():
     assert obs.validate_with_definition(dsd) is None
 
 
+@pytest.mark.parametrize(
+    "name, match",
+    [
+        ("missing_criteria", "No validation criteria provided:"),
+        ("bounds_and_value", "Cannot use bounds and value-criteria simultaneously:"),
+        ("bounds_and_rtol", "Cannot use bounds and value-criteria simultaneously:"),
+    ],
+)
+def test_DataValidator_illegal_structure(name, match):
+    with pytest.raises(ValueError, match=match):
+        DataValidator.from_file(DATA_VALIDATION_TEST_DIR / f"validate_{name}.yaml")
+
+
 @pytest.mark.parametrize(
     "dimension, match",
     [
@@ -68,21 +81,38 @@ def test_DataValidator_apply_no_matching_data(simple_df):
     assert data_validator.apply(simple_df) == simple_df
 
 
-def test_DataValidator_apply_fails(simple_df, caplog):
-    data_file = DATA_VALIDATION_TEST_DIR / "validate_data_fails.yaml"
+@pytest.mark.parametrize(
+    "file, item_1, item_2, item_3",
+    [
+        (
+            "bounds",
+            "upper_bound: 5.0",
+            "lower_bound: 2.0",
+            "upper_bound: 1.9, lower_bound: 1.1",
+        ),
+        (
+            "value",
+            "value: 2.0, atol: 1.0",
+            "value: 3.0",
+            "value: 1.5, rtol: 0.2",
+        ),
+    ],
+)
+def test_DataValidator_apply_fails(simple_df, file, item_1, item_2, item_3, caplog):
+    data_file = DATA_VALIDATION_TEST_DIR / f"validate_data_fails_{file}.yaml"
     data_validator = DataValidator.from_file(data_file)
 
     failed_validation_message = f"""Failed data validation (file {data_file.relative_to(Path.cwd())}):
-  Criteria: variable: ['Primary Energy'], upper_bound: 5.0
+  Criteria: variable: ['Primary Energy'], {item_1}
          model scenario region        variable   unit  year  value
     0  model_a   scen_a  World  Primary Energy  EJ/yr  2010    6.0
     1  model_a   scen_b  World  Primary Energy  EJ/yr  2010    7.0
 
-  Criteria: variable: ['Primary Energy|Coal'], lower_bound: 2.0
+  Criteria: variable: ['Primary Energy|Coal'], {item_2}
          model scenario region             variable   unit  year  value
     0  model_a   scen_a  World  Primary Energy|Coal  EJ/yr  2005    0.5
 
-  Criteria: variable: ['Primary Energy'], year: [2005], upper_bound: 1.9, lower_bound: 1.1
+  Criteria: variable: ['Primary Energy'], year: [2005], {item_3}
          model scenario region        variable   unit  year  value
     0  model_a   scen_a  World  Primary Energy  EJ/yr  2005    1.0
     1  model_a   scen_b  World  Primary Energy  EJ/yr  2005    2.0"""