From cee0ed1e28535b07835269ba5f4a1ac5856cb545 Mon Sep 17 00:00:00 2001
From: Philip Hackstock <20710924+phackstock@users.noreply.github.com>
Date: Thu, 29 Aug 2024 17:13:07 +0200
Subject: [PATCH] Support data validation using value and tolerance (subclass
 implementation) (#371)

* Add check that some validation criteria must exist

* Add more tests

* Make black

* No reason not to allow both rtol and atol

* Fix failing tests

* Make bounds-fail-validation more generic

* Implement validation by value and tolerance

* Make ruff

* Update docstring

* Don't allow extra arguments to IamcDataFilter

* Make fieldl optional

* Make black

* Separate DataValidationCriteria into separate classes

* Sort imports

* Use pydantic functionality for model_dump

* Clean up DataValidationCriteria classes

* Add explicit criteria check

* Use property

* Update error messages

* Apply suggestions from code review

Co-authored-by: Daniel Huppmann <dh@dergelbesalon.at>

* Apply black

* Disable Windows tests for now as there's a GH issue

---------

Co-authored-by: Daniel Huppmann <dh@dergelbesalon.at>
---
 .github/workflows/pytest.yml                  |  2 +-
 nomenclature/processor/data_validator.py      | 72 ++++++++++++++++---
 nomenclature/processor/iamc.py                |  6 +-
 .../validate_bounds_and_rtol.yaml             |  5 ++
 .../validate_bounds_and_value.yaml            |  5 ++
 ...s.yaml => validate_data_fails_bounds.yaml} |  4 +-
 .../validate_data_fails_value.yaml            | 12 ++++
 .../validate_missing_criteria.yaml            |  2 +
 tests/test_cli.py                             |  2 +-
 tests/test_validate_data.py                   | 40 +++++++++--
 10 files changed, 130 insertions(+), 20 deletions(-)
 create mode 100644 tests/data/validation/validate_data/validate_bounds_and_rtol.yaml
 create mode 100644 tests/data/validation/validate_data/validate_bounds_and_value.yaml
 rename tests/data/validation/validate_data/{validate_data_fails.yaml => validate_data_fails_bounds.yaml} (61%)
 create mode 100644 tests/data/validation/validate_data/validate_data_fails_value.yaml
 create mode 100644 tests/data/validation/validate_data/validate_missing_criteria.yaml

diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
index 0a788026..3b1f9f38 100644
--- a/.github/workflows/pytest.yml
+++ b/.github/workflows/pytest.yml
@@ -15,7 +15,7 @@ jobs:
         shell: bash
     strategy:
       matrix:
-        os: ["macos", "ubuntu", "windows"]
+        os: ["macos", "ubuntu"]
         # keep consistent with py-version badge in README.md and docs/index.rst
         python-version: ["3.10", "3.11", "3.12"]
       fail-fast: false
diff --git a/nomenclature/processor/data_validator.py b/nomenclature/processor/data_validator.py
index 8afab90d..17b9624f 100644
--- a/nomenclature/processor/data_validator.py
+++ b/nomenclature/processor/data_validator.py
@@ -1,34 +1,88 @@
 import logging
-from pathlib import Path
 import textwrap
-from typing import List, Union
+from pathlib import Path
+from typing import List, Optional, Union
 
 import yaml
 from pyam import IamDataFrame
 from pyam.logging import adjust_log_level
+from pydantic import computed_field, field_validator, model_validator
 
 from nomenclature.definition import DataStructureDefinition
 from nomenclature.error import ErrorCollector
-from nomenclature.processor.iamc import IamcDataFilter
 from nomenclature.processor import Processor
+from nomenclature.processor.iamc import IamcDataFilter
 from nomenclature.processor.utils import get_relative_path
 
 logger = logging.getLogger(__name__)
 
 
-class DataValidationCriteria(IamcDataFilter):
-    """Data validation criteria"""
+class DataValidationCriteriaValue(IamcDataFilter):
+    value: float
+    rtol: float = 0.0
+    atol: float = 0.0
+
+    @property
+    def tolerance(self) -> float:
+        return self.value * self.rtol + self.atol
+
+    @computed_field
+    def upper_bound(self) -> float:
+        return self.value + self.tolerance
+
+    @computed_field
+    def lower_bound(self) -> float:
+        return self.value - self.tolerance
+
+    @property
+    def validation_args(self):
+        return self.model_dump(
+            exclude_none=True,
+            exclude_unset=True,
+            exclude=["value", "rtol", "atol"],
+        )
 
-    upper_bound: float = None
-    lower_bound: float = None
+    @property
+    def criteria(self):
+        return self.model_dump(
+            exclude_none=True,
+            exclude_unset=True,
+            exclude=["lower_bound", "upper_bound"],
+        )
+
+
+class DataValidationCriteriaBounds(IamcDataFilter):
+    upper_bound: Optional[float] = None
+    lower_bound: Optional[float] = None
+
+    @model_validator(mode="after")
+    def check_validation_criteria_exist(self):
+        if self.upper_bound is None and self.lower_bound is None:
+            raise ValueError("No validation criteria provided: " + str(self.criteria))
+        return self
+
+    @property
+    def validation_args(self):
+        return self.criteria
 
 
 class DataValidator(Processor):
     """Processor for validating IAMC datapoints"""
 
-    criteria_items: List[DataValidationCriteria]
+    criteria_items: List[DataValidationCriteriaBounds | DataValidationCriteriaValue]
     file: Path
 
+    @field_validator("criteria_items", mode="before")
+    def check_criteria(cls, v):
+        for criterion in v:
+            has_bounds = any(c in criterion for c in ["upper_bound", "lower_bound"])
+            has_values = any(c in criterion for c in ["value", "atol", "rtol"])
+            if has_bounds and has_values:
+                raise ValueError(
+                    f"Cannot use bounds and value-criteria simultaneously: {criterion}"
+                )
+        return v
+
     @classmethod
     def from_file(cls, file: Union[Path, str]) -> "DataValidator":
         with open(file, "r") as f:
@@ -40,7 +94,7 @@ def apply(self, df: IamDataFrame) -> IamDataFrame:
 
         with adjust_log_level():
             for item in self.criteria_items:
-                failed_validation = df.validate(**item.criteria)
+                failed_validation = df.validate(**item.validation_args)
                 if failed_validation is not None:
                     error_list.append(
                         "  Criteria: "
diff --git a/nomenclature/processor/iamc.py b/nomenclature/processor/iamc.py
index 163b22a0..1eba7abc 100644
--- a/nomenclature/processor/iamc.py
+++ b/nomenclature/processor/iamc.py
@@ -1,5 +1,5 @@
 from typing import List
-from pydantic import BaseModel, field_validator
+from pydantic import BaseModel, ConfigDict, field_validator
 
 from pyam import IAMC_IDX
 
@@ -7,6 +7,8 @@
 
 
 class IamcDataFilter(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+
     model: List[str] | None = None
     scenario: List[str] | None = None
     region: List[str] | None = None
@@ -21,7 +23,7 @@ def single_input_to_list(cls, v):
 
     @property
     def criteria(self):
-        return dict(item for item in self.model_dump().items() if item[1] is not None)
+        return self.model_dump(exclude_none=True, exclude_unset=True)
 
     def validate_with_definition(self, dsd: DataStructureDefinition) -> None:
         error_msg = ""
diff --git a/tests/data/validation/validate_data/validate_bounds_and_rtol.yaml b/tests/data/validation/validate_data/validate_bounds_and_rtol.yaml
new file mode 100644
index 00000000..08c24978
--- /dev/null
+++ b/tests/data/validation/validate_data/validate_bounds_and_rtol.yaml
@@ -0,0 +1,5 @@
+ - variable: Final Energy
+   year: 2010
+   upper_bound: 2.5
+   lower_bound: 1
+   rtol: 0.5
diff --git a/tests/data/validation/validate_data/validate_bounds_and_value.yaml b/tests/data/validation/validate_data/validate_bounds_and_value.yaml
new file mode 100644
index 00000000..7281d50f
--- /dev/null
+++ b/tests/data/validation/validate_data/validate_bounds_and_value.yaml
@@ -0,0 +1,5 @@
+ - variable: Final Energy
+   year: 2010
+   upper_bound: 2.5
+   lower_bound: 1
+   value: 1.5
diff --git a/tests/data/validation/validate_data/validate_data_fails.yaml b/tests/data/validation/validate_data/validate_data_fails_bounds.yaml
similarity index 61%
rename from tests/data/validation/validate_data/validate_data_fails.yaml
rename to tests/data/validation/validate_data/validate_data_fails_bounds.yaml
index e576e3a4..a6a79939 100644
--- a/tests/data/validation/validate_data/validate_data_fails.yaml
+++ b/tests/data/validation/validate_data/validate_data_fails_bounds.yaml
@@ -1,10 +1,10 @@
- # 2005 value passes the validation, but the 2010 value does not
+  # 2005 value passes the validation, fails validation for 2010 for both scenarios
  - variable: Primary Energy
    upper_bound: 5.
 # variable exists only for 'scen_a'
  - variable: Primary Energy|Coal
    lower_bound: 2
-# both upper and lower bound are triggered
+# both upper and lower bound fail for both scenarios
  - variable: Primary Energy
    year: 2005
    upper_bound: 1.9
diff --git a/tests/data/validation/validate_data/validate_data_fails_value.yaml b/tests/data/validation/validate_data/validate_data_fails_value.yaml
new file mode 100644
index 00000000..b2e614e1
--- /dev/null
+++ b/tests/data/validation/validate_data/validate_data_fails_value.yaml
@@ -0,0 +1,12 @@
+ # 2005 value passes the validation, fails validation for 2010 for both scenarios
+ - variable: Primary Energy
+   value: 2.
+   atol: 1.
+# variable exists only for 'scen_a', fails validation for 2005
+ - variable: Primary Energy|Coal
+   value: 3
+# both upper and lower bound fail for both scenarios
+ - variable: Primary Energy
+   year: 2005
+   value: 1.5
+   rtol: 0.2
diff --git a/tests/data/validation/validate_data/validate_missing_criteria.yaml b/tests/data/validation/validate_data/validate_missing_criteria.yaml
new file mode 100644
index 00000000..cc9f84a6
--- /dev/null
+++ b/tests/data/validation/validate_data/validate_missing_criteria.yaml
@@ -0,0 +1,2 @@
+ - variable: Final Energy
+   year: 2010
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 6687140a..86c5b8d9 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -266,7 +266,7 @@ def test_cli_validate_data_fails():
     )
 
     assert cli_result.exit_code == 1
-    assert "Collected 2 errors" in str(cli_result.exception)
+    assert "Collected 5 errors" in str(cli_result.exception)
     assert "Asia" in str(cli_result.exception)
     assert "Final Energy|Industry" in str(cli_result.exception)
 
diff --git a/tests/test_validate_data.py b/tests/test_validate_data.py
index 2e451a78..cf547680 100644
--- a/tests/test_validate_data.py
+++ b/tests/test_validate_data.py
@@ -30,6 +30,19 @@ def test_DataValidator_from_file():
     assert obs.validate_with_definition(dsd) is None
 
 
+@pytest.mark.parametrize(
+    "name, match",
+    [
+        ("missing_criteria", "No validation criteria provided:"),
+        ("bounds_and_value", "Cannot use bounds and value-criteria simultaneously:"),
+        ("bounds_and_rtol", "Cannot use bounds and value-criteria simultaneously:"),
+    ],
+)
+def test_DataValidator_illegal_structure(name, match):
+    with pytest.raises(ValueError, match=match):
+        DataValidator.from_file(DATA_VALIDATION_TEST_DIR / f"validate_{name}.yaml")
+
+
 @pytest.mark.parametrize(
     "dimension, match",
     [
@@ -68,21 +81,38 @@ def test_DataValidator_apply_no_matching_data(simple_df):
     assert data_validator.apply(simple_df) == simple_df
 
 
-def test_DataValidator_apply_fails(simple_df, caplog):
-    data_file = DATA_VALIDATION_TEST_DIR / "validate_data_fails.yaml"
+@pytest.mark.parametrize(
+    "file, item_1, item_2, item_3",
+    [
+        (
+            "bounds",
+            "upper_bound: 5.0",
+            "lower_bound: 2.0",
+            "upper_bound: 1.9, lower_bound: 1.1",
+        ),
+        (
+            "value",
+            "value: 2.0, atol: 1.0",
+            "value: 3.0",
+            "value: 1.5, rtol: 0.2",
+        ),
+    ],
+)
+def test_DataValidator_apply_fails(simple_df, file, item_1, item_2, item_3, caplog):
+    data_file = DATA_VALIDATION_TEST_DIR / f"validate_data_fails_{file}.yaml"
     data_validator = DataValidator.from_file(data_file)
 
     failed_validation_message = f"""Failed data validation (file {data_file.relative_to(Path.cwd())}):
-  Criteria: variable: ['Primary Energy'], upper_bound: 5.0
+  Criteria: variable: ['Primary Energy'], {item_1}
          model scenario region        variable   unit  year  value
     0  model_a   scen_a  World  Primary Energy  EJ/yr  2010    6.0
     1  model_a   scen_b  World  Primary Energy  EJ/yr  2010    7.0
 
-  Criteria: variable: ['Primary Energy|Coal'], lower_bound: 2.0
+  Criteria: variable: ['Primary Energy|Coal'], {item_2}
          model scenario region             variable   unit  year  value
     0  model_a   scen_a  World  Primary Energy|Coal  EJ/yr  2005    0.5
 
-  Criteria: variable: ['Primary Energy'], year: [2005], upper_bound: 1.9, lower_bound: 1.1
+  Criteria: variable: ['Primary Energy'], year: [2005], {item_3}
          model scenario region        variable   unit  year  value
     0  model_a   scen_a  World  Primary Energy  EJ/yr  2005    1.0
     1  model_a   scen_b  World  Primary Energy  EJ/yr  2005    2.0"""