Skip to content

Commit

Permalink
Support data validation using value and tolerance (subclass implement…
Browse files Browse the repository at this point in the history
…ation) (#371)

* Add check that some validation criteria must exist

* Add more tests

* Make black

* No reason not to allow both rtol and atol

* Fix failing tests

* Make bounds-fail-validation more generic

* Implement validation by value and tolerance

* Make ruff

* Update docstring

* Don't allow extra arguments to IamcDataFilter

* Make fieldl optional

* Make black

* Separate DataValidationCriteria into separate classes

* Sort imports

* Use pydantic functionality for model_dump

* Clean up DataValidationCriteria classes

* Add explicit criteria check

* Use property

* Update error messages

* Apply suggestions from code review

Co-authored-by: Daniel Huppmann <dh@dergelbesalon.at>

* Apply black

* Disable Windows tests for now as there's a GH issue

---------

Co-authored-by: Daniel Huppmann <dh@dergelbesalon.at>
  • Loading branch information
phackstock and danielhuppmann authored Aug 29, 2024
1 parent 18c0b12 commit cee0ed1
Show file tree
Hide file tree
Showing 10 changed files with 130 additions and 20 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/pytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
shell: bash
strategy:
matrix:
os: ["macos", "ubuntu", "windows"]
os: ["macos", "ubuntu"]
# keep consistent with py-version badge in README.md and docs/index.rst
python-version: ["3.10", "3.11", "3.12"]
fail-fast: false
Expand Down
72 changes: 63 additions & 9 deletions nomenclature/processor/data_validator.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,88 @@
import logging
from pathlib import Path
import textwrap
from typing import List, Union
from pathlib import Path
from typing import List, Optional, Union

import yaml
from pyam import IamDataFrame
from pyam.logging import adjust_log_level
from pydantic import computed_field, field_validator, model_validator

from nomenclature.definition import DataStructureDefinition
from nomenclature.error import ErrorCollector
from nomenclature.processor.iamc import IamcDataFilter
from nomenclature.processor import Processor
from nomenclature.processor.iamc import IamcDataFilter
from nomenclature.processor.utils import get_relative_path

logger = logging.getLogger(__name__)


class DataValidationCriteria(IamcDataFilter):
"""Data validation criteria"""
class DataValidationCriteriaValue(IamcDataFilter):
value: float
rtol: float = 0.0
atol: float = 0.0

@property
def tolerance(self) -> float:
return self.value * self.rtol + self.atol

@computed_field
def upper_bound(self) -> float:
return self.value + self.tolerance

@computed_field
def lower_bound(self) -> float:
return self.value - self.tolerance

@property
def validation_args(self):
return self.model_dump(
exclude_none=True,
exclude_unset=True,
exclude=["value", "rtol", "atol"],
)

upper_bound: float = None
lower_bound: float = None
@property
def criteria(self):
return self.model_dump(
exclude_none=True,
exclude_unset=True,
exclude=["lower_bound", "upper_bound"],
)


class DataValidationCriteriaBounds(IamcDataFilter):
upper_bound: Optional[float] = None
lower_bound: Optional[float] = None

@model_validator(mode="after")
def check_validation_criteria_exist(self):
if self.upper_bound is None and self.lower_bound is None:
raise ValueError("No validation criteria provided: " + str(self.criteria))
return self

@property
def validation_args(self):
return self.criteria


class DataValidator(Processor):
"""Processor for validating IAMC datapoints"""

criteria_items: List[DataValidationCriteria]
criteria_items: List[DataValidationCriteriaBounds | DataValidationCriteriaValue]
file: Path

@field_validator("criteria_items", mode="before")
def check_criteria(cls, v):
for criterion in v:
has_bounds = any(c in criterion for c in ["upper_bound", "lower_bound"])
has_values = any(c in criterion for c in ["value", "atol", "rtol"])
if has_bounds and has_values:
raise ValueError(
f"Cannot use bounds and value-criteria simultaneously: {criterion}"
)
return v

@classmethod
def from_file(cls, file: Union[Path, str]) -> "DataValidator":
with open(file, "r") as f:
Expand All @@ -40,7 +94,7 @@ def apply(self, df: IamDataFrame) -> IamDataFrame:

with adjust_log_level():
for item in self.criteria_items:
failed_validation = df.validate(**item.criteria)
failed_validation = df.validate(**item.validation_args)
if failed_validation is not None:
error_list.append(
" Criteria: "
Expand Down
6 changes: 4 additions & 2 deletions nomenclature/processor/iamc.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from typing import List
from pydantic import BaseModel, field_validator
from pydantic import BaseModel, ConfigDict, field_validator

from pyam import IAMC_IDX

from nomenclature.definition import DataStructureDefinition


class IamcDataFilter(BaseModel):
model_config = ConfigDict(extra="forbid")

model: List[str] | None = None
scenario: List[str] | None = None
region: List[str] | None = None
Expand All @@ -21,7 +23,7 @@ def single_input_to_list(cls, v):

@property
def criteria(self):
return dict(item for item in self.model_dump().items() if item[1] is not None)
return self.model_dump(exclude_none=True, exclude_unset=True)

def validate_with_definition(self, dsd: DataStructureDefinition) -> None:
error_msg = ""
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
- variable: Final Energy
year: 2010
upper_bound: 2.5
lower_bound: 1
rtol: 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
- variable: Final Energy
year: 2010
upper_bound: 2.5
lower_bound: 1
value: 1.5
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
# 2005 value passes the validation, but the 2010 value does not
# 2005 value passes the validation, fails validation for 2010 for both scenarios
- variable: Primary Energy
upper_bound: 5.
# variable exists only for 'scen_a'
- variable: Primary Energy|Coal
lower_bound: 2
# both upper and lower bound are triggered
# both upper and lower bound fail for both scenarios
- variable: Primary Energy
year: 2005
upper_bound: 1.9
Expand Down
12 changes: 12 additions & 0 deletions tests/data/validation/validate_data/validate_data_fails_value.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# 2005 value passes the validation, fails validation for 2010 for both scenarios
- variable: Primary Energy
value: 2.
atol: 1.
# variable exists only for 'scen_a', fails validation for 2005
- variable: Primary Energy|Coal
value: 3
# both upper and lower bound fail for both scenarios
- variable: Primary Energy
year: 2005
value: 1.5
rtol: 0.2
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
- variable: Final Energy
year: 2010
2 changes: 1 addition & 1 deletion tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ def test_cli_validate_data_fails():
)

assert cli_result.exit_code == 1
assert "Collected 2 errors" in str(cli_result.exception)
assert "Collected 5 errors" in str(cli_result.exception)
assert "Asia" in str(cli_result.exception)
assert "Final Energy|Industry" in str(cli_result.exception)

Expand Down
40 changes: 35 additions & 5 deletions tests/test_validate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,19 @@ def test_DataValidator_from_file():
assert obs.validate_with_definition(dsd) is None


@pytest.mark.parametrize(
"name, match",
[
("missing_criteria", "No validation criteria provided:"),
("bounds_and_value", "Cannot use bounds and value-criteria simultaneously:"),
("bounds_and_rtol", "Cannot use bounds and value-criteria simultaneously:"),
],
)
def test_DataValidator_illegal_structure(name, match):
with pytest.raises(ValueError, match=match):
DataValidator.from_file(DATA_VALIDATION_TEST_DIR / f"validate_{name}.yaml")


@pytest.mark.parametrize(
"dimension, match",
[
Expand Down Expand Up @@ -68,21 +81,38 @@ def test_DataValidator_apply_no_matching_data(simple_df):
assert data_validator.apply(simple_df) == simple_df


def test_DataValidator_apply_fails(simple_df, caplog):
data_file = DATA_VALIDATION_TEST_DIR / "validate_data_fails.yaml"
@pytest.mark.parametrize(
"file, item_1, item_2, item_3",
[
(
"bounds",
"upper_bound: 5.0",
"lower_bound: 2.0",
"upper_bound: 1.9, lower_bound: 1.1",
),
(
"value",
"value: 2.0, atol: 1.0",
"value: 3.0",
"value: 1.5, rtol: 0.2",
),
],
)
def test_DataValidator_apply_fails(simple_df, file, item_1, item_2, item_3, caplog):
data_file = DATA_VALIDATION_TEST_DIR / f"validate_data_fails_{file}.yaml"
data_validator = DataValidator.from_file(data_file)

failed_validation_message = f"""Failed data validation (file {data_file.relative_to(Path.cwd())}):
Criteria: variable: ['Primary Energy'], upper_bound: 5.0
Criteria: variable: ['Primary Energy'], {item_1}
model scenario region variable unit year value
0 model_a scen_a World Primary Energy EJ/yr 2010 6.0
1 model_a scen_b World Primary Energy EJ/yr 2010 7.0
Criteria: variable: ['Primary Energy|Coal'], lower_bound: 2.0
Criteria: variable: ['Primary Energy|Coal'], {item_2}
model scenario region variable unit year value
0 model_a scen_a World Primary Energy|Coal EJ/yr 2005 0.5
Criteria: variable: ['Primary Energy'], year: [2005], upper_bound: 1.9, lower_bound: 1.1
Criteria: variable: ['Primary Energy'], year: [2005], {item_3}
model scenario region variable unit year value
0 model_a scen_a World Primary Energy EJ/yr 2005 1.0
1 model_a scen_b World Primary Energy EJ/yr 2005 2.0"""
Expand Down

0 comments on commit cee0ed1

Please sign in to comment.