Skip to content

Commit

Permalink
Implement DataValidator.apply() (#368)
Browse files Browse the repository at this point in the history
Co-authored-by: Philip Hackstock <20710924+phackstock@users.noreply.github.com>
  • Loading branch information
danielhuppmann and phackstock authored Aug 20, 2024
1 parent 6efc8d9 commit 18c0b12
Show file tree
Hide file tree
Showing 6 changed files with 93 additions and 11 deletions.
39 changes: 35 additions & 4 deletions nomenclature/processor/data_validator.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
import logging
from pathlib import Path
import textwrap
from typing import List, Union

import yaml
from pyam import IamDataFrame
from pyam.logging import adjust_log_level

from nomenclature.definition import DataStructureDefinition
from nomenclature.error import ErrorCollector
from nomenclature.processor.iamc import IamcDataFilter
from nomenclature.processor import Processor
from nomenclature.processor.utils import get_relative_path

logger = logging.getLogger(__name__)


class DataValidationCriteria(IamcDataFilter):
"""Data validation criteria"""
Expand All @@ -29,14 +35,39 @@ def from_file(cls, file: Union[Path, str]) -> "DataValidator":
content = yaml.safe_load(f)
return cls(file=file, criteria_items=content)

def apply(self):
pass
def apply(self, df: IamDataFrame) -> IamDataFrame:
error_list = []

with adjust_log_level():
for item in self.criteria_items:
failed_validation = df.validate(**item.criteria)
if failed_validation is not None:
error_list.append(
" Criteria: "
+ ", ".join(
[f"{key}: {value}" for key, value in item.criteria.items()]
)
)
error_list.append(
textwrap.indent(str(failed_validation), prefix=" ") + "\n"
)

if error_list:
logger.error(
"Failed data validation (file %s):\n%s",
get_relative_path(self.file),
"\n".join(error_list),
)
raise ValueError(
"Data validation failed. Please check the log for details."
)
return df

def validate_with_definition(self, dsd: DataStructureDefinition) -> None:
errors = ErrorCollector(description=f"in file '{self.file}'")
for data in self.criteria_items:
for criterion in self.criteria_items:
try:
data.validate_with_definition(dsd)
criterion.validate_with_definition(dsd)
except ValueError as value_error:
errors.append(value_error)
if errors:
Expand Down
11 changes: 8 additions & 3 deletions nomenclature/processor/iamc.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Tuple, Any
from typing import List
from pydantic import BaseModel, field_validator

from pyam import IAMC_IDX
Expand All @@ -19,15 +19,20 @@ class IamcDataFilter(BaseModel):
def single_input_to_list(cls, v):
return v if isinstance(v, list) else [v]

@property
def criteria(self):
return dict(item for item in self.model_dump().items() if item[1] is not None)

def validate_with_definition(self, dsd: DataStructureDefinition) -> None:
error_msg = ""

# check for filter-items that are not defined in the codelists
for dimension in IAMC_IDX:
codelist = getattr(dsd, dimension, None)
if codelist is None:
# no validation if codelist is not defined or filter-item is None
if codelist is None or getattr(self, dimension) is None:
continue
if invalid := codelist.validate_items(getattr(self, dimension, [])):
if invalid := codelist.validate_items(getattr(self, dimension)):
error_msg += (
f"The following {dimension}s are not defined in the "
f"DataStructureDefinition:\n {', '.join(invalid)}\n"
Expand Down
2 changes: 2 additions & 0 deletions tests/data/validation/definitions/variable/variable.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
unit: EJ/yr
- Primary Energy:
unit: EJ/yr
- Primary Energy|Coal:
unit: EJ/yr
- Emissions|CO2:
unit: Mt CO2/yr
- Emissions|CH4:
Expand Down
4 changes: 1 addition & 3 deletions tests/data/validation/validate_data/simple_validation.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
- region: World
variable: Final Energy
- variable: Final Energy
year: 2010
upper_bound: 2.5
lower_bound: 1

11 changes: 11 additions & 0 deletions tests/data/validation/validate_data/validate_data_fails.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# 2005 value passes the validation, but the 2010 value does not
- variable: Primary Energy
upper_bound: 5.
# variable exists only for 'scen_a'
- variable: Primary Energy|Coal
lower_bound: 2
# both upper and lower bound are triggered
- variable: Primary Energy
year: 2005
upper_bound: 1.9
lower_bound: 1.1
37 changes: 36 additions & 1 deletion tests/test_validate_data.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from pathlib import Path

import pytest
from conftest import TEST_DATA_DIR

Expand All @@ -12,7 +14,6 @@ def test_DataValidator_from_file():
**{
"criteria_items": [
{
"region": ["World"],
"variable": "Final Energy",
"year": [2010],
"upper_bound": 2.5,
Expand Down Expand Up @@ -57,3 +58,37 @@ def test_DataValidator_validate_with_definition_raises(dimension, match):
dimensions=[dim for dim in ["region", "variable"] if dim != dimension],
)
assert data_validator.validate_with_definition(dsd) is None


def test_DataValidator_apply_no_matching_data(simple_df):
data_validator = DataValidator.from_file(
DATA_VALIDATION_TEST_DIR / "simple_validation.yaml"
)
# no data matches validation criteria, `apply()` passes and returns unchanged object
assert data_validator.apply(simple_df) == simple_df


def test_DataValidator_apply_fails(simple_df, caplog):
data_file = DATA_VALIDATION_TEST_DIR / "validate_data_fails.yaml"
data_validator = DataValidator.from_file(data_file)

failed_validation_message = f"""Failed data validation (file {data_file.relative_to(Path.cwd())}):
Criteria: variable: ['Primary Energy'], upper_bound: 5.0
model scenario region variable unit year value
0 model_a scen_a World Primary Energy EJ/yr 2010 6.0
1 model_a scen_b World Primary Energy EJ/yr 2010 7.0
Criteria: variable: ['Primary Energy|Coal'], lower_bound: 2.0
model scenario region variable unit year value
0 model_a scen_a World Primary Energy|Coal EJ/yr 2005 0.5
Criteria: variable: ['Primary Energy'], year: [2005], upper_bound: 1.9, lower_bound: 1.1
model scenario region variable unit year value
0 model_a scen_a World Primary Energy EJ/yr 2005 1.0
1 model_a scen_b World Primary Energy EJ/yr 2005 2.0"""

with pytest.raises(ValueError, match="Data validation failed"):
data_validator.apply(simple_df)

# check if the log message contains the correct information
assert failed_validation_message in caplog.text

0 comments on commit 18c0b12

Please sign in to comment.