diff --git a/nomenclature/processor/data_validator.py b/nomenclature/processor/data_validator.py index 6bd38df5..debf0e1a 100644 --- a/nomenclature/processor/data_validator.py +++ b/nomenclature/processor/data_validator.py @@ -1,5 +1,6 @@ import logging import textwrap +from enum import Enum from pathlib import Path import yaml @@ -16,7 +17,18 @@ logger = logging.getLogger(__name__) -class DataValidationCriteriaValue(IamcDataFilter): +class WarningEnum(str, Enum): + high = "high" + medium = "medium" + low = "low" + error = "error" + + +class DataValidationCriteria(IamcDataFilter): + warning_level: WarningEnum = WarningEnum.error + + +class DataValidationCriteriaValue(DataValidationCriteria): value: float rtol: float = 0.0 atol: float = 0.0 @@ -38,7 +50,7 @@ def validation_args(self): return self.model_dump( exclude_none=True, exclude_unset=True, - exclude=["value", "rtol", "atol"], + exclude=["warning_level", "value", "rtol", "atol"], ) @property @@ -46,11 +58,11 @@ def criteria(self): return self.model_dump( exclude_none=True, exclude_unset=True, - exclude=["lower_bound", "upper_bound"], + exclude=["warning_level", "lower_bound", "upper_bound"], ) -class DataValidationCriteriaBounds(IamcDataFilter): +class DataValidationCriteriaBounds(DataValidationCriteria): upper_bound: float | None = None lower_bound: float | None = None @@ -64,6 +76,14 @@ def check_validation_criteria_exist(self): def validation_args(self): return self.criteria + @property + def criteria(self): + return self.model_dump( + exclude_none=True, + exclude_unset=True, + exclude=["warning_level"], + ) + class DataValidator(Processor): """Processor for validating IAMC datapoints""" @@ -89,31 +109,39 @@ def from_file(cls, file: Path | str) -> "DataValidator": return cls(file=file, criteria_items=content) def apply(self, df: IamDataFrame) -> IamDataFrame: - error_list = [] + fail_list = [] + error = False with adjust_log_level(): for item in self.criteria_items: failed_validation = df.validate(**item.validation_args) if failed_validation is not None: - error_list.append( - " Criteria: " - + ", ".join( - [f"{key}: {value}" for key, value in item.criteria.items()] - ) + criteria_msg = " Criteria: " + ", ".join( + [f"{key}: {value}" for key, value in item.criteria.items()] ) - error_list.append( + failed_validation["warning_level"] = item.warning_level.value + if item.warning_level == WarningEnum.error: + error = True + fail_list.append(criteria_msg) + fail_list.append( textwrap.indent(str(failed_validation), prefix=" ") + "\n" ) - - if error_list: - logger.error( - "Failed data validation (file %s):\n%s", - get_relative_path(self.file), - "\n".join(error_list), + fail_msg = "(file %s):\n" % get_relative_path(self.file) + if error: + fail_msg = ( + "Data validation with error(s)/warning(s) " + + fail_msg + + "\n".join(fail_list) ) + logger.error(fail_msg) raise ValueError( "Data validation failed. Please check the log for details." ) + if fail_list: + fail_msg = ( + "Data validation with warning(s) " + fail_msg + "\n".join(fail_list) + ) + logger.warning(fail_msg) return df def validate_with_definition(self, dsd: DataStructureDefinition) -> None: diff --git a/tests/data/validation/validate_data/validate_warning.yaml b/tests/data/validation/validate_data/validate_warning.yaml new file mode 100644 index 00000000..3482305a --- /dev/null +++ b/tests/data/validation/validate_data/validate_warning.yaml @@ -0,0 +1,13 @@ + - variable: Primary Energy + year: 2010 + upper_bound: 2.5 + lower_bound: 1 + warning_level: low + - variable: Primary Energy + year: 2010 + upper_bound: 5 + lower_bound: 1 + - variable: Primary Energy|Coal + year: 2010 + upper_bound: 5 + lower_bound: 1 diff --git a/tests/test_validate_data.py b/tests/test_validate_data.py index cf547680..8d0dbf0a 100644 --- a/tests/test_validate_data.py +++ b/tests/test_validate_data.py @@ -102,23 +102,53 @@ def test_DataValidator_apply_fails(simple_df, file, item_1, item_2, item_3, capl data_file = DATA_VALIDATION_TEST_DIR / f"validate_data_fails_{file}.yaml" data_validator = DataValidator.from_file(data_file) - failed_validation_message = f"""Failed data validation (file {data_file.relative_to(Path.cwd())}): + failed_validation_message = ( + "Data validation with error(s)/warning(s) " + f"""(file {data_file.relative_to(Path.cwd())}): Criteria: variable: ['Primary Energy'], {item_1} - model scenario region variable unit year value - 0 model_a scen_a World Primary Energy EJ/yr 2010 6.0 - 1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 + model scenario region variable unit year value warning_level + 0 model_a scen_a World Primary Energy EJ/yr 2010 6.0 error + 1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 error Criteria: variable: ['Primary Energy|Coal'], {item_2} - model scenario region variable unit year value - 0 model_a scen_a World Primary Energy|Coal EJ/yr 2005 0.5 + model scenario region ... year value warning_level + 0 model_a scen_a World ... 2005 0.5 error + + [1 rows x 8 columns] Criteria: variable: ['Primary Energy'], year: [2005], {item_3} - model scenario region variable unit year value - 0 model_a scen_a World Primary Energy EJ/yr 2005 1.0 - 1 model_a scen_b World Primary Energy EJ/yr 2005 2.0""" + model scenario region variable unit year value warning_level + 0 model_a scen_a World Primary Energy EJ/yr 2005 1.0 error + 1 model_a scen_b World Primary Energy EJ/yr 2005 2.0 error""" + ) with pytest.raises(ValueError, match="Data validation failed"): data_validator.apply(simple_df) # check if the log message contains the correct information assert failed_validation_message in caplog.text + + +def test_DataValidator_validate_with_warning(simple_df, caplog): + data_validator = DataValidator.from_file( + DATA_VALIDATION_TEST_DIR / "validate_warning.yaml" + ) + with pytest.raises(ValueError, match="Data validation failed"): + data_validator.apply(simple_df) + + failed_validation_message = ( + "Data validation with error(s)/warning(s) " + f"""(file {(DATA_VALIDATION_TEST_DIR / "validate_warning.yaml").relative_to(Path.cwd())}): + Criteria: variable: ['Primary Energy'], year: [2010], upper_bound: 2.5, lower_bound: 1.0 + model scenario region variable unit year value warning_level + 0 model_a scen_a World Primary Energy EJ/yr 2010 6.0 low + 1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 low + + Criteria: variable: ['Primary Energy'], year: [2010], upper_bound: 5.0, lower_bound: 1.0 + model scenario region variable unit year value warning_level + 0 model_a scen_a World Primary Energy EJ/yr 2010 6.0 error + 1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 error""" + ) + + # only prints two of three criteria in df to be validated + assert failed_validation_message in caplog.text