From 2db6a18d6082d5df148b03a5f54e8259c9658e4f Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Fri, 16 Aug 2024 12:14:25 +0200 Subject: [PATCH 01/19] Remove empty line at end of file --- tests/data/validation/validate_data/simple_validation.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/data/validation/validate_data/simple_validation.yaml b/tests/data/validation/validate_data/simple_validation.yaml index 2694b20f..354621e3 100644 --- a/tests/data/validation/validate_data/simple_validation.yaml +++ b/tests/data/validation/validate_data/simple_validation.yaml @@ -3,4 +3,3 @@ year: 2010 upper_bound: 2.5 lower_bound: 1 - From 17017e8678fb82b15081cc3d9351caabcbd561a2 Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Fri, 16 Aug 2024 13:00:41 +0200 Subject: [PATCH 02/19] Make sure that validation-with-codelist passes if no criteria are given --- nomenclature/processor/iamc.py | 5 +++-- tests/data/validation/validate_data/simple_validation.yaml | 3 +-- tests/test_validate_data.py | 1 - 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/nomenclature/processor/iamc.py b/nomenclature/processor/iamc.py index 8dc3936a..dc3549a2 100644 --- a/nomenclature/processor/iamc.py +++ b/nomenclature/processor/iamc.py @@ -25,9 +25,10 @@ def validate_with_definition(self, dsd: DataStructureDefinition) -> None: # check for filter-items that are not defined in the codelists for dimension in IAMC_IDX: codelist = getattr(dsd, dimension, None) - if codelist is None: + # no validation if codelist is not defined or filter-item is None + if codelist is None or getattr(self, dimension) is None: continue - if invalid := codelist.validate_items(getattr(self, dimension, [])): + if invalid := codelist.validate_items(getattr(self, dimension)): error_msg += ( f"The following {dimension}s are not defined in the " f"DataStructureDefinition:\n {', '.join(invalid)}\n" diff --git a/tests/data/validation/validate_data/simple_validation.yaml b/tests/data/validation/validate_data/simple_validation.yaml index 354621e3..a7ec5ee9 100644 --- a/tests/data/validation/validate_data/simple_validation.yaml +++ b/tests/data/validation/validate_data/simple_validation.yaml @@ -1,5 +1,4 @@ - - region: World - variable: Final Energy + - variable: Final Energy year: 2010 upper_bound: 2.5 lower_bound: 1 diff --git a/tests/test_validate_data.py b/tests/test_validate_data.py index 4d339750..67dec617 100644 --- a/tests/test_validate_data.py +++ b/tests/test_validate_data.py @@ -12,7 +12,6 @@ def test_DataValidator_from_file(): **{ "criteria_items": [ { - "region": ["World"], "variable": "Final Energy", "year": [2010], "upper_bound": 2.5, From 31f7baaa89e7b672e4ffbb76995c551e04ee1deb Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Fri, 16 Aug 2024 13:01:08 +0200 Subject: [PATCH 03/19] Harmonize notation --- nomenclature/processor/data_validator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nomenclature/processor/data_validator.py b/nomenclature/processor/data_validator.py index dfc5158c..1c9936f7 100644 --- a/nomenclature/processor/data_validator.py +++ b/nomenclature/processor/data_validator.py @@ -34,9 +34,9 @@ def apply(self): def validate_with_definition(self, dsd: DataStructureDefinition) -> None: errors = ErrorCollector(description=f"in file '{self.file}'") - for data in self.criteria_items: + for item in self.criteria_items: try: - data.validate_with_definition(dsd) + item.validate_with_definition(dsd) except ValueError as value_error: errors.append(value_error) if errors: From 402817d530f8033b8af2bfb3495ef44668c40501 Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Fri, 16 Aug 2024 13:05:32 +0200 Subject: [PATCH 04/19] Add `criteria` attribute --- nomenclature/processor/iamc.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/nomenclature/processor/iamc.py b/nomenclature/processor/iamc.py index dc3549a2..163b22a0 100644 --- a/nomenclature/processor/iamc.py +++ b/nomenclature/processor/iamc.py @@ -1,4 +1,4 @@ -from typing import List, Tuple, Any +from typing import List from pydantic import BaseModel, field_validator from pyam import IAMC_IDX @@ -19,6 +19,10 @@ class IamcDataFilter(BaseModel): def single_input_to_list(cls, v): return v if isinstance(v, list) else [v] + @property + def criteria(self): + return dict(item for item in self.model_dump().items() if item[1] is not None) + def validate_with_definition(self, dsd: DataStructureDefinition) -> None: error_msg = "" From 433415d2d2e3f9a9eedd1b798d6ef563400a0316 Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Fri, 16 Aug 2024 13:12:38 +0200 Subject: [PATCH 05/19] Add initial `apply` implementation --- nomenclature/processor/data_validator.py | 34 +++++++++++++++++-- .../definitions/variable/variable.yaml | 2 ++ tests/test_validate_data.py | 8 +++++ 3 files changed, 42 insertions(+), 2 deletions(-) diff --git a/nomenclature/processor/data_validator.py b/nomenclature/processor/data_validator.py index 1c9936f7..3920c31d 100644 --- a/nomenclature/processor/data_validator.py +++ b/nomenclature/processor/data_validator.py @@ -1,7 +1,11 @@ +import logging from pathlib import Path from typing import List, Union +import pandas as pd import yaml +from pyam import IamDataFrame +from pyam.logging import adjust_log_level from nomenclature.definition import DataStructureDefinition from nomenclature.error import ErrorCollector @@ -9,6 +13,8 @@ from nomenclature.processor import Processor from nomenclature.processor.utils import get_relative_path +logger = logging.getLogger(__name__) + class DataValidationCriteria(IamcDataFilter): """Data validation criteria""" @@ -29,8 +35,32 @@ def from_file(cls, file: Union[Path, str]) -> "DataValidator": content = yaml.safe_load(f) return cls(file=file, criteria_items=content) - def apply(self): - pass + def apply(self, df: IamDataFrame) -> IamDataFrame: + failed_validation_list = [] + error = False + + with adjust_log_level(): + for item in self.criteria_items: + failed_validation = df.validate(**item.criteria) + if failed_validation is not None: + for direction in ["upper_bound", "lower_bound"]: + if getattr(item, direction) is not None: + failed_validation[direction] = getattr(item, direction) + failed_validation_list.append(failed_validation) + + if failed_validation_list: + logger.error( + "Failed data validation (file %s):\n%s", + get_relative_path(self.file), + pd.concat(failed_validation_list), + ) + error = True + + if error: + raise ValueError( + "Data validation failed. Please check the log for details." + ) + return df def validate_with_definition(self, dsd: DataStructureDefinition) -> None: errors = ErrorCollector(description=f"in file '{self.file}'") diff --git a/tests/data/validation/definitions/variable/variable.yaml b/tests/data/validation/definitions/variable/variable.yaml index 59c7edf8..0d6094ea 100644 --- a/tests/data/validation/definitions/variable/variable.yaml +++ b/tests/data/validation/definitions/variable/variable.yaml @@ -2,6 +2,8 @@ unit: EJ/yr - Primary Energy: unit: EJ/yr +- Primary Energy|Coal: + unit: EJ/yr - Emissions|CO2: unit: Mt CO2/yr - Emissions|CH4: diff --git a/tests/test_validate_data.py b/tests/test_validate_data.py index 67dec617..7cf2973b 100644 --- a/tests/test_validate_data.py +++ b/tests/test_validate_data.py @@ -56,3 +56,11 @@ def test_DataValidator_validate_with_definition_raises(dimension, match): dimensions=[dim for dim in ["region", "variable"] if dim != dimension], ) assert data_validator.validate_with_definition(dsd) is None + + +def test_DataValidator_apply_no_matching_data(simple_df): + data_validator = DataValidator.from_file( + DATA_VALIDATION_TEST_DIR / "simple_validation.yaml" + ) + # no data matches validation criteria, `apply()` passes and returns unchanged object + assert data_validator.apply(simple_df) == simple_df From 1dcfa11a894ecd37e04de91154e19eddb3b5823f Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Fri, 16 Aug 2024 13:20:18 +0200 Subject: [PATCH 06/19] Add a test for showing how to fail validation --- .../validate_data/validate_data_failing.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 tests/data/validation/validate_data/validate_data_failing.yaml diff --git a/tests/data/validation/validate_data/validate_data_failing.yaml b/tests/data/validation/validate_data/validate_data_failing.yaml new file mode 100644 index 00000000..e576e3a4 --- /dev/null +++ b/tests/data/validation/validate_data/validate_data_failing.yaml @@ -0,0 +1,11 @@ + # 2005 value passes the validation, but the 2010 value does not + - variable: Primary Energy + upper_bound: 5. +# variable exists only for 'scen_a' + - variable: Primary Energy|Coal + lower_bound: 2 +# both upper and lower bound are triggered + - variable: Primary Energy + year: 2005 + upper_bound: 1.9 + lower_bound: 1.1 From cb37ff37d84c0c7057deff338c14b5ff6333272f Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Fri, 16 Aug 2024 13:24:57 +0200 Subject: [PATCH 07/19] Add a test for showing how to fail validation --- ...alidate_data_failing.yaml => validate_data_fails.yaml} | 0 tests/test_validate_data.py | 8 ++++++++ 2 files changed, 8 insertions(+) rename tests/data/validation/validate_data/{validate_data_failing.yaml => validate_data_fails.yaml} (100%) diff --git a/tests/data/validation/validate_data/validate_data_failing.yaml b/tests/data/validation/validate_data/validate_data_fails.yaml similarity index 100% rename from tests/data/validation/validate_data/validate_data_failing.yaml rename to tests/data/validation/validate_data/validate_data_fails.yaml diff --git a/tests/test_validate_data.py b/tests/test_validate_data.py index 7cf2973b..073e9ba5 100644 --- a/tests/test_validate_data.py +++ b/tests/test_validate_data.py @@ -64,3 +64,11 @@ def test_DataValidator_apply_no_matching_data(simple_df): ) # no data matches validation criteria, `apply()` passes and returns unchanged object assert data_validator.apply(simple_df) == simple_df + + +def test_DataValidator_apply_fails(simple_df): + data_validator = DataValidator.from_file( + DATA_VALIDATION_TEST_DIR / "validate_data_fails.yaml" + ) + with pytest.raises(ValueError, match="Data validation failed"): + data_validator.apply(simple_df) \ No newline at end of file From 8d3d8223a5cdc2d43212318928144c0c67d59f0b Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Fri, 16 Aug 2024 13:27:06 +0200 Subject: [PATCH 08/19] Make black --- tests/test_validate_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_validate_data.py b/tests/test_validate_data.py index 073e9ba5..59dfaf28 100644 --- a/tests/test_validate_data.py +++ b/tests/test_validate_data.py @@ -71,4 +71,4 @@ def test_DataValidator_apply_fails(simple_df): DATA_VALIDATION_TEST_DIR / "validate_data_fails.yaml" ) with pytest.raises(ValueError, match="Data validation failed"): - data_validator.apply(simple_df) \ No newline at end of file + data_validator.apply(simple_df) From 68accb8749a997c31d315cf70ff7d73a74a1b2ae Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Mon, 19 Aug 2024 10:13:36 +0200 Subject: [PATCH 09/19] Write failing validation for each item to log with criteria --- nomenclature/processor/data_validator.py | 7 +- tests/data/validation/Untitled.ipynb | 347 +++++++++++++++++++++++ 2 files changed, 351 insertions(+), 3 deletions(-) create mode 100644 tests/data/validation/Untitled.ipynb diff --git a/nomenclature/processor/data_validator.py b/nomenclature/processor/data_validator.py index 3920c31d..0b96635e 100644 --- a/nomenclature/processor/data_validator.py +++ b/nomenclature/processor/data_validator.py @@ -2,7 +2,6 @@ from pathlib import Path from typing import List, Union -import pandas as pd import yaml from pyam import IamDataFrame from pyam.logging import adjust_log_level @@ -46,13 +45,15 @@ def apply(self, df: IamDataFrame) -> IamDataFrame: for direction in ["upper_bound", "lower_bound"]: if getattr(item, direction) is not None: failed_validation[direction] = getattr(item, direction) - failed_validation_list.append(failed_validation) + failed_validation_list.append( + f"Criteria: {item.criteria}\n{failed_validation}\n" + ) if failed_validation_list: logger.error( "Failed data validation (file %s):\n%s", get_relative_path(self.file), - pd.concat(failed_validation_list), + "\n".join(failed_validation_list), ) error = True diff --git a/tests/data/validation/Untitled.ipynb b/tests/data/validation/Untitled.ipynb new file mode 100644 index 00000000..fd381df3 --- /dev/null +++ b/tests/data/validation/Untitled.ipynb @@ -0,0 +1,347 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "e7b2b9b1-3e81-480f-823b-32e5d6e52f2a", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import pyam\n", + "import nomenclature" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3b021ffb-743f-48c1-8bc4-447f18f0e6da", + "metadata": {}, + "outputs": [], + "source": [ + "import logging" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "3e640186-93b2-40e2-8a91-b094f9e053d2", + "metadata": {}, + "outputs": [], + "source": [ + "logger = logging.getLogger(__name__)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "a672ea04-cf92-4018-bf75-88c01e3de790", + "metadata": {}, + "outputs": [], + "source": [ + "from nomenclature.processor.utils import get_relative_path" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "be399c88-0e2b-4079-85f5-6780e49d68bf", + "metadata": {}, + "outputs": [], + "source": [ + "TEST_DF = pd.DataFrame(\n", + " [\n", + " [\"model_a\", \"scen_a\", \"World\", \"Primary Energy\", \"EJ/yr\", 1, 6.0],\n", + " [\"model_a\", \"scen_a\", \"World\", \"Primary Energy|Coal\", \"EJ/yr\", 0.5, 3],\n", + " [\"model_a\", \"scen_b\", \"World\", \"Primary Energy\", \"EJ/yr\", 2, 7],\n", + " ],\n", + " columns=pyam.IAMC_IDX + [2005, 2010],\n", + ")\n", + "df = pyam.IamDataFrame(TEST_DF)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "35d319e2-5607-4ebd-941e-e53522f4d599", + "metadata": {}, + "outputs": [], + "source": [ + "dsd = nomenclature.DataStructureDefinition(\"definitions/\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "edc41204-b82a-4e58-a0d5-138a581c97fb", + "metadata": {}, + "outputs": [], + "source": [ + "processor = nomenclature.processor.DataValidator.from_file(\n", + " \"validate_data/validate_data_fails.yaml\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "0326423a-32c0-4b25-b974-a2597ca0ded5", + "metadata": {}, + "outputs": [], + "source": [ + "processor.validate_with_definition(dsd)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "4cff6cf3-58f2-423a-8dba-4b84b083da6d", + "metadata": {}, + "outputs": [], + "source": [ + "pd.set_option(\"display.width\", 180)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "80814f71-fe8e-47bb-8389-a54e7ff9041f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-08-19 10:10:33 ERROR Failed data validation (file validate_data/validate_data_fails.yaml):\n", + "Criteria: {'variable': ['Primary Energy'], 'upper_bound': 5.0}\n", + " model scenario region variable unit year value upper_bound\n", + "0 model_a scen_a World Primary Energy EJ/yr 2010 6.0 5.0\n", + "1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 5.0\n", + "\n", + "Criteria: {'variable': ['Primary Energy|Coal'], 'lower_bound': 2.0}\n", + " model scenario region variable unit year value lower_bound\n", + "0 model_a scen_a World Primary Energy|Coal EJ/yr 2005 0.5 2.0\n", + "\n", + "Criteria: {'variable': ['Primary Energy'], 'year': [2005], 'upper_bound': 1.9, 'lower_bound': 1.1}\n", + " model scenario region variable unit year value upper_bound lower_bound\n", + "0 model_a scen_a World Primary Energy EJ/yr 2005 1.0 1.9 1.1\n", + "1 model_a scen_b World Primary Energy EJ/yr 2005 2.0 1.9 1.1\n", + "\n" + ] + }, + { + "ename": "ValueError", + "evalue": "Data validation failed. Please check the log for details.", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[10], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mprocessor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/GitHub/nomenclature/nomenclature/processor/data_validator.py:61\u001b[0m, in \u001b[0;36mDataValidator.apply\u001b[0;34m(self, df)\u001b[0m\n\u001b[1;32m 58\u001b[0m error \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 60\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m error:\n\u001b[0;32m---> 61\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 62\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mData validation failed. Please check the log for details.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 63\u001b[0m )\n\u001b[1;32m 64\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m df\n", + "\u001b[0;31mValueError\u001b[0m: Data validation failed. Please check the log for details." + ] + } + ], + "source": [ + "processor.apply(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d910ec0-44a6-47da-9e2d-d4d5390d40b5", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "00cee2ef-c1c1-46e9-9e04-5e662e827904", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac27a3ed-9e85-4426-92ff-9b6eeccc79aa", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b4482a0e-2743-4b73-9782-16648189c4e1", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b8047fa-58a9-4fef-a670-ad4f7f3134a5", + "metadata": {}, + "outputs": [], + "source": [ + "x.dict" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba8368e4-c66b-47ca-bd2d-4dcb79d3dd24", + "metadata": {}, + "outputs": [], + "source": [ + "y = df.validate(year=2010, upper_bound=2.5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3cac9da3-6151-44c4-82b3-21209e983c6e", + "metadata": {}, + "outputs": [], + "source": [ + "y[\"upper_bound\"] = 2.5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2cdb1852-86ec-40c9-ab5b-396155481290", + "metadata": {}, + "outputs": [], + "source": [ + "y" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c96c95a7-7d76-445b-bcf2-1311918f4669", + "metadata": {}, + "outputs": [], + "source": [ + "df.validate(**x.dict())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d36ceae3-470b-43de-85e8-57a3e2348802", + "metadata": {}, + "outputs": [], + "source": [ + "x = processor.criteria_items[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1fddc65-2f9d-4bf6-9b30-ad8d03251cae", + "metadata": {}, + "outputs": [], + "source": [ + "dict(item for item in x.model_dump().items() if item[1] is not None)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b9c67a4a-ad77-4c3f-a916-8b0b473086d5", + "metadata": {}, + "outputs": [], + "source": [ + "x.get_filters()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "54d69a12-ce0e-4976-ba92-42d41b939f29", + "metadata": {}, + "outputs": [], + "source": [ + "failed_validation_data = df.validate(**item.dict())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6fb4fc3-b4f9-48bb-b105-9d64244513fd", + "metadata": {}, + "outputs": [], + "source": [ + "?pyam.logging.adjust_log_level" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1f37a28-4a48-4570-afaf-8b83e0979bd2", + "metadata": {}, + "outputs": [], + "source": [ + "item.dict()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90987d27-347a-4f37-a4b9-48634a7f11b2", + "metadata": {}, + "outputs": [], + "source": [ + "self = processor\n", + "\n", + "error = False\n", + "\n", + "for item in self.criteria_items:\n", + " with pyam.logging.adjust_log_level():\n", + " failed_validation_data = df.validate(**item.dict())\n", + " if not failed_validation_data.empty:\n", + " logger.error(\n", + " \"Failed data validation.\\nFile: %s\\n\\n%s\",\n", + " get_relative_path(self.file),\n", + " failed_validation_data,\n", + " )\n", + " error = True\n", + "\n", + "if error:\n", + " raise ValueError(\"Failed data validation. Please check the log for details.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4b7b493-8315-47f7-9965-8ca9731f911e", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 710497bdfc46e7576a5417c2bfdcfc425e939ad7 Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Mon, 19 Aug 2024 10:38:44 +0200 Subject: [PATCH 10/19] Don't add upper/lower bound columns explicitly --- nomenclature/processor/data_validator.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/nomenclature/processor/data_validator.py b/nomenclature/processor/data_validator.py index 0b96635e..cfe0b733 100644 --- a/nomenclature/processor/data_validator.py +++ b/nomenclature/processor/data_validator.py @@ -42,9 +42,6 @@ def apply(self, df: IamDataFrame) -> IamDataFrame: for item in self.criteria_items: failed_validation = df.validate(**item.criteria) if failed_validation is not None: - for direction in ["upper_bound", "lower_bound"]: - if getattr(item, direction) is not None: - failed_validation[direction] = getattr(item, direction) failed_validation_list.append( f"Criteria: {item.criteria}\n{failed_validation}\n" ) From b3288b98a8cb263fed1b17d1dd500616b1ff73d5 Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Mon, 19 Aug 2024 11:16:20 +0200 Subject: [PATCH 11/19] Make more concise log error messages --- nomenclature/processor/data_validator.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/nomenclature/processor/data_validator.py b/nomenclature/processor/data_validator.py index cfe0b733..a1096962 100644 --- a/nomenclature/processor/data_validator.py +++ b/nomenclature/processor/data_validator.py @@ -1,5 +1,6 @@ import logging from pathlib import Path +import textwrap from typing import List, Union import yaml @@ -35,26 +36,28 @@ def from_file(cls, file: Union[Path, str]) -> "DataValidator": return cls(file=file, criteria_items=content) def apply(self, df: IamDataFrame) -> IamDataFrame: - failed_validation_list = [] - error = False + error_list = [] with adjust_log_level(): for item in self.criteria_items: failed_validation = df.validate(**item.criteria) if failed_validation is not None: - failed_validation_list.append( - f"Criteria: {item.criteria}\n{failed_validation}\n" + error_list.append( + " Criteria: " + + ", ".join( + [f"{key}: {value}" for key, value in item.criteria.items()] + ) + ) + error_list.append( + textwrap.indent(str(failed_validation), prefix=" ") + "\n" ) - if failed_validation_list: + if error_list: logger.error( "Failed data validation (file %s):\n%s", get_relative_path(self.file), - "\n".join(failed_validation_list), + "\n".join(error_list), ) - error = True - - if error: raise ValueError( "Data validation failed. Please check the log for details." ) From b190f96d20a1bbb4dd479e73a540ab7c88a8bb5a Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Mon, 19 Aug 2024 11:16:32 +0200 Subject: [PATCH 12/19] Add a test --- tests/test_validate_data.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/tests/test_validate_data.py b/tests/test_validate_data.py index 59dfaf28..e07bb1b7 100644 --- a/tests/test_validate_data.py +++ b/tests/test_validate_data.py @@ -66,9 +66,31 @@ def test_DataValidator_apply_no_matching_data(simple_df): assert data_validator.apply(simple_df) == simple_df -def test_DataValidator_apply_fails(simple_df): +def test_DataValidator_apply_fails(simple_df, caplog): data_validator = DataValidator.from_file( DATA_VALIDATION_TEST_DIR / "validate_data_fails.yaml" ) + + failed_validation_message = [ + "Failed data validation (file data/validation/validate_data/validate_data_fails.yaml):" + """ + Criteria: variable: ['Primary Energy'], upper_bound: 5.0 + model scenario region variable unit year value + 0 model_a scen_a World Primary Energy EJ/yr 2010 6.0 + 1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 + + Criteria: variable: ['Primary Energy|Coal'], lower_bound: 2.0 + model scenario region variable unit year value + 0 model_a scen_a World Primary Energy|Coal EJ/yr 2005 0.5 + + Criteria: variable: ['Primary Energy'], year: [2005], upper_bound: 1.9, lower_bound: 1.1 + model scenario region variable unit year value + 0 model_a scen_a World Primary Energy EJ/yr 2005 1.0 + 1 model_a scen_b World Primary Energy EJ/yr 2005 2.0""", + ] + with pytest.raises(ValueError, match="Data validation failed"): data_validator.apply(simple_df) + + # check if the log message contains the correct information + assert all(x in caplog.text for x in failed_validation_message) From 856a412d3a5d9615689d8c266589893f6ac5a4a9 Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Mon, 19 Aug 2024 11:34:49 +0200 Subject: [PATCH 13/19] Fix failing test --- tests/test_validate_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_validate_data.py b/tests/test_validate_data.py index e07bb1b7..5bd332d0 100644 --- a/tests/test_validate_data.py +++ b/tests/test_validate_data.py @@ -72,7 +72,7 @@ def test_DataValidator_apply_fails(simple_df, caplog): ) failed_validation_message = [ - "Failed data validation (file data/validation/validate_data/validate_data_fails.yaml):" + "Failed data validation (file data/validation/validate_data/validate_data_fails.yaml):", """ Criteria: variable: ['Primary Energy'], upper_bound: 5.0 model scenario region variable unit year value From 766787e642efe8d62245d7582c7460889cd61720 Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Mon, 19 Aug 2024 11:46:35 +0200 Subject: [PATCH 14/19] Simplify test to one assertion --- tests/test_validate_data.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/test_validate_data.py b/tests/test_validate_data.py index 5bd332d0..fc0beced 100644 --- a/tests/test_validate_data.py +++ b/tests/test_validate_data.py @@ -71,9 +71,8 @@ def test_DataValidator_apply_fails(simple_df, caplog): DATA_VALIDATION_TEST_DIR / "validate_data_fails.yaml" ) - failed_validation_message = [ - "Failed data validation (file data/validation/validate_data/validate_data_fails.yaml):", - """ + failed_validation_message = \ + """Failed data validation (file data/validation/validate_data/validate_data_fails.yaml): Criteria: variable: ['Primary Energy'], upper_bound: 5.0 model scenario region variable unit year value 0 model_a scen_a World Primary Energy EJ/yr 2010 6.0 @@ -86,11 +85,10 @@ def test_DataValidator_apply_fails(simple_df, caplog): Criteria: variable: ['Primary Energy'], year: [2005], upper_bound: 1.9, lower_bound: 1.1 model scenario region variable unit year value 0 model_a scen_a World Primary Energy EJ/yr 2005 1.0 - 1 model_a scen_b World Primary Energy EJ/yr 2005 2.0""", - ] + 1 model_a scen_b World Primary Energy EJ/yr 2005 2.0""" with pytest.raises(ValueError, match="Data validation failed"): data_validator.apply(simple_df) # check if the log message contains the correct information - assert all(x in caplog.text for x in failed_validation_message) + assert failed_validation_message in caplog.text From 37f689af9c0609e88fa73a6f23f5352d3ebec6cb Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Mon, 19 Aug 2024 11:57:04 +0200 Subject: [PATCH 15/19] Check if console-with is causing the problems --- tests/test_validate_data.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/test_validate_data.py b/tests/test_validate_data.py index fc0beced..638c7211 100644 --- a/tests/test_validate_data.py +++ b/tests/test_validate_data.py @@ -1,3 +1,4 @@ +import pandas as pd import pytest from conftest import TEST_DATA_DIR @@ -71,8 +72,10 @@ def test_DataValidator_apply_fails(simple_df, caplog): DATA_VALIDATION_TEST_DIR / "validate_data_fails.yaml" ) - failed_validation_message = \ - """Failed data validation (file data/validation/validate_data/validate_data_fails.yaml): + # TODO implement a utility function to display pandas nicely + pd.set_option("display.width", 180) + + failed_validation_message = """Failed data validation (file data/validation/validate_data/validate_data_fails.yaml): Criteria: variable: ['Primary Energy'], upper_bound: 5.0 model scenario region variable unit year value 0 model_a scen_a World Primary Energy EJ/yr 2010 6.0 From a61b7c07a7e1e101caeacf6054a83719fb12afda Mon Sep 17 00:00:00 2001 From: Philip Hackstock <20710924+phackstock@users.noreply.github.com> Date: Mon, 19 Aug 2024 13:03:22 +0200 Subject: [PATCH 16/19] Fix validate data path --- tests/test_validate_data.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/test_validate_data.py b/tests/test_validate_data.py index 638c7211..51e47f5b 100644 --- a/tests/test_validate_data.py +++ b/tests/test_validate_data.py @@ -1,3 +1,5 @@ +from pathlib import Path + import pandas as pd import pytest from conftest import TEST_DATA_DIR @@ -68,14 +70,13 @@ def test_DataValidator_apply_no_matching_data(simple_df): def test_DataValidator_apply_fails(simple_df, caplog): - data_validator = DataValidator.from_file( - DATA_VALIDATION_TEST_DIR / "validate_data_fails.yaml" - ) + data_file = DATA_VALIDATION_TEST_DIR / "validate_data_fails.yaml" + data_validator = DataValidator.from_file(data_file) # TODO implement a utility function to display pandas nicely pd.set_option("display.width", 180) - failed_validation_message = """Failed data validation (file data/validation/validate_data/validate_data_fails.yaml): + failed_validation_message = f"""Failed data validation (file {data_file.relative_to(Path.cwd())}): Criteria: variable: ['Primary Energy'], upper_bound: 5.0 model scenario region variable unit year value 0 model_a scen_a World Primary Energy EJ/yr 2010 6.0 From 7868fd529abf70a775c5ded1a033b2f9842adb55 Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Mon, 19 Aug 2024 13:28:54 +0200 Subject: [PATCH 17/19] Remove dev notebook --- tests/data/validation/Untitled.ipynb | 347 --------------------------- 1 file changed, 347 deletions(-) delete mode 100644 tests/data/validation/Untitled.ipynb diff --git a/tests/data/validation/Untitled.ipynb b/tests/data/validation/Untitled.ipynb deleted file mode 100644 index fd381df3..00000000 --- a/tests/data/validation/Untitled.ipynb +++ /dev/null @@ -1,347 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "e7b2b9b1-3e81-480f-823b-32e5d6e52f2a", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import pyam\n", - "import nomenclature" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "3b021ffb-743f-48c1-8bc4-447f18f0e6da", - "metadata": {}, - "outputs": [], - "source": [ - "import logging" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "3e640186-93b2-40e2-8a91-b094f9e053d2", - "metadata": {}, - "outputs": [], - "source": [ - "logger = logging.getLogger(__name__)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "a672ea04-cf92-4018-bf75-88c01e3de790", - "metadata": {}, - "outputs": [], - "source": [ - "from nomenclature.processor.utils import get_relative_path" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "be399c88-0e2b-4079-85f5-6780e49d68bf", - "metadata": {}, - "outputs": [], - "source": [ - "TEST_DF = pd.DataFrame(\n", - " [\n", - " [\"model_a\", \"scen_a\", \"World\", \"Primary Energy\", \"EJ/yr\", 1, 6.0],\n", - " [\"model_a\", \"scen_a\", \"World\", \"Primary Energy|Coal\", \"EJ/yr\", 0.5, 3],\n", - " [\"model_a\", \"scen_b\", \"World\", \"Primary Energy\", \"EJ/yr\", 2, 7],\n", - " ],\n", - " columns=pyam.IAMC_IDX + [2005, 2010],\n", - ")\n", - "df = pyam.IamDataFrame(TEST_DF)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "35d319e2-5607-4ebd-941e-e53522f4d599", - "metadata": {}, - "outputs": [], - "source": [ - "dsd = nomenclature.DataStructureDefinition(\"definitions/\")" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "edc41204-b82a-4e58-a0d5-138a581c97fb", - "metadata": {}, - "outputs": [], - "source": [ - "processor = nomenclature.processor.DataValidator.from_file(\n", - " \"validate_data/validate_data_fails.yaml\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "0326423a-32c0-4b25-b974-a2597ca0ded5", - "metadata": {}, - "outputs": [], - "source": [ - "processor.validate_with_definition(dsd)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "4cff6cf3-58f2-423a-8dba-4b84b083da6d", - "metadata": {}, - "outputs": [], - "source": [ - "pd.set_option(\"display.width\", 180)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "80814f71-fe8e-47bb-8389-a54e7ff9041f", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-08-19 10:10:33 ERROR Failed data validation (file validate_data/validate_data_fails.yaml):\n", - "Criteria: {'variable': ['Primary Energy'], 'upper_bound': 5.0}\n", - " model scenario region variable unit year value upper_bound\n", - "0 model_a scen_a World Primary Energy EJ/yr 2010 6.0 5.0\n", - "1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 5.0\n", - "\n", - "Criteria: {'variable': ['Primary Energy|Coal'], 'lower_bound': 2.0}\n", - " model scenario region variable unit year value lower_bound\n", - "0 model_a scen_a World Primary Energy|Coal EJ/yr 2005 0.5 2.0\n", - "\n", - "Criteria: {'variable': ['Primary Energy'], 'year': [2005], 'upper_bound': 1.9, 'lower_bound': 1.1}\n", - " model scenario region variable unit year value upper_bound lower_bound\n", - "0 model_a scen_a World Primary Energy EJ/yr 2005 1.0 1.9 1.1\n", - "1 model_a scen_b World Primary Energy EJ/yr 2005 2.0 1.9 1.1\n", - "\n" - ] - }, - { - "ename": "ValueError", - "evalue": "Data validation failed. Please check the log for details.", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[10], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mprocessor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/GitHub/nomenclature/nomenclature/processor/data_validator.py:61\u001b[0m, in \u001b[0;36mDataValidator.apply\u001b[0;34m(self, df)\u001b[0m\n\u001b[1;32m 58\u001b[0m error \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 60\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m error:\n\u001b[0;32m---> 61\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 62\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mData validation failed. Please check the log for details.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 63\u001b[0m )\n\u001b[1;32m 64\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m df\n", - "\u001b[0;31mValueError\u001b[0m: Data validation failed. Please check the log for details." - ] - } - ], - "source": [ - "processor.apply(df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2d910ec0-44a6-47da-9e2d-d4d5390d40b5", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "00cee2ef-c1c1-46e9-9e04-5e662e827904", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ac27a3ed-9e85-4426-92ff-9b6eeccc79aa", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b4482a0e-2743-4b73-9782-16648189c4e1", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8b8047fa-58a9-4fef-a670-ad4f7f3134a5", - "metadata": {}, - "outputs": [], - "source": [ - "x.dict" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ba8368e4-c66b-47ca-bd2d-4dcb79d3dd24", - "metadata": {}, - "outputs": [], - "source": [ - "y = df.validate(year=2010, upper_bound=2.5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3cac9da3-6151-44c4-82b3-21209e983c6e", - "metadata": {}, - "outputs": [], - "source": [ - "y[\"upper_bound\"] = 2.5" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2cdb1852-86ec-40c9-ab5b-396155481290", - "metadata": {}, - "outputs": [], - "source": [ - "y" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c96c95a7-7d76-445b-bcf2-1311918f4669", - "metadata": {}, - "outputs": [], - "source": [ - "df.validate(**x.dict())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d36ceae3-470b-43de-85e8-57a3e2348802", - "metadata": {}, - "outputs": [], - "source": [ - "x = processor.criteria_items[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e1fddc65-2f9d-4bf6-9b30-ad8d03251cae", - "metadata": {}, - "outputs": [], - "source": [ - "dict(item for item in x.model_dump().items() if item[1] is not None)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b9c67a4a-ad77-4c3f-a916-8b0b473086d5", - "metadata": {}, - "outputs": [], - "source": [ - "x.get_filters()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "54d69a12-ce0e-4976-ba92-42d41b939f29", - "metadata": {}, - "outputs": [], - "source": [ - "failed_validation_data = df.validate(**item.dict())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a6fb4fc3-b4f9-48bb-b105-9d64244513fd", - "metadata": {}, - "outputs": [], - "source": [ - "?pyam.logging.adjust_log_level" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c1f37a28-4a48-4570-afaf-8b83e0979bd2", - "metadata": {}, - "outputs": [], - "source": [ - "item.dict()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "90987d27-347a-4f37-a4b9-48634a7f11b2", - "metadata": {}, - "outputs": [], - "source": [ - "self = processor\n", - "\n", - "error = False\n", - "\n", - "for item in self.criteria_items:\n", - " with pyam.logging.adjust_log_level():\n", - " failed_validation_data = df.validate(**item.dict())\n", - " if not failed_validation_data.empty:\n", - " logger.error(\n", - " \"Failed data validation.\\nFile: %s\\n\\n%s\",\n", - " get_relative_path(self.file),\n", - " failed_validation_data,\n", - " )\n", - " error = True\n", - "\n", - "if error:\n", - " raise ValueError(\"Failed data validation. Please check the log for details.\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d4b7b493-8315-47f7-9965-8ca9731f911e", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.2" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From 0f19b56353bc8fcda010c4d99d7f49b038da3bad Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Mon, 19 Aug 2024 13:32:58 +0200 Subject: [PATCH 18/19] Implement review suggestion by @phackstock --- nomenclature/processor/data_validator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nomenclature/processor/data_validator.py b/nomenclature/processor/data_validator.py index a1096962..8afab90d 100644 --- a/nomenclature/processor/data_validator.py +++ b/nomenclature/processor/data_validator.py @@ -65,9 +65,9 @@ def apply(self, df: IamDataFrame) -> IamDataFrame: def validate_with_definition(self, dsd: DataStructureDefinition) -> None: errors = ErrorCollector(description=f"in file '{self.file}'") - for item in self.criteria_items: + for criterion in self.criteria_items: try: - item.validate_with_definition(dsd) + criterion.validate_with_definition(dsd) except ValueError as value_error: errors.append(value_error) if errors: From 9ae85dc5ca48464ed30cbaa34184239209e05827 Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Mon, 19 Aug 2024 13:59:33 +0200 Subject: [PATCH 19/19] Remove unnecessary todo --- tests/test_validate_data.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/test_validate_data.py b/tests/test_validate_data.py index 51e47f5b..2e451a78 100644 --- a/tests/test_validate_data.py +++ b/tests/test_validate_data.py @@ -1,6 +1,5 @@ from pathlib import Path -import pandas as pd import pytest from conftest import TEST_DATA_DIR @@ -73,9 +72,6 @@ def test_DataValidator_apply_fails(simple_df, caplog): data_file = DATA_VALIDATION_TEST_DIR / "validate_data_fails.yaml" data_validator = DataValidator.from_file(data_file) - # TODO implement a utility function to display pandas nicely - pd.set_option("display.width", 180) - failed_validation_message = f"""Failed data validation (file {data_file.relative_to(Path.cwd())}): Criteria: variable: ['Primary Energy'], upper_bound: 5.0 model scenario region variable unit year value