From 87311de5a6bcab2d0f64c6682c5b4c8c4e039634 Mon Sep 17 00:00:00 2001 From: Philip Hackstock <20710924+phackstock@users.noreply.github.com> Date: Tue, 23 Jan 2024 13:32:45 +0100 Subject: [PATCH] Use multiple external repos for definitions and mappings (#311) * Add output_directory option to parse_model_registration * Update config to use multiple repositories * Update codelist to use multiple external repos * Update RegionProcessor to use multiple external repos * Remove unnecessary repository_dimension_path * Update test for multiple external repos * Fix repository set for definitions and mappings * Add tests for config * Add test for multiple external repos * Revert to previous suggestion from @danielhuppmann * Apply suggestions from code review Co-authored-by: Daniel Huppmann * Finish renaming in repo in loop * Switch cast to set order to clarity * Allow errors for rmtree for windows debugging * Attempt fix for failing Windows test * Remove all clean-up for Windows fix * Change repo permissions on Windows to delete * Re-introduce external repo cleanup * Add explicit errors for debugging * Try IWRITE for Windows permission change * Try onerror for rmtree * Get explicit removing errors * Update docs for usage of multiple repos --------- Co-authored-by: Daniel Huppmann --- docs/user_guide/config.rst | 14 +++-- nomenclature/codelist.py | 29 +++++---- nomenclature/config.py | 60 +++++++++++++------ nomenclature/processor/region.py | 16 ++--- tests/conftest.py | 17 ++++-- .../general-config-only/nomenclature.yaml | 1 - tests/data/general-config/nomenclature.yaml | 1 - .../hash_and_release.yaml | 1 - .../multiple_repos_for_mapping.yaml | 9 +++ .../multiple_repos_per_dimension.yaml | 10 ++++ .../nomenclature_configs/variable/.gitkeep | 0 tests/test_codelist.py | 20 ++++++- tests/test_config.py | 38 +++++++++++- 13 files changed, 158 insertions(+), 58 deletions(-) create mode 100644 tests/data/nomenclature_configs/multiple_repos_for_mapping.yaml create mode 100644 tests/data/nomenclature_configs/multiple_repos_per_dimension.yaml create mode 100644 tests/data/nomenclature_configs/variable/.gitkeep diff --git a/docs/user_guide/config.rst b/docs/user_guide/config.rst index 2bce39d8..9da5cd70 100644 --- a/docs/user_guide/config.rst +++ b/docs/user_guide/config.rst @@ -8,7 +8,7 @@ General project configuration The following features can be accessed via a general project configuration file: * Import codelists and mappings from public GitHub repositories -* Add all countries to the region codelist (details: :ref:`countries`) +* Add all countries to the region codelist (details: :ref:`countries`) Configuration file format ------------------------- @@ -24,19 +24,23 @@ Importing from an external repository In order to import from an external repository, the configuration file must define the repository and the repositories key. -The repository has a **name** (in the example below *common-definitions*) and a **url**: +The repository has a **name** (in the example below *common-definitions*) and a **url**. Multiple repositories can be used in a single configuration file: .. code:: yaml repositories: common-definitions: url: https://github.com/IAMconsortium/common-definitions.git/ + legacy-definitions: + url: https://github.com/IAMconsortium/legacy-definitions.git/ In order for the import to work the url must be given in the above format, i.e. with the leading *https://* and the trailing *.git/*. Information from an external repository can either be used for codelists ("definitions") -or model mappings, or both. +or model mappings, or both. For each definition dimension, i.e. *region* or *variable* +multiple external repositories can be used as the example below illustrates for +*variable*: .. code:: yaml @@ -47,7 +51,9 @@ or model mappings, or both. region: repository: common-definitions variable: - repository: common-definitions + repositories: + - common-definitions + - legacy-definitions mappings: repository: common-definitions diff --git a/nomenclature/codelist.py b/nomenclature/codelist.py index 37da4965..7cb39948 100644 --- a/nomenclature/codelist.py +++ b/nomenclature/codelist.py @@ -202,17 +202,17 @@ def from_directory( with suppress(AttributeError): dimension = path.name codelistconfig = getattr(config.definitions, dimension) - repo_path = ( - config.repositories[codelistconfig.repository].local_path - / codelistconfig.repository_dimension_path - ) - code_list = ( - cls._parse_codelist_dir( - repo_path, - file_glob_pattern, + for repo in codelistconfig.repositories: + repo_path = ( + config.repositories[repo].local_path / "definitions" / dimension + ) + code_list = ( + cls._parse_codelist_dir( + repo_path, + file_glob_pattern, + ) + + code_list ) - + code_list - ) mapping: Dict[str, Code] = {} for code in code_list: @@ -615,17 +615,16 @@ def from_directory( code_list.append(RegionCode(name=c.name, hierarchy="Country")) # importing from an external repository - if config.definitions.region.repository: + for repo in config.definitions.region.repositories: repo_path = ( - config.repositories[config.definitions.region.repository].local_path - / config.definitions.region.repository_dimension_path + config.repositories[repo].local_path / "definitions" / "region" ) code_list = cls._parse_region_code_dir( code_list, repo_path, file_glob_pattern, - repository=config.definitions.region.repository, + repository=config.definitions.region.repositories, ) code_list = cls._parse_and_replace_tags( code_list, repo_path, file_glob_pattern @@ -639,7 +638,7 @@ def from_directory( mapping: Dict[str, RegionCode] = {} for code in code_list: if code.name in mapping: - raise DuplicateCodeError(name=name, code=code.name) + raise ValueError(f"Trying to set a duplicate code {code.name}") mapping[code.name] = code return cls(name=name, mapping=mapping) diff --git a/nomenclature/config.py b/nomenclature/config.py index 92aea787..3971c1d8 100644 --- a/nomenclature/config.py +++ b/nomenclature/config.py @@ -1,22 +1,41 @@ from pathlib import Path -from typing import Dict, Optional +from typing import Annotated, Optional import yaml from git import Repo -from pydantic import BaseModel, Field, ValidationInfo, field_validator, model_validator +from pydantic import ( + BaseModel, + Field, + ValidationInfo, + field_validator, + model_validator, + ConfigDict, + BeforeValidator, +) + + +def convert_to_set(v: str | list[str] | set[str]) -> set[str]: + match v: + case set(v): + return v + case list(v): + return set(v) + case str(v): + return {v} + case _: + raise TypeError("`repositories` must be of type str, list or set.") class CodeListConfig(BaseModel): dimension: str - repository: str | None = None - repository_dimension_path: Path | None = None + repositories: Annotated[set[str] | None, BeforeValidator(convert_to_set)] = Field( + None, alias="repository" + ) + model_config = ConfigDict(populate_by_name=True) - @model_validator(mode="after") - @classmethod - def set_repository_dimension_path(cls, v: "CodeListConfig") -> "CodeListConfig": - if v.repository is not None and v.repository_dimension_path is None: - v.repository_dimension_path = f"definitions/{v.dimension}" - return v + @property + def repository_dimension_path(self) -> str: + return f"definitions/{self.dimension}" class RegionCodeListConfig(CodeListConfig): @@ -84,20 +103,23 @@ def add_dimension(cls, v, info: ValidationInfo): return {"dimension": info.field_name, **v} @property - def repos(self) -> Dict[str, str]: + def repos(self) -> dict[str, str]: return { - dimension: getattr(self, dimension).repository + dimension: getattr(self, dimension).repositories for dimension in ("region", "variable") - if getattr(self, dimension) and getattr(self, dimension).repository + if getattr(self, dimension) and getattr(self, dimension).repositories } class RegionMappingConfig(BaseModel): - repository: str + repositories: Annotated[set[str], BeforeValidator(convert_to_set)] = Field( + ..., alias="repository" + ) + model_config = ConfigDict(populate_by_name=True) class NomenclatureConfig(BaseModel): - repositories: Dict[str, Repository] = {} + repositories: dict[str, Repository] = {} definitions: Optional[DataStructureConfig] = None mappings: Optional[RegionMappingConfig] = None @@ -107,11 +129,11 @@ def check_definitions_repository( cls, v: "NomenclatureConfig" ) -> "NomenclatureConfig": definitions_repos = v.definitions.repos if v.definitions else {} - mapping_repos = {"mappings": v.mappings.repository} if v.mappings else {} + mapping_repos = {"mappings": v.mappings.repositories} if v.mappings else {} repos = {**definitions_repos, **mapping_repos} - for use, repository in repos.items(): - if repository not in v.repositories: - raise ValueError((f"Unknown repository '{repository}' in {use}.")) + for use, repositories in repos.items(): + if repositories - v.repositories.keys(): + raise ValueError((f"Unknown repository {repositories} in '{use}'.")) return v def fetch_repos(self, target_folder: Path): diff --git a/nomenclature/processor/region.py b/nomenclature/processor/region.py index 7df675c1..d2964650 100644 --- a/nomenclature/processor/region.py +++ b/nomenclature/processor/region.py @@ -482,14 +482,14 @@ def from_directory(cls, path: DirectoryPath, dsd: DataStructureDefinition): mapping_files = [f for f in path.glob("**/*") if f.suffix in {".yaml", ".yml"}] if dsd.config and dsd.config.mappings: - mapping_files = [ - f - for f in ( - dsd.config.repositories[dsd.config.mappings.repository].local_path - / "mappings" - ).glob("**/*") - if f.suffix in {".yaml", ".yml"} - ] + mapping_files + for repository in dsd.config.mappings.repositories: + mapping_files.extend( + f + for f in ( + dsd.config.repositories[repository].local_path / "mappings" + ).glob("**/*") + if f.suffix in {".yaml", ".yml"} + ) for file in mapping_files: try: diff --git a/tests/conftest.py b/tests/conftest.py index d3c2e368..ecc90ce4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,11 +1,15 @@ -from pathlib import Path -import pytest import shutil +import sys +import os +import stat +from pathlib import Path + import pandas as pd +import pytest from pyam import IamDataFrame from pyam.utils import IAMC_IDX -from nomenclature import DataStructureDefinition +from nomenclature import DataStructureDefinition here = Path(__file__).parent TEST_DATA_DIR = here / "data" @@ -48,8 +52,13 @@ def add_meta(df): df.set_meta(["foo", "bar"], "string") +def remove_readonly(func, path, excinfo): + os.chmod(path, stat.S_IWRITE) + func(path) + + def clean_up_external_repos(repos): # clean up the external repo for repository in repos.values(): if repository.local_path.exists(): - shutil.rmtree(repository.local_path, ignore_errors=True) + shutil.rmtree(repository.local_path, onerror=remove_readonly) diff --git a/tests/data/general-config-only/nomenclature.yaml b/tests/data/general-config-only/nomenclature.yaml index e9515d9e..1c7e59e9 100644 --- a/tests/data/general-config-only/nomenclature.yaml +++ b/tests/data/general-config-only/nomenclature.yaml @@ -7,4 +7,3 @@ definitions: country: true variable: repository: common-definitions - repository_dimension_path: definitions/variable diff --git a/tests/data/general-config/nomenclature.yaml b/tests/data/general-config/nomenclature.yaml index e9515d9e..1c7e59e9 100644 --- a/tests/data/general-config/nomenclature.yaml +++ b/tests/data/general-config/nomenclature.yaml @@ -7,4 +7,3 @@ definitions: country: true variable: repository: common-definitions - repository_dimension_path: definitions/variable diff --git a/tests/data/nomenclature_configs/hash_and_release.yaml b/tests/data/nomenclature_configs/hash_and_release.yaml index 8e8a0fe5..502d18e4 100644 --- a/tests/data/nomenclature_configs/hash_and_release.yaml +++ b/tests/data/nomenclature_configs/hash_and_release.yaml @@ -9,4 +9,3 @@ definitions: country: true variable: repository: common-definitions - repository_dimension_path: definitions/variable diff --git a/tests/data/nomenclature_configs/multiple_repos_for_mapping.yaml b/tests/data/nomenclature_configs/multiple_repos_for_mapping.yaml new file mode 100644 index 00000000..ff2bd784 --- /dev/null +++ b/tests/data/nomenclature_configs/multiple_repos_for_mapping.yaml @@ -0,0 +1,9 @@ +repositories: + common-definitions: + url: https://github.com/IAMconsortium/common-definitions.git/ + legacy-definitions: + url: https://github.com/IAMconsortium/legacy-definitions.git/ +mappings: + repositories: + - common-definitions + - legacy-definitions diff --git a/tests/data/nomenclature_configs/multiple_repos_per_dimension.yaml b/tests/data/nomenclature_configs/multiple_repos_per_dimension.yaml new file mode 100644 index 00000000..6e510cd2 --- /dev/null +++ b/tests/data/nomenclature_configs/multiple_repos_per_dimension.yaml @@ -0,0 +1,10 @@ +repositories: + common-definitions: + url: https://github.com/IAMconsortium/common-definitions.git/ + legacy-definitions: + url: https://github.com/IAMconsortium/legacy-definitions.git/ +definitions: + variable: + repository: + - common-definitions + - legacy-definitions diff --git a/tests/data/nomenclature_configs/variable/.gitkeep b/tests/data/nomenclature_configs/variable/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_codelist.py b/tests/test_codelist.py index 4fd0a5c0..5668bc65 100644 --- a/tests/test_codelist.py +++ b/tests/test_codelist.py @@ -3,15 +3,16 @@ import pandas.testing as pdt import logging -from nomenclature.code import Code, RegionCode, MetaCode, VariableCode +from nomenclature.code import Code, RegionCode, MetaCode from nomenclature.codelist import ( CodeList, VariableCodeList, RegionCodeList, MetaCodeList, ) +from nomenclature.config import NomenclatureConfig -from conftest import TEST_DATA_DIR +from conftest import TEST_DATA_DIR, clean_up_external_repos def test_simple_codelist(): @@ -323,3 +324,18 @@ def test_MetaCodeList_from_directory(): } exp = MetaCodeList(name="Meta", mapping=mapping) assert obs == exp + + +def test_multiple_external_repos(): + nomenclature_config = NomenclatureConfig.from_file( + TEST_DATA_DIR / "nomenclature_configs" / "multiple_repos_per_dimension.yaml" + ) + try: + with raises(ValueError, match="Duplicate"): + variable_code_list = VariableCodeList.from_directory( + "variable", + TEST_DATA_DIR / "nomenclature_configs" / "variable", + nomenclature_config, + ) + finally: + clean_up_external_repos(nomenclature_config.repositories) diff --git a/tests/test_config.py b/tests/test_config.py index 3e72cb72..b03fd3e2 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,9 +1,9 @@ from pathlib import Path from pytest import raises -from nomenclature.config import Repository, NomenclatureConfig +from nomenclature.config import Repository, NomenclatureConfig, CodeListConfig -from conftest import TEST_DATA_DIR +from conftest import TEST_DATA_DIR, clean_up_external_repos def test_hash_and_release_raises(): @@ -19,7 +19,39 @@ def test_setting_local_path_raises(): def test_unknown_repo_raises(): - with raises(ValueError, match="Unknown repository 'common-definitions'"): + with raises( + ValueError, match="Unknown repository {'common-definitions'} in 'region'" + ): NomenclatureConfig.from_file( TEST_DATA_DIR / "nomenclature_configs" / "unknown_repo.yaml" ) + + +def test_multiple_definition_repos(): + nomenclature_config = NomenclatureConfig.from_file( + TEST_DATA_DIR / "nomenclature_configs" / "multiple_repos_per_dimension.yaml" + ) + try: + exp_repos = {"common-definitions", "legacy-definitions"} + assert nomenclature_config.repositories.keys() == exp_repos + assert nomenclature_config.definitions.variable.repositories == exp_repos + finally: + clean_up_external_repos(nomenclature_config.repositories) + + +def test_codelist_config_set_input(): + exp_repos = {"repo1", "repo2"} + code_list_config = CodeListConfig(dimension="variable", repositories=exp_repos) + assert code_list_config.repositories == exp_repos + + +def test_multiple_mapping_repos(): + nomenclature_config = NomenclatureConfig.from_file( + TEST_DATA_DIR / "nomenclature_configs" / "multiple_repos_for_mapping.yaml" + ) + try: + exp_repos = {"common-definitions", "legacy-definitions"} + assert nomenclature_config.mappings.repositories == exp_repos + assert nomenclature_config.repositories.keys() == exp_repos + finally: + clean_up_external_repos(nomenclature_config.repositories)