diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3cc8b7c6..858a8def 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -44,7 +44,7 @@ jobs: python -m build # Install the built wheel file to imitiate users installing from PyPI: pip uninstall --yes xl2times - pip install --find-links=dist xl2times + pip install --no-index --find-links=dist xl2times - name: Check code formatting working-directory: xl2times diff --git a/tests/test_transforms.py b/tests/test_transforms.py index da33a98d..a9fcacbe 100644 --- a/tests/test_transforms.py +++ b/tests/test_transforms.py @@ -7,8 +7,6 @@ _match_wildcards, _process_comm_groups_vectorised, commodity_map, - get_matching_commodities, - get_matching_processes, process_map, ) @@ -49,12 +47,11 @@ def test_uc_wildcards(self): dictionary = pickle.load(f) df = df_in.copy() - df = _match_wildcards( - df, process_map, dictionary, get_matching_processes, "process" - ) - df = _match_wildcards( - df, commodity_map, dictionary, get_matching_commodities, "commodity" - ) + for result_col, item_map in { + "process": process_map, + "commodity": commodity_map, + }.items(): + df = _match_wildcards(df, item_map, dictionary, result_col) # unit tests assert df is not None and not df.empty diff --git a/xl2times/__main__.py b/xl2times/__main__.py index 1b48d3b7..64360f2a 100644 --- a/xl2times/__main__.py +++ b/xl2times/__main__.py @@ -220,14 +220,14 @@ def compare( f" {sum(df.shape[0] for _, df in ground_truth.items())} rows" ) - missing = set(ground_truth.keys()) - set(data.keys()) + missing = set(ground_truth.keys()).difference(data.keys()) missing_str = ", ".join( [f"{x} ({ground_truth[x].shape[0]})" for x in sorted(missing)] ) if len(missing) > 0: logger.warning(f"Missing {len(missing)} tables: {missing_str}") - additional_tables = set(data.keys()) - set(ground_truth.keys()) + additional_tables = set(data.keys()).difference(ground_truth.keys()) additional_str = ", ".join( [f"{x} ({data[x].shape[0]})" for x in sorted(additional_tables)] ) @@ -258,9 +258,9 @@ def compare( data_rows = set(str(row).lower() for row in data_table.to_numpy().tolist()) total_gt_rows += len(gt_rows) total_correct_rows += len(gt_rows.intersection(data_rows)) - additional = data_rows - gt_rows + additional = data_rows.difference(gt_rows) total_additional_rows += len(additional) - missing = gt_rows - data_rows + missing = gt_rows.difference(data_rows) if len(additional) != 0 or len(missing) != 0: logger.warning( f"Table {table_name} ({data_table.shape[0]} rows," @@ -320,7 +320,7 @@ def produce_times_tables( if "techgroup" in mapping.xl_cols: df["techgroup"] = df["techname"] if not all(c in df.columns for c in mapping.xl_cols): - missing = set(mapping.xl_cols) - set(df.columns) + missing = set(mapping.xl_cols).difference(df.columns) logger.warning( f"Cannot produce table {mapping.times_name} because" f" {mapping.xl_name} does not contain the required columns" @@ -347,7 +347,7 @@ def produce_times_tables( continue result[mapping.times_name] = df - unused_tables = set(input.keys()) - used_tables + unused_tables = set(input.keys()).difference(used_tables) if len(unused_tables) > 0: logger.warning( f"{len(unused_tables)} unused tables: {', '.join(sorted(unused_tables))}" diff --git a/xl2times/config/times-info.json b/xl2times/config/times-info.json index 3ce803d9..b59cecbe 100644 --- a/xl2times/config/times-info.json +++ b/xl2times/config/times-info.json @@ -1619,7 +1619,7 @@ "YEAR", "PRC", "CG", - "CG", + "CG2", "TS" ], "mapping": [ @@ -1627,7 +1627,7 @@ "year", "process", "other_indexes", - "other_indexes", + "commodity", "timeslice" ] }, @@ -2017,14 +2017,14 @@ "indexes": [ "ALL_R", "COM", - "ALL_R", - "COM" + "REG2", + "COM2" ], "mapping": [ "region", "commodity", - "region", - "commodity" + "region2", + "commodity2" ] }, { @@ -2035,18 +2035,18 @@ "YEAR", "PRC", "COM", - "ALL_R", - "COM", - "TS" + "REG2", + "COM2", + "TS2" ], "mapping": [ "region", "year", "process", "commodity", - "region", - "commodity", - "timeslice" + "region2", + "commodity2", + "timeslice2" ] }, { @@ -2059,7 +2059,7 @@ "COM", "TS", "IE", - "COM", + "COM2", "IO" ], "mapping": [ @@ -2069,7 +2069,7 @@ "commodity", "timeslice", "other_indexes", - "commodity", + "commodity2", "other_indexes" ] }, @@ -3295,11 +3295,11 @@ "indexes": [ "REG", "PRC", - "PRC" + "PRC2" ], "mapping": [ "region", - "process", + "other_indexes", "process" ] }, diff --git a/xl2times/config/veda-attr-defaults.json b/xl2times/config/veda-attr-defaults.json index 6367c75e..8153f5b6 100644 --- a/xl2times/config/veda-attr-defaults.json +++ b/xl2times/config/veda-attr-defaults.json @@ -646,6 +646,12 @@ }, "FLO_FUNC": { "defaults": { + "commodity": [ + "commodity-in", + "commodity-out", + "commodity-in-aux", + "commodity-out-aux" + ], "ts-level": "ANNUAL" } }, @@ -883,7 +889,16 @@ }, "times-attribute": "NCAP_PKCNT" }, + "PEAK(CON)": { + "defaults": { + "ts-level": "ANNUAL" + }, + "times-attribute": "NCAP_PKCNT" + }, "PKCNT": { + "defaults": { + "ts-level": "ANNUAL" + }, "times-attribute": "NCAP_PKCNT" }, "PKCOI": { diff --git a/xl2times/datatypes.py b/xl2times/datatypes.py index bc886ad8..f1e525c1 100644 --- a/xl2times/datatypes.py +++ b/xl2times/datatypes.py @@ -318,6 +318,8 @@ class Config: times_sets: dict[str, list[str]] # Switch to prevent overwriting of I/E settings in BASE and SubRES ie_override_in_syssettings: bool = False + # Switch to include dummy imports in the model + include_dummy_imports: bool = True def __init__( self, diff --git a/xl2times/transforms.py b/xl2times/transforms.py index 92de3712..7a039050 100644 --- a/xl2times/transforms.py +++ b/xl2times/transforms.py @@ -1,7 +1,6 @@ import re import time from collections import defaultdict -from collections.abc import Callable from concurrent.futures import ProcessPoolExecutor from dataclasses import replace from functools import reduce @@ -76,7 +75,7 @@ def remove_comment_rows( def _remove_df_comment_rows( - df: pd.DataFrame, + df: DataFrame, comment_chars: dict[str, list], ) -> None: """Modify a dataframe in-place by deleting rows with cells starting with symbols @@ -261,10 +260,10 @@ def revalidate_input_tables( for table in tables: tag = Tag(table.tag) required_cols = config.required_columns[tag] - unique_table_cols = set(table.dataframe.columns) if required_cols: + df = table.dataframe # Drop table if any column in required columns is missing - missing_cols = required_cols - unique_table_cols + missing_cols = required_cols.difference(df.columns) if missing_cols: logger.warning( f"Dropping {tag.value} table withing range {table.range} on sheet {table.sheetname}" @@ -274,7 +273,6 @@ def revalidate_input_tables( continue # Check whether any of the required columns is empty else: - df = table.dataframe empty_required_cols = {c for c in required_cols if all(df[c].isna())} if empty_required_cols: logger.warning( @@ -406,7 +404,7 @@ def merge_tables( # VEDA appears to support merging tables where come columns are optional, e.g. ctslvl and ctype from ~FI_COMM. # So just print detailed warning if we find tables with fewer columns than the concat'ed table. concat_cols = set(df.columns) - missing_cols = [concat_cols - set(t.dataframe.columns) for t in group] + missing_cols = [concat_cols.difference(t.dataframe.columns) for t in group] if any([len(m) for m in missing_cols]): err = ( @@ -535,7 +533,7 @@ def process_flexible_import_table( if any(index): for attr in df["attribute"][index].unique(): i = index & (df["attribute"] == attr) - parts = attr.split("~") + parts = [part.strip() for part in attr.split("~")] for value in parts: colname, typed_value = _get_colname(value, legal_values) if colname is None: @@ -694,7 +692,7 @@ def process_user_constraint_table( for attr in df["attribute"].unique(): if "~" in attr: i = df["attribute"] == attr - parts = attr.split("~") + parts = [part.strip() for part in attr.split("~")] for value in parts: colname, typed_value = _get_colname(value, legal_values) if colname is None: @@ -885,6 +883,7 @@ def fill_in_missing_values( def fill_in_missing_values_table(table): df = table.dataframe.copy() default_values = config.column_default_value.get(table.tag, {}) + mapping_to_defaults = {"limtype": "limtype", "timeslice": "tslvl"} for colname in df.columns: # TODO make this more declarative @@ -900,30 +899,18 @@ def fill_in_missing_values_table(table): ismat = df["csets"] == "MAT" df.loc[isna & ismat, colname] = "FX" df.loc[isna & ~ismat, colname] = "LO" - elif ( - colname == "limtype" - and (table.tag == Tag.fi_t or table.tag.startswith("~TFM")) - and len(df) > 0 - ): + elif colname in {"limtype", "timeslice"} and "attribute" in df.columns: isna = df[colname].isna() - for lim in config.veda_attr_defaults["limtype"].keys(): - df.loc[ - isna - & df["attribute"] - .str.upper() - .isin(config.veda_attr_defaults["limtype"][lim]), - colname, - ] = lim - elif colname == "timeslice" and len(df) > 0 and "attribute" in df.columns: - isna = df[colname].isna() - for timeslice in config.veda_attr_defaults["tslvl"].keys(): - df.loc[ - isna - & df["attribute"] - .str.upper() - .isin(config.veda_attr_defaults["tslvl"][timeslice]), - colname, - ] = timeslice + if any(isna): + key = mapping_to_defaults[colname] + for value in config.veda_attr_defaults[key].keys(): + df.loc[ + isna + & df["attribute"].isin( + config.veda_attr_defaults[key][value] + ), + colname, + ] = value elif ( colname == "tslvl" and table.tag == Tag.fi_process ): # or colname == "CTSLvl" or colname == "PeakTS": @@ -996,7 +983,7 @@ def expand_rows( """ # Exclude columns that have patterns exclude_cols = set(process_map.keys()).union(set(commodity_map.keys())) - lists_columns = lists_columns - exclude_cols + lists_columns = lists_columns.difference(exclude_cols) df = table.dataframe.copy() c = df.map(_has_comma) cols_to_make_lists = [ @@ -1340,7 +1327,7 @@ def generate_commodity_groups( # Add columns for the number of IN/OUT commodities of each type _count_comm_group_vectorised(comm_groups) - def name_comm_group(df): + def name_comm_group(df: pd.Series) -> str | None: """Generate the name of a commodity group based on the member count.""" if df["commoditygroup"] > 1: return df["process"] + "_" + df["csets"] + df["io"][:1] @@ -1385,7 +1372,7 @@ def name_comm_group(df): return tables -def _count_comm_group_vectorised(comm_groups: pd.DataFrame) -> None: +def _count_comm_group_vectorised(comm_groups: DataFrame) -> None: """Store the number of IN/OUT commodities of the same type per Region and Process in CommodityGroup. `comm_groups` is modified in-place. @@ -1404,8 +1391,8 @@ def _count_comm_group_vectorised(comm_groups: pd.DataFrame) -> None: def _process_comm_groups_vectorised( - comm_groups: pd.DataFrame, csets_ordered_for_pcg: list[str] -) -> pd.DataFrame: + comm_groups: DataFrame, csets_ordered_for_pcg: list[str] +) -> DataFrame: """Sets the first commodity group in the list of csets_ordered_for_pcg as the default pcg for each region/process/io combination, but setting the io="OUT" subset as default before "IN". @@ -1827,7 +1814,6 @@ def generate_dummy_processes( config: Config, tables: list[EmbeddedXlTable], model: TimesModel, - include_dummy_processes=True, ) -> list[EmbeddedXlTable]: """Define dummy processes and specify default cost data for them to ensure that a TIMES model can always be solved. @@ -1836,7 +1822,7 @@ def generate_dummy_processes( Significant cost is usually associated with the activity of these processes to ensure that they are used as a last resort """ - if include_dummy_processes: + if config.include_dummy_imports: # TODO: Activity units below are arbitrary. Suggest Veda devs not to have any. dummy_processes = [ ["IMP", "IMPNRGZ", "Dummy Import of NRG", "PJ", "", "NRG"], @@ -1861,9 +1847,8 @@ def generate_dummy_processes( ) process_data_specs = process_declarations[["process", "description"]].copy() - # Use this as default activity cost for dummy processes - # TODO: Should this be included in settings instead? - process_data_specs["ACTCOST"] = 1111 + # Provide an empty value in case an upd table is used to provide data + process_data_specs["ACTCOST"] = "" tables.append( EmbeddedXlTable( @@ -1969,6 +1954,7 @@ def has_no_wildcards(list): and "*" not in x and "," not in x and "?" not in x + and "_" not in x ) ) @@ -2028,6 +2014,8 @@ def is_year(col_name): value_name="value", ignore_index=False, ) + # Convert the attribute column to uppercase + df["attribute"] = df["attribute"].str.upper() result.append( replace(table, dataframe=df, tag=Tag(tag.value.split("-")[0])) ) @@ -2163,51 +2151,43 @@ def process_transform_availability( return result -def filter_by_pattern(df: pd.DataFrame, pattern: str) -> pd.DataFrame: - """Filter dataframe index by a regex pattern.""" - # Duplicates can be created when a process has multiple commodities that match the pattern - df = df.filter(regex=utils.create_regexp(pattern), axis="index").drop_duplicates() - exclude = df.filter(regex=utils.create_negative_regexp(pattern), axis="index").index - - return df.drop(exclude) - +def filter_by_pattern(df: DataFrame, pattern: str) -> set[str]: + """Filter dataframe index by a pattern specifying which items to include and/or exclude. + Return a set of corresponding items from the first (and only) column in the dataframe. + """ + map = {"include": utils.create_regexp, "exclude": utils.create_negative_regexp} + sets = dict() + for action, regex_maker in map.items(): + sets[action] = set( + df.filter(regex=regex_maker(pattern), axis="index").iloc[:, 0] + ) -def intersect(acc, df): - if acc is None: - return df - return acc.merge(df) + return sets["include"].difference(sets["exclude"]) -def get_matching_processes( - row: pd.Series, topology: dict[str, DataFrame] -) -> pd.Series | None: - matching_processes = None - for col, key in process_map.items(): - if col in row.index and row[col] not in {None, ""}: - proc_set = topology[key] +def get_matching_items( + row: pd.Series, topology: dict[str, DataFrame], item_map: dict[str, str] +) -> list[str] | None: + """Return a list of items that match conditions in the given row.""" + matching_items = None + for col, key in item_map.items(): + if col in row.index and pd.notna(row[col]): + item_set = topology[key] pattern = row[col].upper() - filtered = filter_by_pattern(proc_set, pattern) - matching_processes = intersect(matching_processes, filtered) - - if matching_processes is not None and any(matching_processes.duplicated()): - raise ValueError("duplicated") - - return matching_processes - - -def get_matching_commodities(row: pd.Series, topology: dict[str, DataFrame]): - matching_commodities = None - for col, key in commodity_map.items(): - if col in row.index and row[col] not in {None, ""}: - matching_commodities = intersect( - matching_commodities, - filter_by_pattern(topology[key], row[col].upper()), + filtered = filter_by_pattern(item_set, pattern) + matching_items = ( + matching_items.intersection(filtered) + if matching_items is not None + else filtered ) - return matching_commodities + if matching_items is not None: + matching_items = list(matching_items) if len(matching_items) > 0 else None + return matching_items -def df_indexed_by_col(df, col): - # Set df index using an existing column; make index is uppercase + +def df_indexed_by_col(df: DataFrame, col: str) -> DataFrame: + """Set df index using an existing column; make index uppercase.""" df = df.dropna().drop_duplicates() index = df[col].str.upper() df = df.set_index(index).rename_axis("index") @@ -2272,6 +2252,7 @@ def process_wildcards( tables: dict[str, DataFrame], model: TimesModel, ) -> dict[str, DataFrame]: + """Process wildcards in the tables.""" tags = [ Tag.tfm_ava, Tag.tfm_comgrp, @@ -2281,54 +2262,46 @@ def process_wildcards( Tag.tfm_upd, Tag.uc_t, ] - dictionary = generate_topology_dictionary(tables, model) + item_maps = { + "process": process_map, + "commodity": commodity_map, + } for tag in tags: - if tag in tqdm(tables, desc=f"Processing wildcards in {tag.value} tables"): + start_time = time.time() df = tables[tag] - if set(df.columns).intersection(set(process_map.keys())): - df = _match_wildcards( - df, - process_map, - dictionary, - get_matching_processes, - "process", - explode=False, - ) - if set(df.columns).intersection(set(commodity_map.keys())): - df = _match_wildcards( - df, - commodity_map, - dictionary, - get_matching_commodities, - "commodity", - explode=False, - ) + for item_type in ["process", "commodity"]: + item_map = item_maps[item_type] + if set(df.columns).intersection(set(item_map.keys())): + df = _match_wildcards( + df, + item_map, + dictionary, + item_type, + explode=False, + ) tables[tag] = df # TODO: Should the tool alert about the following? # logger.warning("a row matched no processes or commodities") - logger.info( f" process_wildcards: {tag} took {time.time() - start_time:.2f} seconds for {len(df)} rows" ) - return tables def _match_wildcards( - df: pd.DataFrame, + df: DataFrame, col_map: dict[str, str], - dictionary: dict[str, pd.DataFrame], - matcher: Callable, + dictionary: dict[str, DataFrame], result_col: str, explode: bool = False, -) -> pd.DataFrame: +) -> DataFrame: """Match wildcards in the given table using the given process map and dictionary. Parameters @@ -2339,8 +2312,6 @@ def _match_wildcards( Mapping of column names to sets. dictionary Dictionary of process sets to match against. - matcher - Matching function to use, e.g. get_matching_processes or get_matching_commodities. result_col Name of the column to store the matched results in. explode @@ -2358,34 +2329,20 @@ def _match_wildcards( unique_filters = df[wild_cols].drop_duplicates().dropna(axis=0, how="all") # match all the wildcards columns against the dictionary names - matches = unique_filters.apply(lambda row: matcher(row, dictionary), axis=1) - - # we occasionally get a Dataframe back from the matchers. convert these to Series. - matches = ( - matches.iloc[:, 0].to_list() - if isinstance(matches, pd.DataFrame) - else matches.to_list() - ) - matches = [ - df.iloc[:, 0].to_list() if df is not None and len(df) != 0 else None - for df in matches - ] - matches = pd.DataFrame({result_col: matches}) - - # then join with the wildcard cols to their list of matched names so we can join them back into the table df. - filter_matches = unique_filters.reset_index(drop=True).merge( - matches, left_index=True, right_index=True + unique_filters[result_col] = unique_filters.apply( + lambda row: get_matching_items(row, dictionary, col_map), axis=1 ) # Finally we merge the matches back into the original table. # This join re-duplicates the duplicate filters dropped above for speed. df = ( - df.merge(filter_matches, on=wild_cols, how="left", suffixes=("_old", "")) + df.merge(unique_filters, on=wild_cols, how="left", suffixes=("_old", "")) .reset_index(drop=True) .drop(columns=wild_cols) ) - # TODO TFM_UPD has existing (but empty) 'process' and 'commodity' columns. Is it ok to drop existing columns here? + # Pre-existing 'process' and 'commodity' are handled during renaming. + # The below should not be necessary, but is left just in case. if f"{result_col}_old" in df.columns: if not df[f"{result_col}_old"].isna().all(): logger.warning( @@ -2436,11 +2393,11 @@ def query( def is_missing(field): return pd.isna(field) if not isinstance(field, list) else pd.isna(field).all() - qs = [] - - for k, v in query_fields.items(): - if not is_missing(v): - qs.append(f"{k} in {v if isinstance(v, list) else [v]}") + qs = [ + f"{k} in {v if isinstance(v, list) else [v]}" + for k, v in query_fields.items() + if not is_missing(v) + ] query_str = " and ".join(qs) row_idx = table.query(query_str).index @@ -2452,6 +2409,11 @@ def eval_and_update(table: DataFrame, rows_to_update: pd.Index, new_value: str) which can be a update formula like `*2.3`. """ if isinstance(new_value, str) and new_value[0] in {"*", "+", "-", "/"}: + # Do not perform arithmetic operations on rows with i/e options + if "year" in table.columns: + rows_to_update = rows_to_update.intersection( + table.index[table["year"] != 0] + ) old_values = table.loc[rows_to_update, "value"] updated = old_values.astype(float).map(lambda x: eval("x" + new_value)) table.loc[rows_to_update, "value"] = updated @@ -2692,8 +2654,8 @@ def apply_transform_tables( new_rows = table.loc[rows_to_update].copy() # Modify values in all '*2' columns for c, v in row.items(): - if c.endswith("2") and v is not None: - new_rows.loc[:, c[:-1]] = v + if str(c).endswith("2") and v is not None: + new_rows.loc[:, str(c)[:-1]] = v # Evaluate 'value' column based on existing values eval_and_update(new_rows, rows_to_update, row["value"]) # In case more than one data module is present in the table, select the one with the highest index @@ -2792,7 +2754,7 @@ def timeslices_table( # Ensure that all timeslice levels are uppercase timeslices = { - col.upper(): list(values.unique()) + str(col).upper(): list(values.unique()) for col, values in table.dataframe.items() } @@ -3089,6 +3051,14 @@ def fix_topology( if Tag.tfm_ava in tables: modules_with_ava = list(tables[Tag.tfm_ava]["module_name"].unique()) updates = tables[Tag.tfm_ava].explode("process", ignore_index=True) + # Ensure valid combinations of process / module_name + updates = updates.merge( + model.processes[["process", "module_name"]].drop_duplicates(), + how="inner", + on=["process", "module_name"], + ) + # Update tfm_ava + tables[Tag.tfm_ava] = updates # Overwrite with the last value for each process/region pair updates = updates.drop_duplicates( subset=[col for col in updates.columns if col != "value"], keep="last" @@ -3236,7 +3206,7 @@ def apply_final_fixup( ].isin(cost_mapping.keys()) ire_processes = set(veda_process_sets["process"][sets_index].unique()) - other_processes = processes - ire_processes + other_processes = processes.difference(ire_processes) if other_processes: logger.warning( diff --git a/xl2times/utils.py b/xl2times/utils.py index b548aff0..3e3da1b8 100644 --- a/xl2times/utils.py +++ b/xl2times/utils.py @@ -261,9 +261,11 @@ def create_regexp(pattern: str, combined: bool = True) -> str: pattern = pattern.replace(",", r"$|^") if len(pattern) == 0: return r".*" # matches everything - # Handle substite VEDA wildcards with regex patterns - for substition in (("*", ".*"), ("?", ".")): - old, new = substition + # Substite VEDA wildcards with regex patterns; escape metacharacters. + # ("_", ".") and ("[.]", "_") are meant to apply one after another to handle + # the usage of "_" equivalent to "?" and "[_]" as literal "_". + substitute = [(".", "\\."), ("_", "."), ("[.]", "_"), ("*", ".*"), ("?", ".")] + for old, new in substitute: pattern = pattern.replace(old, new) # Do not match substrings pattern = rf"^{pattern}$"