From e54e518b7fe0744178111482572a9760e8cd44ea Mon Sep 17 00:00:00 2001 From: Siddharth Krishna Date: Fri, 6 Oct 2023 18:37:50 +0200 Subject: [PATCH] Reduce TFM_INS-AT and TFM_INS-TS to TFM_INS (#122) Contributes to #120 --- times_reader/__main__.py | 1 + times_reader/datatypes.py | 23 ++++- times_reader/transforms.py | 177 +++++++++++++++++++++++-------------- 3 files changed, 133 insertions(+), 68 deletions(-) diff --git a/times_reader/__main__.py b/times_reader/__main__.py index 2cfe92c9..fbb8ec1e 100644 --- a/times_reader/__main__.py +++ b/times_reader/__main__.py @@ -61,6 +61,7 @@ def convert_xl_to_times( lambda config, tables: [transforms.remove_comment_rows(t) for t in tables], lambda config, tables: [transforms.remove_comment_cols(t) for t in tables], transforms.remove_tables_with_formulas, # slow + transforms.process_transform_insert_variants, transforms.process_transform_insert, transforms.process_processes, transforms.process_topology, diff --git a/times_reader/datatypes.py b/times_reader/datatypes.py index b42c6f86..f4039cbb 100644 --- a/times_reader/datatypes.py +++ b/times_reader/datatypes.py @@ -4,7 +4,7 @@ from itertools import chain import json import re -from typing import Dict, Iterable, List +from typing import Any, Dict, Iterable, List, Set, Tuple from enum import Enum from pandas.core.frame import DataFrame @@ -86,14 +86,19 @@ class Config: times_xl_maps: List[TimesXlMap] dd_table_order: Iterable[str] + all_attributes: Set[str] + # TODO perhaps have a datatype to represent these tag_infos? + veda_tags_info: List[Any] def __init__(self, mapping_file: str, times_info_file: str, veda_tags_file: str): self.times_xl_maps = Config._read_mappings(mapping_file) - self.dd_table_order = Config._compute_dd_table_order(times_info_file) + self.dd_table_order, self.all_attributes = Config._process_times_info( + times_info_file + ) self.veda_tags_info = Config._read_veda_tags_info(veda_tags_file) @staticmethod - def _compute_dd_table_order(times_info_file: str) -> Iterable[str]: + def _process_times_info(times_info_file: str) -> Tuple[Iterable[str], Set[str]]: # Read times_info_file and compute dd_table_order: # We output tables in order by categories: set, subset, subsubset, md-set, and parameter with resources.open_text("times_reader.config", times_info_file) as f: @@ -105,7 +110,17 @@ def _compute_dd_table_order(times_info_file: str) -> Iterable[str]: unknown_cats = {item["gams-cat"] for item in table_info} - set(categories) if unknown_cats: print(f"WARNING: Unknown categories in times-info.json: {unknown_cats}") - return chain.from_iterable((sorted(cat_to_tables[c]) for c in categories)) + dd_table_order = chain.from_iterable( + (sorted(cat_to_tables[c]) for c in categories) + ) + + # Compute the set of all attributes, i.e. all entities with category = parameter + attributes = { + item["name"].lower() + for item in table_info + if item["gams-cat"] == "parameter" + } + return dd_table_order, attributes @staticmethod def _read_mappings(filename: str) -> List[TimesXlMap]: diff --git a/times_reader/transforms.py b/times_reader/transforms.py index f9522ada..1e531e93 100644 --- a/times_reader/transforms.py +++ b/times_reader/transforms.py @@ -976,7 +976,6 @@ def remove_invalid_values( def process_units( config: datatypes.Config, tables: Dict[str, DataFrame] ) -> Dict[str, DataFrame]: - all_units = set() tags = { @@ -1666,6 +1665,91 @@ def generate_dummy_processes( return tables +def process_transform_insert_variants( + config: datatypes.Config, + tables: List[datatypes.EmbeddedXlTable], +) -> List[datatypes.EmbeddedXlTable]: + """Reduces variants of TFM_INS like TFM_INS-TS to TFM_INS.""" + + def has_no_wildcards(list): + return all( + list.apply( + lambda x: x is not None + and x[0] != "-" + and "*" not in x + and "," not in x + and "?" not in x + ) + ) + + def is_year(col_name): + """A column name is a year if it is an int >= 0""" + return col_name.isdigit() and int(col_name) >= 0 + + result = [] + for table in tables: + if table.tag == datatypes.Tag.tfm_ins_ts: + # ~TFM_INS-TS: Gather columns whose names are years into a single "Year" column: + df = table.dataframe + if "year" in df.columns: + raise ValueError(f"TFM_INS-AT table already has Year column: {table}") + # TODO can we remove this hacky shortcut? + if ( + table.tag == datatypes.Tag.tfm_ins_ts + and set(df.columns) & query_columns == {"cset_cn"} + and has_no_wildcards(df["cset_cn"]) + ): + df.rename(columns={"cset_cn": "commname"}, inplace=True) + result.append(replace(table, dataframe=df, tag=datatypes.Tag.fi_t)) + continue + elif ( + table.tag == datatypes.Tag.tfm_ins_ts + and set(df.columns) & query_columns == {"pset_pn"} + and has_no_wildcards(df["pset_pn"]) + ): + df.rename(columns={"pset_pn": "techname"}, inplace=True) + result.append(replace(table, dataframe=df, tag=datatypes.Tag.fi_t)) + continue + + other_columns = [ + col_name for col_name in df.columns if not is_year(col_name) + ] + df = pd.melt( + df, + id_vars=other_columns, + var_name="year", + value_name="value", + ignore_index=False, + ) + # Convert the year column to integer + df["year"] = df["year"].astype("int") + result.append(replace(table, dataframe=df, tag=datatypes.Tag.tfm_ins)) + elif table.tag == datatypes.Tag.tfm_ins_at: + # ~TFM_INS-AT: Gather columns with attribute names into a single "Attribue" column + df = table.dataframe + if "attribute" in df.columns: + raise ValueError( + f"TFM_INS-AT table already has Attribute column: {table}" + ) + other_columns = [ + col_name + for col_name in df.columns + if col_name not in config.all_attributes + ] + df = pd.melt( + df, + id_vars=other_columns, + var_name="attribute", + value_name="value", + ignore_index=False, + ) + result.append(replace(table, dataframe=df, tag=datatypes.Tag.tfm_ins)) + else: + result.append(table) + + return result + + # TODO: should we rename this to something more general, since it takes care of more than tfm_ins? def process_transform_insert( config: datatypes.Config, @@ -1690,15 +1774,14 @@ def process_transform_insert( elif table.tag in [ datatypes.Tag.tfm_ins, datatypes.Tag.tfm_ins_txt, - datatypes.Tag.tfm_ins_ts, datatypes.Tag.tfm_upd, datatypes.Tag.tfm_comgrp, ]: df = table.dataframe.copy() - nrows = df.shape[0] # Standardize column names # TODO: Include other valid column names + # TODO should this go in datatypes.Config? known_columns = { "attribute", "year", @@ -1710,13 +1793,14 @@ def process_transform_insert( "other_indexes", } | query_columns - if table.tag == datatypes.Tag.tfm_ins_ts: - # ~TFM_INS-TS: Regions should be specified in a column with header=Region and columns in data area are YEARS - if "region" not in df.columns: - df["region"] = [regions] * len(df) - df = df.explode(["region"], ignore_index=True) - else: - # Transpose region columns to new VALUE column and add corresponding regions in new Region column + # Handle Regions: + if set(df.columns).isdisjoint( + {x.lower() for x in regions} | {"allregions", "region"} + ): + # If there's no region information at all, this table is for all regions: + df["region"] = ["allregions"] * len(df) + elif "region" not in df.columns: + # We have columns whose names are regions, so gather them into a Region column: region_cols = [ col_name for col_name in df.columns @@ -1733,61 +1817,27 @@ def process_transform_insert( ignore_index=False, ) df = df.sort_index().reset_index(drop=True) # retain original row order - # This expands "allregions" into one row for each region: - df["region"] = df["region"].map( - lambda x: regions if x == "allregions" else x - ) - df = df.explode(["region"]) - df["region"] = df["region"].str.upper() - unknown_columns = [ - col_name - for col_name in df.columns - if col_name not in known_columns | {"region", "value"} - ] - df.drop(columns=unknown_columns, inplace=True) - - def has_no_wildcards(list): - return all( - list.apply( - lambda x: x is not None - and x[0] != "-" - and "*" not in x - and "," not in x - and "?" not in x - ) - ) + # TODO handle case where we have a "region" column and columns with region names - if ( - table.tag == datatypes.Tag.tfm_ins_ts - and set(df.columns) & query_columns == {"cset_cn"} - and has_no_wildcards(df["cset_cn"]) - ): - df["commname"] = df["cset_cn"] - df.drop(columns=["cset_cn"], inplace=True) - result.append(replace(table, dataframe=df, tag=datatypes.Tag.fi_t)) - elif ( - table.tag == datatypes.Tag.tfm_ins_ts - and set(df.columns) & query_columns == {"pset_pn"} - and has_no_wildcards(df["pset_pn"]) - ): - df.rename(columns={"pset_pn": "techname"}, inplace=True) - result.append(replace(table, dataframe=df, tag=datatypes.Tag.fi_t)) - else: - # wildcard expansion will happen later - if table.tag == datatypes.Tag.tfm_ins_ts: - # ~TFM_INS-TS: Regions should be specified in a column with header=Region and columns in data area are YEARS - data_columns = [ - colname - for colname in df.columns - if colname not in known_columns | {"region", "ts_filter"} - ] - df, years = utils.explode(df, data_columns) - df["year"] = years - for standard_col in known_columns: - if standard_col not in df.columns: - df[standard_col] = [None] * len(df) - result.append(replace(table, dataframe=df)) + # This expands "allregions" into one row for each region: + df["region"] = df["region"].map( + lambda x: regions if x == "allregions" else x + ) + df = df.explode(["region"]) + df["region"] = df["region"].str.upper() + + # Remove unknown columns and add missing known columns: + unknown_columns = [ + col_name + for col_name in df.columns + if col_name not in known_columns | {"region", "value"} + ] + df.drop(columns=unknown_columns, inplace=True) + for standard_col in known_columns: + if standard_col not in df.columns: + df[standard_col] = [None] * len(df) + result.append(replace(table, dataframe=df)) else: dropped.append(table) @@ -1999,7 +2049,6 @@ def process_wildcards( for tag in [ datatypes.Tag.tfm_upd, datatypes.Tag.tfm_ins, - datatypes.Tag.tfm_ins_ts, datatypes.Tag.tfm_ins_txt, ]: if tag in tables: