Skip to content

Commit

Permalink
Reduce TFM_INS-AT and TFM_INS-TS to TFM_INS (#122)
Browse files Browse the repository at this point in the history
Contributes to #120
  • Loading branch information
siddharth-krishna authored Oct 6, 2023
1 parent 99da0f7 commit e54e518
Show file tree
Hide file tree
Showing 3 changed files with 133 additions and 68 deletions.
1 change: 1 addition & 0 deletions times_reader/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def convert_xl_to_times(
lambda config, tables: [transforms.remove_comment_rows(t) for t in tables],
lambda config, tables: [transforms.remove_comment_cols(t) for t in tables],
transforms.remove_tables_with_formulas, # slow
transforms.process_transform_insert_variants,
transforms.process_transform_insert,
transforms.process_processes,
transforms.process_topology,
Expand Down
23 changes: 19 additions & 4 deletions times_reader/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from itertools import chain
import json
import re
from typing import Dict, Iterable, List
from typing import Any, Dict, Iterable, List, Set, Tuple
from enum import Enum
from pandas.core.frame import DataFrame

Expand Down Expand Up @@ -86,14 +86,19 @@ class Config:

times_xl_maps: List[TimesXlMap]
dd_table_order: Iterable[str]
all_attributes: Set[str]
# TODO perhaps have a datatype to represent these tag_infos?
veda_tags_info: List[Any]

def __init__(self, mapping_file: str, times_info_file: str, veda_tags_file: str):
self.times_xl_maps = Config._read_mappings(mapping_file)
self.dd_table_order = Config._compute_dd_table_order(times_info_file)
self.dd_table_order, self.all_attributes = Config._process_times_info(
times_info_file
)
self.veda_tags_info = Config._read_veda_tags_info(veda_tags_file)

@staticmethod
def _compute_dd_table_order(times_info_file: str) -> Iterable[str]:
def _process_times_info(times_info_file: str) -> Tuple[Iterable[str], Set[str]]:
# Read times_info_file and compute dd_table_order:
# We output tables in order by categories: set, subset, subsubset, md-set, and parameter
with resources.open_text("times_reader.config", times_info_file) as f:
Expand All @@ -105,7 +110,17 @@ def _compute_dd_table_order(times_info_file: str) -> Iterable[str]:
unknown_cats = {item["gams-cat"] for item in table_info} - set(categories)
if unknown_cats:
print(f"WARNING: Unknown categories in times-info.json: {unknown_cats}")
return chain.from_iterable((sorted(cat_to_tables[c]) for c in categories))
dd_table_order = chain.from_iterable(
(sorted(cat_to_tables[c]) for c in categories)
)

# Compute the set of all attributes, i.e. all entities with category = parameter
attributes = {
item["name"].lower()
for item in table_info
if item["gams-cat"] == "parameter"
}
return dd_table_order, attributes

@staticmethod
def _read_mappings(filename: str) -> List[TimesXlMap]:
Expand Down
177 changes: 113 additions & 64 deletions times_reader/transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -976,7 +976,6 @@ def remove_invalid_values(
def process_units(
config: datatypes.Config, tables: Dict[str, DataFrame]
) -> Dict[str, DataFrame]:

all_units = set()

tags = {
Expand Down Expand Up @@ -1666,6 +1665,91 @@ def generate_dummy_processes(
return tables


def process_transform_insert_variants(
config: datatypes.Config,
tables: List[datatypes.EmbeddedXlTable],
) -> List[datatypes.EmbeddedXlTable]:
"""Reduces variants of TFM_INS like TFM_INS-TS to TFM_INS."""

def has_no_wildcards(list):
return all(
list.apply(
lambda x: x is not None
and x[0] != "-"
and "*" not in x
and "," not in x
and "?" not in x
)
)

def is_year(col_name):
"""A column name is a year if it is an int >= 0"""
return col_name.isdigit() and int(col_name) >= 0

result = []
for table in tables:
if table.tag == datatypes.Tag.tfm_ins_ts:
# ~TFM_INS-TS: Gather columns whose names are years into a single "Year" column:
df = table.dataframe
if "year" in df.columns:
raise ValueError(f"TFM_INS-AT table already has Year column: {table}")
# TODO can we remove this hacky shortcut?
if (
table.tag == datatypes.Tag.tfm_ins_ts
and set(df.columns) & query_columns == {"cset_cn"}
and has_no_wildcards(df["cset_cn"])
):
df.rename(columns={"cset_cn": "commname"}, inplace=True)
result.append(replace(table, dataframe=df, tag=datatypes.Tag.fi_t))
continue
elif (
table.tag == datatypes.Tag.tfm_ins_ts
and set(df.columns) & query_columns == {"pset_pn"}
and has_no_wildcards(df["pset_pn"])
):
df.rename(columns={"pset_pn": "techname"}, inplace=True)
result.append(replace(table, dataframe=df, tag=datatypes.Tag.fi_t))
continue

other_columns = [
col_name for col_name in df.columns if not is_year(col_name)
]
df = pd.melt(
df,
id_vars=other_columns,
var_name="year",
value_name="value",
ignore_index=False,
)
# Convert the year column to integer
df["year"] = df["year"].astype("int")
result.append(replace(table, dataframe=df, tag=datatypes.Tag.tfm_ins))
elif table.tag == datatypes.Tag.tfm_ins_at:
# ~TFM_INS-AT: Gather columns with attribute names into a single "Attribue" column
df = table.dataframe
if "attribute" in df.columns:
raise ValueError(
f"TFM_INS-AT table already has Attribute column: {table}"
)
other_columns = [
col_name
for col_name in df.columns
if col_name not in config.all_attributes
]
df = pd.melt(
df,
id_vars=other_columns,
var_name="attribute",
value_name="value",
ignore_index=False,
)
result.append(replace(table, dataframe=df, tag=datatypes.Tag.tfm_ins))
else:
result.append(table)

return result


# TODO: should we rename this to something more general, since it takes care of more than tfm_ins?
def process_transform_insert(
config: datatypes.Config,
Expand All @@ -1690,15 +1774,14 @@ def process_transform_insert(
elif table.tag in [
datatypes.Tag.tfm_ins,
datatypes.Tag.tfm_ins_txt,
datatypes.Tag.tfm_ins_ts,
datatypes.Tag.tfm_upd,
datatypes.Tag.tfm_comgrp,
]:
df = table.dataframe.copy()
nrows = df.shape[0]

# Standardize column names
# TODO: Include other valid column names
# TODO should this go in datatypes.Config?
known_columns = {
"attribute",
"year",
Expand All @@ -1710,13 +1793,14 @@ def process_transform_insert(
"other_indexes",
} | query_columns

if table.tag == datatypes.Tag.tfm_ins_ts:
# ~TFM_INS-TS: Regions should be specified in a column with header=Region and columns in data area are YEARS
if "region" not in df.columns:
df["region"] = [regions] * len(df)
df = df.explode(["region"], ignore_index=True)
else:
# Transpose region columns to new VALUE column and add corresponding regions in new Region column
# Handle Regions:
if set(df.columns).isdisjoint(
{x.lower() for x in regions} | {"allregions", "region"}
):
# If there's no region information at all, this table is for all regions:
df["region"] = ["allregions"] * len(df)
elif "region" not in df.columns:
# We have columns whose names are regions, so gather them into a Region column:
region_cols = [
col_name
for col_name in df.columns
Expand All @@ -1733,61 +1817,27 @@ def process_transform_insert(
ignore_index=False,
)
df = df.sort_index().reset_index(drop=True) # retain original row order
# This expands "allregions" into one row for each region:
df["region"] = df["region"].map(
lambda x: regions if x == "allregions" else x
)
df = df.explode(["region"])
df["region"] = df["region"].str.upper()
unknown_columns = [
col_name
for col_name in df.columns
if col_name not in known_columns | {"region", "value"}
]
df.drop(columns=unknown_columns, inplace=True)

def has_no_wildcards(list):
return all(
list.apply(
lambda x: x is not None
and x[0] != "-"
and "*" not in x
and "," not in x
and "?" not in x
)
)
# TODO handle case where we have a "region" column and columns with region names

if (
table.tag == datatypes.Tag.tfm_ins_ts
and set(df.columns) & query_columns == {"cset_cn"}
and has_no_wildcards(df["cset_cn"])
):
df["commname"] = df["cset_cn"]
df.drop(columns=["cset_cn"], inplace=True)
result.append(replace(table, dataframe=df, tag=datatypes.Tag.fi_t))
elif (
table.tag == datatypes.Tag.tfm_ins_ts
and set(df.columns) & query_columns == {"pset_pn"}
and has_no_wildcards(df["pset_pn"])
):
df.rename(columns={"pset_pn": "techname"}, inplace=True)
result.append(replace(table, dataframe=df, tag=datatypes.Tag.fi_t))
else:
# wildcard expansion will happen later
if table.tag == datatypes.Tag.tfm_ins_ts:
# ~TFM_INS-TS: Regions should be specified in a column with header=Region and columns in data area are YEARS
data_columns = [
colname
for colname in df.columns
if colname not in known_columns | {"region", "ts_filter"}
]
df, years = utils.explode(df, data_columns)
df["year"] = years
for standard_col in known_columns:
if standard_col not in df.columns:
df[standard_col] = [None] * len(df)
result.append(replace(table, dataframe=df))
# This expands "allregions" into one row for each region:
df["region"] = df["region"].map(
lambda x: regions if x == "allregions" else x
)
df = df.explode(["region"])
df["region"] = df["region"].str.upper()

# Remove unknown columns and add missing known columns:
unknown_columns = [
col_name
for col_name in df.columns
if col_name not in known_columns | {"region", "value"}
]
df.drop(columns=unknown_columns, inplace=True)
for standard_col in known_columns:
if standard_col not in df.columns:
df[standard_col] = [None] * len(df)

result.append(replace(table, dataframe=df))
else:
dropped.append(table)

Expand Down Expand Up @@ -1999,7 +2049,6 @@ def process_wildcards(
for tag in [
datatypes.Tag.tfm_upd,
datatypes.Tag.tfm_ins,
datatypes.Tag.tfm_ins_ts,
datatypes.Tag.tfm_ins_txt,
]:
if tag in tables:
Expand Down

0 comments on commit e54e518

Please sign in to comment.