Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updates to emis/proxies for failed QCs #64

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
439 changes: 439 additions & 0 deletions gch4i/.scratch/task_abandoned_coal_proxy.py

Large diffs are not rendered by default.

16 changes: 9 additions & 7 deletions gch4i/emis_processing/task_field_burning_emi.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
Input Files: - gch4i_data_guide_v3.xlsx
- {V3_DATA_PATH}/ghgi/3F4_fbar/FBAR_90-22_State.xlsx.
Output Files: - {emi_data_dir_path}/barley_emi.csv
/chickpease_emi.csv
/chickpeas_emi.csv
/cotton_emi.csv
/drybeans_emi.csv
/grasshay_emi.csv
Expand All @@ -15,7 +15,7 @@
/maize_emi.csv
/oats_emi.csv
/other_grains_emi.csv
/peanutes_emi.csv
/peanuts_emi.csv
/peas_emi.csv
/potatoes_emi.csv
/rice_emi.csv
Expand All @@ -42,9 +42,11 @@
emi_data_dir_path,
ghgi_data_dir_path,
max_year,
min_year,
min_year
)
from gch4i.utils import tg_to_kt
#from gch4i.utils import tg_to_kt

tg_to_kt = 1000
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is in utils as a constant? Why import separately?



# %% Initialize Parameters
Expand Down Expand Up @@ -136,11 +138,11 @@ def task_field_burning_emi(
)
.rename(columns={"georef": "state_code"})
.set_index("state_code")
# covert "NO" string to numeric (will become np.nan)
# Replace NA values with 0
.replace(0, pd.NA)
.apply(pd.to_numeric, errors="coerce")
# drop states that have all nan values
.dropna(how="all")
# reset the index state back to a column
.fillna(0)
.reset_index()
# make the table long by state/year
.melt(id_vars="state_code", var_name="year", value_name="ch4_tg")
Expand Down
184 changes: 121 additions & 63 deletions gch4i/emis_processing/task_iron_steel_emi.py
Original file line number Diff line number Diff line change
@@ -1,91 +1,81 @@
"""
Name: task_iron_steel_emi.py
Date Last Modified: 2024-12-12
Authors Name: C. COxen
Date Last Modified: 2025-02-13
Authors Name: Chris Coxen
Purpose: Mapping of iron and steel emissions to State, Year, emissions
format
gch4i_name: 2C1_iron_and_steel
Input Files: - 2C1_iron_and_steel/State_Iron-Steel_1990-2022.xlsx
Output Files: - iron_steel_emi.csv
Notes:
Input Files: - {ghgi_data_dir_path}/2C1_iron_and_steel/
State_Iron-Steel_1990-2022.xlsx
Output Files: - {emi_data_dir_path}/
iron_steel_emi.csv
"""

# %% STEP 0. Load packages, configuration files, and local parameters ------------------
from pathlib import Path
from typing import Annotated

import pandas as pd
from pytask import Product, mark, task

from gch4i.config import ( # noqa
from gch4i.config import (
V3_DATA_PATH,
emi_data_dir_path,
ghgi_data_dir_path,
min_year,
max_year
)
from gch4i.utils import tg_to_kt


@mark.persist
@task(id="iron_steel_emi")
def task_get_iron_and_steel_inv_data(
input_path: Path = (
ghgi_data_dir_path / "2C1_iron_and_steel/State_Iron-Steel_1990-2022.xlsx"
),
output_path: Annotated[Path, Product] = emi_data_dir_path / "iron_steel_emi.csv",
) -> None:
"""
Read in the iron and steel data from the GHGI
# %% Step 1. Create Function


def get_iron_and_steel_inv_data(in_path, src):
"""read in the ch4_kt values for each state

Function reads in the inventory data for iron and steel and returns the
emissions in kt for each state and year.

Parameters
----------
in_path : str
path to the input file
src : str
subcategory of interest

Returns
Saves the emissions data to the output path.
"""
emi_df = (
# read in the data
pd.read_excel(
input_path,
sheet_name="InvDB",
skiprows=15,
nrows=457,
usecols="A:BA",

# Read in the data
emi_df = pd.read_excel(
in_path,
sheet_name="InvDB",
skiprows=15,
nrows=457,
usecols="A:BA"
)
# Specify years to keep
year_list = [str(x) for x in list(range(min_year, max_year + 1))]
# Clean and format the data
emi_df = (
# name column names lower
# drop columns we don't need
.drop(
columns=[
"Data Type",
"Sector",
"Subsector",
"Category",
# "Subcategory1",
"Subcategory2",
"Subcategory3",
"Subcategory4",
"Subcategory5",
"Carbon Pool",
"Fuel1",
"Fuel2",
# "GeoRef",
"Exclude",
"CRT Code",
"ID",
"Sensitive (Y or N)",
"Units",
# "GHG",
"GWP",
]
)
# filter on Sinter because it's the only emission type with CH4 emission values
.query("Subcategory1 == 'Sinter Production'")
.drop(columns="Subcategory1")
.rename(columns=lambda x: str(x).lower())
# get just methane emissions
.query("ghg == 'CH4'")
# remove that column
.drop(columns="ghg")
# set the index to state
emi_df.rename(columns=lambda x: str(x).lower())
# Rename state column
.rename(columns={"georef": "state_code"})
# Filter out national data
.query("state_code != 'National'")
# Filter for Sinter Production & CH4
.query("(subcategory1 == 'Sinter Production') & (ghg == 'CH4')")
# Filter for state_code and years
.filter(items=["state_code"] + year_list, axis=1)
.set_index("state_code")
# covert "NO" string to numeric (will become np.nan)
# Replace NA values with 0
.replace(0, pd.NA)
.apply(pd.to_numeric, errors="coerce")
# drop states that have all nan values
.dropna(how="all")
# reset the index state back to a column
.fillna(0)
.reset_index()
# make the table long by state/year
.melt(id_vars="state_code", var_name="year", value_name="ch4_tg")
Expand All @@ -96,5 +86,73 @@ def task_get_iron_and_steel_inv_data(
.fillna({"ghgi_ch4_kt": 0})
# get only the years we need
.query("year.between(@min_year, @max_year)")
)
emi_df.to_csv(output_path, index=False)
# Ensure state/year grouping is unique
.groupby(["state_code", "year"])["ghgi_ch4_kt"]
.sum()
.reset_index()
)
return emi_df


# %% STEP 2. Initialize Parameters
"""
This section initializes the parameters for the task and stores them in the
emi_parameters_dict.

The parameters are read from the emi_proxy_mapping sheet of the gch4i_data_guide_v3.xlsx
file. The parameters are used to create the pytask task for the emi.
"""
# gch4i_name in gch4i_data_guide_v3.xlsx, emi_proxy_mapping sheet
source_name = "2C1_iron_and_steel"
# Directory name for GHGI data
source_path = "2C1_iron_and_steel"

# Data Guide Directory
proxy_file_path = V3_DATA_PATH.parents[1] / "gch4i_data_guide_v3.xlsx"
# Read and query for the source name (ghch4i_name)
proxy_data = pd.read_excel(proxy_file_path, sheet_name="emi_proxy_mapping").query(
f"gch4i_name == '{source_name}'"
)

# Initialize the emi_parameters_dict
emi_parameters_dict = {}
# Loop through the proxy data and store the parameters in the emi_parameters_dict
for emi_name, data in proxy_data.groupby("emi_id"):
emi_parameters_dict[emi_name] = {
"input_paths": [ghgi_data_dir_path / source_path / x for x in data.file_name],
"source_list": [x.strip().casefold() for x in data.Category.to_list()],
"output_path": emi_data_dir_path / f"{emi_name}.csv"
}

emi_parameters_dict


# %% STEP 3. Create Pytask Function and Loop

for _id, _kwargs in emi_parameters_dict.items():

@mark.persist
@task(id=_id, kwargs=_kwargs)
def task_iron_steel_emi(
input_paths: list[Path],
source_list: list[str],
output_path: Annotated[Path, Product],
) -> None:

# Initialize the emi_df_list
emi_df_list = []
# Loop through the input paths and source list to get the emissions data
for input_path, ghgi_group in zip(input_paths, source_list):
individual_emi_df = get_iron_and_steel_inv_data(input_path,
ghgi_group)
emi_df_list.append(individual_emi_df)

# Concatenate the emissions data and group by state and year
emission_group_df = (
pd.concat(emi_df_list)
.groupby(["state_code", "year"])["ghgi_ch4_kt"]
.sum()
.reset_index()
)
# Save the emissions data to the output path
emission_group_df.to_csv(output_path)
Loading