Skip to content

Commit

Permalink
refactor data extraction code
Browse files Browse the repository at this point in the history
  • Loading branch information
sumn2u committed Nov 25, 2023
1 parent 041e67f commit 6955b75
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 29 deletions.
27 changes: 18 additions & 9 deletions src/data/cpu_power_data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,18 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# Import necessary libraries\n",
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 2,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -127,19 +128,22 @@
"[2161 rows x 2 columns]"
]
},
"execution_count": 4,
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Read CSV data from the provided URL into a Pandas DataFrame\n",
"df = pd.read_csv(\"https://raw.githubusercontent.com/mlco2/codecarbon/master/codecarbon/data/hardware/cpu_power.csv\")\n",
"\n",
"# Display the entire DataFrame\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 3,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -182,31 +186,36 @@
"2156 Intel Xeon X5670 95"
]
},
"execution_count": 6,
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.query(\"Name == 'Intel Xeon X5670'\")"
"# Query the DataFrame to filter rows where the 'Name' column is equal to 'Intel Xeon X5670\n",
"intel_xeon_data = df.query(\"Name == 'Intel Xeon X5670'\")\n",
"\n",
"# Display the filtered DataFrame for 'Intel Xeon X5670'\n",
"intel_xeon_data"
]
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# Convert DataFrame to JSON\n",
"# Convert the entire DataFrame to JSON format, with lowercase column names\n",
"json_data = df.rename(columns=str.lower).to_json(orient='records', indent=2)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# Write the JSON data to a file named 'cpu_power.json'\n",
"with open('cpu_power.json', 'w') as json_file:\n",
" json_file.write(json_data)"
]
Expand Down
64 changes: 44 additions & 20 deletions src/data/our_world_in_data-2021_data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@
"metadata": {},
"outputs": [],
"source": [
"# Import necessary libraries\n",
"import pandas as pd\n",
"import numpy as np"
"import numpy as np\n",
"import json"
]
},
{
Expand Down Expand Up @@ -445,7 +447,9 @@
}
],
"source": [
"# Read energy data from a CSV file hosted on GitHub into a Pandas DataFrame\n",
"df = pd.read_csv(\"https://raw.githubusercontent.com/owid/energy-data/master/owid-energy-data.csv\")\n",
"# Display the entire DataFrame\n",
"df"
]
},
Expand Down Expand Up @@ -870,6 +874,7 @@
}
],
"source": [
"# Query the DataFrame to filter rows where the 'country' column is equal to 'India'\n",
"df.query(\"country == 'India'\")"
]
},
Expand All @@ -891,8 +896,11 @@
}
],
"source": [
"# Create lists of columns related to electricity, nuclear, and carbon intensity\n",
"elec_col = [c for c in df.columns.to_list() if \"_electricity\" in c]\n",
"nuke_col = [c for c in elec_col if \"nuclear\" in c]\n",
"\n",
"# Display the nuclear-related columns\n",
"nuke_col"
]
},
Expand All @@ -903,6 +911,7 @@
"metadata": {},
"outputs": [],
"source": [
"# Extract columns related to carbon intensity\n",
"carbon_intensity_col = [c for c in df.columns.to_list() if \"intensity\" in c]"
]
},
Expand Down Expand Up @@ -1005,6 +1014,7 @@
}
],
"source": [
"# Query the DataFrame for specific columns and conditions\n",
"df[['country', 'iso_code', 'year'] + carbon_intensity_col + elec_col].query(\"iso_code=='FRA' and year==2021\")"
]
},
Expand Down Expand Up @@ -1201,8 +1211,12 @@
}
],
"source": [
"# Create a filtered DataFrame for the year 2021, removing rows with null ISO codes\n",
"df_filtered = df[['country', 'iso_code', 'year'] + carbon_intensity_col + elec_col].query(\"year==2021 and iso_code.notnull()\")\n",
"\n",
"# Rename columns for better clarity\n",
"columns_names = {\n",
" # mapping old column names to new ones\n",
" 'country':'country_name',\n",
" 'carbon_intensity_elec':'carbon_intensity',\n",
" 'biofuel_electricity':'biofuel_TWh',\n",
Expand All @@ -1220,7 +1234,11 @@
" 'solar_electricity':'solar_TWh',\n",
" 'wind_electricity':'wind_TWh'\n",
"}\n",
"\n",
"# Apply column name changes to the DataFrame\n",
"df_filtered.rename(columns=columns_names, inplace=True)\n",
"\n",
"# Reorder and select specific columns in the DataFrame\n",
"df_filtered = df_filtered[['country_name',\n",
" 'iso_code',\n",
" 'year',\n",
Expand All @@ -1239,6 +1257,8 @@
" 'nuclear_TWh',\n",
" 'biofuel_TWh',\n",
" 'low_carbon_TWh']]\n",
"\n",
"# Display the first 5 rows of the filtered DataFrame\n",
"df_filtered.head(5)"
]
},
Expand Down Expand Up @@ -1367,8 +1387,13 @@
}
],
"source": [
"# Define a list of columns to sum\n",
"columns_to_sum = [\"fossil_TWh\", \"renewables_TWh\", \"nuclear_TWh\"]\n",
"\n",
"# Create a new column 'total_TWh' in the DataFrame by summing specified columns\n",
"df_filtered['total_TWh'] = df_filtered[columns_to_sum].sum(axis=1)\n",
"\n",
"# Display the first 2 rows of the DataFrame with the new column\n",
"df_filtered.head(2)"
]
},
Expand Down Expand Up @@ -1470,6 +1495,7 @@
}
],
"source": [
"# Query the DataFrame for rows where 'iso_code' is 'FRA'\n",
"df_filtered.query(\"iso_code=='FRA'\")"
]
},
Expand Down Expand Up @@ -1571,6 +1597,7 @@
}
],
"source": [
"# Query the DataFrame for rows where 'iso_code' is 'BGD'\n",
"df_filtered.query(\"iso_code=='BGD'\")"
]
},
Expand All @@ -1580,11 +1607,7 @@
"id": "9fbe26d1",
"metadata": {},
"source": [
"renewables_Twh = hydroelectricity_Twh + wind_Twh + solar_Twh + other_renewable_Twh\n",
"\n",
"RTE Production\n",
"\n",
"![RTE](2023-07-07-22-40-48.png)\n"
"renewables_Twh = hydroelectricity_Twh + wind_Twh + solar_Twh + other_renewable_Twh"
]
},
{
Expand All @@ -1599,6 +1622,7 @@
"# From https://bilan-electrique-2021.rte-france.com/# we read 522 TWh for Production, but 468 TWh of consumption.\n",
"# Here we found 550 TWh, which is not the same !\n",
"\n",
"# Assert that a specific value in the DataFrame is equal to a given value\n",
"assert df_filtered.query(\"iso_code=='FRA'\")[\"total_TWh\"].values[0] == 550.38"
]
},
Expand Down Expand Up @@ -1658,17 +1682,21 @@
}
],
"source": [
"# Create a dictionary 'energy_mix' to store information about the energy mix for each country\n",
"energy_mix = {}\n",
"\n",
"# Loop through unique ISO codes in the filtered DataFrame\n",
"for code in df_filtered['iso_code'].unique():\n",
"#for code in ['FRA', 'AFG', 'VNM']:\n",
" # print(code)\n",
" df=df_filtered.query(\"iso_code == @code\")\n",
" max_year = df.year.max()\n",
" #if df.isnull().values.any() == False:\n",
"\n",
" # Check for missing values and populate the 'energy_mix' dictionary\n",
" if len(df)>0 and not np.isnan(df.loc[df.year == max_year].iloc[0][\"carbon_intensity\"]):\n",
" energy_mix[code]=df.loc[df.year == max_year].iloc[0].to_dict()\n",
" else:\n",
" print(f\"Warning: missing carbon_intensity value for {code} - {df.iloc[0].country_name}.\")\n",
"\n",
"# Display the energy mix for the USA\n",
"energy_mix['USA']"
]
},
Expand All @@ -1690,18 +1718,18 @@
}
],
"source": [
"\n",
"# Check if a specific value in the DataFrame is NaN\n",
"np.isnan(df_filtered.loc[df_filtered.iso_code == \"GUF\"].iloc[0][\"carbon_intensity\"])"
]
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 14,
"id": "0533388d",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"# Write the 'energy_mix' dictionary to a JSON file named 'global_energy_mix.json'\n",
"with open(\"global_energy_mix.json\", \"w\") as outfile:\n",
" json.dump(energy_mix, outfile, indent=4, sort_keys=True)"
]
Expand All @@ -1712,11 +1740,11 @@
"id": "1737311d-a9d7-45ee-a871-c26a9e1763cf",
"metadata": {},
"source": [
"We need only the last available indicator for the columns `carbon_intensity_elec`, not the history, `year` is just an indicator of the freshness.\n",
"To clarify, we are only interested in the latest available data for the `carbon_intensity_elec` columns, and not its historical values. The `year` column only serves as an indicator of the most recent data. \n",
"\n",
"If we don't have `carbon_intensity` it could be computed, but we prefer to not have to do it.\n",
"If the `carbon_intensity` value is missing, we could calculate it, but we would prefer to avoid doing so. \n",
"\n",
"We could keep `*_electricity` to have the electricity mix in `TWh` but it is not mandatory."
"While it's not essential, we might retain `*_electricity` to include the electricity mix in `TWh`."
]
},
{
Expand All @@ -1725,11 +1753,7 @@
"id": "8e8031a6-4b45-4ce5-a037-008173af37bb",
"metadata": {},
"source": [
"So, we have to:\n",
"1. Filter for the last available year\n",
"1. Check if we not loose country we previously had.\n",
"1. Keep only the data we need\n",
"1. Export to JSON"
"We need to filter for the most recent year, retain country data, and export selected data to JSON."
]
}
],
Expand Down

0 comments on commit 6955b75

Please sign in to comment.