Skip to content

Commit

Permalink
add code and test for summary function
Browse files Browse the repository at this point in the history
  • Loading branch information
jessiezhang24 committed Jan 16, 2025
1 parent 3175308 commit 8b86d46
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 2 deletions.
34 changes: 33 additions & 1 deletion src/pyeda/data_summary.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import pandas as pd

def get_summary_statistics(df, col = None):
"""
Generate summary statistics for specified columns or all columns if none are provided.
Expand All @@ -18,4 +20,34 @@ def get_summary_statistics(df, col = None):
pd.DataFrame
A DataFrame with summary statistics for the specified columns.
"""
pass
if col is None:
col = df.columns.tolist()

summary_stats = {}

for column in col:
if column not in df.columns:
raise KeyError(f"Column '{column}' does not exist in the dataframe.")

if pd.api.types.is_numeric_dtype(df[column]):
summary_stats[column] = {
"mean": df[column].mean(),
"min": df[column].min(),
"max": df[column].max(),
"std": df[column].std(),
"median": df[column].median(),
"mode": df[column].mode().iloc[0] if not df[column].mode().empty else None,
"range": df[column].max() - df[column].min(),
}

else:
summary_stats[column] = {
"unique_values": df[column].unique(),
"num_unique_values": df[column].nunique(),
"most_frequent_value": df[column].value_counts().idxmax() if not df[column].value_counts().empty else None,
"frequency_of_most_frequent_value": df[column].value_counts().max() if not df[column].value_counts().empty else None,
}

summary_df = pd.DataFrame(summary_stats)

return summary_df
1 change: 0 additions & 1 deletion tests/test_check_csv.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import pytest
import pandas as pd
import os
from pyeda.check_csv import check_csv

Expand Down
43 changes: 43 additions & 0 deletions tests/test_data_summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import pytest
import pandas as pd
from pyeda.data_summary import get_summary_statistics

@pytest.fixture
def sample_df():
data = {
"numeric": [1, 2, 3, 4, 5],
"categorical": ["a", "b", "a", "a", "c"],
"missing": [None, None, None, None, None]
}
return pd.DataFrame(data)

def test_for_all_columns(sample_df):
"""Test the result of get summary function for all columns"""
result = get_summary_statistics(sample_df)

assert isinstance(result, pd.DataFrame)
assert "mean" in result.index
assert "std" in result.index
assert "median" in result.index
assert "unique_values" in result.index
assert "most_frequent_value" in result.index
assert result.loc["mean", "numeric"] == 3
assert result.loc["most_frequent_value", "categorical"] == "a"

def test_for_selected_columns(sample_df):
"""Test the result of get summary function for selected columns"""
result = get_summary_statistics(sample_df, col=["numeric", "categorical"])

assert "numeric" in result.columns
assert "categorical" in result.columns

def test_for_empty_dataframe():
"""Test the result of empty dataframe"""
df = pd.DataFrame()
result = get_summary_statistics(df)
assert result.empty

def test_for_invalid_columns(sample_df):
"""Test the result of invalid columns"""
with pytest.raises(KeyError):
get_summary_statistics(sample_df, col=["non_exist"])

0 comments on commit 8b86d46

Please sign in to comment.