add code and test for summary function

UBC-MDS · Jan 16, 2025 · 8b86d46 · 8b86d46
1 parent 3175308
commit 8b86d46
Show file tree

Hide file tree

Showing 3 changed files with 76 additions and 2 deletions.
diff --git a/src/pyeda/data_summary.py b/src/pyeda/data_summary.py
@@ -1,3 +1,5 @@
+import pandas as pd
+
 def get_summary_statistics(df, col = None):
     """
     Generate summary statistics for specified columns or all columns if none are provided.
@@ -18,4 +20,34 @@ def get_summary_statistics(df, col = None):
     pd.DataFrame
         A DataFrame with summary statistics for the specified columns.
     """
-    pass
+    if col is None:
+        col = df.columns.tolist()
+
+    summary_stats = {}
+
+    for column in col:
+        if column not in df.columns:
+            raise KeyError(f"Column '{column}' does not exist in the dataframe.")
+
+        if pd.api.types.is_numeric_dtype(df[column]):
+            summary_stats[column] = {
+                "mean": df[column].mean(),
+                "min": df[column].min(),
+                "max": df[column].max(),
+                "std": df[column].std(),
+                "median": df[column].median(),
+                "mode": df[column].mode().iloc[0] if not df[column].mode().empty else None,
+                "range": df[column].max() - df[column].min(),
+            }
+
+        else:
+            summary_stats[column] = {
+                "unique_values": df[column].unique(),
+                "num_unique_values": df[column].nunique(),
+                "most_frequent_value": df[column].value_counts().idxmax() if not df[column].value_counts().empty else None,
+                "frequency_of_most_frequent_value": df[column].value_counts().max() if not df[column].value_counts().empty else None,
+            }
+
+    summary_df = pd.DataFrame(summary_stats)
+
+    return summary_df
diff --git a/tests/test_check_csv.py b/tests/test_check_csv.py
@@ -1,5 +1,4 @@
 import pytest
-import pandas as pd
 import os
 from pyeda.check_csv import check_csv
 

diff --git a/tests/test_data_summary.py b/tests/test_data_summary.py
@@ -0,0 +1,43 @@
+import pytest
+import pandas as pd
+from pyeda.data_summary import get_summary_statistics
+
+@pytest.fixture
+def sample_df():
+    data = {
+        "numeric": [1, 2, 3, 4, 5],
+        "categorical": ["a", "b", "a", "a", "c"],
+        "missing": [None, None, None, None, None]
+    }
+    return pd.DataFrame(data)
+
+def test_for_all_columns(sample_df):
+    """Test the result of get summary function for all columns"""
+    result = get_summary_statistics(sample_df)
+
+    assert isinstance(result, pd.DataFrame)
+    assert "mean" in result.index
+    assert "std" in result.index
+    assert "median" in result.index
+    assert "unique_values" in result.index
+    assert "most_frequent_value" in result.index
+    assert result.loc["mean", "numeric"] == 3
+    assert result.loc["most_frequent_value", "categorical"] == "a"
+
+def test_for_selected_columns(sample_df):
+    """Test the result of get summary function for selected columns"""
+    result = get_summary_statistics(sample_df, col=["numeric", "categorical"])
+
+    assert "numeric" in result.columns
+    assert "categorical" in result.columns
+
+def test_for_empty_dataframe():
+    """Test the result of empty dataframe"""
+    df = pd.DataFrame()
+    result = get_summary_statistics(df)
+    assert result.empty
+
+def test_for_invalid_columns(sample_df):
+    """Test the result of invalid columns"""
+    with pytest.raises(KeyError):
+        get_summary_statistics(sample_df, col=["non_exist"])