From 905315e32058b88bd87838abfe1b70e3d91d7eb0 Mon Sep 17 00:00:00 2001 From: Zheng He Date: Thu, 23 Jan 2025 14:30:11 -0800 Subject: [PATCH 1/3] update from df to series --- src/pyeda/pymissing_values_summary.py | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/src/pyeda/pymissing_values_summary.py b/src/pyeda/pymissing_values_summary.py index 01e7711..17f2db3 100644 --- a/src/pyeda/pymissing_values_summary.py +++ b/src/pyeda/pymissing_values_summary.py @@ -3,7 +3,7 @@ def missing_values_summary(df): """ - This function is to provide a summary of missing values in the dataset. + This function is to provide a summary of missing values in the dataset as a Series. Parameters ---------- @@ -11,11 +11,11 @@ def missing_values_summary(df): Returns ------- - pd.DataFrame: A DataFrame showing the count and percentage of missing values. + pd.Series: A Series showing the count and percentage of missing values. Examples -------- - >>> missing_values_summary(df) + >>> missing_values_series(df) """ # Calculate the count of missing values for each column missing_count = df.isnull().sum() @@ -23,17 +23,10 @@ def missing_values_summary(df): # Calculate the percentage of missing values for each column missing_percentage = (missing_count / len(df)) * 100 - # Create a summary DataFrame - missing_summary = pd.DataFrame({ - 'Missing Count': missing_count, - 'Missing Percentage': missing_percentage - }) + # Combine count and percentage into a Series, filtering out columns with no missing values + missing_summary = missing_count[missing_count > 0].astype(str) + " (" + \ + missing_percentage[missing_count > 0].round(2).astype(str) + "%)" - # Filter out columns with no missing values - missing_summary = missing_summary[missing_summary['Missing Count'] > 0] - - # Sort the DataFrame by the count of missing values in descending order - missing_summary = missing_summary.sort_values(by='Missing Count', ascending=False) - - return missing_summary + missing_summary.name = "Missing Count (Percentage)" + return missing_summary.sort_values(ascending=False) From a5c87e28dc642527a09e9402e6bd11aec865223b Mon Sep 17 00:00:00 2001 From: Zheng He Date: Thu, 23 Jan 2025 14:39:49 -0800 Subject: [PATCH 2/3] updated the test as Series --- tests/test_missing.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/tests/test_missing.py b/tests/test_missing.py index 75b457a..289810b 100644 --- a/tests/test_missing.py +++ b/tests/test_missing.py @@ -1,5 +1,4 @@ from pyeda.pymissing_values_summary import missing_values_summary - import pytest import pandas as pd @@ -16,7 +15,6 @@ def df(): return pd.DataFrame(data) - def test_missing_values_summary(df): """ Test for the missing_values_summary function. @@ -24,15 +22,14 @@ def test_missing_values_summary(df): result = missing_values_summary(df) # Expected results - expected_columns = ['Missing Count', 'Missing Percentage'] expected_index = ['B', 'A'] # Columns with missing values in descending order of missing count + expected_values = ['2 (50.0%)', '1 (25.0%)'] # Expected values in the Series + + # Check if the result is a Series + assert isinstance(result, pd.Series), "Result should be a Series." - # Check if the resulting DataFrame has the correct structure - assert list(result.columns) == expected_columns - assert list(result.index) == expected_index + # Check if the resulting Series has the correct index + assert list(result.index) == expected_index, "Index does not match the expected order." - # Check the specific values - assert result.loc['B', 'Missing Count'] == 2 - assert result.loc['B', 'Missing Percentage'] == 50.0 - assert result.loc['A', 'Missing Count'] == 1 - assert result.loc['A', 'Missing Percentage'] == 25.0 \ No newline at end of file + # Check the specific values in the Series + assert list(result.values) == expected_values, "Values in the Series do not match the expected output." From 6f515dd1b8231b6cc5b6b93f53dcf98d95703236 Mon Sep 17 00:00:00 2001 From: Zheng He Date: Thu, 23 Jan 2025 14:45:49 -0800 Subject: [PATCH 3/3] fixing docstring --- src/pyeda/pymissing_values_summary.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pyeda/pymissing_values_summary.py b/src/pyeda/pymissing_values_summary.py index 17f2db3..b9f18e0 100644 --- a/src/pyeda/pymissing_values_summary.py +++ b/src/pyeda/pymissing_values_summary.py @@ -15,7 +15,7 @@ def missing_values_summary(df): Examples -------- - >>> missing_values_series(df) + >>> missing_values_summary(df) """ # Calculate the count of missing values for each column missing_count = df.isnull().sum() @@ -28,5 +28,5 @@ def missing_values_summary(df): missing_percentage[missing_count > 0].round(2).astype(str) + "%)" missing_summary.name = "Missing Count (Percentage)" - return missing_summary.sort_values(ascending=False) + return missing_summary.sort_values(ascending=False)