Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update from df to series #18

Merged
merged 3 commits into from
Jan 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 7 additions & 14 deletions src/pyeda/pymissing_values_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@

def missing_values_summary(df):
"""
This function is to provide a summary of missing values in the dataset.
This function is to provide a summary of missing values in the dataset as a Series.

Parameters
----------
df (pd.DataFrame): The DataFrame containing the data.

Returns
-------
pd.DataFrame: A DataFrame showing the count and percentage of missing values.
pd.Series: A Series showing the count and percentage of missing values.

Examples
--------
Expand All @@ -23,17 +23,10 @@ def missing_values_summary(df):
# Calculate the percentage of missing values for each column
missing_percentage = (missing_count / len(df)) * 100

# Create a summary DataFrame
missing_summary = pd.DataFrame({
'Missing Count': missing_count,
'Missing Percentage': missing_percentage
})
# Combine count and percentage into a Series, filtering out columns with no missing values
missing_summary = missing_count[missing_count > 0].astype(str) + " (" + \
missing_percentage[missing_count > 0].round(2).astype(str) + "%)"

# Filter out columns with no missing values
missing_summary = missing_summary[missing_summary['Missing Count'] > 0]

# Sort the DataFrame by the count of missing values in descending order
missing_summary = missing_summary.sort_values(by='Missing Count', ascending=False)

return missing_summary
missing_summary.name = "Missing Count (Percentage)"
return missing_summary.sort_values(ascending=False)

19 changes: 8 additions & 11 deletions tests/test_missing.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from pyeda.pymissing_values_summary import missing_values_summary

import pytest
import pandas as pd

Expand All @@ -16,23 +15,21 @@ def df():
return pd.DataFrame(data)



def test_missing_values_summary(df):
"""
Test for the missing_values_summary function.
"""
result = missing_values_summary(df)

# Expected results
expected_columns = ['Missing Count', 'Missing Percentage']
expected_index = ['B', 'A'] # Columns with missing values in descending order of missing count
expected_values = ['2 (50.0%)', '1 (25.0%)'] # Expected values in the Series

# Check if the result is a Series
assert isinstance(result, pd.Series), "Result should be a Series."

# Check if the resulting DataFrame has the correct structure
assert list(result.columns) == expected_columns
assert list(result.index) == expected_index
# Check if the resulting Series has the correct index
assert list(result.index) == expected_index, "Index does not match the expected order."

# Check the specific values
assert result.loc['B', 'Missing Count'] == 2
assert result.loc['B', 'Missing Percentage'] == 50.0
assert result.loc['A', 'Missing Count'] == 1
assert result.loc['A', 'Missing Percentage'] == 25.0
# Check the specific values in the Series
assert list(result.values) == expected_values, "Values in the Series do not match the expected output."
Loading