Skip to content

Commit

Permalink
Merge pull request #18 from UBC-MDS/checknullvalue
Browse files Browse the repository at this point in the history
update from df to series
  • Loading branch information
ZhengHe-007 authored Jan 23, 2025
2 parents cc2dafc + 6f515dd commit b14fbe4
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 25 deletions.
21 changes: 7 additions & 14 deletions src/pyeda/pymissing_values_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@

def missing_values_summary(df):
"""
This function is to provide a summary of missing values in the dataset.
This function is to provide a summary of missing values in the dataset as a Series.
Parameters
----------
df (pd.DataFrame): The DataFrame containing the data.
Returns
-------
pd.DataFrame: A DataFrame showing the count and percentage of missing values.
pd.Series: A Series showing the count and percentage of missing values.
Examples
--------
Expand All @@ -23,17 +23,10 @@ def missing_values_summary(df):
# Calculate the percentage of missing values for each column
missing_percentage = (missing_count / len(df)) * 100

# Create a summary DataFrame
missing_summary = pd.DataFrame({
'Missing Count': missing_count,
'Missing Percentage': missing_percentage
})
# Combine count and percentage into a Series, filtering out columns with no missing values
missing_summary = missing_count[missing_count > 0].astype(str) + " (" + \
missing_percentage[missing_count > 0].round(2).astype(str) + "%)"

# Filter out columns with no missing values
missing_summary = missing_summary[missing_summary['Missing Count'] > 0]

# Sort the DataFrame by the count of missing values in descending order
missing_summary = missing_summary.sort_values(by='Missing Count', ascending=False)

return missing_summary
missing_summary.name = "Missing Count (Percentage)"
return missing_summary.sort_values(ascending=False)

19 changes: 8 additions & 11 deletions tests/test_missing.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from pyeda.pymissing_values_summary import missing_values_summary

import pytest
import pandas as pd

Expand All @@ -16,23 +15,21 @@ def df():
return pd.DataFrame(data)



def test_missing_values_summary(df):
"""
Test for the missing_values_summary function.
"""
result = missing_values_summary(df)

# Expected results
expected_columns = ['Missing Count', 'Missing Percentage']
expected_index = ['B', 'A'] # Columns with missing values in descending order of missing count
expected_values = ['2 (50.0%)', '1 (25.0%)'] # Expected values in the Series

# Check if the result is a Series
assert isinstance(result, pd.Series), "Result should be a Series."

# Check if the resulting DataFrame has the correct structure
assert list(result.columns) == expected_columns
assert list(result.index) == expected_index
# Check if the resulting Series has the correct index
assert list(result.index) == expected_index, "Index does not match the expected order."

# Check the specific values
assert result.loc['B', 'Missing Count'] == 2
assert result.loc['B', 'Missing Percentage'] == 50.0
assert result.loc['A', 'Missing Count'] == 1
assert result.loc['A', 'Missing Percentage'] == 25.0
# Check the specific values in the Series
assert list(result.values) == expected_values, "Values in the Series do not match the expected output."

0 comments on commit b14fbe4

Please sign in to comment.