Skip to content

Commit

Permalink
Merge pull request #59 from UBC-MDS/column_scaler_improvements
Browse files Browse the repository at this point in the history
Column scaler improvements
  • Loading branch information
davyxuximin authored Feb 3, 2025
2 parents 5f1fd41 + e879af2 commit c06d403
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 12 deletions.
47 changes: 36 additions & 11 deletions src/datamop/column_scaler.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Formula in this function is adapted from Scikit Learn Documentation
# Formula in this function is adapted from Scikit Learn documentation
# https://scikit-learn.org/1.5/modules/generated/sklearn.preprocessing.MinMaxScaler.html
# https://scikit-learn.org/1.6/modules/generated/sklearn.preprocessing.StandardScaler.html

Expand Down Expand Up @@ -33,8 +33,11 @@ def column_scaler(data, column, method="minmax", new_min=0, new_max=1, inplace=T
Returns
--------
pandas.DataFrame
A copy of the DataFrame with the scaled column replacing the original column if `inplace` is set to `True`.
If `inplace` is set to `False`, the copy of DataFrame is returned with the new scaled column added, keeping the original column.
A copy of the DataFrame with the scaled column
replacing the original column if `inplace` is set to `True`.
If `inplace` is set to `False`,
the copy of DataFrame is returned with the new scaled column added,
keeping the original column.
Raises
------
Expand All @@ -47,7 +50,6 @@ def column_scaler(data, column, method="minmax", new_min=0, new_max=1, inplace=T
If the `method` is not `minmax` or `standard`.
If the `new_min` value is greater or equal to the `new_max` when using `minmax` method.
Examples
--------
>>> import pandas as pd
Expand All @@ -58,15 +60,17 @@ def column_scaler(data, column, method="minmax", new_min=0, new_max=1, inplace=T
0.0
0.5
1.0
"""
# Check input is pd.DataFrame
if not isinstance(data, pd.DataFrame):
raise TypeError("Input must be a pandas DataFrame.")

# Empty df warning
if data.empty:
warnings.warn("Empty DataFrame detected. Empty DataFrame will be returned.", UserWarning)
warnings.warn(
"Empty DataFrame detected. Empty DataFrame will be returned.",
UserWarning
)
return data.copy()

# Error handling
Expand All @@ -79,12 +83,28 @@ def column_scaler(data, column, method="minmax", new_min=0, new_max=1, inplace=T

# Edge case warning
if data[column].isna().any():
warnings.warn("NaN value detected in column '{column}'. They will be unchanged", UserWarning)
warnings.warn(
f"NaN value detected in column '{column}'. They will be unchanged",
UserWarning
)

if data[column].nunique() == 1:
warnings.warn("Single-value column detected. All values will be scaled to the midpoint of the `new_min` and `new_max`.", UserWarning)
midpoint = (new_min + new_max) / 2
scaled_column = pd.Series([midpoint] * len(data), index=data.index)
if method == "minmax":
warnings.warn(
"Single-value column detected. "
"All values will be scaled to the midpoint of the `new_min` and `new_max`.",
UserWarning
)
midpoint = (new_min + new_max) / 2
scaled_column = pd.Series([midpoint] * len(data), index=data.index)

elif method == "standard":
warnings.warn(
"Standard deviation is zero. "
"All values are set to 0 to prevent division by zero.",
UserWarning
)
scaled_column = pd.Series([0] * len(data), index=data.index)

# Scale the column
else:
Expand All @@ -94,7 +114,12 @@ def column_scaler(data, column, method="minmax", new_min=0, new_max=1, inplace=T
raise ValueError("`new_min` cannot be greater than `new_max`.")
min_value = data[column].min()
max_value = data[column].max()
scaled_column = ((data[column] - min_value) / (max_value - min_value)) * (new_max - new_min) + new_min
scaled_column = (
((data[column] - min_value) / (max_value - min_value))
* (new_max - new_min)
+ new_min
)

# standard scaling
elif method == "standard":
mean_value = data[column].mean()
Expand Down
23 changes: 22 additions & 1 deletion tests/test_column_scaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@ def one_column_df():
"""Return DataFrame with one column of numeric values. Used for testing."""
return pd.DataFrame({"price": [25, 50, 75]})

@pytest.fixture
def one_column_df_float():
"""Return DataFrame with one column of floating values. Used for testing."""
return pd.DataFrame({"price": [25.0, 50.0, 75.0]})

@pytest.fixture
def single_val_df():
"""Return DataFrame with one column with single repeated value. Used for testing."""
Expand All @@ -26,11 +31,17 @@ def non_numeric_df():

# Expected use case tests
def test_minmax_scaling_default(one_column_df):
"""Test min-max scaling with default new_min=0 and new_max=1."""
"""Test min-max scaling with default new_min=0 and new_max=1. Use float values."""
scaled_df = column_scaler(one_column_df, column="price", method="minmax")
expected = [0.0, 0.5, 1.0]
assert scaled_df["price"].tolist() == expected

def test_minmax_scaling_default_float(one_column_df_float):
"""Test min-max scaling with default new_min=0 and new_max=1."""
scaled_df = column_scaler(one_column_df_float, column="price", method="minmax")
expected = [0.0, 0.5, 1.0]
assert scaled_df["price"].tolist() == expected

def test_minmax_scaling_custom(one_column_df):
"""Test min-max scaling with custom new_min=10 and new_max=20."""
scaled_df = column_scaler(one_column_df, column="price", method="minmax", new_min=10, new_max=20)
Expand Down Expand Up @@ -58,6 +69,15 @@ def test_single_value_column_minmax(single_val_df):
expected = [15.0, 15.0, 15.0]
assert scaled_df["price"].tolist() == expected

def test_single_value_column_standard(single_val_df):
"""Test standard scaling with column with single repeated values to prevent division by zero."""
with pytest.warns(UserWarning,
match="Standard deviation is zero"):
scaled_df = column_scaler(single_val_df, column="price", method="standard")

expected = [0, 0, 0]
assert scaled_df["price"].tolist() == expected

def test_empty_dataframe(empty_df):
"""Test scaling on empty DataFrame."""
with pytest.warns(UserWarning, match="Empty DataFrame detected"):
Expand All @@ -73,6 +93,7 @@ def test_column_with_nan():
expected = [0.0, np.nan, 1.0]
assert np.allclose(scaled_df["price"], expected, equal_nan=True)


# Erroneous case tests

def test_non_numeric_column(non_numeric_df):
Expand Down

0 comments on commit c06d403

Please sign in to comment.