From c10e06f9ab34c035b2b6e2021a8d921db2721fca Mon Sep 17 00:00:00 2001 From: Sepehr Heydarian Date: Sun, 2 Feb 2025 22:19:30 -0800 Subject: [PATCH 1/4] fixed spacing my removing empty lines in the function code --- src/datamop/column_scaler.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/datamop/column_scaler.py b/src/datamop/column_scaler.py index 4938dd1..3f8b93a 100644 --- a/src/datamop/column_scaler.py +++ b/src/datamop/column_scaler.py @@ -70,7 +70,6 @@ def column_scaler(data, column, method="minmax", new_min=0, new_max=1, inplace=T return data.copy() # Error handling - if column not in data.columns: raise KeyError("Column not found in the DataFrame.") if not pd.api.types.is_numeric_dtype(data[column]): @@ -87,7 +86,6 @@ def column_scaler(data, column, method="minmax", new_min=0, new_max=1, inplace=T midpoint = (new_min + new_max) / 2 scaled_column = pd.Series([midpoint] * len(data), index=data.index) - # Scale the column else: # minmax scaling From c7799151cb407a252603445578a938ff69876f38 Mon Sep 17 00:00:00 2001 From: Sepehr Heydarian Date: Sun, 2 Feb 2025 22:34:33 -0800 Subject: [PATCH 2/4] Fixed line with too many characters in columns_scaler --- src/datamop/column_scaler.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/src/datamop/column_scaler.py b/src/datamop/column_scaler.py index 3f8b93a..8c11d91 100644 --- a/src/datamop/column_scaler.py +++ b/src/datamop/column_scaler.py @@ -34,7 +34,8 @@ def column_scaler(data, column, method="minmax", new_min=0, new_max=1, inplace=T -------- pandas.DataFrame A copy of the DataFrame with the scaled column replacing the original column if `inplace` is set to `True`. - If `inplace` is set to `False`, the copy of DataFrame is returned with the new scaled column added, keeping the original column. + If `inplace` is set to `False`, the copy of DataFrame is returned with the new scaled column added, + keeping the original column. Raises ------ @@ -58,7 +59,6 @@ def column_scaler(data, column, method="minmax", new_min=0, new_max=1, inplace=T 0.0 0.5 1.0 - """ # Check input is pd.DataFrame if not isinstance(data, pd.DataFrame): @@ -66,7 +66,10 @@ def column_scaler(data, column, method="minmax", new_min=0, new_max=1, inplace=T # Empty df warning if data.empty: - warnings.warn("Empty DataFrame detected. Empty DataFrame will be returned.", UserWarning) + warnings.warn( + "Empty DataFrame detected. Empty DataFrame will be returned.", + UserWarning + ) return data.copy() # Error handling @@ -79,10 +82,17 @@ def column_scaler(data, column, method="minmax", new_min=0, new_max=1, inplace=T # Edge case warning if data[column].isna().any(): - warnings.warn("NaN value detected in column '{column}'. They will be unchanged", UserWarning) + warnings.warn( + "NaN value detected in column '{column}'. They will be unchanged", + UserWarning + ) if data[column].nunique() == 1: - warnings.warn("Single-value column detected. All values will be scaled to the midpoint of the `new_min` and `new_max`.", UserWarning) + warnings.warn( + "Single-value column detected. " + "All values will be scaled to the midpoint of the `new_min` and `new_max`.", + UserWarning + ) midpoint = (new_min + new_max) / 2 scaled_column = pd.Series([midpoint] * len(data), index=data.index) @@ -94,7 +104,11 @@ def column_scaler(data, column, method="minmax", new_min=0, new_max=1, inplace=T raise ValueError("`new_min` cannot be greater than `new_max`.") min_value = data[column].min() max_value = data[column].max() - scaled_column = ((data[column] - min_value) / (max_value - min_value)) * (new_max - new_min) + new_min + scaled_column = ( + ((data[column] - min_value) / (max_value - min_value)) + * (new_max - new_min) + + new_min + ) # standard scaling elif method == "standard": mean_value = data[column].mean() From 66a84be172677e3ca80f5f562bbcf24423f69f4a Mon Sep 17 00:00:00 2001 From: Sepehr Heydarian Date: Sun, 2 Feb 2025 23:38:30 -0800 Subject: [PATCH 3/4] Fixed column_scaler zero variance issue to prevent division by zero --- src/datamop/column_scaler.py | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/src/datamop/column_scaler.py b/src/datamop/column_scaler.py index 8c11d91..4c742fc 100644 --- a/src/datamop/column_scaler.py +++ b/src/datamop/column_scaler.py @@ -1,4 +1,4 @@ -# Formula in this function is adapted from Scikit Learn Documentation +# Formula in this function is adapted from Scikit Learn documentation # https://scikit-learn.org/1.5/modules/generated/sklearn.preprocessing.MinMaxScaler.html # https://scikit-learn.org/1.6/modules/generated/sklearn.preprocessing.StandardScaler.html @@ -33,8 +33,10 @@ def column_scaler(data, column, method="minmax", new_min=0, new_max=1, inplace=T Returns -------- pandas.DataFrame - A copy of the DataFrame with the scaled column replacing the original column if `inplace` is set to `True`. - If `inplace` is set to `False`, the copy of DataFrame is returned with the new scaled column added, + A copy of the DataFrame with the scaled column + replacing the original column if `inplace` is set to `True`. + If `inplace` is set to `False`, + the copy of DataFrame is returned with the new scaled column added, keeping the original column. Raises @@ -48,7 +50,6 @@ def column_scaler(data, column, method="minmax", new_min=0, new_max=1, inplace=T If the `method` is not `minmax` or `standard`. If the `new_min` value is greater or equal to the `new_max` when using `minmax` method. - Examples -------- >>> import pandas as pd @@ -83,18 +84,27 @@ def column_scaler(data, column, method="minmax", new_min=0, new_max=1, inplace=T # Edge case warning if data[column].isna().any(): warnings.warn( - "NaN value detected in column '{column}'. They will be unchanged", + f"NaN value detected in column '{column}'. They will be unchanged", UserWarning ) if data[column].nunique() == 1: - warnings.warn( - "Single-value column detected. " - "All values will be scaled to the midpoint of the `new_min` and `new_max`.", - UserWarning - ) - midpoint = (new_min + new_max) / 2 - scaled_column = pd.Series([midpoint] * len(data), index=data.index) + if method == "minmax": + warnings.warn( + "Single-value column detected. " + "All values will be scaled to the midpoint of the `new_min` and `new_max`.", + UserWarning + ) + midpoint = (new_min + new_max) / 2 + scaled_column = pd.Series([midpoint] * len(data), index=data.index) + + elif method == "standard": + warnings.warn( + "Standard deviation is zero. " + "All values are set to 0 to prevent division by zero.", + UserWarning + ) + scaled_column = pd.Series([0] * len(data), index=data.index) # Scale the column else: @@ -109,6 +119,7 @@ def column_scaler(data, column, method="minmax", new_min=0, new_max=1, inplace=T * (new_max - new_min) + new_min ) + # standard scaling elif method == "standard": mean_value = data[column].mean() From e879af2c85b8883a4908584d07f3ea26ed8c4e09 Mon Sep 17 00:00:00 2001 From: Sepehr Heydarian Date: Sun, 2 Feb 2025 23:42:08 -0800 Subject: [PATCH 4/4] Added test case for zero divsion for column_scaler in test_column_scaler.py --- tests/test_column_scaler.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/tests/test_column_scaler.py b/tests/test_column_scaler.py index 2b01adb..5c3bf99 100644 --- a/tests/test_column_scaler.py +++ b/tests/test_column_scaler.py @@ -9,6 +9,11 @@ def one_column_df(): """Return DataFrame with one column of numeric values. Used for testing.""" return pd.DataFrame({"price": [25, 50, 75]}) +@pytest.fixture +def one_column_df_float(): + """Return DataFrame with one column of floating values. Used for testing.""" + return pd.DataFrame({"price": [25.0, 50.0, 75.0]}) + @pytest.fixture def single_val_df(): """Return DataFrame with one column with single repeated value. Used for testing.""" @@ -26,11 +31,17 @@ def non_numeric_df(): # Expected use case tests def test_minmax_scaling_default(one_column_df): - """Test min-max scaling with default new_min=0 and new_max=1.""" + """Test min-max scaling with default new_min=0 and new_max=1. Use float values.""" scaled_df = column_scaler(one_column_df, column="price", method="minmax") expected = [0.0, 0.5, 1.0] assert scaled_df["price"].tolist() == expected +def test_minmax_scaling_default_float(one_column_df_float): + """Test min-max scaling with default new_min=0 and new_max=1.""" + scaled_df = column_scaler(one_column_df_float, column="price", method="minmax") + expected = [0.0, 0.5, 1.0] + assert scaled_df["price"].tolist() == expected + def test_minmax_scaling_custom(one_column_df): """Test min-max scaling with custom new_min=10 and new_max=20.""" scaled_df = column_scaler(one_column_df, column="price", method="minmax", new_min=10, new_max=20) @@ -58,6 +69,15 @@ def test_single_value_column_minmax(single_val_df): expected = [15.0, 15.0, 15.0] assert scaled_df["price"].tolist() == expected +def test_single_value_column_standard(single_val_df): + """Test standard scaling with column with single repeated values to prevent division by zero.""" + with pytest.warns(UserWarning, + match="Standard deviation is zero"): + scaled_df = column_scaler(single_val_df, column="price", method="standard") + + expected = [0, 0, 0] + assert scaled_df["price"].tolist() == expected + def test_empty_dataframe(empty_df): """Test scaling on empty DataFrame.""" with pytest.warns(UserWarning, match="Empty DataFrame detected"): @@ -73,6 +93,7 @@ def test_column_with_nan(): expected = [0.0, np.nan, 1.0] assert np.allclose(scaled_df["price"], expected, equal_nan=True) + # Erroneous case tests def test_non_numeric_column(non_numeric_df):