Skip to content

Commit

Permalink
Make Outlier respect dtype, remove Clipping
Browse files Browse the repository at this point in the history
I added support for `Outlier` to respect dtypes - series that contain
integers won't be cast to float, but preserve the int dtype.

I ended up removing the `Clipping` Error Type, because it introduces
a dependency between error_pct and the error type that the library
currently does not support. A bigger effort would be needed to
properly support `Clipping`.

I added one example of how to use `Outlier`.
  • Loading branch information
philipp-jung committed Nov 17, 2024
1 parent 1ac1df9 commit 40f4f09
Show file tree
Hide file tree
Showing 5 changed files with 198 additions and 133 deletions.
1 change: 0 additions & 1 deletion error_generation/error_type/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from ._base import ErrorType
from .add_delta import AddDelta
from .butterfinger import Butterfinger
from .clipping import Clipping
from .extraneous import Extraneous
from .mislabel import Mislabel
from .missing import MissingValue
Expand Down
41 changes: 0 additions & 41 deletions error_generation/error_type/clipping.py

This file was deleted.

12 changes: 10 additions & 2 deletions error_generation/error_type/outlier.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from typing import TYPE_CHECKING

import numpy as np
from pandas.api.types import is_numeric_dtype
from pandas.api.types import is_integer_dtype, is_numeric_dtype

from error_generation.error_type import ErrorType
from error_generation.utils import get_column
Expand Down Expand Up @@ -54,6 +54,10 @@ def _apply(self: Outlier, table: pd.DataFrame, error_mask: pd.DataFrame, column:
perturbation_upper = self.config.outlier_coefficient * (upper_boundary - mean_value)
perturbation_lower = self.config.outlier_coefficient * (mean_value - lower_boundary)

if is_integer_dtype(series): # round float to int when series is int
perturbation_upper = np.ceil(perturbation_upper)
perturbation_lower = np.floor(perturbation_lower)

# Get masks for the different outlier types depending on the mean
mask_lower = (series < mean_value) & series_mask
mask_upper = (series > mean_value) & series_mask
Expand All @@ -73,6 +77,10 @@ def _apply(self: Outlier, table: pd.DataFrame, error_mask: pd.DataFrame, column:

# Apply Gaussian noise to simulate the increase in measurement error of the outliers
noise_std = self.config.outlier_noise_coeff * iqr
series.loc[series_mask] += rng.normal(loc=0, scale=noise_std, size=series_mask.sum())

if is_integer_dtype(series): # round float to int when series is int
series.loc[series_mask] += np.rint(rng.normal(loc=0, scale=noise_std, size=series_mask.sum()))
else:
series.loc[series_mask] += rng.normal(loc=0, scale=noise_std, size=series_mask.sum())

return series
5 changes: 0 additions & 5 deletions error_generation/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,6 @@ class ErrorTypeConfig:
replace_what: String that the Replace Error Type replaces with replace_with.
replace_with: String that the Replace Error Type uses to replace replace_what with. Defaults to "".
add_delta_value: Value that is added to the value by the AddDelta Error Type.
clip_lower_quantile: Lower quantile of the clipping range for Clipping Error Type. Default is None, which means no lower clipping.
clip_upper_quantile: Upper quantile of the clipping range for Clipping Error Type. Default is None, which means no upper clipping.
outlier_coefficient: Coefficient that determines the magnitude of the outliers for the Outlier Error Type.
outlier_noise_coeff: Coefficient that influences the standard deviation of the noise added to the outliers for the Outlier Error Type.
"""
Expand Down Expand Up @@ -87,9 +85,6 @@ class ErrorTypeConfig:

add_delta_value: Any | None = None

clip_lower_quantile: float | None = None
clip_upper_quantile: float | None = None

outlier_coefficient: float = 1.0
outlier_noise_coeff: float = 0.1

Expand Down
Loading

0 comments on commit 40f4f09

Please sign in to comment.