diff --git a/error_generation/api/_low_level.py b/error_generation/api/_low_level.py deleted file mode 100644 index 958a7d7..0000000 --- a/error_generation/api/_low_level.py +++ /dev/null @@ -1,60 +0,0 @@ -from __future__ import annotations - -import math -import random -from dataclasses import dataclass, field - -import pandas as pd - - -@dataclass -class Column: - """Describe a column in a Dataframe.""" - - name: str | None = field(default=None) - index: int | None = field(default=None) - - def __post_init__(self: Column) -> None: - if self.name is None and self.index is None: - msg = "Specify either column name or index." - raise ValueError(msg) - - -class Mechanism: - pass - - -class ErrorType: - pass - - -def error_function(x): - return x - - -def create_errors( - table: pd.DataFrame, - column: Column, - error_rate: float, - mechanism: Mechanism, - error_type: ErrorType, - condition_to_column: Column | None = None, -) -> tuple[pd.DataFrame, pd.DataFrame]: - try: - series = table.loc[column.name] - except KeyError: # Assume it's integer index - series = table.iloc[column.index] - except IndexError: - msg = f"Invalid column: {column}" - raise ValueError(msg) from None - n_rows = len(series) - - # TODO(phju): this should be its own function - print(f"And use {mechanism} and {error_type} and {condition_to_column} to infer error positions.") - n_errors = math.floor(n_rows * error_rate) - - error_rows = random.sample(n_rows, n_errors) - series.apply() - error_series = series.iloc[error_rows].apply(error_function) - mask = [0 if (i not in error_rows) else 1 for i in range(n_rows)] - return error_series, pd.Series(mask) diff --git a/error_generation/api/low_level.py b/error_generation/api/low_level.py new file mode 100644 index 0000000..cf998fa --- /dev/null +++ b/error_generation/api/low_level.py @@ -0,0 +1,26 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from error_generation.utils import set_column + +if TYPE_CHECKING: + import pandas as pd + + from error_generation.error_mechanism import ErrorMechanism + from error_generation.error_type import ErrorType + from error_generation.utils import Column + + +def create_errors( + table: pd.DataFrame, + column: Column, + error_rate: float, + mechanism: ErrorMechanism, + error_type: ErrorType, + condition_to_column: Column | None = None, +) -> tuple[pd.DataFrame, pd.DataFrame]: + error_mask = mechanism.sample(table, error_rate, condition_to_column, seed=None) + series = error_type.apply(table, error_mask, column) + set_column(table, column, series) + return table diff --git a/error_generation/error_type/__init__.py b/error_generation/error_type/__init__.py index e69de29..da75825 100644 --- a/error_generation/error_type/__init__.py +++ b/error_generation/error_type/__init__.py @@ -0,0 +1,2 @@ +from ._base import ErrorType +from .mojibake import Mojibake diff --git a/error_generation/error_type/_base.py b/error_generation/error_type/_base.py index e69de29..053de87 100644 --- a/error_generation/error_type/_base.py +++ b/error_generation/error_type/_base.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import pandas as pd + + from error_generation.utils import Column + + +class NotInstantiableError(Exception): + def __init__(self: NotInstantiableError) -> None: + super().__init__("This class is not meant to be instantiated.") + + +class ErrorType(ABC): + def __init__(self: ErrorType) -> None: + raise NotInstantiableError + + @classmethod + # TODO (seja): def apply(cls: type[ErrorType], table: pd.DataFrame, error_mask: pd.DataFrame, preserve_dtypes: bool = True) -> tuple[pd.DataFrame, pd.DataFrame]: + # 1. prüft parameters + # - table.shape == error_mask.shape + # 2. kopiert 'table' + # 3. ruft '_get_valid_columns' auf um mögliche Spalten zu bekommen + # 4. ruft '_apply' mit 'table[valid_columns]' auf um geänderte 'table' zu bekommen + # 5. gibt gänderte 'table' und maske zurück, die anzeigt welche Zellen verändert wurden + def apply(cls: type[ErrorType], table: pd.DataFrame, error_mask: pd.DataFrame, column: Column) -> pd.Series: + cls._check_type(table, column) + return cls._apply(table, error_mask, column) + + @staticmethod + @abstractmethod + # TODO (seja): def _get_valid_columns(table: pd.DataFrame, preserve_dtypes: bool = True) -> list[Dtype]: + # Prüft auf welche columns dieser Fehler angewendet werden kann und gibt die entsprechenden Namen zurück. + def _check_type(table: pd.DataFrame, column: Column) -> None: + pass + + @staticmethod + @abstractmethod + # TODO (seja): def _apply(table: pd.DataFrame, error_mask: pd.DataFrame) -> pd.DataFrame: + # erwartet, dass 'table' ausschließlich valide columns hat. Wendet fehler stumpf auf alle Zellen an, wenn 'error_mask' True ist + # Gibt geänderte 'table' zurück. + def _apply(table: pd.DataFrame, error_mask: pd.DataFrame, column: Column) -> pd.Series: + pass diff --git a/error_generation/error_type/mojibake.py b/error_generation/error_type/mojibake.py new file mode 100644 index 0000000..e60182e --- /dev/null +++ b/error_generation/error_type/mojibake.py @@ -0,0 +1,33 @@ +import random + +import pandas as pd +from pandas.api.types import is_string_dtype + +from error_generation.error_type import ErrorType +from error_generation.utils import Column, get_column + + +class Mojibake(ErrorType): + """Inserts mojibake into a column containing strings.""" + + @staticmethod + def _check_type(table: pd.DataFrame, column: Column) -> None: + series = get_column(table, column) + + if not is_string_dtype(series): + msg = f"Column {column} does not contain values of the string dtype. Cannot insert Mojibake." + raise TypeError(msg) + + @staticmethod + def _apply(table: pd.DataFrame, error_mask: pd.DataFrame, column: Column) -> pd.Series: + # Top 10 most used encodings on the internet + # https://w3techs.com/technologies/overview/character_encoding + encodings: list[str] = ["utf_8", "iso-8859-1", "windows-1252", "windows-1251", "shift_jis", "euc_jp", "gb2312", "euc_kr", "windows-1250", "iso-8859-2"] + + series = get_column(table, column).copy() + encoding_sender, encoding_receiver = random.sample(encodings, 2) + + series_mask = get_column(error_mask, column) + series.iloc[series_mask].apply(lambda x: x.encode(encoding_sender)) + series.iloc[series_mask].apply(lambda x: x.decode(encoding_receiver)) + return series diff --git a/error_generation/utils/__init__.py b/error_generation/utils/__init__.py new file mode 100644 index 0000000..e1ce730 --- /dev/null +++ b/error_generation/utils/__init__.py @@ -0,0 +1 @@ +from .utils import Column, get_column, set_column diff --git a/error_generation/utils/utils.py b/error_generation/utils/utils.py new file mode 100644 index 0000000..d773e79 --- /dev/null +++ b/error_generation/utils/utils.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import pandas as pd + + +@dataclass +class Column: + """Describe a column in a Dataframe. We support selection by both index and column names.""" + + name: str | None = field(default=None) + index: int | None = field(default=None) + + def __post_init__(self: Column) -> None: + """Ensures that either column name, or an index is set.""" + if self.name is None and self.index is None: + msg = "Specify either column name or index." + raise ValueError(msg) + + +def get_column(table: pd.DataFrame, column: Column) -> pd.Series: + """Selects a column from a dataframe and returns it as a series.""" + try: + return table.loc[column.name] + except KeyError: # Assume it's integer index + return table.iloc[column.index] + except IndexError: + msg = f"Invalid column: {column}" + raise ValueError(msg) from None + + +def set_column(table: pd.DataFrame, column: Column, series: pd.Series) -> pd.Series: + """Replaces a column in a dataframe with a series. Mutates table.""" + try: + table.loc[column.name] = series + except KeyError: # Assume it's integer index + table.iloc[column.index] = series + except IndexError: + msg = f"Invalid column: {column}" + raise ValueError(msg) from None + return table