From f002aff168685c7572191bf90cbbf9038c73df09 Mon Sep 17 00:00:00 2001 From: "mark.wang" Date: Fri, 6 Sep 2024 23:09:07 +0800 Subject: [PATCH] add fixed value imputation --- docs/api-reference/impute.md | 1 + mkdocs.yml | 3 +- pyproject.toml | 7 +-- python/polarscore/__init__.py | 3 +- python/polarscore/impute.py | 115 ++++++++++++++++++++++++++++++++++ uv.lock | 1 - 6 files changed, 121 insertions(+), 9 deletions(-) create mode 100644 docs/api-reference/impute.md create mode 100644 python/polarscore/impute.py diff --git a/docs/api-reference/impute.md b/docs/api-reference/impute.md new file mode 100644 index 0000000..6184720 --- /dev/null +++ b/docs/api-reference/impute.md @@ -0,0 +1 @@ +:::polarscore.impute diff --git a/mkdocs.yml b/mkdocs.yml index 1dfe5e0..61eb021 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -40,4 +40,5 @@ markdown_extensions: nav: - Quick Start: index.md - API Reference: - - WOE: api-reference/woe.md \ No newline at end of file + - WOE: api-reference/woe.md + - Imputation: api-reference/impute.md \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 1cfa374..0c2fe26 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,12 +5,7 @@ description = "Lightning-fast credit scorecard building with Polars" authors = [{ name = "Mark Wang", email = "wxgter@gmail.com" }] readme = "README.md" requires-python = ">=3.9" -dependencies = [ - "altair>=5.4.1", - "polars>=1.6.0", - "polarscore", - "scikit-learn>=1.5.1", -] +dependencies = ["altair>=5.4.1", "polars>=1.6.0", "scikit-learn>=1.5.1"] [tool.ruff] diff --git a/python/polarscore/__init__.py b/python/polarscore/__init__.py index 0542586..4867190 100644 --- a/python/polarscore/__init__.py +++ b/python/polarscore/__init__.py @@ -4,7 +4,7 @@ from polars._typing import IntoExpr from polars.plugins import register_plugin_function -from polarscore import base, bin, feature_selection, woe +from polarscore import base, bin, feature_selection, impute, woe LIB = Path(__file__).parent @@ -42,4 +42,5 @@ def cal_woe(x: IntoExpr, y: IntoExpr) -> pl.Expr: # noqa: D103 "feature_selection", "bin", "base", + "impute", ] diff --git a/python/polarscore/impute.py b/python/polarscore/impute.py new file mode 100644 index 0000000..20cfb53 --- /dev/null +++ b/python/polarscore/impute.py @@ -0,0 +1,115 @@ +import polars as pl +from sklearn.base import BaseEstimator, TransformerMixin + + +class FixedValueImputer(TransformerMixin, BaseEstimator): + """ + Imputer that fills missing values with fixed values for specified columns. + + This transformer allows for imputation of missing values in a DataFrame + using predefined fixed values for specified columns. + + Parameters + ---------- + fill_value_dict : dict + A dictionary where keys are column names and values are the fixed + values to use for imputation in those columns. + + Attributes + ---------- + fill_value_dict_ : dict + The fitted dictionary of column names and their corresponding + imputation values. + + Methods + ------- + fit(X, y=None) + Fit the imputer to the input DataFrame. + transform(X) + Impute missing values in the input DataFrame using the fixed values. + + Examples + -------- + >>> import polars as pl + >>> from polarscore.impute import FixedValueImputer + >>> df = pl.DataFrame({"A": [1, None, 3], "B": ["x", None, "z"]}) + >>> imputer = FixedValueImputer({"A": 0, "B": "unknown"}) + >>> imputer.fit_transform(df) + shape: (3, 2) + ┌─────┬─────────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════════╡ + │ 1 ┆ x │ + │ 0 ┆ unknown │ + │ 3 ┆ z │ + └─────┴─────────┘ + + Notes + ----- + This imputer is particularly useful when you have domain knowledge about + appropriate default values for specific columns in your dataset. + """ + + def __init__(self, fill_value_dict: dict): + self.fill_value_dict = fill_value_dict + + def fit(self, X: pl.DataFrame, y=None): + """ + Fit the imputer to the input DataFrame. + + This method validates that all columns specified in the fill_value_dict + are present in the input DataFrame. It then stores the fill_value_dict + as an attribute for use in the transform method. + + Parameters + ---------- + X : pl.DataFrame + The input DataFrame to fit the imputer on. + y : None + Ignored. This parameter exists only for compatibility with + scikit-learn's transformer interface. + + Returns + ------- + self : FixedValueImputer + Returns the instance itself. + + Raises + ------ + ValueError + If any column specified in fill_value_dict is not present in X. + """ + if not set(self.fill_value_dict).issubset(X.columns): + msg = "Some columns in fill_value_dict are not present in the input DataFrame." + raise ValueError(msg) + + self.fill_value_dict_ = self.fill_value_dict + return self + + def transform(self, X: pl.DataFrame) -> pl.DataFrame: + """ + Transform the input DataFrame by imputing missing values with fixed values. + + This method applies the fixed value imputation to the specified columns + in the input DataFrame using the fill values provided during initialization. + + Parameters + ---------- + X : pl.DataFrame + The input DataFrame to transform. + + Returns + ------- + pl.DataFrame + A new DataFrame with missing values imputed in the specified columns. + + Notes + ----- + This method uses the `fill_value_dict_` attribute set during the fit method + to determine which columns to impute and what values to use for imputation. + """ + return X.with_columns( + pl.col(col).fill_null(value) for col, value in self.fill_value_dict_.items() + ) diff --git a/uv.lock b/uv.lock index c9bfbdf..e39b50a 100644 --- a/uv.lock +++ b/uv.lock @@ -680,7 +680,6 @@ dev = [ requires-dist = [ { name = "altair", specifier = ">=5.4.1" }, { name = "polars", specifier = ">=1.6.0" }, - { name = "polarscore" }, { name = "scikit-learn", specifier = ">=1.5.1" }, ]