Skip to content

Commit

Permalink
Merge pull request #41 from markxwang/28-custom-imputer
Browse files Browse the repository at this point in the history
add fixed value imputation
  • Loading branch information
markxwang authored Sep 6, 2024
2 parents 39d3950 + f002aff commit 227de42
Show file tree
Hide file tree
Showing 6 changed files with 121 additions and 9 deletions.
1 change: 1 addition & 0 deletions docs/api-reference/impute.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
:::polarscore.impute
3 changes: 2 additions & 1 deletion mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,5 @@ markdown_extensions:
nav:
- Quick Start: index.md
- API Reference:
- WOE: api-reference/woe.md
- WOE: api-reference/woe.md
- Imputation: api-reference/impute.md
7 changes: 1 addition & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,7 @@ description = "Lightning-fast credit scorecard building with Polars"
authors = [{ name = "Mark Wang", email = "wxgter@gmail.com" }]
readme = "README.md"
requires-python = ">=3.9"
dependencies = [
"altair>=5.4.1",
"polars>=1.6.0",
"polarscore",
"scikit-learn>=1.5.1",
]
dependencies = ["altair>=5.4.1", "polars>=1.6.0", "scikit-learn>=1.5.1"]


[tool.ruff]
Expand Down
3 changes: 2 additions & 1 deletion python/polarscore/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from polars._typing import IntoExpr
from polars.plugins import register_plugin_function

from polarscore import base, bin, feature_selection, woe
from polarscore import base, bin, feature_selection, impute, woe

LIB = Path(__file__).parent

Expand Down Expand Up @@ -42,4 +42,5 @@ def cal_woe(x: IntoExpr, y: IntoExpr) -> pl.Expr: # noqa: D103
"feature_selection",
"bin",
"base",
"impute",
]
115 changes: 115 additions & 0 deletions python/polarscore/impute.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import polars as pl
from sklearn.base import BaseEstimator, TransformerMixin


class FixedValueImputer(TransformerMixin, BaseEstimator):
"""
Imputer that fills missing values with fixed values for specified columns.
This transformer allows for imputation of missing values in a DataFrame
using predefined fixed values for specified columns.
Parameters
----------
fill_value_dict : dict
A dictionary where keys are column names and values are the fixed
values to use for imputation in those columns.
Attributes
----------
fill_value_dict_ : dict
The fitted dictionary of column names and their corresponding
imputation values.
Methods
-------
fit(X, y=None)
Fit the imputer to the input DataFrame.
transform(X)
Impute missing values in the input DataFrame using the fixed values.
Examples
--------
>>> import polars as pl
>>> from polarscore.impute import FixedValueImputer
>>> df = pl.DataFrame({"A": [1, None, 3], "B": ["x", None, "z"]})
>>> imputer = FixedValueImputer({"A": 0, "B": "unknown"})
>>> imputer.fit_transform(df)
shape: (3, 2)
┌─────┬─────────┐
│ A ┆ B │
│ --- ┆ --- │
│ i64 ┆ str │
╞═════╪═════════╡
│ 1 ┆ x │
│ 0 ┆ unknown │
│ 3 ┆ z │
└─────┴─────────┘
Notes
-----
This imputer is particularly useful when you have domain knowledge about
appropriate default values for specific columns in your dataset.
"""

def __init__(self, fill_value_dict: dict):
self.fill_value_dict = fill_value_dict

def fit(self, X: pl.DataFrame, y=None):
"""
Fit the imputer to the input DataFrame.
This method validates that all columns specified in the fill_value_dict
are present in the input DataFrame. It then stores the fill_value_dict
as an attribute for use in the transform method.
Parameters
----------
X : pl.DataFrame
The input DataFrame to fit the imputer on.
y : None
Ignored. This parameter exists only for compatibility with
scikit-learn's transformer interface.
Returns
-------
self : FixedValueImputer
Returns the instance itself.
Raises
------
ValueError
If any column specified in fill_value_dict is not present in X.
"""
if not set(self.fill_value_dict).issubset(X.columns):
msg = "Some columns in fill_value_dict are not present in the input DataFrame."
raise ValueError(msg)

self.fill_value_dict_ = self.fill_value_dict
return self

def transform(self, X: pl.DataFrame) -> pl.DataFrame:
"""
Transform the input DataFrame by imputing missing values with fixed values.
This method applies the fixed value imputation to the specified columns
in the input DataFrame using the fill values provided during initialization.
Parameters
----------
X : pl.DataFrame
The input DataFrame to transform.
Returns
-------
pl.DataFrame
A new DataFrame with missing values imputed in the specified columns.
Notes
-----
This method uses the `fill_value_dict_` attribute set during the fit method
to determine which columns to impute and what values to use for imputation.
"""
return X.with_columns(
pl.col(col).fill_null(value) for col, value in self.fill_value_dict_.items()
)
1 change: 0 additions & 1 deletion uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 227de42

Please sign in to comment.