Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Imputer fill_value #126

Merged
merged 13 commits into from
Apr 17, 2020
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -205,7 +205,7 @@ Temporary Items



# Unison 1
# Unison
*.unison
*.zip
.unison*
@@ -219,3 +219,6 @@ Temporary Items

# Vim swap files
*.swp

catboost_info
cb_model.json
49 changes: 38 additions & 11 deletions src/fklearn/training/imputation.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from typing import Any, List
from typing import Any, List, Optional

import pandas as pd
from sklearn.impute import SimpleImputer
from toolz import curry
from toolz import curry, identity

from fklearn.common_docstrings import learner_return_docstring, learner_pred_fn_docstring
from fklearn.types import LearnerReturnType
@@ -13,7 +13,8 @@
@log_learner_time(learner_name='imputer')
def imputer(df: pd.DataFrame,
columns_to_impute: List[str],
impute_strategy: str = 'median') -> LearnerReturnType:
impute_strategy: str = 'median',
placeholder_value: Optional[Any] = None) -> LearnerReturnType:
"""
Fits a missing value imputer to the dataset.

@@ -32,23 +33,49 @@ def imputer(df: pd.DataFrame,
- If "mean", then replace missing values using the mean along the axis.
- If "median", then replace missing values using the median along the axis.
- If "most_frequent", then replace missing using the most frequent value along the axis.

placeholder_value : Any, (default=None)
if not None, use this as default value when some features only contains
NA values on training. For transformation, NA values on those features
will be replaced by `fill_value`.
"""

if placeholder_value is not None:
mask_feat_is_na = df[columns_to_impute].isna().all(axis=0)
columns_to_fill = mask_feat_is_na[mask_feat_is_na].index.values
columns_imputable = mask_feat_is_na[~mask_feat_is_na].index.values

fill_fn, _, fill_logs = placeholder_imputer(
df, columns_to_impute=columns_to_fill, placeholder_value=placeholder_value)
else:
columns_to_fill = list()
columns_imputable = columns_to_impute
fill_fn, _, fill_logs = identity, None, dict()

imp = SimpleImputer(strategy=impute_strategy)

imp.fit(df[columns_to_impute].values)
imp.fit(df[columns_imputable].values)

def p(new_data_set: pd.DataFrame) -> pd.DataFrame:
new_data = imp.transform(new_data_set[columns_to_impute])
new_cols = pd.DataFrame(data=new_data, columns=columns_to_impute).to_dict('list')
return new_data_set.assign(**new_cols)
new_data = imp.transform(new_data_set[columns_imputable])
new_cols = pd.DataFrame(data=new_data, columns=columns_imputable).to_dict('list')
return fill_fn(new_data_set.assign(**new_cols))

p.__doc__ = learner_pred_fn_docstring("imputer")

log = {'imputer': {'impute_strategy': impute_strategy,
'columns_to_impute': columns_to_impute,
'training_proportion_of_nulls': df[columns_to_impute].isnull().mean(axis=0).to_dict(),
'statistics': imp.statistics_}}
log = {
'imputer': {
'impute_strategy': impute_strategy,
'placeholder_value': placeholder_value,
'columns_to_impute': columns_to_impute,
'columns_to_fill': columns_to_fill,
'columns_imputable': columns_imputable,
'training_proportion_of_nulls': df[columns_to_impute].isnull().mean(axis=0).to_dict(),
'statistics': imp.statistics_,
'placeholder_imputer_fn': fill_fn,
'placeholder_imputer_logs': fill_logs,
}
}

return p, p(df), log

24 changes: 24 additions & 0 deletions tests/training/test_imputation.py
Original file line number Diff line number Diff line change
@@ -30,6 +30,30 @@ def test_imputer():
assert expected2.equals(pred_fn(input_df2))


def test_imputer_with_fill_value():
input_df = pd.DataFrame({
'col1': [10, 13, 10],
'col2': [50, 100, None],
'col3': [None, None, None]
})

df = pd.DataFrame({
'col1': [10.0, 13.0, 10.0],
'col2': [50.0, 100.0, 75.0],
'col3': [10.0, None, None]
})

expected = pd.DataFrame({
'col1': [10.0, 13.0, 10.0],
'col2': [50.0, 100.0, 75.0],
'col3': [10.0, -999, -999]
})

pred_fn, data, log = imputer(input_df, ["col1", "col2", "col3"], "mean", placeholder_value=-999)

assert expected.equals(pred_fn(df))


def test_placeholder_imputer():
input_df = pd.DataFrame({
'col1': [10, 13, 10],