diff --git a/.gitignore b/.gitignore index 7dbc7433..966aafb1 100644 --- a/.gitignore +++ b/.gitignore @@ -205,7 +205,7 @@ Temporary Items -# Unison 1 +# Unison *.unison *.zip .unison* @@ -219,3 +219,6 @@ Temporary Items # Vim swap files *.swp + +catboost_info +cb_model.json diff --git a/src/fklearn/training/imputation.py b/src/fklearn/training/imputation.py index 2f3c4026..0a811e7a 100644 --- a/src/fklearn/training/imputation.py +++ b/src/fklearn/training/imputation.py @@ -1,8 +1,8 @@ -from typing import Any, List +from typing import Any, List, Optional import pandas as pd from sklearn.impute import SimpleImputer -from toolz import curry +from toolz import curry, identity from fklearn.common_docstrings import learner_return_docstring, learner_pred_fn_docstring from fklearn.types import LearnerReturnType @@ -13,7 +13,8 @@ @log_learner_time(learner_name='imputer') def imputer(df: pd.DataFrame, columns_to_impute: List[str], - impute_strategy: str = 'median') -> LearnerReturnType: + impute_strategy: str = 'median', + placeholder_value: Optional[Any] = None) -> LearnerReturnType: """ Fits a missing value imputer to the dataset. @@ -32,23 +33,49 @@ def imputer(df: pd.DataFrame, - If "mean", then replace missing values using the mean along the axis. - If "median", then replace missing values using the median along the axis. - If "most_frequent", then replace missing using the most frequent value along the axis. + + placeholder_value : Any, (default=None) + if not None, use this as default value when some features only contains + NA values on training. For transformation, NA values on those features + will be replaced by `fill_value`. """ + if placeholder_value is not None: + mask_feat_is_na = df[columns_to_impute].isna().all(axis=0) + columns_to_fill = mask_feat_is_na[mask_feat_is_na].index.values + columns_imputable = mask_feat_is_na[~mask_feat_is_na].index.values + + fill_fn, _, fill_logs = placeholder_imputer( + df, columns_to_impute=columns_to_fill, placeholder_value=placeholder_value) + else: + columns_to_fill = list() + columns_imputable = columns_to_impute + fill_fn, _, fill_logs = identity, None, dict() + imp = SimpleImputer(strategy=impute_strategy) - imp.fit(df[columns_to_impute].values) + imp.fit(df[columns_imputable].values) def p(new_data_set: pd.DataFrame) -> pd.DataFrame: - new_data = imp.transform(new_data_set[columns_to_impute]) - new_cols = pd.DataFrame(data=new_data, columns=columns_to_impute).to_dict('list') - return new_data_set.assign(**new_cols) + new_data = imp.transform(new_data_set[columns_imputable]) + new_cols = pd.DataFrame(data=new_data, columns=columns_imputable).to_dict('list') + return fill_fn(new_data_set.assign(**new_cols)) p.__doc__ = learner_pred_fn_docstring("imputer") - log = {'imputer': {'impute_strategy': impute_strategy, - 'columns_to_impute': columns_to_impute, - 'training_proportion_of_nulls': df[columns_to_impute].isnull().mean(axis=0).to_dict(), - 'statistics': imp.statistics_}} + log = { + 'imputer': { + 'impute_strategy': impute_strategy, + 'placeholder_value': placeholder_value, + 'columns_to_impute': columns_to_impute, + 'columns_to_fill': columns_to_fill, + 'columns_imputable': columns_imputable, + 'training_proportion_of_nulls': df[columns_to_impute].isnull().mean(axis=0).to_dict(), + 'statistics': imp.statistics_, + 'placeholder_imputer_fn': fill_fn, + 'placeholder_imputer_logs': fill_logs, + } + } return p, p(df), log diff --git a/tests/training/test_imputation.py b/tests/training/test_imputation.py index 7d4787a8..977f60a4 100644 --- a/tests/training/test_imputation.py +++ b/tests/training/test_imputation.py @@ -30,6 +30,30 @@ def test_imputer(): assert expected2.equals(pred_fn(input_df2)) +def test_imputer_with_fill_value(): + input_df = pd.DataFrame({ + 'col1': [10, 13, 10], + 'col2': [50, 100, None], + 'col3': [None, None, None] + }) + + df = pd.DataFrame({ + 'col1': [10.0, 13.0, 10.0], + 'col2': [50.0, 100.0, 75.0], + 'col3': [10.0, None, None] + }) + + expected = pd.DataFrame({ + 'col1': [10.0, 13.0, 10.0], + 'col2': [50.0, 100.0, 75.0], + 'col3': [10.0, -999, -999] + }) + + pred_fn, data, log = imputer(input_df, ["col1", "col2", "col3"], "mean", placeholder_value=-999) + + assert expected.equals(pred_fn(df)) + + def test_placeholder_imputer(): input_df = pd.DataFrame({ 'col1': [10, 13, 10],