nubank · caique-lima · Apr 17, 2020 · Feb 12, 2020 · Feb 12, 2020 · Feb 12, 2020
@@ -205,7 +205,7 @@ Temporary Items
 
 
 
-# Unison                                                                           1
+# Unison
 *.unison
 *.zip
 .unison*
@@ -219,3 +219,6 @@ Temporary Items
 
 # Vim swap files
 *.swp
+
+catboost_info
+cb_model.json
@@ -1,8 +1,8 @@
-from typing import Any, List
+from typing import Any, List, Optional
 
 import pandas as pd
 from sklearn.impute import SimpleImputer
-from toolz import curry
+from toolz import curry, identity
 
 from fklearn.common_docstrings import learner_return_docstring, learner_pred_fn_docstring
 from fklearn.types import LearnerReturnType
@@ -13,7 +13,8 @@
 @log_learner_time(learner_name='imputer')
 def imputer(df: pd.DataFrame,
             columns_to_impute: List[str],
-            impute_strategy: str = 'median') -> LearnerReturnType:
+            impute_strategy: str = 'median',
+            placeholder_value: Optional[Any] = None) -> LearnerReturnType:
     """
     Fits a missing value imputer to the dataset.
 
@@ -32,23 +33,49 @@ def imputer(df: pd.DataFrame,
         - If "mean", then replace missing values using the mean along the axis.
         - If "median", then replace missing values using the median along the axis.
         - If "most_frequent", then replace missing using the most frequent value along the axis.
+
+    placeholder_value : Any, (default=None)
+        if not None, use this as default value when some features only contains
+        NA values on training. For transformation, NA values on those features
+        will be replaced by `fill_value`.
     """
 
+    if placeholder_value is not None:
+        mask_feat_is_na = df[columns_to_impute].isna().all(axis=0)
+        columns_to_fill = mask_feat_is_na[mask_feat_is_na].index.values
+        columns_imputable = mask_feat_is_na[~mask_feat_is_na].index.values
+
+        fill_fn, _, fill_logs = placeholder_imputer(
+            df, columns_to_impute=columns_to_fill, placeholder_value=placeholder_value)
+    else:
+        columns_to_fill = list()
+        columns_imputable = columns_to_impute
+        fill_fn, _, fill_logs = identity, None, dict()
+
     imp = SimpleImputer(strategy=impute_strategy)
 
-    imp.fit(df[columns_to_impute].values)
+    imp.fit(df[columns_imputable].values)
 
     def p(new_data_set: pd.DataFrame) -> pd.DataFrame:
-        new_data = imp.transform(new_data_set[columns_to_impute])
-        new_cols = pd.DataFrame(data=new_data, columns=columns_to_impute).to_dict('list')
-        return new_data_set.assign(**new_cols)
+        new_data = imp.transform(new_data_set[columns_imputable])
+        new_cols = pd.DataFrame(data=new_data, columns=columns_imputable).to_dict('list')
+        return fill_fn(new_data_set.assign(**new_cols))
 
     p.__doc__ = learner_pred_fn_docstring("imputer")
 
-    log = {'imputer': {'impute_strategy': impute_strategy,
-                       'columns_to_impute': columns_to_impute,
-                       'training_proportion_of_nulls': df[columns_to_impute].isnull().mean(axis=0).to_dict(),
-                       'statistics': imp.statistics_}}
+    log = {
+        'imputer': {
+            'impute_strategy': impute_strategy,
+            'placeholder_value': placeholder_value,
+            'columns_to_impute': columns_to_impute,
+            'columns_to_fill': columns_to_fill,
+            'columns_imputable': columns_imputable,
+            'training_proportion_of_nulls': df[columns_to_impute].isnull().mean(axis=0).to_dict(),
+            'statistics': imp.statistics_,
+            'placeholder_imputer_fn': fill_fn,
+            'placeholder_imputer_logs': fill_logs,
+        }
+    }
 
     return p, p(df), log
 

@@ -30,6 +30,30 @@ def test_imputer():
     assert expected2.equals(pred_fn(input_df2))
 
 
+def test_imputer_with_fill_value():
+    input_df = pd.DataFrame({
+        'col1': [10, 13, 10],
+        'col2': [50, 100, None],
+        'col3': [None, None, None]
+    })
+
+    df = pd.DataFrame({
+        'col1': [10.0, 13.0, 10.0],
+        'col2': [50.0, 100.0, 75.0],
+        'col3': [10.0, None, None]
+    })
+
+    expected = pd.DataFrame({
+        'col1': [10.0, 13.0, 10.0],
+        'col2': [50.0, 100.0, 75.0],
+        'col3': [10.0, -999, -999]
+    })
+
+    pred_fn, data, log = imputer(input_df, ["col1", "col2", "col3"], "mean", placeholder_value=-999)
+
+    assert expected.equals(pred_fn(df))
+
+
 def test_placeholder_imputer():
     input_df = pd.DataFrame({
         'col1': [10, 13, 10],