nubank · vitorsrg · Feb 12, 2020 · Feb 12, 2020 · Feb 12, 2020 · Feb 12, 2020
@@ -205,7 +205,7 @@ Temporary Items
 
 
 
-# Unison                                                                           1
+# Unison
 *.unison
 *.zip
 .unison*
@@ -219,3 +219,6 @@ Temporary Items
 
 # Vim swap files
 *.swp
+
+catboost_info
+cb_model.json
@@ -1,4 +1,4 @@
-from typing import Any, List
+from typing import Any, List, Optional
 
 import pandas as pd
 from sklearn.impute import SimpleImputer
@@ -13,7 +13,8 @@
 @log_learner_time(learner_name='imputer')
 def imputer(df: pd.DataFrame,
             columns_to_impute: List[str],
-            impute_strategy: str = 'median') -> LearnerReturnType:
+            impute_strategy: str = 'median',
+            fill_value: Optional[Any] = None) -> LearnerReturnType:
     """
     Fits a missing value imputer to the dataset.
 
@@ -32,23 +33,41 @@ def imputer(df: pd.DataFrame,
         - If "mean", then replace missing values using the mean along the axis.
         - If "median", then replace missing values using the median along the axis.
         - If "most_frequent", then replace missing using the most frequent value along the axis.
+
+    fill_value : Any, (default=None)
+        if not None, use this as default value when some feature only contains NA values.
     """
 
+    columns_to_fill = list()
+    columns_imputable = columns_to_impute
+    if fill_value is not None:
+        df_is_nan = df[columns_to_impute].isna().all(axis=0)
+        columns_to_fill = list(df_is_nan[df_is_nan].index)
+        columns_imputable = list(filter(lambda column: column not in columns_to_fill, columns_to_impute))
+
     imp = SimpleImputer(strategy=impute_strategy)
 
-    imp.fit(df[columns_to_impute].values)
+    imp.fit(df[columns_imputable].values)
 
     def p(new_data_set: pd.DataFrame) -> pd.DataFrame:
-        new_data = imp.transform(new_data_set[columns_to_impute])
-        new_cols = pd.DataFrame(data=new_data, columns=columns_to_impute).to_dict('list')
-        return new_data_set.assign(**new_cols)
+        new_df = new_data_set[columns_to_impute].copy()
+        new_df.loc[:, columns_imputable] = imp.transform(new_df[columns_imputable])
+        if columns_to_fill:
+            new_df.loc[:, columns_to_fill] = new_df.loc[:, columns_to_fill].fillna(value=fill_value)
+        return new_df
 
     p.__doc__ = learner_pred_fn_docstring("imputer")
 
-    log = {'imputer': {'impute_strategy': impute_strategy,
-                       'columns_to_impute': columns_to_impute,
-                       'training_proportion_of_nulls': df[columns_to_impute].isnull().mean(axis=0).to_dict(),
-                       'statistics': imp.statistics_}}
+    log = {
+        'imputer': {
+            'impute_strategy': impute_strategy,
+            'columns_to_impute': columns_to_impute,
+            'columns_to_fill': columns_to_fill,
+            'columns_imputable': columns_imputable,
+            'training_proportion_of_nulls': df[columns_to_impute].isnull().mean(axis=0).to_dict(),
+            'statistics': imp.statistics_
+        }
+    }
 
     return p, p(df), log
 

@@ -6,25 +6,29 @@
 def test_imputer():
     input_df = pd.DataFrame({
         'col1': [10, 13, 10],
-        'col2': [50, 100, None]
+        'col2': [50, 100, None],
+        'col3': [None, None, None]
     })
 
     input_df2 = pd.DataFrame({
         'col1': [10, None],
-        'col2': [None, 100]
+        'col2': [None, 100],
+        'col3': [None, 100]
     })
 
     expected1 = pd.DataFrame({
         'col1': [10.0, 13.0, 10.0],
-        'col2': [50.0, 100.0, 75.0]
+        'col2': [50.0, 100.0, 75.0],
+        'col3': [0, 0, 0]
     })
 
     expected2 = pd.DataFrame({
         'col1': [10, 11.0],
-        'col2': [75.0, 100]
+        'col2': [75.0, 100],
+        'col3': [0.0, 100],
     })
 
-    pred_fn, data, log = imputer(input_df, ["col1", "col2"], "mean")
+    pred_fn, data, log = imputer(input_df, ["col1", "col2", "col3"], "mean", fill_value=0)
 
     assert expected1.equals(data)
     assert expected2.equals(pred_fn(input_df2))