From d4ea356de6fe6ea149e59a1112e36988deee26e3 Mon Sep 17 00:00:00 2001 From: Vitor Santa Rosa Date: Tue, 11 Feb 2020 23:14:30 -0300 Subject: [PATCH 01/12] Add imputer fill_value --- src/fklearn/training/imputation.py | 35 ++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/src/fklearn/training/imputation.py b/src/fklearn/training/imputation.py index 2f3c4026..03c4ea30 100644 --- a/src/fklearn/training/imputation.py +++ b/src/fklearn/training/imputation.py @@ -3,6 +3,7 @@ import pandas as pd from sklearn.impute import SimpleImputer from toolz import curry +from typing import Optional from fklearn.common_docstrings import learner_return_docstring, learner_pred_fn_docstring from fklearn.types import LearnerReturnType @@ -13,7 +14,8 @@ @log_learner_time(learner_name='imputer') def imputer(df: pd.DataFrame, columns_to_impute: List[str], - impute_strategy: str = 'median') -> LearnerReturnType: + impute_strategy: str = 'median', + fill_value: Optional[str] = None) -> LearnerReturnType: """ Fits a missing value imputer to the dataset. @@ -34,21 +36,36 @@ def imputer(df: pd.DataFrame, - If "most_frequent", then replace missing using the most frequent value along the axis. """ + columns_to_fill = list() + columns_imputable = columns_to_impute + if fill_value is not None: + df_is_nan = df[columns_to_impute].isna().all(axis=0) + columns_to_fill = list(df_is_nan[df_is_nan].index) + columns_imputable = list(filter(lambda column: column not in columns_to_fill, columns_to_impute)) + imp = SimpleImputer(strategy=impute_strategy) - imp.fit(df[columns_to_impute].values) + imp.fit(df[columns_imputable].values) def p(new_data_set: pd.DataFrame) -> pd.DataFrame: - new_data = imp.transform(new_data_set[columns_to_impute]) - new_cols = pd.DataFrame(data=new_data, columns=columns_to_impute).to_dict('list') - return new_data_set.assign(**new_cols) + new_df = new_data_set[columns_to_impute].copy() + new_df.loc[:, columns_imputable] = imp.transform(new_df[columns_imputable]) + if columns_to_fill: + new_df.loc[:, columns_to_fill] = new_df.loc[:, columns_to_fill].fillna(value=fill_value) + return new_df p.__doc__ = learner_pred_fn_docstring("imputer") - log = {'imputer': {'impute_strategy': impute_strategy, - 'columns_to_impute': columns_to_impute, - 'training_proportion_of_nulls': df[columns_to_impute].isnull().mean(axis=0).to_dict(), - 'statistics': imp.statistics_}} + log = { + 'imputer': { + 'impute_strategy': impute_strategy, + 'columns_to_impute': columns_to_impute, + 'columns_to_fill': columns_to_fill, + 'columns_imputable': columns_imputable, + 'training_proportion_of_nulls': df[columns_to_impute].isnull().mean(axis=0).to_dict(), + 'statistics': imp.statistics_ + } + } return p, p(df), log From 9d30077aedf6a95f035832f104679aeac4793b4c Mon Sep 17 00:00:00 2001 From: Vitor Santa Rosa Date: Tue, 11 Feb 2020 23:17:19 -0300 Subject: [PATCH 02/12] Add imputer fill_value tests --- tests/training/test_imputation.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/training/test_imputation.py b/tests/training/test_imputation.py index 7d4787a8..2f4ccfcd 100644 --- a/tests/training/test_imputation.py +++ b/tests/training/test_imputation.py @@ -6,25 +6,29 @@ def test_imputer(): input_df = pd.DataFrame({ 'col1': [10, 13, 10], - 'col2': [50, 100, None] + 'col2': [50, 100, None], + 'col3': [None, None, None] }) input_df2 = pd.DataFrame({ 'col1': [10, None], - 'col2': [None, 100] + 'col2': [None, 100], + 'col3': [None, 100] }) expected1 = pd.DataFrame({ 'col1': [10.0, 13.0, 10.0], - 'col2': [50.0, 100.0, 75.0] + 'col2': [50.0, 100.0, 75.0], + 'col3': [0, 0, 0] }) expected2 = pd.DataFrame({ 'col1': [10, 11.0], - 'col2': [75.0, 100] + 'col2': [75.0, 100], + 'col3': [0.0, 100], }) - pred_fn, data, log = imputer(input_df, ["col1", "col2"], "mean") + pred_fn, data, log = imputer(input_df, ["col1", "col2", "col3"], "mean", fill_value=0) assert expected1.equals(data) assert expected2.equals(pred_fn(input_df2)) From d7f079077045819060814c7f98a8d7755f8e2cae Mon Sep 17 00:00:00 2001 From: Vitor Santa Rosa Date: Tue, 11 Feb 2020 23:19:13 -0300 Subject: [PATCH 03/12] Add to .gitignore tests generated files --- .gitignore | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 7dbc7433..966aafb1 100644 --- a/.gitignore +++ b/.gitignore @@ -205,7 +205,7 @@ Temporary Items -# Unison 1 +# Unison *.unison *.zip .unison* @@ -219,3 +219,6 @@ Temporary Items # Vim swap files *.swp + +catboost_info +cb_model.json From 646ee93dfb1d4aa2cf3bf7e7bab474d4a45339cf Mon Sep 17 00:00:00 2001 From: Vitor Santa Rosa Date: Tue, 11 Feb 2020 23:27:35 -0300 Subject: [PATCH 04/12] Add documentation for imputer fill_value --- src/fklearn/training/imputation.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/fklearn/training/imputation.py b/src/fklearn/training/imputation.py index 03c4ea30..c017f41c 100644 --- a/src/fklearn/training/imputation.py +++ b/src/fklearn/training/imputation.py @@ -3,7 +3,7 @@ import pandas as pd from sklearn.impute import SimpleImputer from toolz import curry -from typing import Optional +from typing import Any, Optional from fklearn.common_docstrings import learner_return_docstring, learner_pred_fn_docstring from fklearn.types import LearnerReturnType @@ -15,7 +15,7 @@ def imputer(df: pd.DataFrame, columns_to_impute: List[str], impute_strategy: str = 'median', - fill_value: Optional[str] = None) -> LearnerReturnType: + fill_value: Optional[Any] = None) -> LearnerReturnType: """ Fits a missing value imputer to the dataset. @@ -34,6 +34,9 @@ def imputer(df: pd.DataFrame, - If "mean", then replace missing values using the mean along the axis. - If "median", then replace missing values using the median along the axis. - If "most_frequent", then replace missing using the most frequent value along the axis. + + fill_value : Any, (default=None) + if not None, use this as default value when some feature only contains NA values. """ columns_to_fill = list() From e6f1d185c7fb43f7a9d8cbfd464615b359002887 Mon Sep 17 00:00:00 2001 From: Vitor Santa Rosa Date: Tue, 11 Feb 2020 23:30:18 -0300 Subject: [PATCH 05/12] Rafactor typing import --- src/fklearn/training/imputation.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/fklearn/training/imputation.py b/src/fklearn/training/imputation.py index c017f41c..eee7f59f 100644 --- a/src/fklearn/training/imputation.py +++ b/src/fklearn/training/imputation.py @@ -1,9 +1,8 @@ -from typing import Any, List +from typing import Any, List, Optional import pandas as pd from sklearn.impute import SimpleImputer from toolz import curry -from typing import Any, Optional from fklearn.common_docstrings import learner_return_docstring, learner_pred_fn_docstring from fklearn.types import LearnerReturnType From c6747fb5cd566c47cf481e2ca81ba25f73535656 Mon Sep 17 00:00:00 2001 From: Vitor Santa Rosa Date: Wed, 12 Feb 2020 12:03:02 -0300 Subject: [PATCH 06/12] Split test_imputer_with_fill_value as new test --- tests/training/test_imputation.py | 32 ++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/tests/training/test_imputation.py b/tests/training/test_imputation.py index 2f4ccfcd..419f4bf2 100644 --- a/tests/training/test_imputation.py +++ b/tests/training/test_imputation.py @@ -6,34 +6,48 @@ def test_imputer(): input_df = pd.DataFrame({ 'col1': [10, 13, 10], - 'col2': [50, 100, None], - 'col3': [None, None, None] + 'col2': [50, 100, None] }) input_df2 = pd.DataFrame({ 'col1': [10, None], - 'col2': [None, 100], - 'col3': [None, 100] + 'col2': [None, 100] }) expected1 = pd.DataFrame({ 'col1': [10.0, 13.0, 10.0], - 'col2': [50.0, 100.0, 75.0], - 'col3': [0, 0, 0] + 'col2': [50.0, 100.0, 75.0] }) expected2 = pd.DataFrame({ 'col1': [10, 11.0], - 'col2': [75.0, 100], - 'col3': [0.0, 100], + 'col2': [75.0, 100] }) - pred_fn, data, log = imputer(input_df, ["col1", "col2", "col3"], "mean", fill_value=0) + pred_fn, data, log = imputer(input_df, ["col1", "col2"], "mean") assert expected1.equals(data) assert expected2.equals(pred_fn(input_df2)) +def test_imputer_with_fill_value(): + input_df = pd.DataFrame({ + 'col1': [10, 13, 10], + 'col2': [50, 100, None], + 'col3': [None, None, None] + }) + + expected = pd.DataFrame({ + 'col1': [10.0, 13.0, 10.0], + 'col2': [50.0, 100.0, 75.0], + 'col3': [0, 0, 0] + }) + + pred_fn, data, log = imputer(input_df, ["col1", "col2", "col3"], "mean", fill_value=0) + + assert expected.equals(data) + + def test_placeholder_imputer(): input_df = pd.DataFrame({ 'col1': [10, 13, 10], From 2024b7c9dfc36578d7638f6e1056b480c2b8ed75 Mon Sep 17 00:00:00 2001 From: Vitor Santa Rosa Date: Wed, 12 Feb 2020 13:33:46 -0300 Subject: [PATCH 07/12] Reafactor imputer with fill_value --- src/fklearn/training/imputation.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/src/fklearn/training/imputation.py b/src/fklearn/training/imputation.py index eee7f59f..30cd480e 100644 --- a/src/fklearn/training/imputation.py +++ b/src/fklearn/training/imputation.py @@ -35,15 +35,23 @@ def imputer(df: pd.DataFrame, - If "most_frequent", then replace missing using the most frequent value along the axis. fill_value : Any, (default=None) - if not None, use this as default value when some feature only contains NA values. + if not None, use this as default value when some features only contains + NA values on training. For transformation, NA values on those features + will be replaced by `fill_value`. """ - columns_to_fill = list() - columns_imputable = columns_to_impute if fill_value is not None: - df_is_nan = df[columns_to_impute].isna().all(axis=0) - columns_to_fill = list(df_is_nan[df_is_nan].index) - columns_imputable = list(filter(lambda column: column not in columns_to_fill, columns_to_impute)) + df_feat_is_nan = df[columns_to_impute].isna().all(axis=0) + columns_to_fill = df_feat_is_nan[df_feat_is_nan].index.values + columns_imputable = df_feat_is_nan[~df_feat_is_nan].index.values + imp_fill = SimpleImputer(strategy='constant', fill_value=fill_value) + + if len(columns_to_fill) > 0: + imp_fill.fit(df[columns_to_fill].values) + else: + columns_to_fill = list() + columns_imputable = columns_to_impute + imp_fill = None imp = SimpleImputer(strategy=impute_strategy) @@ -53,7 +61,7 @@ def p(new_data_set: pd.DataFrame) -> pd.DataFrame: new_df = new_data_set[columns_to_impute].copy() new_df.loc[:, columns_imputable] = imp.transform(new_df[columns_imputable]) if columns_to_fill: - new_df.loc[:, columns_to_fill] = new_df.loc[:, columns_to_fill].fillna(value=fill_value) + new_df.loc[:, columns_to_fill] = imp_fill.transform(new_df[columns_to_fill]) return new_df p.__doc__ = learner_pred_fn_docstring("imputer") @@ -65,7 +73,9 @@ def p(new_data_set: pd.DataFrame) -> pd.DataFrame: 'columns_to_fill': columns_to_fill, 'columns_imputable': columns_imputable, 'training_proportion_of_nulls': df[columns_to_impute].isnull().mean(axis=0).to_dict(), - 'statistics': imp.statistics_ + 'training_proportion_of_nulls_fill': df.loc[:, columns_to_fill].isnull().mean(axis=0).to_dict(), + 'statistics': imp.statistics_, + 'statistics_fill': imp_fill.statistics_, } } From 53d6a1760fc58be43c5027a77860a85516702a2b Mon Sep 17 00:00:00 2001 From: Vitor Santa Rosa Date: Wed, 12 Feb 2020 14:15:39 -0300 Subject: [PATCH 08/12] Change imputer to use placeholder_imputer --- src/fklearn/training/imputation.py | 32 ++++++++++++++---------------- tests/training/test_imputation.py | 14 ++++++++++--- 2 files changed, 26 insertions(+), 20 deletions(-) diff --git a/src/fklearn/training/imputation.py b/src/fklearn/training/imputation.py index 30cd480e..7c03abe8 100644 --- a/src/fklearn/training/imputation.py +++ b/src/fklearn/training/imputation.py @@ -2,7 +2,7 @@ import pandas as pd from sklearn.impute import SimpleImputer -from toolz import curry +from toolz import curry, identity from fklearn.common_docstrings import learner_return_docstring, learner_pred_fn_docstring from fklearn.types import LearnerReturnType @@ -14,7 +14,7 @@ def imputer(df: pd.DataFrame, columns_to_impute: List[str], impute_strategy: str = 'median', - fill_value: Optional[Any] = None) -> LearnerReturnType: + placeholder_value: Optional[Any] = None) -> LearnerReturnType: """ Fits a missing value imputer to the dataset. @@ -35,47 +35,45 @@ def imputer(df: pd.DataFrame, - If "most_frequent", then replace missing using the most frequent value along the axis. fill_value : Any, (default=None) - if not None, use this as default value when some features only contains - NA values on training. For transformation, NA values on those features + if not None, use this as default value when some features only contains + NA values on training. For transformation, NA values on those features will be replaced by `fill_value`. """ - if fill_value is not None: + if placeholder_value is not None: df_feat_is_nan = df[columns_to_impute].isna().all(axis=0) columns_to_fill = df_feat_is_nan[df_feat_is_nan].index.values columns_imputable = df_feat_is_nan[~df_feat_is_nan].index.values - imp_fill = SimpleImputer(strategy='constant', fill_value=fill_value) - - if len(columns_to_fill) > 0: - imp_fill.fit(df[columns_to_fill].values) + + fill_fn, __, fill_logs = placeholder_imputer( + df, columns_to_impute=columns_to_fill, placeholder_value=placeholder_value) else: columns_to_fill = list() columns_imputable = columns_to_impute - imp_fill = None + fill_fn, __, fill_logs = identity, None, dict() imp = SimpleImputer(strategy=impute_strategy) imp.fit(df[columns_imputable].values) def p(new_data_set: pd.DataFrame) -> pd.DataFrame: - new_df = new_data_set[columns_to_impute].copy() - new_df.loc[:, columns_imputable] = imp.transform(new_df[columns_imputable]) - if columns_to_fill: - new_df.loc[:, columns_to_fill] = imp_fill.transform(new_df[columns_to_fill]) - return new_df + new_data = imp.transform(new_data_set[columns_imputable]) + new_cols = pd.DataFrame(data=new_data, columns=columns_imputable).to_dict('list') + return fill_fn(new_data_set.assign(**new_cols)) p.__doc__ = learner_pred_fn_docstring("imputer") log = { 'imputer': { 'impute_strategy': impute_strategy, + 'placeholder_value': placeholder_value, 'columns_to_impute': columns_to_impute, 'columns_to_fill': columns_to_fill, 'columns_imputable': columns_imputable, 'training_proportion_of_nulls': df[columns_to_impute].isnull().mean(axis=0).to_dict(), - 'training_proportion_of_nulls_fill': df.loc[:, columns_to_fill].isnull().mean(axis=0).to_dict(), 'statistics': imp.statistics_, - 'statistics_fill': imp_fill.statistics_, + 'placeholder_imputer_fn': fill_fn, + 'placeholder_imputer_logs': fill_logs, } } diff --git a/tests/training/test_imputation.py b/tests/training/test_imputation.py index 419f4bf2..2b52b19e 100644 --- a/tests/training/test_imputation.py +++ b/tests/training/test_imputation.py @@ -37,15 +37,23 @@ def test_imputer_with_fill_value(): 'col3': [None, None, None] }) + df = pd.DataFrame({ + 'col1': [10.0, 13.0, 10.0], + 'col2': [50.0, 100.0, 75.0], + 'col3': [10.0, None, None] + }) + expected = pd.DataFrame({ 'col1': [10.0, 13.0, 10.0], 'col2': [50.0, 100.0, 75.0], - 'col3': [0, 0, 0] + 'col3': [10.0, -999, -999] }) - pred_fn, data, log = imputer(input_df, ["col1", "col2", "col3"], "mean", fill_value=0) + pred_fn, data, log = imputer(input_df, ["col1", "col2", "col3"], "mean", placeholder_value=-999) - assert expected.equals(data) + print(pred_fn(df)) + print(expected) + assert expected.equals(pred_fn(df)) def test_placeholder_imputer(): From 6a75ef5b3724a95c05d3bc024a3881ea973bb197 Mon Sep 17 00:00:00 2001 From: Vitor Santa Rosa Date: Wed, 12 Feb 2020 14:16:32 -0300 Subject: [PATCH 09/12] Rename df_feat_is_nan to mask_feat_is_na --- src/fklearn/training/imputation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/fklearn/training/imputation.py b/src/fklearn/training/imputation.py index 7c03abe8..96f074b3 100644 --- a/src/fklearn/training/imputation.py +++ b/src/fklearn/training/imputation.py @@ -34,16 +34,16 @@ def imputer(df: pd.DataFrame, - If "median", then replace missing values using the median along the axis. - If "most_frequent", then replace missing using the most frequent value along the axis. - fill_value : Any, (default=None) + placeholder_value : Any, (default=None) if not None, use this as default value when some features only contains NA values on training. For transformation, NA values on those features will be replaced by `fill_value`. """ if placeholder_value is not None: - df_feat_is_nan = df[columns_to_impute].isna().all(axis=0) - columns_to_fill = df_feat_is_nan[df_feat_is_nan].index.values - columns_imputable = df_feat_is_nan[~df_feat_is_nan].index.values + mask_feat_is_na = df[columns_to_impute].isna().all(axis=0) + columns_to_fill = mask_feat_is_na[mask_feat_is_na].index.values + columns_imputable = mask_feat_is_na[~mask_feat_is_na].index.values fill_fn, __, fill_logs = placeholder_imputer( df, columns_to_impute=columns_to_fill, placeholder_value=placeholder_value) From ae9bec3a52a3c65c95ddb6025d69ccf280ee4dd2 Mon Sep 17 00:00:00 2001 From: Vitor Santa Rosa Date: Wed, 12 Feb 2020 14:41:52 -0300 Subject: [PATCH 10/12] Change unused variable name --- src/fklearn/training/imputation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/fklearn/training/imputation.py b/src/fklearn/training/imputation.py index 96f074b3..f64ce34c 100644 --- a/src/fklearn/training/imputation.py +++ b/src/fklearn/training/imputation.py @@ -45,12 +45,12 @@ def imputer(df: pd.DataFrame, columns_to_fill = mask_feat_is_na[mask_feat_is_na].index.values columns_imputable = mask_feat_is_na[~mask_feat_is_na].index.values - fill_fn, __, fill_logs = placeholder_imputer( + fill_fn, _, fill_logs = placeholder_imputer( df, columns_to_impute=columns_to_fill, placeholder_value=placeholder_value) else: columns_to_fill = list() columns_imputable = columns_to_impute - fill_fn, __, fill_logs = identity, None, dict() + fill_fn, _, fill_logs = identity, None, dict() imp = SimpleImputer(strategy=impute_strategy) From 582047213b76231ea1eda6c071e7da5c6ed049ac Mon Sep 17 00:00:00 2001 From: Vitor Santa Rosa Date: Wed, 12 Feb 2020 14:43:28 -0300 Subject: [PATCH 11/12] Remove whitespaces in blank line --- src/fklearn/training/imputation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fklearn/training/imputation.py b/src/fklearn/training/imputation.py index f64ce34c..0a811e7a 100644 --- a/src/fklearn/training/imputation.py +++ b/src/fklearn/training/imputation.py @@ -44,7 +44,7 @@ def imputer(df: pd.DataFrame, mask_feat_is_na = df[columns_to_impute].isna().all(axis=0) columns_to_fill = mask_feat_is_na[mask_feat_is_na].index.values columns_imputable = mask_feat_is_na[~mask_feat_is_na].index.values - + fill_fn, _, fill_logs = placeholder_imputer( df, columns_to_impute=columns_to_fill, placeholder_value=placeholder_value) else: From 521074e2b61506a9928f11433ee3437a0ffa1fe8 Mon Sep 17 00:00:00 2001 From: Vitor Santa Rosa Date: Thu, 13 Feb 2020 17:37:07 -0300 Subject: [PATCH 12/12] Remove print in tests --- tests/training/test_imputation.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/training/test_imputation.py b/tests/training/test_imputation.py index 2b52b19e..977f60a4 100644 --- a/tests/training/test_imputation.py +++ b/tests/training/test_imputation.py @@ -51,8 +51,6 @@ def test_imputer_with_fill_value(): pred_fn, data, log = imputer(input_df, ["col1", "col2", "col3"], "mean", placeholder_value=-999) - print(pred_fn(df)) - print(expected) assert expected.equals(pred_fn(df))