From 0c25a4429d23ddbe65d3a5aaf474d3838a03dd6a Mon Sep 17 00:00:00 2001 From: jineetd <35962652+jineetd@users.noreply.github.com> Date: Fri, 17 Nov 2023 05:16:00 -0500 Subject: [PATCH] Adding changes for Flaml Sklearn integration (#1361) Flaml provides support for Sklearn models like Random Forests, KNN, Extra Trees Regressor, and Logistic Regression with regularization. We plan to integrate these ML models into EVADB. Link for Flaml documentation: https://microsoft.github.io/FLAML/docs/Use-Cases/Task-Oriented-AutoML --------- Co-authored-by: Jineet Desai --- evadb/configuration/constants.py | 2 ++ evadb/executor/create_function_executor.py | 33 +++++++++++++------ evadb/functions/sklearn.py | 4 +-- evadb/functions/xgboost.py | 4 +-- evadb/utils/generic_utils.py | 27 +++------------ .../long/test_model_train.py | 4 ++- test/markers.py | 7 ++-- 7 files changed, 39 insertions(+), 42 deletions(-) diff --git a/evadb/configuration/constants.py b/evadb/configuration/constants.py index 3665a28727..126e6bcfca 100644 --- a/evadb/configuration/constants.py +++ b/evadb/configuration/constants.py @@ -36,3 +36,5 @@ DEFAULT_DOCUMENT_CHUNK_OVERLAP = 200 DEFAULT_TRAIN_REGRESSION_METRIC = "rmse" DEFAULT_XGBOOST_TASK = "regression" +DEFAULT_SKLEARN_TRAIN_MODEL = "rf" +SKLEARN_SUPPORTED_MODELS = ["rf", "extra_tree", "kneighbor"] diff --git a/evadb/executor/create_function_executor.py b/evadb/executor/create_function_executor.py index 9488195927..f686794184 100644 --- a/evadb/executor/create_function_executor.py +++ b/evadb/executor/create_function_executor.py @@ -30,9 +30,11 @@ from evadb.catalog.models.function_io_catalog import FunctionIOCatalogEntry from evadb.catalog.models.function_metadata_catalog import FunctionMetadataCatalogEntry from evadb.configuration.constants import ( + DEFAULT_SKLEARN_TRAIN_MODEL, DEFAULT_TRAIN_REGRESSION_METRIC, DEFAULT_TRAIN_TIME_LIMIT, DEFAULT_XGBOOST_TASK, + SKLEARN_SUPPORTED_MODELS, EvaDB_INSTALLATION_DIR, ) from evadb.database import EvaDBDatabase @@ -45,13 +47,12 @@ from evadb.utils.generic_utils import ( load_function_class_from_file, string_comparison_case_insensitive, + try_to_import_flaml_automl, try_to_import_ludwig, try_to_import_neuralforecast, - try_to_import_sklearn, try_to_import_statsforecast, try_to_import_torch, try_to_import_ultralytics, - try_to_import_xgboost, ) from evadb.utils.logging_manager import logger @@ -169,8 +170,7 @@ def handle_sklearn_function(self): Use Sklearn's regression to train models. """ - try_to_import_sklearn() - from sklearn.linear_model import LinearRegression + try_to_import_flaml_automl() assert ( len(self.children) == 1 @@ -186,13 +186,26 @@ def handle_sklearn_function(self): aggregated_batch.drop_column_alias() arg_map = {arg.key: arg.value for arg in self.node.metadata} - model = LinearRegression() - Y = aggregated_batch.frames[arg_map["predict"]] - aggregated_batch.frames.drop([arg_map["predict"]], axis=1, inplace=True) + from flaml import AutoML + + model = AutoML() + sklearn_model = arg_map.get("model", DEFAULT_SKLEARN_TRAIN_MODEL) + if sklearn_model not in SKLEARN_SUPPORTED_MODELS: + raise ValueError( + f"Sklearn Model {sklearn_model} provided as input is not supported." + ) + settings = { + "time_budget": arg_map.get("time_limit", DEFAULT_TRAIN_TIME_LIMIT), + "metric": arg_map.get("metric", DEFAULT_TRAIN_REGRESSION_METRIC), + "estimator_list": [sklearn_model], + "task": arg_map.get("task", DEFAULT_XGBOOST_TASK), + } start_time = int(time.time()) - model.fit(X=aggregated_batch.frames, y=Y) + model.fit( + dataframe=aggregated_batch.frames, label=arg_map["predict"], **settings + ) train_time = int(time.time()) - start_time - score = model.score(X=aggregated_batch.frames, y=Y) + score = model.best_loss model_path = os.path.join( self.db.catalog().get_configuration_catalog_value("model_dir"), self.node.name, @@ -232,7 +245,7 @@ def handle_xgboost_function(self): We use the Flaml AutoML model for training xgboost models. """ - try_to_import_xgboost() + try_to_import_flaml_automl() assert ( len(self.children) == 1 diff --git a/evadb/functions/sklearn.py b/evadb/functions/sklearn.py index 4ab2b0abfe..1333004b09 100644 --- a/evadb/functions/sklearn.py +++ b/evadb/functions/sklearn.py @@ -17,7 +17,7 @@ import pandas as pd from evadb.functions.abstract.abstract_function import AbstractFunction -from evadb.utils.generic_utils import try_to_import_sklearn +from evadb.utils.generic_utils import try_to_import_flaml_automl class GenericSklearnModel(AbstractFunction): @@ -26,7 +26,7 @@ def name(self) -> str: return "GenericSklearnModel" def setup(self, model_path: str, predict_col: str, **kwargs): - try_to_import_sklearn() + try_to_import_flaml_automl() self.model = pickle.load(open(model_path, "rb")) self.predict_col = predict_col diff --git a/evadb/functions/xgboost.py b/evadb/functions/xgboost.py index 0635294116..9705d09e4b 100644 --- a/evadb/functions/xgboost.py +++ b/evadb/functions/xgboost.py @@ -17,7 +17,7 @@ import pandas as pd from evadb.functions.abstract.abstract_function import AbstractFunction -from evadb.utils.generic_utils import try_to_import_xgboost +from evadb.utils.generic_utils import try_to_import_flaml_automl class GenericXGBoostModel(AbstractFunction): @@ -26,7 +26,7 @@ def name(self) -> str: return "GenericXGBoostModel" def setup(self, model_path: str, predict_col: str, **kwargs): - try_to_import_xgboost() + try_to_import_flaml_automl() self.model = pickle.load(open(model_path, "rb")) self.predict_col = predict_col diff --git a/evadb/utils/generic_utils.py b/evadb/utils/generic_utils.py index d9af319103..8f362e8cb5 100644 --- a/evadb/utils/generic_utils.py +++ b/evadb/utils/generic_utils.py @@ -369,39 +369,20 @@ def is_forecast_available() -> bool: return False -def try_to_import_sklearn(): - try: - import sklearn # noqa: F401 - from sklearn.linear_model import LinearRegression # noqa: F401 - except ImportError: - raise ValueError( - """Could not import sklearn. - Please install it with `pip install scikit-learn`.""" - ) - - -def is_sklearn_available() -> bool: - try: - try_to_import_sklearn() - return True - except ValueError: # noqa: E722 - return False - - -def try_to_import_xgboost(): +def try_to_import_flaml_automl(): try: import flaml # noqa: F401 from flaml import AutoML # noqa: F401 except ImportError: raise ValueError( - """Could not import Flaml AutoML. + """Could not import Flaml AutML. Please install it with `pip install "flaml[automl]"`.""" ) -def is_xgboost_available() -> bool: +def is_flaml_automl_available() -> bool: try: - try_to_import_xgboost() + try_to_import_flaml_automl() return True except ValueError: # noqa: E722 return False diff --git a/test/integration_tests/long/test_model_train.py b/test/integration_tests/long/test_model_train.py index 55c5b5a484..be610674a9 100644 --- a/test/integration_tests/long/test_model_train.py +++ b/test/integration_tests/long/test_model_train.py @@ -116,7 +116,9 @@ def test_sklearn_regression(self): CREATE OR REPLACE FUNCTION PredictHouseRentSklearn FROM ( SELECT number_of_rooms, number_of_bathrooms, days_on_market, rental_price FROM HomeRentals ) TYPE Sklearn - PREDICT 'rental_price'; + PREDICT 'rental_price' + MODEL 'extra_tree' + METRIC 'r2'; """ execute_query_fetch_all(self.evadb, create_predict_function) diff --git a/test/markers.py b/test/markers.py index 6fdd2ad3c7..8273f5f0fb 100644 --- a/test/markers.py +++ b/test/markers.py @@ -20,6 +20,7 @@ from evadb.utils.generic_utils import ( is_chromadb_available, + is_flaml_automl_available, is_forecast_available, is_gpu_available, is_ludwig_available, @@ -27,8 +28,6 @@ is_pinecone_available, is_qdrant_available, is_replicate_available, - is_sklearn_available, - is_xgboost_available, ) asyncio_skip_marker = pytest.mark.skipif( @@ -93,11 +92,11 @@ ) sklearn_skip_marker = pytest.mark.skipif( - is_sklearn_available() is False, reason="Run only if sklearn is available" + is_flaml_automl_available() is False, reason="Run only if Flaml AutoML is available" ) xgboost_skip_marker = pytest.mark.skipif( - is_xgboost_available() is False, reason="Run only if xgboost is available" + is_flaml_automl_available() is False, reason="Run only if Flaml AutoML is available" ) chatgpt_skip_marker = pytest.mark.skip(