diff --git a/evadb/configuration/constants.py b/evadb/configuration/constants.py index 3665a2872..126e6bcfc 100644 --- a/evadb/configuration/constants.py +++ b/evadb/configuration/constants.py @@ -36,3 +36,5 @@ DEFAULT_DOCUMENT_CHUNK_OVERLAP = 200 DEFAULT_TRAIN_REGRESSION_METRIC = "rmse" DEFAULT_XGBOOST_TASK = "regression" +DEFAULT_SKLEARN_TRAIN_MODEL = "rf" +SKLEARN_SUPPORTED_MODELS = ["rf", "extra_tree", "kneighbor"] diff --git a/evadb/executor/create_function_executor.py b/evadb/executor/create_function_executor.py index 948819592..f68679418 100644 --- a/evadb/executor/create_function_executor.py +++ b/evadb/executor/create_function_executor.py @@ -30,9 +30,11 @@ from evadb.catalog.models.function_io_catalog import FunctionIOCatalogEntry from evadb.catalog.models.function_metadata_catalog import FunctionMetadataCatalogEntry from evadb.configuration.constants import ( + DEFAULT_SKLEARN_TRAIN_MODEL, DEFAULT_TRAIN_REGRESSION_METRIC, DEFAULT_TRAIN_TIME_LIMIT, DEFAULT_XGBOOST_TASK, + SKLEARN_SUPPORTED_MODELS, EvaDB_INSTALLATION_DIR, ) from evadb.database import EvaDBDatabase @@ -45,13 +47,12 @@ from evadb.utils.generic_utils import ( load_function_class_from_file, string_comparison_case_insensitive, + try_to_import_flaml_automl, try_to_import_ludwig, try_to_import_neuralforecast, - try_to_import_sklearn, try_to_import_statsforecast, try_to_import_torch, try_to_import_ultralytics, - try_to_import_xgboost, ) from evadb.utils.logging_manager import logger @@ -169,8 +170,7 @@ def handle_sklearn_function(self): Use Sklearn's regression to train models. """ - try_to_import_sklearn() - from sklearn.linear_model import LinearRegression + try_to_import_flaml_automl() assert ( len(self.children) == 1 @@ -186,13 +186,26 @@ def handle_sklearn_function(self): aggregated_batch.drop_column_alias() arg_map = {arg.key: arg.value for arg in self.node.metadata} - model = LinearRegression() - Y = aggregated_batch.frames[arg_map["predict"]] - aggregated_batch.frames.drop([arg_map["predict"]], axis=1, inplace=True) + from flaml import AutoML + + model = AutoML() + sklearn_model = arg_map.get("model", DEFAULT_SKLEARN_TRAIN_MODEL) + if sklearn_model not in SKLEARN_SUPPORTED_MODELS: + raise ValueError( + f"Sklearn Model {sklearn_model} provided as input is not supported." + ) + settings = { + "time_budget": arg_map.get("time_limit", DEFAULT_TRAIN_TIME_LIMIT), + "metric": arg_map.get("metric", DEFAULT_TRAIN_REGRESSION_METRIC), + "estimator_list": [sklearn_model], + "task": arg_map.get("task", DEFAULT_XGBOOST_TASK), + } start_time = int(time.time()) - model.fit(X=aggregated_batch.frames, y=Y) + model.fit( + dataframe=aggregated_batch.frames, label=arg_map["predict"], **settings + ) train_time = int(time.time()) - start_time - score = model.score(X=aggregated_batch.frames, y=Y) + score = model.best_loss model_path = os.path.join( self.db.catalog().get_configuration_catalog_value("model_dir"), self.node.name, @@ -232,7 +245,7 @@ def handle_xgboost_function(self): We use the Flaml AutoML model for training xgboost models. """ - try_to_import_xgboost() + try_to_import_flaml_automl() assert ( len(self.children) == 1 diff --git a/evadb/functions/sklearn.py b/evadb/functions/sklearn.py index 4ab2b0abf..1333004b0 100644 --- a/evadb/functions/sklearn.py +++ b/evadb/functions/sklearn.py @@ -17,7 +17,7 @@ import pandas as pd from evadb.functions.abstract.abstract_function import AbstractFunction -from evadb.utils.generic_utils import try_to_import_sklearn +from evadb.utils.generic_utils import try_to_import_flaml_automl class GenericSklearnModel(AbstractFunction): @@ -26,7 +26,7 @@ def name(self) -> str: return "GenericSklearnModel" def setup(self, model_path: str, predict_col: str, **kwargs): - try_to_import_sklearn() + try_to_import_flaml_automl() self.model = pickle.load(open(model_path, "rb")) self.predict_col = predict_col diff --git a/evadb/functions/xgboost.py b/evadb/functions/xgboost.py index 063529411..9705d09e4 100644 --- a/evadb/functions/xgboost.py +++ b/evadb/functions/xgboost.py @@ -17,7 +17,7 @@ import pandas as pd from evadb.functions.abstract.abstract_function import AbstractFunction -from evadb.utils.generic_utils import try_to_import_xgboost +from evadb.utils.generic_utils import try_to_import_flaml_automl class GenericXGBoostModel(AbstractFunction): @@ -26,7 +26,7 @@ def name(self) -> str: return "GenericXGBoostModel" def setup(self, model_path: str, predict_col: str, **kwargs): - try_to_import_xgboost() + try_to_import_flaml_automl() self.model = pickle.load(open(model_path, "rb")) self.predict_col = predict_col diff --git a/evadb/utils/generic_utils.py b/evadb/utils/generic_utils.py index d9af31910..8f362e8cb 100644 --- a/evadb/utils/generic_utils.py +++ b/evadb/utils/generic_utils.py @@ -369,39 +369,20 @@ def is_forecast_available() -> bool: return False -def try_to_import_sklearn(): - try: - import sklearn # noqa: F401 - from sklearn.linear_model import LinearRegression # noqa: F401 - except ImportError: - raise ValueError( - """Could not import sklearn. - Please install it with `pip install scikit-learn`.""" - ) - - -def is_sklearn_available() -> bool: - try: - try_to_import_sklearn() - return True - except ValueError: # noqa: E722 - return False - - -def try_to_import_xgboost(): +def try_to_import_flaml_automl(): try: import flaml # noqa: F401 from flaml import AutoML # noqa: F401 except ImportError: raise ValueError( - """Could not import Flaml AutoML. + """Could not import Flaml AutML. Please install it with `pip install "flaml[automl]"`.""" ) -def is_xgboost_available() -> bool: +def is_flaml_automl_available() -> bool: try: - try_to_import_xgboost() + try_to_import_flaml_automl() return True except ValueError: # noqa: E722 return False diff --git a/test/integration_tests/long/test_model_train.py b/test/integration_tests/long/test_model_train.py index 55c5b5a48..be610674a 100644 --- a/test/integration_tests/long/test_model_train.py +++ b/test/integration_tests/long/test_model_train.py @@ -116,7 +116,9 @@ def test_sklearn_regression(self): CREATE OR REPLACE FUNCTION PredictHouseRentSklearn FROM ( SELECT number_of_rooms, number_of_bathrooms, days_on_market, rental_price FROM HomeRentals ) TYPE Sklearn - PREDICT 'rental_price'; + PREDICT 'rental_price' + MODEL 'extra_tree' + METRIC 'r2'; """ execute_query_fetch_all(self.evadb, create_predict_function) diff --git a/test/markers.py b/test/markers.py index 6fdd2ad3c..8273f5f0f 100644 --- a/test/markers.py +++ b/test/markers.py @@ -20,6 +20,7 @@ from evadb.utils.generic_utils import ( is_chromadb_available, + is_flaml_automl_available, is_forecast_available, is_gpu_available, is_ludwig_available, @@ -27,8 +28,6 @@ is_pinecone_available, is_qdrant_available, is_replicate_available, - is_sklearn_available, - is_xgboost_available, ) asyncio_skip_marker = pytest.mark.skipif( @@ -93,11 +92,11 @@ ) sklearn_skip_marker = pytest.mark.skipif( - is_sklearn_available() is False, reason="Run only if sklearn is available" + is_flaml_automl_available() is False, reason="Run only if Flaml AutoML is available" ) xgboost_skip_marker = pytest.mark.skipif( - is_xgboost_available() is False, reason="Run only if xgboost is available" + is_flaml_automl_available() is False, reason="Run only if Flaml AutoML is available" ) chatgpt_skip_marker = pytest.mark.skip(