Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding changes for Flaml Sklearn integration #1361

Merged
merged 2 commits into from
Nov 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions evadb/configuration/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,5 @@
DEFAULT_DOCUMENT_CHUNK_OVERLAP = 200
DEFAULT_TRAIN_REGRESSION_METRIC = "rmse"
DEFAULT_XGBOOST_TASK = "regression"
DEFAULT_SKLEARN_TRAIN_MODEL = "rf"
SKLEARN_SUPPORTED_MODELS = ["rf", "extra_tree", "kneighbor"]
33 changes: 23 additions & 10 deletions evadb/executor/create_function_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,11 @@
from evadb.catalog.models.function_io_catalog import FunctionIOCatalogEntry
from evadb.catalog.models.function_metadata_catalog import FunctionMetadataCatalogEntry
from evadb.configuration.constants import (
DEFAULT_SKLEARN_TRAIN_MODEL,
DEFAULT_TRAIN_REGRESSION_METRIC,
DEFAULT_TRAIN_TIME_LIMIT,
DEFAULT_XGBOOST_TASK,
SKLEARN_SUPPORTED_MODELS,
EvaDB_INSTALLATION_DIR,
)
from evadb.database import EvaDBDatabase
Expand All @@ -44,13 +46,12 @@
from evadb.utils.generic_utils import (
load_function_class_from_file,
string_comparison_case_insensitive,
try_to_import_flaml_automl,
try_to_import_ludwig,
try_to_import_neuralforecast,
try_to_import_sklearn,
try_to_import_statsforecast,
try_to_import_torch,
try_to_import_ultralytics,
try_to_import_xgboost,
)
from evadb.utils.logging_manager import logger

Expand Down Expand Up @@ -164,8 +165,7 @@ def handle_sklearn_function(self):

Use Sklearn's regression to train models.
"""
try_to_import_sklearn()
from sklearn.linear_model import LinearRegression
try_to_import_flaml_automl()

assert (
len(self.children) == 1
Expand All @@ -181,13 +181,26 @@ def handle_sklearn_function(self):
aggregated_batch.drop_column_alias()

arg_map = {arg.key: arg.value for arg in self.node.metadata}
model = LinearRegression()
Y = aggregated_batch.frames[arg_map["predict"]]
aggregated_batch.frames.drop([arg_map["predict"]], axis=1, inplace=True)
from flaml import AutoML

model = AutoML()
sklearn_model = arg_map.get("model", DEFAULT_SKLEARN_TRAIN_MODEL)
if sklearn_model not in SKLEARN_SUPPORTED_MODELS:
raise ValueError(
f"Sklearn Model {sklearn_model} provided as input is not supported."
)
settings = {
"time_budget": arg_map.get("time_limit", DEFAULT_TRAIN_TIME_LIMIT),
"metric": arg_map.get("metric", DEFAULT_TRAIN_REGRESSION_METRIC),
"estimator_list": [sklearn_model],
"task": arg_map.get("task", DEFAULT_XGBOOST_TASK),
}
start_time = int(time.time())
model.fit(X=aggregated_batch.frames, y=Y)
model.fit(
dataframe=aggregated_batch.frames, label=arg_map["predict"], **settings
)
train_time = int(time.time()) - start_time
score = model.score(X=aggregated_batch.frames, y=Y)
score = model.best_loss
model_path = os.path.join(
self.db.catalog().get_configuration_catalog_value("model_dir"),
self.node.name,
Expand Down Expand Up @@ -227,7 +240,7 @@ def handle_xgboost_function(self):

We use the Flaml AutoML model for training xgboost models.
"""
try_to_import_xgboost()
try_to_import_flaml_automl()

assert (
len(self.children) == 1
Expand Down
4 changes: 2 additions & 2 deletions evadb/functions/sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import pandas as pd

from evadb.functions.abstract.abstract_function import AbstractFunction
from evadb.utils.generic_utils import try_to_import_sklearn
from evadb.utils.generic_utils import try_to_import_flaml_automl


class GenericSklearnModel(AbstractFunction):
Expand All @@ -26,7 +26,7 @@ def name(self) -> str:
return "GenericSklearnModel"

def setup(self, model_path: str, predict_col: str, **kwargs):
try_to_import_sklearn()
try_to_import_flaml_automl()

self.model = pickle.load(open(model_path, "rb"))
self.predict_col = predict_col
Expand Down
4 changes: 2 additions & 2 deletions evadb/functions/xgboost.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import pandas as pd

from evadb.functions.abstract.abstract_function import AbstractFunction
from evadb.utils.generic_utils import try_to_import_xgboost
from evadb.utils.generic_utils import try_to_import_flaml_automl


class GenericXGBoostModel(AbstractFunction):
Expand All @@ -26,7 +26,7 @@ def name(self) -> str:
return "GenericXGBoostModel"

def setup(self, model_path: str, predict_col: str, **kwargs):
try_to_import_xgboost()
try_to_import_flaml_automl()

self.model = pickle.load(open(model_path, "rb"))
self.predict_col = predict_col
Expand Down
27 changes: 4 additions & 23 deletions evadb/utils/generic_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,39 +369,20 @@ def is_forecast_available() -> bool:
return False


def try_to_import_sklearn():
try:
import sklearn # noqa: F401
from sklearn.linear_model import LinearRegression # noqa: F401
except ImportError:
raise ValueError(
"""Could not import sklearn.
Please install it with `pip install scikit-learn`."""
)


def is_sklearn_available() -> bool:
try:
try_to_import_sklearn()
return True
except ValueError: # noqa: E722
return False


def try_to_import_xgboost():
def try_to_import_flaml_automl():
try:
import flaml # noqa: F401
from flaml import AutoML # noqa: F401
except ImportError:
raise ValueError(
"""Could not import Flaml AutoML.
"""Could not import Flaml AutML.
Please install it with `pip install "flaml[automl]"`."""
)


def is_xgboost_available() -> bool:
def is_flaml_automl_available() -> bool:
try:
try_to_import_xgboost()
try_to_import_flaml_automl()
return True
except ValueError: # noqa: E722
return False
Expand Down
4 changes: 3 additions & 1 deletion test/integration_tests/long/test_model_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,9 @@ def test_sklearn_regression(self):
CREATE OR REPLACE FUNCTION PredictHouseRentSklearn FROM
( SELECT number_of_rooms, number_of_bathrooms, days_on_market, rental_price FROM HomeRentals )
TYPE Sklearn
PREDICT 'rental_price';
PREDICT 'rental_price'
MODEL 'extra_tree'
METRIC 'r2';
"""
execute_query_fetch_all(self.evadb, create_predict_function)

Expand Down
7 changes: 3 additions & 4 deletions test/markers.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,14 @@

from evadb.utils.generic_utils import (
is_chromadb_available,
is_flaml_automl_available,
is_forecast_available,
is_gpu_available,
is_ludwig_available,
is_milvus_available,
is_pinecone_available,
is_qdrant_available,
is_replicate_available,
is_sklearn_available,
is_xgboost_available,
)

asyncio_skip_marker = pytest.mark.skipif(
Expand Down Expand Up @@ -93,11 +92,11 @@
)

sklearn_skip_marker = pytest.mark.skipif(
is_sklearn_available() is False, reason="Run only if sklearn is available"
is_flaml_automl_available() is False, reason="Run only if Flaml AutoML is available"
)

xgboost_skip_marker = pytest.mark.skipif(
is_xgboost_available() is False, reason="Run only if xgboost is available"
is_flaml_automl_available() is False, reason="Run only if Flaml AutoML is available"
)

chatgpt_skip_marker = pytest.mark.skip(
Expand Down