Skip to content

Commit 7a934e2

Browse files
[FSTORE-743] Add online support for External Feature Groups (#976)
1 parent 0223120 commit 7a934e2

7 files changed

+372
-208
lines changed

python/hsfs/core/external_feature_group_engine.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,54 @@ def save(self, feature_group):
4747

4848
self._feature_group_api.save(feature_group)
4949

50+
def insert(
51+
self,
52+
feature_group,
53+
feature_dataframe,
54+
write_options: dict,
55+
validation_options: dict = {},
56+
):
57+
if not feature_group.online_enabled:
58+
raise FeatureStoreException(
59+
"Online storage is not enabled for this feature group. External feature groups can only store data in"
60+
+ " online storage. To create an offline only external feature group, use the `save` method."
61+
)
62+
63+
schema = engine.get_instance().parse_schema_feature_group(feature_dataframe)
64+
65+
if not feature_group._id:
66+
# only save metadata if feature group does not exist
67+
feature_group.features = schema
68+
self.save(feature_group)
69+
else:
70+
# else, just verify that feature group schema matches user-provided dataframe
71+
self._verify_schema_compatibility(feature_group.features, schema)
72+
73+
# ge validation on python and non stream feature groups on spark
74+
ge_report = feature_group._great_expectation_engine.validate(
75+
feature_group=feature_group,
76+
dataframe=feature_dataframe,
77+
validation_options=validation_options,
78+
ingestion_result="INGESTED",
79+
ge_type=False,
80+
)
81+
82+
if ge_report is not None and ge_report.ingestion_result == "REJECTED":
83+
return None, ge_report
84+
85+
return (
86+
engine.get_instance().save_dataframe(
87+
feature_group=feature_group,
88+
dataframe=feature_dataframe,
89+
operation=None,
90+
online_enabled=feature_group.online_enabled,
91+
storage="online",
92+
offline_write_options=write_options,
93+
online_write_options=write_options,
94+
),
95+
ge_report,
96+
)
97+
5098
def _update_features_metadata(self, feature_group, features):
5199
# perform changes on copy in case the update fails, so we don't leave
52100
# the user object in corrupted state

python/hsfs/core/feature_group_base_engine.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#
1616

1717
from hsfs.core import feature_group_api, storage_connector_api, tags_api, kafka_api
18+
from hsfs.client.exceptions import FeatureStoreException
1819

1920

2021
class FeatureGroupBaseEngine:
@@ -107,3 +108,49 @@ def new_feature_list(self, feature_group, updated_features):
107108
):
108109
new_features.append(feature)
109110
return new_features + updated_features
111+
112+
def _verify_schema_compatibility(self, feature_group_features, dataframe_features):
113+
err = []
114+
feature_df_dict = {feat.name: feat.type for feat in dataframe_features}
115+
for feature_fg in feature_group_features:
116+
fg_type = feature_fg.type.lower().replace(" ", "")
117+
# check if feature exists dataframe
118+
if feature_fg.name in feature_df_dict:
119+
df_type = feature_df_dict[feature_fg.name].lower().replace(" ", "")
120+
# remove match from lookup table
121+
del feature_df_dict[feature_fg.name]
122+
123+
# check if types match
124+
if fg_type != df_type:
125+
# don't check structs for exact match
126+
if fg_type.startswith("struct") and df_type.startswith("struct"):
127+
continue
128+
129+
err += [
130+
f"{feature_fg.name} ("
131+
f"expected type: '{fg_type}', "
132+
f"derived from input: '{df_type}') has the wrong type."
133+
]
134+
135+
else:
136+
err += [
137+
f"{feature_fg.name} (type: '{feature_fg.type}') is missing from "
138+
f"input dataframe."
139+
]
140+
141+
# any features that are left in lookup table are superfluous
142+
for feature_df_name, feature_df_type in feature_df_dict.items():
143+
err += [
144+
f"{feature_df_name} (type: '{feature_df_type}') does not exist "
145+
f"in feature group."
146+
]
147+
148+
# raise exception if any errors were found.
149+
if len(err) > 0:
150+
raise FeatureStoreException(
151+
"Features are not compatible with Feature Group schema: "
152+
+ "".join(["\n - " + e for e in err])
153+
)
154+
155+
def get_subject(self, feature_group):
156+
return self._kafka_api.get_topic_subject(feature_group._online_topic_name)

python/hsfs/core/feature_group_engine.py

Lines changed: 0 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
from hsfs import engine, client, util
1818
from hsfs import feature_group as fg
1919
from hsfs.client import exceptions
20-
from hsfs.client.exceptions import FeatureStoreException
2120
from hsfs.core import feature_group_base_engine, hudi_engine
2221
from hsfs.core.deltastreamer_jobconf import DeltaStreamerJobConf
2322

@@ -234,9 +233,6 @@ def update_description(self, feature_group, description):
234233
feature_group, copy_feature_group, "updateMetadata"
235234
)
236235

237-
def get_subject(self, feature_group):
238-
return self._kafka_api.get_topic_subject(feature_group._online_topic_name)
239-
240236
def insert_stream(
241237
self,
242238
feature_group,
@@ -305,49 +301,6 @@ def insert_stream(
305301

306302
return streaming_query
307303

308-
def _verify_schema_compatibility(self, feature_group_features, dataframe_features):
309-
err = []
310-
feature_df_dict = {feat.name: feat.type for feat in dataframe_features}
311-
for feature_fg in feature_group_features:
312-
fg_type = feature_fg.type.lower().replace(" ", "")
313-
# check if feature exists dataframe
314-
if feature_fg.name in feature_df_dict:
315-
df_type = feature_df_dict[feature_fg.name].lower().replace(" ", "")
316-
# remove match from lookup table
317-
del feature_df_dict[feature_fg.name]
318-
319-
# check if types match
320-
if fg_type != df_type:
321-
# don't check structs for exact match
322-
if fg_type.startswith("struct") and df_type.startswith("struct"):
323-
continue
324-
325-
err += [
326-
f"{feature_fg.name} ("
327-
f"expected type: '{fg_type}', "
328-
f"derived from input: '{df_type}') has the wrong type."
329-
]
330-
331-
else:
332-
err += [
333-
f"{feature_fg.name} (type: '{feature_fg.type}') is missing from "
334-
f"input dataframe."
335-
]
336-
337-
# any features that are left in lookup table are superfluous
338-
for feature_df_name, feature_df_type in feature_df_dict.items():
339-
err += [
340-
f"{feature_df_name} (type: '{feature_df_type}') does not exist "
341-
f"in feature group."
342-
]
343-
344-
# raise exception if any errors were found.
345-
if len(err) > 0:
346-
raise FeatureStoreException(
347-
"Features are not compatible with Feature Group schema: "
348-
+ "".join(["\n - " + e for e in err])
349-
)
350-
351304
def _save_feature_group_metadata(
352305
self, feature_group, dataframe_features, write_options
353306
):

python/hsfs/engine/python.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
from sqlalchemy import sql
4545

4646
from hsfs import client, feature, util
47+
from hsfs.feature_group import ExternalFeatureGroup
4748
from hsfs.client.exceptions import FeatureStoreException
4849
from hsfs.core import (
4950
feature_group_api,
@@ -436,7 +437,10 @@ def save_dataframe(
436437
online_write_options: dict,
437438
validation_id: int = None,
438439
):
439-
if feature_group.stream:
440+
if (
441+
isinstance(feature_group, ExternalFeatureGroup)
442+
and feature_group.online_enabled
443+
) or feature_group.stream:
440444
return self._write_dataframe_kafka(
441445
feature_group, dataframe, offline_write_options
442446
)
@@ -896,13 +900,16 @@ def acked(err, msg):
896900
progress_bar.close()
897901

898902
# start backfilling job
899-
if offline_write_options is not None and offline_write_options.get(
900-
"start_offline_backfill", True
903+
if (
904+
not isinstance(feature_group, ExternalFeatureGroup)
905+
and offline_write_options is not None
906+
and offline_write_options.get("start_offline_backfill", True)
901907
):
902908
feature_group.backfill_job.run(
903909
await_termination=offline_write_options.get("wait_for_job", True)
904910
)
905-
911+
if isinstance(feature_group, ExternalFeatureGroup):
912+
return None
906913
return feature_group.backfill_job
907914

908915
def _kafka_produce(

python/hsfs/engine/spark.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@
6666
pass
6767

6868
from hsfs import feature, training_dataset_feature, client, util
69+
from hsfs.feature_group import ExternalFeatureGroup
6970
from hsfs.storage_connector import StorageConnector
7071
from hsfs.client.exceptions import FeatureStoreException
7172
from hsfs.core import hudi_engine, transformation_function_engine, kafka_api
@@ -261,7 +262,10 @@ def save_dataframe(
261262
validation_id=None,
262263
):
263264
try:
264-
if feature_group.stream:
265+
if (
266+
isinstance(feature_group, ExternalFeatureGroup)
267+
and feature_group.online_enabled
268+
) or feature_group.stream:
265269
self._save_online_dataframe(
266270
feature_group, dataframe, online_write_options
267271
)

0 commit comments

Comments
 (0)