diff --git a/howso/utilities/feature_attributes/base.py b/howso/utilities/feature_attributes/base.py index 9ea6c755..22f170d9 100644 --- a/howso/utilities/feature_attributes/base.py +++ b/howso/utilities/feature_attributes/base.py @@ -240,7 +240,7 @@ def get_names(self, *, types: t.Optional[str | Container] = None, ] def _validate_bounds(self, data: pd.DataFrame, feature: str, # noqa: C901 - attributes: dict) -> list[str]: + attributes: FeatureAttributes) -> list[str]: """Validate the feature bounds of the provided DataFrame.""" # Import here to avoid circular import from howso.utilities import date_to_epoch @@ -388,7 +388,7 @@ def _validate_dtype(self, data: pd.DataFrame, feature: str, # noqa: C901 return errors @staticmethod - def _allows_null(attributes: dict) -> bool: + def _allows_null(attributes: FeatureAttributes) -> bool: """Return whether the given attributes indicates the allowance of null values.""" return 'bounds' in attributes and attributes['bounds'].get('allow_null', False) @@ -397,7 +397,7 @@ def _validate_df(self, data: pd.DataFrame, coerce: bool = False, # noqa: C901 allow_missing_features: bool = False, localize_datetimes=True, nullable_int_dtype='Int64'): errors = [] coerced_df = data.copy(deep=True) - features = self[table_name] if table_name else self + features = t.cast(dict[str, "FeatureAttributes"], self[table_name] if table_name else self) for feature, attributes in features.items(): if feature not in data.columns: @@ -472,6 +472,11 @@ def _validate_df(self, data: pd.DataFrame, coerce: bool = False, # noqa: C901 errors.extend(self._validate_dtype(data, feature, 'datetime64', coerced_df, coerce=coerce, localize_datetimes=localize_datetimes)) + + # Check semi-structured type (object) + elif attributes.get("data_type") in {"json", "yaml", "amalgam", "string", "string_mixable"}: + errors.extend(self._validate_dtype(data, feature, "object", coerced_df, coerce=coerce)) + # Check type (float) elif attributes.get('decimal_places', -1) > 0: errors.extend(self._validate_dtype(data, feature, 'float64', diff --git a/howso/utilities/feature_attributes/tests/test_infer_feature_attributes.py b/howso/utilities/feature_attributes/tests/test_infer_feature_attributes.py index 68ce8d9b..a7488ab9 100644 --- a/howso/utilities/feature_attributes/tests/test_infer_feature_attributes.py +++ b/howso/utilities/feature_attributes/tests/test_infer_feature_attributes.py @@ -656,6 +656,46 @@ def test_validate_df_multiple_dtypes(ftype, data_type, decimal_places, bounds, d assert pd.api.types.is_datetime64_any_dtype(coerced_df['DATE'].dtype) +@pytest.mark.parametrize( + ("data", "expected_data_type", "expected_orig_type"), + ( + ({"a": 1}, "json", "container"), + ([1, 2, 3], "json", "container"), + ('{"a": 1}', "json", "string"), + ('["a", "b", "c"]', "json", "string"), + ("doc:\n abc: 1", "yaml", "string"), + ("(list 1 2 3)", "amalgam", "string"), + ('(assoc "a" 1 "b" 2)', "amalgam", "string"), + ), +) +def test_validate_df_semi_structured(data, expected_data_type: str, expected_orig_type: str): + """Test validate_df handles semi-structured features correctly.""" + df = pd.DataFrame({ + "id": [1, 2, 3], + "doc": [data, None, data], + }) + with warnings.catch_warnings(): + warnings.simplefilter("error") + feature_attributes = infer_feature_attributes(df) + + attrs = feature_attributes["doc"] + + assert "original_type" in attrs + assert attrs["original_type"]["data_type"] == expected_orig_type + + if expected_data_type == "amalgam": + # Amalgam is not automatically inferred set it manually + attrs["type"] = "continuous" + attrs["data_type"] = "amalgam" + else: + assert attrs["type"] == "continuous" + assert attrs.get("data_type") == expected_data_type + + with warnings.catch_warnings(): + warnings.simplefilter("error") + feature_attributes.validate(df, raise_errors=True) + + @pytest.mark.parametrize("extra_attrs, success", ( ({}, False), ({'auto_derive_on_train': False}, False), diff --git a/howso/utilities/features.py b/howso/utilities/features.py index 69b6207c..f3958d75 100644 --- a/howso/utilities/features.py +++ b/howso/utilities/features.py @@ -269,7 +269,7 @@ def format_dataframe(cls, df: pd.DataFrame, features: Mapping, new_values = cls.format_column(df[col], attributes, tokenizer=tokenizer) df = df.drop(columns=col) df[col] = new_values - + return df[original_feature_order] @classmethod