Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions howso/utilities/feature_attributes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ def get_names(self, *, types: t.Optional[str | Container] = None,
]

def _validate_bounds(self, data: pd.DataFrame, feature: str, # noqa: C901
attributes: dict) -> list[str]:
attributes: FeatureAttributes) -> list[str]:
"""Validate the feature bounds of the provided DataFrame."""
# Import here to avoid circular import
from howso.utilities import date_to_epoch
Expand Down Expand Up @@ -388,7 +388,7 @@ def _validate_dtype(self, data: pd.DataFrame, feature: str, # noqa: C901
return errors

@staticmethod
def _allows_null(attributes: dict) -> bool:
def _allows_null(attributes: FeatureAttributes) -> bool:
"""Return whether the given attributes indicates the allowance of null values."""
return 'bounds' in attributes and attributes['bounds'].get('allow_null', False)

Expand All @@ -397,7 +397,7 @@ def _validate_df(self, data: pd.DataFrame, coerce: bool = False, # noqa: C901
allow_missing_features: bool = False, localize_datetimes=True, nullable_int_dtype='Int64'):
errors = []
coerced_df = data.copy(deep=True)
features = self[table_name] if table_name else self
features = t.cast(dict[str, "FeatureAttributes"], self[table_name] if table_name else self)

for feature, attributes in features.items():
if feature not in data.columns:
Expand Down Expand Up @@ -472,6 +472,11 @@ def _validate_df(self, data: pd.DataFrame, coerce: bool = False, # noqa: C901
errors.extend(self._validate_dtype(data, feature, 'datetime64',
coerced_df, coerce=coerce,
localize_datetimes=localize_datetimes))

# Check semi-structured type (object)
elif attributes.get("data_type") in {"json", "yaml", "amalgam", "string", "string_mixable"}:
errors.extend(self._validate_dtype(data, feature, "object", coerced_df, coerce=coerce))

# Check type (float)
elif attributes.get('decimal_places', -1) > 0:
errors.extend(self._validate_dtype(data, feature, 'float64',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -656,6 +656,46 @@ def test_validate_df_multiple_dtypes(ftype, data_type, decimal_places, bounds, d
assert pd.api.types.is_datetime64_any_dtype(coerced_df['DATE'].dtype)


@pytest.mark.parametrize(
("data", "expected_data_type", "expected_orig_type"),
(
({"a": 1}, "json", "container"),
([1, 2, 3], "json", "container"),
('{"a": 1}', "json", "string"),
('["a", "b", "c"]', "json", "string"),
("doc:\n abc: 1", "yaml", "string"),
("(list 1 2 3)", "amalgam", "string"),
('(assoc "a" 1 "b" 2)', "amalgam", "string"),
),
)
def test_validate_df_semi_structured(data, expected_data_type: str, expected_orig_type: str):
"""Test validate_df handles semi-structured features correctly."""
df = pd.DataFrame({
"id": [1, 2, 3],
"doc": [data, None, data],
})
with warnings.catch_warnings():
warnings.simplefilter("error")
feature_attributes = infer_feature_attributes(df)

attrs = feature_attributes["doc"]

assert "original_type" in attrs
assert attrs["original_type"]["data_type"] == expected_orig_type

if expected_data_type == "amalgam":
# Amalgam is not automatically inferred set it manually
attrs["type"] = "continuous"
attrs["data_type"] = "amalgam"
else:
assert attrs["type"] == "continuous"
assert attrs.get("data_type") == expected_data_type

with warnings.catch_warnings():
warnings.simplefilter("error")
feature_attributes.validate(df, raise_errors=True)


@pytest.mark.parametrize("extra_attrs, success", (
({}, False),
({'auto_derive_on_train': False}, False),
Expand Down
2 changes: 1 addition & 1 deletion howso/utilities/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,7 @@ def format_dataframe(cls, df: pd.DataFrame, features: Mapping,
new_values = cls.format_column(df[col], attributes, tokenizer=tokenizer)
df = df.drop(columns=col)
df[col] = new_values

return df[original_feature_order]

@classmethod
Expand Down
Loading