Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 0 additions & 86 deletions howso/client/tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -1440,89 +1440,3 @@ def test_tokenizable_strings_reaction(self):
)
assert reaction["action"].iloc[0]["review"] == df.iloc[0]["review"]
assert reaction["details"]["influential_cases"][0].iloc[0]["review"] == df.iloc[0]["review"]

def test_json_feature_types(self):
"""Test that JSON features stored as Python data structures have their primitive types maintained."""
tests = [
({"a": "str", "b": "1", "c": "2.7", "d": True, "e": {"a1": "str", "b1": {"c1": [1, 2, 3]}}},
{"a": "string", "b": "integer", "c": "numeric", "d": "boolean", "e": {"a1": "string", "b1": {"c1": "integer"}}}),
({"a": "str", "b": "1", "c": "3.3", "d": False, "e": {"a1": "str2", "b1": {"c1": [1, 2, 3, 4, 5, 6, 7]}}},
{"a": "string", "b": "integer", "c": "numeric", "d": "boolean", "e": {"a1": "string", "b1": {"c1": "integer"}}}),
({"a": 3, "b": 1.5, "c": 2.7, "d": True, "e": {"a1": 5, "b1": {"c1": [1, 2, 3]}}},
{"a": "integer", "b": "numeric", "c": "numeric", "d": "boolean", "e": {"a1": "integer", "b1": {"c1": "integer"}}}),
({"a": 3, "b": 1, "c": 2.7, "d": True, "e": {"a1": "str", "b1": {"c1": [1, True, "foo"]}}},
{"a": "integer", "b": "integer", "c": "numeric", "d": "boolean", "e": {"a1": "string", "b1": {"c1": "object"}}}),
]
data_uniform_types = pd.DataFrame({"foo": [tests[0][0], tests[1][0]], "bar": ["a", "b"]})
data_uniform_except_list = pd.DataFrame({"foo": [tests[2][0], tests[3][0]], "bar": ["a", "b"]})
data_non_uniform = pd.DataFrame({"foo": [tests[0][0], tests[1][0], tests[2][0]], "bar": ["a", "b", "c"]})

# Types should be preserved with no warnings (dict)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
feature_attributes = infer_feature_attributes(data_uniform_types)
assert feature_attributes["foo"]["data_type"] == "json"
assert feature_attributes["foo"]["type"] == "continuous"
assert feature_attributes["foo"]["original_type"]["data_type"] == "container"
client = HowsoClient()
t = Trainee()
client.set_feature_attributes(t.id, feature_attributes)
client.train(t.id, data_uniform_types)
reaction = client.react(
t.id,
contexts=[["a"]],
context_features=['bar'],
action_features=['foo'],
details={"influential_cases": True},
desired_conviction=5,
)
# Cannot compare
assert reaction["action"].iloc[0]["foo"] == data_uniform_types.iloc[0]["foo"]
assert reaction["details"]["influential_cases"][0].iloc[0]["foo"] == tests[0][0]

# All types except for the nested list should be preserved and a warning issued
with pytest.warns(match="contains a key 'c1' whose value is a list of mixed types"):
feature_attributes = infer_feature_attributes(data_uniform_except_list)
assert feature_attributes["foo"]["data_type"] == "json"
assert feature_attributes["foo"]["type"] == "continuous"
assert feature_attributes["foo"]["original_type"]["data_type"] == "container"
client = HowsoClient()
t = Trainee()
client.set_feature_attributes(t.id, feature_attributes)
client.train(t.id, data_uniform_except_list)
reaction = client.react(
t.id,
contexts=[["b"]],
context_features=['bar'],
action_features=['foo'],
details={"influential_cases": True},
desired_conviction=5,
)
expected_case = deepcopy(tests[3][0])
# The list under this key has mixed types so it will come back as-is when deserialized
expected_case["e"]["b1"]["c1"] = json.loads(json.dumps(expected_case["e"]["b1"]["c1"]))
assert reaction["action"].iloc[0]["foo"] == expected_case
assert reaction["details"]["influential_cases"][0].iloc[0]["foo"] == expected_case

# Types cannot be preserved, warning issued
with pytest.warns(match="inconsistent types and/or keys across cases."):
feature_attributes = infer_feature_attributes(data_non_uniform)
assert feature_attributes["foo"]["data_type"] == "json"
assert feature_attributes["foo"]["type"] == "continuous"
assert feature_attributes["foo"]["original_type"]["data_type"] == "container"
client = HowsoClient()
t = Trainee()
client.set_feature_attributes(t.id, feature_attributes)
client.train(t.id, data_non_uniform)
reaction = client.react(
t.id,
contexts=[["a"]],
context_features=['bar'],
action_features=['foo'],
details={"influential_cases": True},
desired_conviction=5,
)
# Cases of "foo" have mixed types so they will come back as-is when deserialized
expected_case = json.loads(json.dumps(tests[0][0]))
assert reaction["action"].iloc[0]["foo"] == expected_case
assert reaction["details"]["influential_cases"][0].iloc[0]["foo"] == expected_case
4 changes: 0 additions & 4 deletions howso/utilities/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
SingleTableFeatureAttributes,
)
from .features import ( # noqa: F401
cast_primitive_from_feature_type,
convert_primitive_to_feature_type,
deserialize_cases,
FeatureType,
format_column,
Expand Down Expand Up @@ -65,8 +63,6 @@
"align_data",
"build_react_series_df",
"check_feature_names",
"cast_primitive_from_feature_type",
"convert_primitive_to_feature_type",
"format_confusion_matrix",
"date_format_is_iso",
"date_to_epoch",
Expand Down
73 changes: 2 additions & 71 deletions howso/utilities/feature_attributes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
import pandas as pd
import yaml

from howso.utilities.features import convert_primitive_to_feature_type, FeatureType
from howso.utilities.features import FeatureType
from howso.utilities.utilities import is_valid_datetime_format, time_to_seconds
from ..utilities import determine_iso_format

Expand Down Expand Up @@ -1148,74 +1148,6 @@ def _infer_boolean_attributes(self, feature_name: str) -> dict:
def _infer_integer_attributes(self, feature_name: str) -> dict:
"""Get inferred attributes for the given integer column."""

def _get_primitive_type_schema(self, feature_name: str) -> dict: # noqa: C901
"""Get a map of keys to types for a JSON feature stored as a Python dict or list."""
# If there is no data, return False
first_non_none = self._get_first_non_null(feature_name)
if first_non_none is None:
return False

# Keep track of whether there are any non-primitive types in the data
has_complex_type = False

def _recursive_get_types(data: t.Any, key: str = None) -> dict:
"""Recursively determine primitive types for an arbitrary Python data structure."""
nonlocal has_complex_type
# Value is a list
if isinstance(data, MutableSequence):
list_type = FeatureType.UNKNOWN.value
# Iterate through 10 random values of a list of len>10, or through the entire list if len<=10.
iterations = min(len(data), 10)
if len(data) > 10:
# Shuffle data so that the first 10 indices are randomized
data = list(pd.Series(data).sample(frac=1))
for idx in range(iterations):
rand_val = data[idx]
if rand_val is None:
# We can still retain primitive types with NoneTypes present in the data structure
continue
elif list_type == FeatureType.UNKNOWN.value:
list_type = convert_primitive_to_feature_type(rand_val)
elif list_type != convert_primitive_to_feature_type(rand_val):
warnings.warn(f"JSON feature '{feature_name}' contains a key '{key}' whose value is a list of "
"mixed types. Original types under this key will not be preserved.")
return FeatureType.UNKNOWN.value
elif list_type == FeatureType.UNKNOWN.value:
# A non-primitive type was found in the data
has_complex_type = True
return list_type
# Base case: not a list or dict
elif not isinstance(data, Mapping):
return convert_primitive_to_feature_type(data)
# Value is a dict
return {key: _recursive_get_types(data[key], key=key) for key in data.keys()}

# Sample up to 10 random values
# OR every value if < 10
type_maps = []
if (count := self._get_unique_count(feature_name)) < 10:
for sample in self._get_unique_values(feature_name):
type_maps.append(_recursive_get_types(sample))
else:
count = 10
for idx in range(10):
sample = self._get_random_value(feature_name, no_nulls=True)
type_maps.append(_recursive_get_types(sample))

# Issue a warning if keys or types are not consistent across cases
for idx in range(1, count):
if type_maps[0] != type_maps[idx]:
warnings.warn(f"JSON feature '{feature_name} has inconsistent types and/or keys across cases. "
"Original types will not be preserved.")
return

# Issue a warning if any non-primitive types were found
if has_complex_type:
warnings.warn(f"JSON feature '{feature_name}' contains at least one instance of a non-primitive type. "
"Only uniform, primitive types will be preserved in semistructured features.")

return type_maps[0]

def _infer_string_attributes(self, feature_name: str) -> dict:
"""Get inferred attributes for the given string column."""
# Column has arbitrary string values, first check if they
Expand All @@ -1239,11 +1171,10 @@ def _infer_string_attributes(self, feature_name: str) -> dict:
elif self._is_json_feature(feature_name):
first_non_null = self._get_first_non_null(feature_name)
if isinstance(first_non_null, Mapping) or isinstance(first_non_null, MutableSequence):
type_map = self._get_primitive_type_schema(feature_name) or {}
return {
"type": "continuous",
"data_type": "json",
"original_type": {"type_map": type_map, "data_type": FeatureType.CONTAINER.value},
"original_type": {"data_type": FeatureType.CONTAINER.value},
}
return {
"type": "continuous",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1084,44 +1084,3 @@ def test_infer_tokenizable_string():
# Product should still be a nominal string
assert feature_attributes["product"]["data_type"] == "string"
assert feature_attributes["product"]["type"] == "nominal"


def test_json_features_types():
"""Test that IFA includes type information for JSON features that are Python dicts/lists."""
tests = [
({"a": "str", "b": 1, "c": 2.7, "d": True, "e": {"a1": "str", "b1": {"c1": [1, 2, 3]}}},
{"a": "string", "b": "integer", "c": "numeric", "d": "boolean", "e": {"a1": "string", "b1": {"c1": "integer"}}}),
({"a": "str", "b": 9, "c": 3.3, "d": False, "e": {"a1": "str2", "b1": {"c1": [1, 2, 3, 4, 5, 6, 7]}}},
{"a": "string", "b": "integer", "c": "numeric", "d": "boolean", "e": {"a1": "string", "b1": {"c1": "integer"}}}),
({"a": 3, "b": 1.5, "c": 2.7, "d": True, "e": {"a1": 5, "b1": {"c1": [1, 2, 3]}}},
{"a": "integer", "b": "numeric", "c": "numeric", "d": "boolean", "e": {"a1": "integer", "b1": {"c1": "integer"}}}),
({"a": 3, "b": 1, "c": 2.7, "d": True, "e": {"a1": "str", "b1": {"c1": [1, True, "foo"]}}},
{"a": "integer", "b": "integer", "c": "numeric", "d": "boolean", "e": {"a1": "string", "b1": {"c1": "object"}}}),
([1.1, 2.2, 3.3, 4.4], "numeric"),
]
# First test that the type maps are correct
with warnings.catch_warnings():
warnings.simplefilter("ignore")
for test in tests:
df = pd.DataFrame({"foo": [test[0]]})
attributes = infer_feature_attributes(df)
assert attributes["foo"]["data_type"] == "json"
assert attributes["foo"]["original_type"]["data_type"] == FeatureType.CONTAINER.value
assert attributes["foo"]["original_type"]["type_map"] == test[1]
with warnings.catch_warnings():
warnings.simplefilter("error")
# Ensure no warnings also with multiple cases of the same schema
df = pd.DataFrame({"foo": [tests[0][0], tests[1][0]]})
attributes = infer_feature_attributes(df)
assert attributes["foo"]["data_type"] == "json"
assert attributes["foo"]["original_type"]["data_type"] == FeatureType.CONTAINER.value
assert attributes["foo"]["original_type"]["type_map"] == tests[0][1]

# Test applicable warnings
with pytest.warns(match="contains a key 'c1' whose value is a list of mixed types"):
df = pd.DataFrame({"foo": [tests[3][0]]})
infer_feature_attributes(df)
with pytest.warns(match="inconsistent types and/or keys across cases."):
df = pd.DataFrame({"foo": [tests[0][0], tests[2][0], tests[1][0]]})
attributes = infer_feature_attributes(df)
assert not attributes["foo"]["original_type"]["type_map"]
Original file line number Diff line number Diff line change
Expand Up @@ -602,23 +602,3 @@ def test_infer_tokenizable_string(adc):
# Product should still be a nominal string
assert feature_attributes["product"]["data_type"] == "string"
assert feature_attributes["product"]["type"] == "nominal"


@pytest.mark.parametrize('adc', [
# Only MongoDBData and DataFrameData support Python objects as data
("MongoDBData", pd.DataFrame()),
("DataFrameData", pd.DataFrame()),
], indirect=True)
def test_json_features_types(adc):
"""Test that IFA includes type information for JSON features that are Python dicts/lists for applicable ADCs."""
test = (
{"a": "str", "b": 1, "c": 2.7, "d": True, "e": {"a1": "str", "b1": {"c1": [1, 2, 3]}}},
{"a": "string", "b": "integer", "c": "numeric", "d": "boolean", "e": {"a1": "string", "b1": {"c1": "integer"}}}
)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
df = pd.DataFrame({"foo": [test[0]]})
convert_data(DataFrameData(df), adc)
attributes = infer_feature_attributes(adc)
assert attributes["foo"]["data_type"] == "json"
assert attributes["foo"]["original_type"]["type_map"] == test[1]
34 changes: 0 additions & 34 deletions howso/utilities/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,6 @@


__all__ = [
"cast_primitive_from_feature_type",
"convert_primitive_to_feature_type",
"FeatureSerializer",
"FeatureType",
"deserialize_cases",
Expand Down Expand Up @@ -67,38 +65,6 @@ def __str__(self):
"""Return a string representation."""
return str(self.value)


def cast_primitive_from_feature_type(data: int | float | str | bool, new_type: str):
"""Cast a primitive value to the provided FeatureType value if it does not match."""
try:
if new_type == FeatureType.STRING.value and not isinstance(data, str):
return str(data)
elif new_type == FeatureType.BOOLEAN.value and not isinstance(data, bool):
return bool(data)
elif new_type == FeatureType.INTEGER.value and not isinstance(data, int):
return int(data)
elif new_type == FeatureType.FLOAT.value and not isinstance(data, float):
return float(data)
except Exception: # noqa: Intentionally broad
# This is a QoL operation and it should not stop execution if there is a problem
pass
return data


def convert_primitive_to_feature_type(value: t.Any):
"""Convert a primitive value's data type to FeatureType. Returns 'object' if not primitive."""
if isinstance(value, str):
return FeatureType.STRING.value
elif isinstance(value, bool):
return FeatureType.BOOLEAN.value
elif isinstance(value, int):
return FeatureType.INTEGER.value
elif isinstance(value, float):
return FeatureType.NUMERIC.value
# A non-primitive type
return FeatureType.UNKNOWN


class FeatureSerializer:
"""Adapter for serialization and deserialization of feature data."""

Expand Down
16 changes: 1 addition & 15 deletions howso/utilities/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -808,18 +808,7 @@ def stringify_json(cases: list[list[t.Any]], features: Iterable[str], feature_at
case_group[idx] = json.dumps(case_group[idx])


def _convert_json_subtypes(data: t.Any, type_map: dict[str, t.Any] | t.Any):
"""Recursively convert primitive types according to the type map for an arbitrary Python data structure."""
# Avoid circular import
from .features import cast_primitive_from_feature_type
if isinstance(data, list):
return [cast_primitive_from_feature_type(d, type_map) for d in data]
elif not isinstance(data, Mapping):
return cast_primitive_from_feature_type(data, type_map)
return {key: _convert_json_subtypes(data[key], type_map.get(key, "object")) for key in data.keys()}


def destringify_json(cases: pd.Series, feature_attributes: Mapping) -> None:
def destringify_json(cases: pd.Series, feature_attributes: Mapping) -> pd.Series:
"""
Ensures that any JSON features have their cases destringified.

Expand All @@ -833,9 +822,6 @@ def destringify_json(cases: pd.Series, feature_attributes: Mapping) -> None:
destringified_cases = []
for case_to_destringify in cases:
formatted_case = json.loads(case_to_destringify)
type_map = feature_attributes.get("original_type", {}).get("type_map")
if type_map:
formatted_case = _convert_json_subtypes(formatted_case, type_map)
destringified_cases.append(formatted_case)
return pd.Series(destringified_cases)

Expand Down
Loading