From 3e1e37039a658ef2d3cee9e7aba95a80fb71fc5c Mon Sep 17 00:00:00 2001 From: apbassett <43486400+apbassett@users.noreply.github.com> Date: Wed, 4 Feb 2026 16:07:06 -0500 Subject: [PATCH 1/4] Removes unecessary logic for JSON feature primitive type preservation --- howso/client/tests/test_client.py | 86 ------------------- howso/utilities/feature_attributes/base.py | 71 +-------------- .../tests/test_infer_feature_attributes.py | 41 --------- .../test_infer_feature_attributes_adc.py | 20 ----- howso/utilities/features.py | 34 -------- howso/utilities/utilities.py | 14 --- 6 files changed, 1 insertion(+), 265 deletions(-) diff --git a/howso/client/tests/test_client.py b/howso/client/tests/test_client.py index b3924a65..4e285b8b 100644 --- a/howso/client/tests/test_client.py +++ b/howso/client/tests/test_client.py @@ -1440,89 +1440,3 @@ def test_tokenizable_strings_reaction(self): ) assert reaction["action"].iloc[0]["review"] == df.iloc[0]["review"] assert reaction["details"]["influential_cases"][0].iloc[0]["review"] == df.iloc[0]["review"] - - def test_json_feature_types(self): - """Test that JSON features stored as Python data structures have their primitive types maintained.""" - tests = [ - ({"a": "str", "b": "1", "c": "2.7", "d": True, "e": {"a1": "str", "b1": {"c1": [1, 2, 3]}}}, - {"a": "string", "b": "integer", "c": "numeric", "d": "boolean", "e": {"a1": "string", "b1": {"c1": "integer"}}}), - ({"a": "str", "b": "1", "c": "3.3", "d": False, "e": {"a1": "str2", "b1": {"c1": [1, 2, 3, 4, 5, 6, 7]}}}, - {"a": "string", "b": "integer", "c": "numeric", "d": "boolean", "e": {"a1": "string", "b1": {"c1": "integer"}}}), - ({"a": 3, "b": 1.5, "c": 2.7, "d": True, "e": {"a1": 5, "b1": {"c1": [1, 2, 3]}}}, - {"a": "integer", "b": "numeric", "c": "numeric", "d": "boolean", "e": {"a1": "integer", "b1": {"c1": "integer"}}}), - ({"a": 3, "b": 1, "c": 2.7, "d": True, "e": {"a1": "str", "b1": {"c1": [1, True, "foo"]}}}, - {"a": "integer", "b": "integer", "c": "numeric", "d": "boolean", "e": {"a1": "string", "b1": {"c1": "object"}}}), - ] - data_uniform_types = pd.DataFrame({"foo": [tests[0][0], tests[1][0]], "bar": ["a", "b"]}) - data_uniform_except_list = pd.DataFrame({"foo": [tests[2][0], tests[3][0]], "bar": ["a", "b"]}) - data_non_uniform = pd.DataFrame({"foo": [tests[0][0], tests[1][0], tests[2][0]], "bar": ["a", "b", "c"]}) - - # Types should be preserved with no warnings (dict) - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - feature_attributes = infer_feature_attributes(data_uniform_types) - assert feature_attributes["foo"]["data_type"] == "json" - assert feature_attributes["foo"]["type"] == "continuous" - assert feature_attributes["foo"]["original_type"]["data_type"] == "container" - client = HowsoClient() - t = Trainee() - client.set_feature_attributes(t.id, feature_attributes) - client.train(t.id, data_uniform_types) - reaction = client.react( - t.id, - contexts=[["a"]], - context_features=['bar'], - action_features=['foo'], - details={"influential_cases": True}, - desired_conviction=5, - ) - # Cannot compare - assert reaction["action"].iloc[0]["foo"] == data_uniform_types.iloc[0]["foo"] - assert reaction["details"]["influential_cases"][0].iloc[0]["foo"] == tests[0][0] - - # All types except for the nested list should be preserved and a warning issued - with pytest.warns(match="contains a key 'c1' whose value is a list of mixed types"): - feature_attributes = infer_feature_attributes(data_uniform_except_list) - assert feature_attributes["foo"]["data_type"] == "json" - assert feature_attributes["foo"]["type"] == "continuous" - assert feature_attributes["foo"]["original_type"]["data_type"] == "container" - client = HowsoClient() - t = Trainee() - client.set_feature_attributes(t.id, feature_attributes) - client.train(t.id, data_uniform_except_list) - reaction = client.react( - t.id, - contexts=[["b"]], - context_features=['bar'], - action_features=['foo'], - details={"influential_cases": True}, - desired_conviction=5, - ) - expected_case = deepcopy(tests[3][0]) - # The list under this key has mixed types so it will come back as-is when deserialized - expected_case["e"]["b1"]["c1"] = json.loads(json.dumps(expected_case["e"]["b1"]["c1"])) - assert reaction["action"].iloc[0]["foo"] == expected_case - assert reaction["details"]["influential_cases"][0].iloc[0]["foo"] == expected_case - - # Types cannot be preserved, warning issued - with pytest.warns(match="inconsistent types and/or keys across cases."): - feature_attributes = infer_feature_attributes(data_non_uniform) - assert feature_attributes["foo"]["data_type"] == "json" - assert feature_attributes["foo"]["type"] == "continuous" - assert feature_attributes["foo"]["original_type"]["data_type"] == "container" - client = HowsoClient() - t = Trainee() - client.set_feature_attributes(t.id, feature_attributes) - client.train(t.id, data_non_uniform) - reaction = client.react( - t.id, - contexts=[["a"]], - context_features=['bar'], - action_features=['foo'], - details={"influential_cases": True}, - desired_conviction=5, - ) - # Cases of "foo" have mixed types so they will come back as-is when deserialized - expected_case = json.loads(json.dumps(tests[0][0])) - assert reaction["action"].iloc[0]["foo"] == expected_case - assert reaction["details"]["influential_cases"][0].iloc[0]["foo"] == expected_case diff --git a/howso/utilities/feature_attributes/base.py b/howso/utilities/feature_attributes/base.py index 2f3f2df4..8cfbcc4a 100644 --- a/howso/utilities/feature_attributes/base.py +++ b/howso/utilities/feature_attributes/base.py @@ -1148,74 +1148,6 @@ def _infer_boolean_attributes(self, feature_name: str) -> dict: def _infer_integer_attributes(self, feature_name: str) -> dict: """Get inferred attributes for the given integer column.""" - def _get_primitive_type_schema(self, feature_name: str) -> dict: # noqa: C901 - """Get a map of keys to types for a JSON feature stored as a Python dict or list.""" - # If there is no data, return False - first_non_none = self._get_first_non_null(feature_name) - if first_non_none is None: - return False - - # Keep track of whether there are any non-primitive types in the data - has_complex_type = False - - def _recursive_get_types(data: t.Any, key: str = None) -> dict: - """Recursively determine primitive types for an arbitrary Python data structure.""" - nonlocal has_complex_type - # Value is a list - if isinstance(data, MutableSequence): - list_type = FeatureType.UNKNOWN.value - # Iterate through 10 random values of a list of len>10, or through the entire list if len<=10. - iterations = min(len(data), 10) - if len(data) > 10: - # Shuffle data so that the first 10 indices are randomized - data = list(pd.Series(data).sample(frac=1)) - for idx in range(iterations): - rand_val = data[idx] - if rand_val is None: - # We can still retain primitive types with NoneTypes present in the data structure - continue - elif list_type == FeatureType.UNKNOWN.value: - list_type = convert_primitive_to_feature_type(rand_val) - elif list_type != convert_primitive_to_feature_type(rand_val): - warnings.warn(f"JSON feature '{feature_name}' contains a key '{key}' whose value is a list of " - "mixed types. Original types under this key will not be preserved.") - return FeatureType.UNKNOWN.value - elif list_type == FeatureType.UNKNOWN.value: - # A non-primitive type was found in the data - has_complex_type = True - return list_type - # Base case: not a list or dict - elif not isinstance(data, Mapping): - return convert_primitive_to_feature_type(data) - # Value is a dict - return {key: _recursive_get_types(data[key], key=key) for key in data.keys()} - - # Sample up to 10 random values - # OR every value if < 10 - type_maps = [] - if (count := self._get_unique_count(feature_name)) < 10: - for sample in self._get_unique_values(feature_name): - type_maps.append(_recursive_get_types(sample)) - else: - count = 10 - for idx in range(10): - sample = self._get_random_value(feature_name, no_nulls=True) - type_maps.append(_recursive_get_types(sample)) - - # Issue a warning if keys or types are not consistent across cases - for idx in range(1, count): - if type_maps[0] != type_maps[idx]: - warnings.warn(f"JSON feature '{feature_name} has inconsistent types and/or keys across cases. " - "Original types will not be preserved.") - return - - # Issue a warning if any non-primitive types were found - if has_complex_type: - warnings.warn(f"JSON feature '{feature_name}' contains at least one instance of a non-primitive type. " - "Only uniform, primitive types will be preserved in semistructured features.") - - return type_maps[0] - def _infer_string_attributes(self, feature_name: str) -> dict: """Get inferred attributes for the given string column.""" # Column has arbitrary string values, first check if they @@ -1239,11 +1171,10 @@ def _infer_string_attributes(self, feature_name: str) -> dict: elif self._is_json_feature(feature_name): first_non_null = self._get_first_non_null(feature_name) if isinstance(first_non_null, Mapping) or isinstance(first_non_null, MutableSequence): - type_map = self._get_primitive_type_schema(feature_name) or {} return { "type": "continuous", "data_type": "json", - "original_type": {"type_map": type_map, "data_type": FeatureType.CONTAINER.value}, + "original_type": {"data_type": FeatureType.CONTAINER.value}, } return { "type": "continuous", diff --git a/howso/utilities/feature_attributes/tests/test_infer_feature_attributes.py b/howso/utilities/feature_attributes/tests/test_infer_feature_attributes.py index 14873c73..68ce8d9b 100644 --- a/howso/utilities/feature_attributes/tests/test_infer_feature_attributes.py +++ b/howso/utilities/feature_attributes/tests/test_infer_feature_attributes.py @@ -1084,44 +1084,3 @@ def test_infer_tokenizable_string(): # Product should still be a nominal string assert feature_attributes["product"]["data_type"] == "string" assert feature_attributes["product"]["type"] == "nominal" - - -def test_json_features_types(): - """Test that IFA includes type information for JSON features that are Python dicts/lists.""" - tests = [ - ({"a": "str", "b": 1, "c": 2.7, "d": True, "e": {"a1": "str", "b1": {"c1": [1, 2, 3]}}}, - {"a": "string", "b": "integer", "c": "numeric", "d": "boolean", "e": {"a1": "string", "b1": {"c1": "integer"}}}), - ({"a": "str", "b": 9, "c": 3.3, "d": False, "e": {"a1": "str2", "b1": {"c1": [1, 2, 3, 4, 5, 6, 7]}}}, - {"a": "string", "b": "integer", "c": "numeric", "d": "boolean", "e": {"a1": "string", "b1": {"c1": "integer"}}}), - ({"a": 3, "b": 1.5, "c": 2.7, "d": True, "e": {"a1": 5, "b1": {"c1": [1, 2, 3]}}}, - {"a": "integer", "b": "numeric", "c": "numeric", "d": "boolean", "e": {"a1": "integer", "b1": {"c1": "integer"}}}), - ({"a": 3, "b": 1, "c": 2.7, "d": True, "e": {"a1": "str", "b1": {"c1": [1, True, "foo"]}}}, - {"a": "integer", "b": "integer", "c": "numeric", "d": "boolean", "e": {"a1": "string", "b1": {"c1": "object"}}}), - ([1.1, 2.2, 3.3, 4.4], "numeric"), - ] - # First test that the type maps are correct - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - for test in tests: - df = pd.DataFrame({"foo": [test[0]]}) - attributes = infer_feature_attributes(df) - assert attributes["foo"]["data_type"] == "json" - assert attributes["foo"]["original_type"]["data_type"] == FeatureType.CONTAINER.value - assert attributes["foo"]["original_type"]["type_map"] == test[1] - with warnings.catch_warnings(): - warnings.simplefilter("error") - # Ensure no warnings also with multiple cases of the same schema - df = pd.DataFrame({"foo": [tests[0][0], tests[1][0]]}) - attributes = infer_feature_attributes(df) - assert attributes["foo"]["data_type"] == "json" - assert attributes["foo"]["original_type"]["data_type"] == FeatureType.CONTAINER.value - assert attributes["foo"]["original_type"]["type_map"] == tests[0][1] - - # Test applicable warnings - with pytest.warns(match="contains a key 'c1' whose value is a list of mixed types"): - df = pd.DataFrame({"foo": [tests[3][0]]}) - infer_feature_attributes(df) - with pytest.warns(match="inconsistent types and/or keys across cases."): - df = pd.DataFrame({"foo": [tests[0][0], tests[2][0], tests[1][0]]}) - attributes = infer_feature_attributes(df) - assert not attributes["foo"]["original_type"]["type_map"] diff --git a/howso/utilities/feature_attributes/tests/test_infer_feature_attributes_adc.py b/howso/utilities/feature_attributes/tests/test_infer_feature_attributes_adc.py index d9b57e22..670071fa 100644 --- a/howso/utilities/feature_attributes/tests/test_infer_feature_attributes_adc.py +++ b/howso/utilities/feature_attributes/tests/test_infer_feature_attributes_adc.py @@ -602,23 +602,3 @@ def test_infer_tokenizable_string(adc): # Product should still be a nominal string assert feature_attributes["product"]["data_type"] == "string" assert feature_attributes["product"]["type"] == "nominal" - - -@pytest.mark.parametrize('adc', [ - # Only MongoDBData and DataFrameData support Python objects as data - ("MongoDBData", pd.DataFrame()), - ("DataFrameData", pd.DataFrame()), -], indirect=True) -def test_json_features_types(adc): - """Test that IFA includes type information for JSON features that are Python dicts/lists for applicable ADCs.""" - test = ( - {"a": "str", "b": 1, "c": 2.7, "d": True, "e": {"a1": "str", "b1": {"c1": [1, 2, 3]}}}, - {"a": "string", "b": "integer", "c": "numeric", "d": "boolean", "e": {"a1": "string", "b1": {"c1": "integer"}}} - ) - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - df = pd.DataFrame({"foo": [test[0]]}) - convert_data(DataFrameData(df), adc) - attributes = infer_feature_attributes(adc) - assert attributes["foo"]["data_type"] == "json" - assert attributes["foo"]["original_type"]["type_map"] == test[1] diff --git a/howso/utilities/features.py b/howso/utilities/features.py index dc694b2c..69b6207c 100644 --- a/howso/utilities/features.py +++ b/howso/utilities/features.py @@ -38,8 +38,6 @@ __all__ = [ - "cast_primitive_from_feature_type", - "convert_primitive_to_feature_type", "FeatureSerializer", "FeatureType", "deserialize_cases", @@ -67,38 +65,6 @@ def __str__(self): """Return a string representation.""" return str(self.value) - -def cast_primitive_from_feature_type(data: int | float | str | bool, new_type: str): - """Cast a primitive value to the provided FeatureType value if it does not match.""" - try: - if new_type == FeatureType.STRING.value and not isinstance(data, str): - return str(data) - elif new_type == FeatureType.BOOLEAN.value and not isinstance(data, bool): - return bool(data) - elif new_type == FeatureType.INTEGER.value and not isinstance(data, int): - return int(data) - elif new_type == FeatureType.FLOAT.value and not isinstance(data, float): - return float(data) - except Exception: # noqa: Intentionally broad - # This is a QoL operation and it should not stop execution if there is a problem - pass - return data - - -def convert_primitive_to_feature_type(value: t.Any): - """Convert a primitive value's data type to FeatureType. Returns 'object' if not primitive.""" - if isinstance(value, str): - return FeatureType.STRING.value - elif isinstance(value, bool): - return FeatureType.BOOLEAN.value - elif isinstance(value, int): - return FeatureType.INTEGER.value - elif isinstance(value, float): - return FeatureType.NUMERIC.value - # A non-primitive type - return FeatureType.UNKNOWN - - class FeatureSerializer: """Adapter for serialization and deserialization of feature data.""" diff --git a/howso/utilities/utilities.py b/howso/utilities/utilities.py index d2e83253..eaa1750b 100644 --- a/howso/utilities/utilities.py +++ b/howso/utilities/utilities.py @@ -808,17 +808,6 @@ def stringify_json(cases: list[list[t.Any]], features: Iterable[str], feature_at case_group[idx] = json.dumps(case_group[idx]) -def _convert_json_subtypes(data: t.Any, type_map: dict[str, t.Any] | t.Any): - """Recursively convert primitive types according to the type map for an arbitrary Python data structure.""" - # Avoid circular import - from .features import cast_primitive_from_feature_type - if isinstance(data, list): - return [cast_primitive_from_feature_type(d, type_map) for d in data] - elif not isinstance(data, Mapping): - return cast_primitive_from_feature_type(data, type_map) - return {key: _convert_json_subtypes(data[key], type_map.get(key, "object")) for key in data.keys()} - - def destringify_json(cases: pd.Series, feature_attributes: Mapping) -> None: """ Ensures that any JSON features have their cases destringified. @@ -833,9 +822,6 @@ def destringify_json(cases: pd.Series, feature_attributes: Mapping) -> None: destringified_cases = [] for case_to_destringify in cases: formatted_case = json.loads(case_to_destringify) - type_map = feature_attributes.get("original_type", {}).get("type_map") - if type_map: - formatted_case = _convert_json_subtypes(formatted_case, type_map) destringified_cases.append(formatted_case) return pd.Series(destringified_cases) From c2af41ef0e3918b9a6ddfb67a54d1172073345be Mon Sep 17 00:00:00 2001 From: apbassett <43486400+apbassett@users.noreply.github.com> Date: Wed, 4 Feb 2026 16:12:53 -0500 Subject: [PATCH 2/4] Remove nonexistant imports --- howso/utilities/__init__.py | 4 ---- howso/utilities/feature_attributes/base.py | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/howso/utilities/__init__.py b/howso/utilities/__init__.py index eee26968..335b857e 100644 --- a/howso/utilities/__init__.py +++ b/howso/utilities/__init__.py @@ -6,8 +6,6 @@ SingleTableFeatureAttributes, ) from .features import ( # noqa: F401 - cast_primitive_from_feature_type, - convert_primitive_to_feature_type, deserialize_cases, FeatureType, format_column, @@ -65,8 +63,6 @@ "align_data", "build_react_series_df", "check_feature_names", - "cast_primitive_from_feature_type", - "convert_primitive_to_feature_type", "format_confusion_matrix", "date_format_is_iso", "date_to_epoch", diff --git a/howso/utilities/feature_attributes/base.py b/howso/utilities/feature_attributes/base.py index 8cfbcc4a..9ea6c755 100644 --- a/howso/utilities/feature_attributes/base.py +++ b/howso/utilities/feature_attributes/base.py @@ -21,7 +21,7 @@ import pandas as pd import yaml -from howso.utilities.features import convert_primitive_to_feature_type, FeatureType +from howso.utilities.features import FeatureType from howso.utilities.utilities import is_valid_datetime_format, time_to_seconds from ..utilities import determine_iso_format From 2da8f8579d31e7da384474184a1a4f58385685cc Mon Sep 17 00:00:00 2001 From: apbassett <43486400+apbassett@users.noreply.github.com> Date: Wed, 4 Feb 2026 16:25:46 -0500 Subject: [PATCH 3/4] Update return type --- howso/utilities/utilities.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/howso/utilities/utilities.py b/howso/utilities/utilities.py index eaa1750b..d34c0ff8 100644 --- a/howso/utilities/utilities.py +++ b/howso/utilities/utilities.py @@ -808,7 +808,7 @@ def stringify_json(cases: list[list[t.Any]], features: Iterable[str], feature_at case_group[idx] = json.dumps(case_group[idx]) -def destringify_json(cases: pd.Series, feature_attributes: Mapping) -> None: +def destringify_json(cases: pd.Series, feature_attributes: Mapping) -> None | pd.Series: """ Ensures that any JSON features have their cases destringified. From 44b0262f2f4c6f0c6f0bc507c248a77818712498 Mon Sep 17 00:00:00 2001 From: apbassett <43486400+apbassett@users.noreply.github.com> Date: Wed, 4 Feb 2026 16:44:03 -0500 Subject: [PATCH 4/4] Another type hint fix --- howso/utilities/utilities.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/howso/utilities/utilities.py b/howso/utilities/utilities.py index d34c0ff8..8cc35718 100644 --- a/howso/utilities/utilities.py +++ b/howso/utilities/utilities.py @@ -808,7 +808,7 @@ def stringify_json(cases: list[list[t.Any]], features: Iterable[str], feature_at case_group[idx] = json.dumps(case_group[idx]) -def destringify_json(cases: pd.Series, feature_attributes: Mapping) -> None | pd.Series: +def destringify_json(cases: pd.Series, feature_attributes: Mapping) -> pd.Series: """ Ensures that any JSON features have their cases destringified.