howsoai · fulpm · Feb 4, 2026 · Feb 4, 2026 · Feb 4, 2026 · Feb 4, 2026
@@ -240,7 +240,7 @@ def get_names(self, *, types: t.Optional[str | Container] = None,
         ]
 
     def _validate_bounds(self, data: pd.DataFrame, feature: str,  # noqa: C901
-                         attributes: dict) -> list[str]:
+                         attributes: FeatureAttributes) -> list[str]:
         """Validate the feature bounds of the provided DataFrame."""
         # Import here to avoid circular import
         from howso.utilities import date_to_epoch
@@ -388,7 +388,7 @@ def _validate_dtype(self, data: pd.DataFrame, feature: str,  # noqa: C901
         return errors
 
     @staticmethod
-    def _allows_null(attributes: dict) -> bool:
+    def _allows_null(attributes: FeatureAttributes) -> bool:
         """Return whether the given attributes indicates the allowance of null values."""
         return 'bounds' in attributes and attributes['bounds'].get('allow_null', False)
 
@@ -397,7 +397,7 @@ def _validate_df(self, data: pd.DataFrame, coerce: bool = False,  # noqa: C901
                      allow_missing_features: bool = False, localize_datetimes=True, nullable_int_dtype='Int64'):
         errors = []
         coerced_df = data.copy(deep=True)
-        features = self[table_name] if table_name else self
+        features = t.cast(dict[str, "FeatureAttributes"], self[table_name] if table_name else self)
 
         for feature, attributes in features.items():
             if feature not in data.columns:
@@ -472,6 +472,11 @@ def _validate_df(self, data: pd.DataFrame, coerce: bool = False,  # noqa: C901
                     errors.extend(self._validate_dtype(data, feature, 'datetime64',
                                                        coerced_df, coerce=coerce,
                                                        localize_datetimes=localize_datetimes))
+
+                # Check semi-structured type (object)
+                elif attributes.get("data_type") in {"json", "yaml", "amalgam", "string", "string_mixable"}:
+                    errors.extend(self._validate_dtype(data, feature, "object", coerced_df, coerce=coerce))
+
                 # Check type (float)
                 elif attributes.get('decimal_places', -1) > 0:
                     errors.extend(self._validate_dtype(data, feature, 'float64',

@@ -656,6 +656,46 @@ def test_validate_df_multiple_dtypes(ftype, data_type, decimal_places, bounds, d
     assert pd.api.types.is_datetime64_any_dtype(coerced_df['DATE'].dtype)
 
 
+@pytest.mark.parametrize(
+    ("data", "expected_data_type", "expected_orig_type"),
+    (
+        ({"a": 1}, "json", "container"),
+        ([1, 2, 3], "json", "container"),
+        ('{"a": 1}', "json", "string"),
+        ('["a", "b", "c"]', "json", "string"),
+        ("doc:\n  abc: 1", "yaml", "string"),
+        ("(list 1 2 3)", "amalgam", "string"),
+        ('(assoc "a" 1 "b" 2)', "amalgam", "string"),
+    ),
+)
+def test_validate_df_semi_structured(data, expected_data_type: str, expected_orig_type: str):
+    """Test validate_df handles semi-structured features correctly."""
+    df = pd.DataFrame({
+        "id": [1, 2, 3],
+        "doc": [data, None, data],
+    })
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        feature_attributes = infer_feature_attributes(df)
+
+    attrs = feature_attributes["doc"]
+
+    assert "original_type" in attrs
+    assert attrs["original_type"]["data_type"] == expected_orig_type
+
+    if expected_data_type == "amalgam":
+        # Amalgam is not automatically inferred set it manually
+        attrs["type"] = "continuous"
+        attrs["data_type"] = "amalgam"
+    else:
+        assert attrs["type"] == "continuous"
+        assert attrs.get("data_type") == expected_data_type
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        feature_attributes.validate(df, raise_errors=True)
+
+
 @pytest.mark.parametrize("extra_attrs, success", (
     ({}, False),
     ({'auto_derive_on_train': False}, False),

@@ -269,7 +269,7 @@ def format_dataframe(cls, df: pd.DataFrame, features: Mapping,
             new_values = cls.format_column(df[col], attributes, tokenizer=tokenizer)
             df = df.drop(columns=col)
             df[col] = new_values
-            
+
         return df[original_feature_order]
 
     @classmethod