Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
from sentence_transformers import SentenceTransformer, util
from sklearn.preprocessing import QuantileTransformer

from ...artifacts.analyzers.field_features import describe_field
from ...config.evaluate import QUASI_IDENTIFIER_COUNT
from ...config.parameters import SafeSynthesizerParameters
from ...observability import get_logger
Expand All @@ -30,6 +29,7 @@
from ..data_model.evaluation_score import EvaluationScore, PrivacyGrade
from ..nearest_neighbors import NearestNeighborSearch
from . import multi_modal_figures as figures
from .privacy_metric_utils import divide_tabular_text, embed_text, find_text_fields

logger = get_logger(__name__)

Expand Down Expand Up @@ -183,42 +183,6 @@ def _is_really_categorical(column: str) -> bool:

return False

@staticmethod
def _divide_tabular_text(df: pd.DataFrame, text_fields: list) -> tuple[pd.DataFrame, pd.DataFrame]:
"""Split a dataframe into tabular-only and text-only subsets."""
tabular_fields = []
for col in df.columns:
if col not in text_fields:
tabular_fields.append(col)
df_tabular = df.filter(tabular_fields)
df_text = df.filter(text_fields)

return (df_tabular, df_text)

@staticmethod
def _embed_text(df: pd.DataFrame, embedder: SentenceTransformer) -> pd.DataFrame:
"""Embed each text column and average into a single embedding per row."""
embeddings = {}
for col in df.columns:
data = df[col].to_list()
data = [str(r) for r in data]
embeddings[col] = embedder.encode(data, show_progress_bar=False, convert_to_tensor=True)

avg_embeddings = []
for i in range(len(df)):
# TODO: Is this average what we want? When there are more than 2 columns, we will
# overweight later columns relative to earlier columns.
norm = embeddings[df.columns[0]][i]
for j in range(1, len(df.columns)):
field = df.columns[j]
norm = torch.mean(torch.stack([norm, embeddings[field][i]]), dim=0)

avg_embeddings.append(norm)

df_embeddings = pd.DataFrame({"embedding": list(avg_embeddings)})

return df_embeddings

@staticmethod
def _parse_dates(value: str | int | float, scalar_type: str | None = None) -> list[tuple[str, datetime]] | None:
if scalar_type == "number" and isinstance(value, str):
Expand Down Expand Up @@ -253,8 +217,8 @@ def _get_synth_nn(

# First divide out text and non-text
if len(text_columns) > 0:
df_train_use, df_train_text = AttributeInferenceProtection._divide_tabular_text(df_train_use, text_columns)
df_synth_use, df_synth_text = AttributeInferenceProtection._divide_tabular_text(df_synth_use, text_columns)
df_train_use, df_train_text = divide_tabular_text(df_train_use, text_columns)
df_synth_use, df_synth_text = divide_tabular_text(df_synth_use, text_columns)

# Normalize the tabular data if there is any
tabular_columns = numeric_columns + nominal_columns
Expand Down Expand Up @@ -283,8 +247,8 @@ def _get_synth_nn(
if len(tabular_columns) == 0:
# Create embeddings for text fields
assert embedder is not None
df_train_embeddings = AttributeInferenceProtection._embed_text(df_train_text, embedder)
df_synth_embeddings = AttributeInferenceProtection._embed_text(df_synth_text, embedder)
df_train_embeddings = embed_text(df_train_text, embedder)
df_synth_embeddings = embed_text(df_synth_text, embedder)
hits = util.semantic_search(
torch.stack(df_train_embeddings["embedding"].tolist()),
torch.stack(df_synth_embeddings["embedding"].tolist()),
Expand All @@ -305,8 +269,8 @@ def _get_synth_nn(
# Get the text embeddings and then the 1000 NN based on just the text

assert embedder is not None
df_train_embeddings = AttributeInferenceProtection._embed_text(df_train_text, embedder)
df_synth_embeddings = AttributeInferenceProtection._embed_text(df_synth_text, embedder)
df_train_embeddings = embed_text(df_train_text, embedder)
df_synth_embeddings = embed_text(df_synth_text, embedder)
search_synth_k = min(1000, len(df_synth_embeddings))
hits = util.semantic_search(
torch.stack(df_train_embeddings["embedding"].tolist()),
Expand Down Expand Up @@ -419,12 +383,7 @@ def _aia(
numeric_columns = [column for column in df_train.columns if column not in nominal_columns]

# Now separate out the text columns from the nominal

text_columns = []
for col in nominal_columns:
result = describe_field(col, df_train[col])
if result.type.value == "text":
text_columns.append(col)
text_columns = find_text_fields(df_train[nominal_columns])
nominal_columns = [x for x in nominal_columns if x not in text_columns]

# If there are text columns, create an embedder
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from sklearn.metrics import accuracy_score, precision_score
from sklearn.preprocessing import QuantileTransformer

from ...artifacts.analyzers.field_features import describe_field
from ...config.evaluate import DEFAULT_RECORD_COUNT
from ...config.parameters import SafeSynthesizerParameters
from ...evaluation.components.component import Component
Expand All @@ -25,6 +24,7 @@
from ...evaluation.nearest_neighbors import NearestNeighborSearch
from ...observability import get_logger
from . import multi_modal_figures as figures
from .privacy_metric_utils import divide_tabular_text, embed_text, find_text_fields

logger = get_logger(__name__)

Expand Down Expand Up @@ -416,54 +416,6 @@ def _compute_mia(
fp_cnts,
)

@staticmethod
def find_text_fields(df: pd.DataFrame) -> list[str]:
"""Return column names classified as free text."""
text_fields = []
for col in df.columns:
field_info = describe_field(col, df[col])
if field_info.type.value == "text":
text_fields.append(col)

return text_fields

@staticmethod
def embed_text(df: pd.DataFrame) -> pd.DataFrame:
"""Embed each text column and average into a single embedding per row."""
embeddings = {}
embedder = SentenceTransformer("distiluse-base-multilingual-cased-v2")
for col in df.columns:
data = df[col].to_list()
data = [str(r) for r in data]
embeddings[col] = embedder.encode(data, show_progress_bar=False, convert_to_tensor=True)

avg_embeddings = []
for i in range(len(df)):
# TODO: Is this average what we want? When there are more than 2 columns, we will
# overweight later columns relative to earlier columns.
norm = embeddings[df.columns[0]][i]
for j in range(1, len(df.columns)):
field = df.columns[j]
norm = torch.mean(torch.stack([norm, embeddings[field][i]]), dim=0)

avg_embeddings.append(norm)

df_embeddings = pd.DataFrame({"embedding": list(avg_embeddings)})

return df_embeddings

@staticmethod
def divide_tabular_text(df: pd.DataFrame, text_fields: list) -> tuple[pd.DataFrame, pd.DataFrame]:
"""Split a dataframe into tabular-only and text-only subsets."""
tabular_fields = []
for col in df.columns:
if col not in text_fields:
tabular_fields.append(col)
df_tabular = df.filter(tabular_fields)
df_text = df.filter(text_fields)

return (df_tabular, df_text)

@staticmethod
def mia(
df_train: pd.DataFrame,
Expand Down Expand Up @@ -506,7 +458,7 @@ def mia(
df_test = df_test.filter([column_name])
df_synth = df_synth.filter([column_name])

text_fields = MembershipInferenceProtection.find_text_fields(df_train)
text_fields = find_text_fields(df_train)
text_cnt = len(text_fields)
tabular_cnt = len(df_train.columns) - text_cnt

Expand All @@ -525,13 +477,11 @@ def mia(
df_train_use = df_train.sample(n=train_size_needed, random_state=1)

# Divide the dataframes into text and tabular
text_fields = MembershipInferenceProtection.find_text_fields(df_train_use)
text_fields = find_text_fields(df_train_use)
if len(text_fields) > 0:
df_train_use, df_train_text = MembershipInferenceProtection.divide_tabular_text(
df_train_use, text_fields
)
df_test, df_test_text = MembershipInferenceProtection.divide_tabular_text(df_test, text_fields)
df_synth, df_synth_text = MembershipInferenceProtection.divide_tabular_text(df_synth, text_fields)
df_train_use, df_train_text = divide_tabular_text(df_train_use, text_fields)
df_test, df_test_text = divide_tabular_text(df_test, text_fields)
df_synth, df_synth_text = divide_tabular_text(df_synth, text_fields)

# Normalize the tabular data (adjusted for multimodal)
if tabular_cnt > 0:
Expand All @@ -555,9 +505,10 @@ def mia(
# Create embeddings for text fields and combine the normalized tabular and the
# new text embeddings into one dataframe.
if len(text_fields) > 0:
df_train_embeddings = MembershipInferenceProtection.embed_text(df_train_text)
df_test_embeddings = MembershipInferenceProtection.embed_text(df_test_text)
df_synth_embeddings = MembershipInferenceProtection.embed_text(df_synth_text)
embedder = SentenceTransformer("distiluse-base-multilingual-cased-v2")
df_train_embeddings = embed_text(df_train_text, embedder)
df_test_embeddings = embed_text(df_test_text, embedder)
df_synth_embeddings = embed_text(df_synth_text, embedder)
df_train_norm = pd.concat([df_train_norm, df_train_embeddings], axis=1)
df_test_norm = pd.concat([df_test_norm, df_test_embeddings], axis=1)
df_synth_norm = pd.concat([df_synth_norm, df_synth_embeddings], axis=1)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

from __future__ import annotations

from typing import TYPE_CHECKING

import pandas as pd
import torch

if TYPE_CHECKING:
from sentence_transformers import SentenceTransformer

from ...artifacts.analyzers.field_features import describe_field


def find_text_fields(df: pd.DataFrame) -> list[str]:
"""Identify columns in ``df`` whose content is free-form text.
Each column is passed through ``describe_field``; those classified
as ``"text"`` are returned.
Args:
df: DataFrame whose columns are inspected.
Returns:
Column names classified as free-form text.
"""
text_fields: list[str] = []
for col in df.columns:
field_info = describe_field(col, df[col])
if field_info.type.value == "text":
text_fields.append(col)
return text_fields


def divide_tabular_text(df: pd.DataFrame, text_fields: list[str]) -> tuple[pd.DataFrame, pd.DataFrame]:
"""Split ``df`` into a tabular-only and a text-only DataFrame.
Columns present in ``text_fields`` go into the text DataFrame; the
remaining columns go into the tabular DataFrame.
Args:
df: Source DataFrame to split.
text_fields: Column names to treat as text.
Returns:
A ``(tabular_df, text_df)`` tuple where ``tabular_df`` contains only
the non-text columns and ``text_df`` contains only the text columns.
"""
tabular_fields = [col for col in df.columns if col not in text_fields]
return df.filter(tabular_fields), df.filter(text_fields)


def embed_text(df: pd.DataFrame, embedder: SentenceTransformer) -> pd.DataFrame:
"""Embed every text column in ``df`` and return a single averaged embedding per row.
For each column the ``embedder`` produces a ``(n_rows, embed_dim)`` matrix.
The per-column matrices are stacked and averaged across columns so that
every column contributes equally to the final embedding.
Args:
df: DataFrame whose columns are all text to be embedded.
embedder: Sentence-transformer model used to produce embeddings.
Returns:
Single-column DataFrame with column ``"embedding"`` whose values are
1-D tensors of shape ``(embed_dim,)``.
"""
embeddings = {}
for col in df.columns:
data = [str(r) for r in df[col].to_list()]
embeddings[col] = torch.as_tensor(embedder.encode(data, show_progress_bar=False, convert_to_tensor=True))

stacked = torch.stack([embeddings[col] for col in df.columns], dim=0) # shape: (n_cols, n_rows, embed_dim)
avg_embeddings = torch.mean(stacked, dim=0) # shape: (n_rows, embed_dim)

return pd.DataFrame({"embedding": list(avg_embeddings)})
60 changes: 60 additions & 0 deletions tests/evaluation/components/test_privacy_metric_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

from unittest.mock import MagicMock

import numpy as np
import pandas as pd
import pytest
import torch

from nemo_safe_synthesizer.evaluation.components.privacy_metric_utils import (
divide_tabular_text,
embed_text,
)


@pytest.fixture
def mock_embedder():
"""A mock SentenceTransformer whose .encode() returns deterministic tensors."""
embedder = MagicMock()

def _encode(data, **kwargs):
# Return a distinct but deterministic embedding per string.
# Use the length of each string as a simple seed for reproducibility.
return torch.tensor([[float(len(s)), float(len(s)) * 2, float(len(s)) * 3] for s in data], dtype=torch.float32)

embedder.encode = MagicMock(side_effect=_encode)
return embedder


def test_divide_tabular_text(train_df):
text_fields = ["text", "other"]
tabular, text = divide_tabular_text(train_df, text_fields)

assert "text" not in tabular.columns
assert "other" not in tabular.columns
assert set(text.columns) == {"other", "text"}
assert len(tabular) == len(train_df)
assert len(text) == len(train_df)


def test_embed_text(mock_embedder):
"""Regression test: with 3+ columns the old pairwise-averaging code
over-weighted later columns. The corrected stack/mean reduction must give
each column equal weight.
"""
df = pd.DataFrame(
{
"a": ["x"], # len 1 → embedding [1, 2, 3]
"b": ["xx"], # len 2 → embedding [2, 4, 6]
"c": ["xxxx"], # len 4 → embedding [4, 8, 12]
}
)
result = embed_text(df, mock_embedder)

# True mean of [1,2,3], [2,4,6], [4,8,12] across columns:
# = [(1+2+4)/3, (2+4+8)/3, (3+6+12)/3] = [7/3, 14/3, 21/3]
expected = np.array([7 / 3, 14 / 3, 7.0])
assert isinstance(result["embedding"].iloc[0], torch.Tensor)
np.testing.assert_array_almost_equal(result["embedding"].iloc[0].numpy(), expected)
Loading