NVIDIA-NeMo · nina-xu · Apr 9, 2026 · Feb 20, 2026 · Feb 20, 2026 · Mar 31, 2026
@@ -21,7 +21,6 @@
 from sentence_transformers import SentenceTransformer, util
 from sklearn.preprocessing import QuantileTransformer
 
-from ...artifacts.analyzers.field_features import describe_field
 from ...config.evaluate import QUASI_IDENTIFIER_COUNT
 from ...config.parameters import SafeSynthesizerParameters
 from ...observability import get_logger
@@ -30,6 +29,7 @@
 from ..data_model.evaluation_score import EvaluationScore, PrivacyGrade
 from ..nearest_neighbors import NearestNeighborSearch
 from . import multi_modal_figures as figures
+from .privacy_metric_utils import divide_tabular_text, embed_text, find_text_fields
 
 logger = get_logger(__name__)
 
@@ -183,42 +183,6 @@ def _is_really_categorical(column: str) -> bool:
 
         return False
 
-    @staticmethod
-    def _divide_tabular_text(df: pd.DataFrame, text_fields: list) -> tuple[pd.DataFrame, pd.DataFrame]:
-        """Split a dataframe into tabular-only and text-only subsets."""
-        tabular_fields = []
-        for col in df.columns:
-            if col not in text_fields:
-                tabular_fields.append(col)
-        df_tabular = df.filter(tabular_fields)
-        df_text = df.filter(text_fields)
-
-        return (df_tabular, df_text)
-
-    @staticmethod
-    def _embed_text(df: pd.DataFrame, embedder: SentenceTransformer) -> pd.DataFrame:
-        """Embed each text column and average into a single embedding per row."""
-        embeddings = {}
-        for col in df.columns:
-            data = df[col].to_list()
-            data = [str(r) for r in data]
-            embeddings[col] = embedder.encode(data, show_progress_bar=False, convert_to_tensor=True)
-
-        avg_embeddings = []
-        for i in range(len(df)):
-            # TODO: Is this average what we want? When there are more than 2 columns, we will
-            # overweight later columns relative to earlier columns.
-            norm = embeddings[df.columns[0]][i]
-            for j in range(1, len(df.columns)):
-                field = df.columns[j]
-                norm = torch.mean(torch.stack([norm, embeddings[field][i]]), dim=0)
-
-            avg_embeddings.append(norm)
-
-        df_embeddings = pd.DataFrame({"embedding": list(avg_embeddings)})
-
-        return df_embeddings
-
     @staticmethod
     def _parse_dates(value: str | int | float, scalar_type: str | None = None) -> list[tuple[str, datetime]] | None:
         if scalar_type == "number" and isinstance(value, str):
@@ -253,8 +217,8 @@ def _get_synth_nn(
 
         # First divide out text and non-text
         if len(text_columns) > 0:
-            df_train_use, df_train_text = AttributeInferenceProtection._divide_tabular_text(df_train_use, text_columns)
-            df_synth_use, df_synth_text = AttributeInferenceProtection._divide_tabular_text(df_synth_use, text_columns)
+            df_train_use, df_train_text = divide_tabular_text(df_train_use, text_columns)
+            df_synth_use, df_synth_text = divide_tabular_text(df_synth_use, text_columns)
 
         # Normalize the tabular data if there is any
         tabular_columns = numeric_columns + nominal_columns
@@ -283,8 +247,8 @@ def _get_synth_nn(
         if len(tabular_columns) == 0:
             # Create embeddings for text fields
             assert embedder is not None
-            df_train_embeddings = AttributeInferenceProtection._embed_text(df_train_text, embedder)
-            df_synth_embeddings = AttributeInferenceProtection._embed_text(df_synth_text, embedder)
+            df_train_embeddings = embed_text(df_train_text, embedder)
+            df_synth_embeddings = embed_text(df_synth_text, embedder)
             hits = util.semantic_search(
                 torch.stack(df_train_embeddings["embedding"].tolist()),
                 torch.stack(df_synth_embeddings["embedding"].tolist()),
@@ -305,8 +269,8 @@ def _get_synth_nn(
         # Get the text embeddings and then the 1000 NN based on just the text
 
         assert embedder is not None
-        df_train_embeddings = AttributeInferenceProtection._embed_text(df_train_text, embedder)
-        df_synth_embeddings = AttributeInferenceProtection._embed_text(df_synth_text, embedder)
+        df_train_embeddings = embed_text(df_train_text, embedder)
+        df_synth_embeddings = embed_text(df_synth_text, embedder)
         search_synth_k = min(1000, len(df_synth_embeddings))
         hits = util.semantic_search(
             torch.stack(df_train_embeddings["embedding"].tolist()),
@@ -419,12 +383,7 @@ def _aia(
             numeric_columns = [column for column in df_train.columns if column not in nominal_columns]
 
             # Now separate out the text columns from the nominal
-
-            text_columns = []
-            for col in nominal_columns:
-                result = describe_field(col, df_train[col])
-                if result.type.value == "text":
-                    text_columns.append(col)
+            text_columns = find_text_fields(df_train[nominal_columns])
             nominal_columns = [x for x in nominal_columns if x not in text_columns]
 
             # If there are text columns, create an embedder

@@ -16,7 +16,6 @@
 from sklearn.metrics import accuracy_score, precision_score
 from sklearn.preprocessing import QuantileTransformer
 
-from ...artifacts.analyzers.field_features import describe_field
 from ...config.evaluate import DEFAULT_RECORD_COUNT
 from ...config.parameters import SafeSynthesizerParameters
 from ...evaluation.components.component import Component
@@ -25,6 +24,7 @@
 from ...evaluation.nearest_neighbors import NearestNeighborSearch
 from ...observability import get_logger
 from . import multi_modal_figures as figures
+from .privacy_metric_utils import divide_tabular_text, embed_text, find_text_fields
 
 logger = get_logger(__name__)
 
@@ -416,54 +416,6 @@ def _compute_mia(
             fp_cnts,
         )
 
-    @staticmethod
-    def find_text_fields(df: pd.DataFrame) -> list[str]:
-        """Return column names classified as free text."""
-        text_fields = []
-        for col in df.columns:
-            field_info = describe_field(col, df[col])
-            if field_info.type.value == "text":
-                text_fields.append(col)
-
-        return text_fields
-
-    @staticmethod
-    def embed_text(df: pd.DataFrame) -> pd.DataFrame:
-        """Embed each text column and average into a single embedding per row."""
-        embeddings = {}
-        embedder = SentenceTransformer("distiluse-base-multilingual-cased-v2")
-        for col in df.columns:
-            data = df[col].to_list()
-            data = [str(r) for r in data]
-            embeddings[col] = embedder.encode(data, show_progress_bar=False, convert_to_tensor=True)
-
-        avg_embeddings = []
-        for i in range(len(df)):
-            # TODO: Is this average what we want? When there are more than 2 columns, we will
-            # overweight later columns relative to earlier columns.
-            norm = embeddings[df.columns[0]][i]
-            for j in range(1, len(df.columns)):
-                field = df.columns[j]
-                norm = torch.mean(torch.stack([norm, embeddings[field][i]]), dim=0)
-
-            avg_embeddings.append(norm)
-
-        df_embeddings = pd.DataFrame({"embedding": list(avg_embeddings)})
-
-        return df_embeddings
-
-    @staticmethod
-    def divide_tabular_text(df: pd.DataFrame, text_fields: list) -> tuple[pd.DataFrame, pd.DataFrame]:
-        """Split a dataframe into tabular-only and text-only subsets."""
-        tabular_fields = []
-        for col in df.columns:
-            if col not in text_fields:
-                tabular_fields.append(col)
-        df_tabular = df.filter(tabular_fields)
-        df_text = df.filter(text_fields)
-
-        return (df_tabular, df_text)
-
     @staticmethod
     def mia(
         df_train: pd.DataFrame,
@@ -506,7 +458,7 @@ def mia(
                 df_test = df_test.filter([column_name])
                 df_synth = df_synth.filter([column_name])
 
-            text_fields = MembershipInferenceProtection.find_text_fields(df_train)
+            text_fields = find_text_fields(df_train)
             text_cnt = len(text_fields)
             tabular_cnt = len(df_train.columns) - text_cnt
 
@@ -525,13 +477,11 @@ def mia(
                 df_train_use = df_train.sample(n=train_size_needed, random_state=1)
 
             # Divide the dataframes into text and tabular
-            text_fields = MembershipInferenceProtection.find_text_fields(df_train_use)
+            text_fields = find_text_fields(df_train_use)
             if len(text_fields) > 0:
-                df_train_use, df_train_text = MembershipInferenceProtection.divide_tabular_text(
-                    df_train_use, text_fields
-                )
-                df_test, df_test_text = MembershipInferenceProtection.divide_tabular_text(df_test, text_fields)
-                df_synth, df_synth_text = MembershipInferenceProtection.divide_tabular_text(df_synth, text_fields)
+                df_train_use, df_train_text = divide_tabular_text(df_train_use, text_fields)
+                df_test, df_test_text = divide_tabular_text(df_test, text_fields)
+                df_synth, df_synth_text = divide_tabular_text(df_synth, text_fields)
 
             # Normalize the tabular data (adjusted for multimodal)
             if tabular_cnt > 0:
@@ -555,9 +505,10 @@ def mia(
             # Create embeddings for text fields and combine the normalized tabular and the
             # new text embeddings into one dataframe.
             if len(text_fields) > 0:
-                df_train_embeddings = MembershipInferenceProtection.embed_text(df_train_text)
-                df_test_embeddings = MembershipInferenceProtection.embed_text(df_test_text)
-                df_synth_embeddings = MembershipInferenceProtection.embed_text(df_synth_text)
+                embedder = SentenceTransformer("distiluse-base-multilingual-cased-v2")
+                df_train_embeddings = embed_text(df_train_text, embedder)
+                df_test_embeddings = embed_text(df_test_text, embedder)
+                df_synth_embeddings = embed_text(df_synth_text, embedder)
                 df_train_norm = pd.concat([df_train_norm, df_train_embeddings], axis=1)
                 df_test_norm = pd.concat([df_test_norm, df_test_embeddings], axis=1)
                 df_synth_norm = pd.concat([df_synth_norm, df_synth_embeddings], axis=1)

@@ -0,0 +1,78 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pandas as pd
+import torch
+
+if TYPE_CHECKING:
+    from sentence_transformers import SentenceTransformer
+
+from ...artifacts.analyzers.field_features import describe_field
+
+
+def find_text_fields(df: pd.DataFrame) -> list[str]:
+    """Identify columns in ``df`` whose content is free-form text.
+
+    Each column is passed through ``describe_field``; those classified
+    as ``"text"`` are returned.
+
+    Args:
+        df: DataFrame whose columns are inspected.
+
+    Returns:
+        Column names classified as free-form text.
+    """
+    text_fields: list[str] = []
+    for col in df.columns:
+        field_info = describe_field(col, df[col])
+        if field_info.type.value == "text":
+            text_fields.append(col)
+    return text_fields
+
+
+def divide_tabular_text(df: pd.DataFrame, text_fields: list[str]) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """Split ``df`` into a tabular-only and a text-only DataFrame.
+
+    Columns present in ``text_fields`` go into the text DataFrame; the
+    remaining columns go into the tabular DataFrame.
+
+    Args:
+        df: Source DataFrame to split.
+        text_fields: Column names to treat as text.
+
+    Returns:
+        A ``(tabular_df, text_df)`` tuple where ``tabular_df`` contains only
+        the non-text columns and ``text_df`` contains only the text columns.
+    """
+    tabular_fields = [col for col in df.columns if col not in text_fields]
+    return df.filter(tabular_fields), df.filter(text_fields)
+
+
+def embed_text(df: pd.DataFrame, embedder: SentenceTransformer) -> pd.DataFrame:
+    """Embed every text column in ``df`` and return a single averaged embedding per row.
+
+    For each column the ``embedder`` produces a ``(n_rows, embed_dim)`` matrix.
+    The per-column matrices are stacked and averaged across columns so that
+    every column contributes equally to the final embedding.
+
+    Args:
+        df: DataFrame whose columns are all text to be embedded.
+        embedder: Sentence-transformer model used to produce embeddings.
+
+    Returns:
+        Single-column DataFrame with column ``"embedding"`` whose values are
+        1-D tensors of shape ``(embed_dim,)``.
+    """
+    embeddings = {}
+    for col in df.columns:
+        data = [str(r) for r in df[col].to_list()]
+        embeddings[col] = torch.as_tensor(embedder.encode(data, show_progress_bar=False, convert_to_tensor=True))
+
+    stacked = torch.stack([embeddings[col] for col in df.columns], dim=0)  # shape: (n_cols, n_rows, embed_dim)
+    avg_embeddings = torch.mean(stacked, dim=0)  # shape: (n_rows, embed_dim)
+
+    return pd.DataFrame({"embedding": list(avg_embeddings)})
@@ -0,0 +1,60 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from unittest.mock import MagicMock
+
+import numpy as np
+import pandas as pd
+import pytest
+import torch
+
+from nemo_safe_synthesizer.evaluation.components.privacy_metric_utils import (
+    divide_tabular_text,
+    embed_text,
+)
+
+
+@pytest.fixture
+def mock_embedder():
+    """A mock SentenceTransformer whose .encode() returns deterministic tensors."""
+    embedder = MagicMock()
+
+    def _encode(data, **kwargs):
+        # Return a distinct but deterministic embedding per string.
+        # Use the length of each string as a simple seed for reproducibility.
+        return torch.tensor([[float(len(s)), float(len(s)) * 2, float(len(s)) * 3] for s in data], dtype=torch.float32)
+
+    embedder.encode = MagicMock(side_effect=_encode)
+    return embedder
+
+
+def test_divide_tabular_text(train_df):
+    text_fields = ["text", "other"]
+    tabular, text = divide_tabular_text(train_df, text_fields)
+
+    assert "text" not in tabular.columns
+    assert "other" not in tabular.columns
+    assert set(text.columns) == {"other", "text"}
+    assert len(tabular) == len(train_df)
+    assert len(text) == len(train_df)
+
+
+def test_embed_text(mock_embedder):
+    """Regression test: with 3+ columns the old pairwise-averaging code
+    over-weighted later columns. The corrected stack/mean reduction must give
+    each column equal weight.
+    """
+    df = pd.DataFrame(
+        {
+            "a": ["x"],  # len 1 → embedding [1, 2, 3]
+            "b": ["xx"],  # len 2 → embedding [2, 4, 6]
+            "c": ["xxxx"],  # len 4 → embedding [4, 8, 12]
+        }
+    )
+    result = embed_text(df, mock_embedder)
+
+    # True mean of [1,2,3], [2,4,6], [4,8,12] across columns:
+    # = [(1+2+4)/3, (2+4+8)/3, (3+6+12)/3] = [7/3, 14/3, 21/3]
+    expected = np.array([7 / 3, 14 / 3, 7.0])
+    assert isinstance(result["embedding"].iloc[0], torch.Tensor)
+    np.testing.assert_array_almost_equal(result["embedding"].iloc[0].numpy(), expected)