PR feedback: add early fail to order by check, remove unnecessary internal method, clean up docstrings, etc.

nina-xu · nina-xu · commit b86051c62cba · 2026-04-08T15:10:56.000Z
Signed-off-by: nina-xu &lt;19981858+nina-xu@users.noreply.github.com&gt;
diff --git a/src/nemo_safe_synthesizer/data_processing/assembler.py b/src/nemo_safe_synthesizer/data_processing/assembler.py
@@ -30,6 +30,7 @@
     RunningStatistics,
     Statistics,
 )
+from ..data_processing.validation import MISSING_GROUP_BY_COLUMN_ERROR, MISSING_ORDER_BY_COLUMN_ERROR
 from ..defaults import (
     DEFAULT_CACHE_PREFIX,
     PSEUDO_GROUP_COLUMN,
@@ -864,10 +865,10 @@ def _validate_columns(self, dataset: Dataset) -> None:
             ParameterError: If group or order column is not found in dataset.
         """
         if self.group_by_column not in dataset.column_names:
-            raise ParameterError(f"Group by column '{self.group_by_column}' not found in dataset.")
+            raise ParameterError(MISSING_GROUP_BY_COLUMN_ERROR.format(group_by=self.group_by_column))
 
         if self.order_by_column not in dataset.column_names:
-            raise ParameterError(f"Order by column '{self.order_by_column}' not found in dataset.")
+            raise ParameterError(MISSING_ORDER_BY_COLUMN_ERROR.format(order_by=self.order_by_column))
 
     def _reorder_columns(self, dataset: Dataset) -> Dataset:
         """Reorder columns: group_by first, order_by second, then the rest.
diff --git a/src/nemo_safe_synthesizer/data_processing/validation.py b/src/nemo_safe_synthesizer/data_processing/validation.py
@@ -11,9 +11,10 @@
 
 MISSING_GROUP_BY_COLUMN_ERROR = (
     "Group by column '{group_by}' not found in input dataset columns. "
-    "Please set `data.group_training_examples_by` to an existing column or disable grouping."
+    "Please set `data.group_training_examples_by` to an existing column or to `null`/`None` to disable grouping."
 )
 MISSING_GROUP_BY_VALUES_ERROR = "Group by column '{group_by}' has missing values. Please remove/replace them."
+MISSING_ORDER_BY_COLUMN_ERROR = "Order by column '{order_by}' not found in the input data."
 
 
 def validate_groupby_column(df: pd.DataFrame, group_by: str | None) -> None:
@@ -35,3 +36,20 @@ def validate_groupby_column(df: pd.DataFrame, group_by: str | None) -> None:
 
     if df[group_by].isna().any():
         raise DataError(MISSING_GROUP_BY_VALUES_ERROR.format(group_by=group_by))
+
+
+def validate_orderby_column(df: pd.DataFrame, order_by: str | None) -> None:
+    """Validate that the configured order-by column exists.
+
+    Args:
+        df: Dataframe to validate.
+        order_by: Name of the configured ordering column.
+
+    Raises:
+        ParameterError: If ``order_by`` is configured but not present in ``df``.
+    """
+    if order_by is None:
+        return
+
+    if order_by not in df.columns:
+        raise ParameterError(MISSING_ORDER_BY_COLUMN_ERROR.format(order_by=order_by))
diff --git a/src/nemo_safe_synthesizer/holdout/holdout.py b/src/nemo_safe_synthesizer/holdout/holdout.py
@@ -13,7 +13,7 @@
 
 from ..config.data import DEFAULT_HOLDOUT, MIN_HOLDOUT
 from ..config.parameters import SafeSynthesizerParameters
-from ..data_processing.validation import MISSING_GROUP_BY_COLUMN_ERROR, validate_groupby_column
+from ..data_processing.validation import validate_groupby_column
 from ..observability import get_logger
 
 MIN_RECORDS_FOR_TEXT_AND_PRIVACY_METRICS = 200
@@ -24,20 +24,11 @@
 INPUT_DATA_TOO_SMALL_ERROR = (
     f"Dataset must have at least {MIN_RECORDS_FOR_TEXT_AND_PRIVACY_METRICS} records to use holdout."
 )
+
 logger = get_logger(__name__)
 
 DataFrameOptionalTuple = tuple[pd.DataFrame, pd.DataFrame] | tuple[pd.DataFrame, None]
 
-__all__ = [
-    "HOLDOUT_TOO_SMALL_ERROR",
-    "INPUT_DATA_TOO_SMALL_ERROR",
-    "MISSING_GROUP_BY_COLUMN_ERROR",
-    "MIN_RECORDS_FOR_TEXT_AND_PRIVACY_METRICS",
-    "Holdout",
-    "grouped_train_test_split",
-    "naive_train_test_split",
-]
-
 
 def naive_train_test_split(df, test_size, random_state=None) -> DataFrameOptionalTuple:
     """Split a dataframe into train and test sets with a random shuffle.
@@ -83,12 +74,10 @@ def grouped_train_test_split(df, test_size, group_by, random_state=None) -> Data
         grouped split could be produced.
 
     Raises:
-        ValueError: If the ``group_by`` column contains missing values.
+        ParameterError: If the ``group_by`` column is not present in ``df``.
+        DataError: If the ``group_by`` column contains missing values.
     """
-    # Do not continue the split process if the groupby column has missing values.
-    if df[group_by].isna().any():
-        msg = f"Group by column '{group_by}' has missing values. Please remove/replace them."
-        raise ValueError(msg)
+    validate_groupby_column(df, group_by)
 
     if test_size > df.groupby(group_by).ngroups or test_size == 1 or test_size == 0:
         logger.info(
diff --git a/src/nemo_safe_synthesizer/sdk/library_builder.py b/src/nemo_safe_synthesizer/sdk/library_builder.py
@@ -23,7 +23,7 @@
     SafeSynthesizerParameters,
 )
 from ..config.autoconfig import AutoConfigResolver
-from ..data_processing.validation import validate_groupby_column
+from ..data_processing.validation import validate_groupby_column, validate_orderby_column
 from ..evaluation.evaluator import Evaluator
 from ..generation.timeseries_backend import TimeseriesBackend
 from ..generation.vllm_backend import VllmBackend
@@ -242,9 +242,11 @@ def load_from_save_path(self) -> SafeSynthesizer:
     def process_data(self) -> SafeSynthesizer:
         """Perform train/test split, auto-config resolution, and optional PII replacement.
 
-        Splits the data via ``Holdout``, runs ``AutoConfigResolver`` to
-        resolve ``"auto"`` parameters, applies PII replacement to the
-        training set when enabled, and persists the splits to the workdir.
+        Validates configured grouping/ordering columns against the input
+        dataset, splits the data via ``Holdout``, runs
+        ``AutoConfigResolver`` to resolve ``"auto"`` parameters, applies
+        PII replacement to the training set when enabled, and persists the
+        splits to the workdir.
 
         Returns:
             Self for method chaining.
@@ -267,6 +269,7 @@ def process_data(self) -> SafeSynthesizer:
             assert isinstance(self._data_source, pd.DataFrame)
 
         validate_groupby_column(self._data_source, self._nss_config.data.group_training_examples_by)
+        validate_orderby_column(self._data_source, self._nss_config.data.order_training_examples_by)
 
         holdout = Holdout(self._nss_config)
         original_train_df, self._test_df = holdout.train_test_split(self._data_source)
diff --git a/src/nemo_safe_synthesizer/training/backend.py b/src/nemo_safe_synthesizer/training/backend.py
@@ -220,11 +220,11 @@ def __subclasshook__(cls, subclass):
     def prepare_training_data(self):
         """Load, validate, and tokenize the training dataset.
 
-        Runs auto-config resolution, validates groupby/orderby columns,
-        applies time-series processing and ``action_executor`` preprocessing,
-        then assembles tokenized training examples. Populates
-        ``training_examples``, ``dataset_schema``, ``df_train``, and
-        ``data_fraction``.
+        Validates grouping/ordering columns (where applicable), resolves
+        auto-config values, applies time-series processing and
+        ``action_executor`` preprocessing, then assembles tokenized training
+        examples. Populates ``training_examples``, ``dataset_schema``,
+        ``df_train``, and ``data_fraction``.
         """
         ...
 
diff --git a/src/nemo_safe_synthesizer/training/huggingface_backend.py b/src/nemo_safe_synthesizer/training/huggingface_backend.py
@@ -38,7 +38,7 @@
 from ..config.autoconfig import AutoConfigResolver
 from ..data_processing.assembler import TrainingExampleAssembler
 from ..data_processing.dataset import make_json_schema
-from ..data_processing.validation import validate_groupby_column
+from ..data_processing.validation import validate_groupby_column, validate_orderby_column
 from ..defaults import (
     DEFAULT_VALID_RECORD_EVAL_BATCH_SIZE,
     EVAL_STEPS,
@@ -529,19 +529,6 @@ def prepare_params(self, **training_args):
         self.trainer = self._create_trainer(self.train_args, data_collator)
         self._configure_trainer_callbacks(self.trainer, training_args)
 
-    def _validate_groupby_column(self, df) -> None:
-        """Validate the groupby column exists and has no missing values.
-
-        Args:
-            df: The DataFrame to validate.
-
-        Raises:
-            ParameterError: If the groupby column doesn't exist.
-            DataError: If the groupby column has missing values.
-        """
-        col = self.params.data.group_training_examples_by
-        validate_groupby_column(df, col)
-
     def _validate_orderby_column(self, df) -> None:
         """Validate the orderby column exists in the dataset.
 
@@ -558,10 +545,7 @@ def _validate_orderby_column(self, df) -> None:
         if self.params.time_series.is_timeseries and self.params.time_series.timestamp_column is None:
             return
 
-        if orderby_col and orderby_col not in df.columns:
-            msg = f"Order by column '{orderby_col}' not found in the input data."
-            logger.error(msg)
-            raise ParameterError(msg)
+        validate_orderby_column(df, orderby_col)
 
     def _apply_preprocessing(self, df):
         """Apply action_executor preprocessing if available.
@@ -642,8 +626,8 @@ def _log_dataset_statistics(self, assembler) -> None:
     def prepare_training_data(self):
         """Validate, preprocess, and tokenize the training dataset.
 
-        Runs auto-config resolution, time-series processing, groupby /
-        orderby validation, and assembles tokenized training examples.
+        Validates groupby/orderby columns, resolves auto-config values,
+        runs time-series preprocessing, and assembles tokenized training examples.
         Populates ``training_examples``, ``dataset_schema``,
         ``df_train``, and ``data_fraction``.
 
@@ -660,7 +644,7 @@ def prepare_training_data(self):
             raise DataError("Expected DataFrame from to_pandas(), got an iterator")
 
         # Validate groupby/orderby parameters as a preprocessing step.
-        self._validate_groupby_column(df_all)
+        validate_groupby_column(df_all, self.params.data.group_training_examples_by)
         self._validate_orderby_column(df_all)
         self.params = AutoConfigResolver(df_all, self.params).resolve()
 
diff --git a/tests/data_processing/test_assembler.py b/tests/data_processing/test_assembler.py
@@ -670,7 +670,7 @@ def test_sequential_assembler_raises_for_missing_group_column(
     fixture_sequential_metadata: ModelMetadata,
 ):
     """Test that SequentialExampleAssembler raises for missing group column."""
-    with pytest.raises(ParameterError, match="Group by column.*not found in dataset"):
+    with pytest.raises(ParameterError, match="Group by column.*not found"):
         SequentialExampleAssembler(
             dataset=fixture_iris_dataset,
             tokenizer=fixture_tokenizer,
@@ -689,7 +689,7 @@ def test_sequential_assembler_raises_for_missing_order_column(
     fixture_sequential_metadata: ModelMetadata,
 ):
     """Test that SequentialExampleAssembler raises for missing order column."""
-    with pytest.raises(ParameterError, match="Order by column.*not found in dataset"):
+    with pytest.raises(ParameterError, match="Order by column.*not found"):
         SequentialExampleAssembler(
             dataset=fixture_iris_dataset,
             tokenizer=fixture_tokenizer,
diff --git a/tests/data_processing/test_validation.py b/tests/data_processing/test_validation.py
@@ -7,7 +7,9 @@
 from nemo_safe_synthesizer.data_processing.validation import (
     MISSING_GROUP_BY_COLUMN_ERROR,
     MISSING_GROUP_BY_VALUES_ERROR,
+    MISSING_ORDER_BY_COLUMN_ERROR,
     validate_groupby_column,
+    validate_orderby_column,
 )
 from nemo_safe_synthesizer.errors import DataError, ParameterError
 
@@ -20,12 +22,24 @@ def test_validate_groupby_column_noop_when_groupby_is_none() -> None:
 def test_validate_groupby_column_raises_for_missing_column() -> None:
     df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
     with pytest.raises(ParameterError) as excinfo:
-        validate_groupby_column(df, "missing")
-    assert str(excinfo.value) == MISSING_GROUP_BY_COLUMN_ERROR.format(group_by="missing")
+        validate_groupby_column(df, "missing_group")
+    assert str(excinfo.value) == MISSING_GROUP_BY_COLUMN_ERROR.format(group_by="missing_group")
 
 
 def test_validate_groupby_column_raises_for_missing_values() -> None:
     df = pd.DataFrame({"group": ["x", None], "value": [1, 2]})
     with pytest.raises(DataError) as excinfo:
         validate_groupby_column(df, "group")
     assert str(excinfo.value) == MISSING_GROUP_BY_VALUES_ERROR.format(group_by="group")
+
+
+def test_validate_orderby_column_noop_when_orderby_is_none() -> None:
+    df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
+    validate_orderby_column(df, None)
+
+
+def test_validate_orderby_column_raises_for_missing_column() -> None:
+    df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
+    with pytest.raises(ParameterError) as excinfo:
+        validate_orderby_column(df, "missing_order")
+    assert str(excinfo.value) == MISSING_ORDER_BY_COLUMN_ERROR.format(order_by="missing_order")
diff --git a/tests/holdout/test_holdout.py b/tests/holdout/test_holdout.py
@@ -6,10 +6,10 @@
 import pytest
 
 from nemo_safe_synthesizer.config.parameters import SafeSynthesizerParameters
+from nemo_safe_synthesizer.errors import DataError, ParameterError
 from nemo_safe_synthesizer.holdout.holdout import (
     HOLDOUT_TOO_SMALL_ERROR,
     INPUT_DATA_TOO_SMALL_ERROR,
-    MISSING_GROUP_BY_COLUMN_ERROR,
     Holdout,
     naive_train_test_split,
 )
@@ -106,10 +106,16 @@ def test_does_group_by_holdout(df):
 
 def test_raises_on_group_by_holdout_with_bad_column(df):
     holdout = Holdout(SafeSynthesizerParameters.from_params(group_training_examples_by="dne"))
-    with pytest.raises(ValueError) as excinfo:
+    with pytest.raises(ParameterError, match="Group by column 'dne' not found"):
         holdout.train_test_split(df)
 
-    assert str(excinfo.value) == MISSING_GROUP_BY_COLUMN_ERROR.format(group_by="dne")
+
+def test_raises_on_group_by_holdout_with_missing_values(df):
+    df_with_missing_group = df.copy()
+    df_with_missing_group.loc[0, "big_cat"] = None
+    holdout = Holdout(SafeSynthesizerParameters.from_params(group_training_examples_by="big_cat"))
+    with pytest.raises(DataError, match="Group by column 'big_cat' has missing values"):
+        holdout.train_test_split(df_with_missing_group)
 
 
 def test_complains_when_training_dataset_is_too_small():
diff --git a/tests/sdk/test_process_data.py b/tests/sdk/test_process_data.py
@@ -585,7 +585,9 @@ class TestProcessDataConfigValidation:
     methods after construction are not visible to the Pydantic validator
     until ``_resolve_nss_config()`` is called.  ``process_data`` must
     call it at the top of the method so invalid configs are caught
-    immediately -- before holdout split, PII replacement, or any disk I/O.
+    immediately. It also validates configured group/order columns against
+    the input dataset before holdout split, autoconfig resolution, PII
+    replacement, or any disk I/O.
     """
 
     def test_dp_and_explicit_unsloth_raises_at_process_data(self, fixture_workdir: Workdir) -> None:
@@ -615,11 +617,37 @@ def test_invalid_groupby_raises_before_holdout(
         downstream ``KeyError``.
         """
         ss = SafeSynthesizer(
-            config=SafeSynthesizerParameters.from_params(group_training_examples_by="non_existent"),
+            config=SafeSynthesizerParameters.from_params(group_training_examples_by="non_existent_group"),
             workdir=fixture_workdir,
         ).with_data_source(fixture_sample_patient_dataframe)
 
-        with pytest.raises(ParameterError, match="Group by column 'non_existent' not found"):
+        with pytest.raises(ParameterError, match="Group by column 'non_existent_group' not found"):
+            ss.process_data()
+
+        mock_holdout_cls.assert_not_called()
+
+    @patch("nemo_safe_synthesizer.sdk.library_builder.Holdout")
+    def test_invalid_orderby_raises_before_holdout(
+        self,
+        mock_holdout_cls,
+        fixture_workdir: Workdir,
+        fixture_sample_patient_dataframe: pd.DataFrame,
+    ) -> None:
+        """Missing order-by column raises immediately during ``process_data``.
+
+        This catches invalid ``order_training_examples_by`` before holdout split
+        or autoconfig runs, ensuring a clear ``ParameterError`` instead of a
+        downstream pandas error.
+        """
+        ss = SafeSynthesizer(
+            config=SafeSynthesizerParameters.from_params(
+                group_training_examples_by="patient_name",
+                order_training_examples_by="non_existent_order",
+            ),
+            workdir=fixture_workdir,
+        ).with_data_source(fixture_sample_patient_dataframe)
+
+        with pytest.raises(ParameterError, match="Order by column 'non_existent_order' not found"):
             ss.process_data()
 
         mock_holdout_cls.assert_not_called()
diff --git a/tests/training/test_huggingface_backend.py b/tests/training/test_huggingface_backend.py
@@ -18,6 +18,7 @@
     SafeSynthesizerParameters,
     TrainingHyperparams,
 )
+from nemo_safe_synthesizer.data_processing.validation import validate_groupby_column
 from nemo_safe_synthesizer.errors import DataError, ParameterError
 from nemo_safe_synthesizer.training.huggingface_backend import (
     HuggingFaceBackend,
@@ -561,28 +562,23 @@ def test_uses_provided_data_collator(self, backend):
 
 
 class TestValidateGroupbyColumn:
-    def test_does_nothing_when_no_groupby(self, backend, sample_dataframe):
+    def test_does_nothing_when_no_groupby(self, sample_dataframe):
         """Test that nothing happens when groupby is None."""
-        backend._validate_groupby_column(sample_dataframe)  # Should not raise
+        validate_groupby_column(sample_dataframe, None)  # Should not raise
 
-    def test_passes_when_column_exists(self, backend, sample_dataframe):
+    def test_passes_when_column_exists(self, sample_dataframe):
         """Test that validation passes when column exists."""
-        backend.params.data.group_training_examples_by = "group_col"
-        backend._validate_groupby_column(sample_dataframe)  # Should not raise
+        validate_groupby_column(sample_dataframe, "group_col")  # Should not raise
 
-    def test_raises_when_column_missing(self, backend, sample_dataframe):
+    def test_raises_when_column_missing(self, sample_dataframe):
         """Test that ParameterError is raised when column is missing."""
-        backend.params.data.group_training_examples_by = "nonexistent_col"
-
         with pytest.raises(ParameterError, match="Group by column 'nonexistent_col' not found"):
-            backend._validate_groupby_column(sample_dataframe)
+            validate_groupby_column(sample_dataframe, "nonexistent_col")
 
-    def test_raises_when_column_has_nulls(self, backend, dataframe_with_null_group):
+    def test_raises_when_column_has_nulls(self, dataframe_with_null_group):
         """Test that DataError is raised when column has null values."""
-        backend.params.data.group_training_examples_by = "group_col"
-
         with pytest.raises(DataError, match="has missing values"):
-            backend._validate_groupby_column(dataframe_with_null_group)
+            validate_groupby_column(dataframe_with_null_group, "group_col")
 
 
 class TestValidateOrderbyColumn: