diff --git a/packages/data-designer/src/data_designer/interface/data_designer.py b/packages/data-designer/src/data_designer/interface/data_designer.py index 0293dfd12..58dcb3f8b 100644 --- a/packages/data-designer/src/data_designer/interface/data_designer.py +++ b/packages/data-designer/src/data_designer/interface/data_designer.py @@ -213,11 +213,25 @@ def create( raise DataDesignerGenerationError(f"🛑 Error generating dataset: {e}") try: - profiler = self._create_dataset_profiler(config_builder, resource_provider) - analysis = profiler.profile_dataset( - num_records, - builder.artifact_storage.load_dataset_with_dropped_columns(), + dataset_for_profiler = builder.artifact_storage.load_dataset_with_dropped_columns() + except Exception as e: + raise DataDesignerGenerationError( + f"🛑 Failed to load generated dataset — all records may have been dropped " + f"due to generation failures. Check the warnings above for details. Original error: {e}" ) + + # Defensive: the batch manager skips writing when the buffer is empty, so in + # practice load_dataset_with_dropped_columns() would raise before returning a + # zero-row DataFrame. This guard protects against future changes to that contract. + if len(dataset_for_profiler) == 0: + raise DataDesignerGenerationError( + "🛑 Dataset is empty — all records were dropped due to generation failures. " + "Check the warnings above for details on which columns failed." + ) + + try: + profiler = self._create_dataset_profiler(config_builder, resource_provider) + analysis = profiler.profile_dataset(num_records, dataset_for_profiler) except Exception as e: raise DataDesignerProfilingError(f"🛑 Error profiling dataset: {e}") @@ -267,6 +281,12 @@ def preview( except Exception as e: raise DataDesignerGenerationError(f"🛑 Error generating preview dataset: {e}") + if len(processed_dataset) == 0: + raise DataDesignerGenerationError( + "🛑 Dataset is empty — all records were dropped due to generation or processing failures. " + "Check the warnings above for details on which columns failed." + ) + dropped_columns = raw_dataset.columns.difference(processed_dataset.columns) if len(dropped_columns) > 0: dataset_for_profiler = lazy.pd.concat([processed_dataset, raw_dataset[dropped_columns]], axis=1) @@ -283,11 +303,7 @@ def preview( for name in builder.artifact_storage.list_processor_names(): processor_artifacts[name] = builder.artifact_storage.load_processor_dataset(name).to_dict(orient="records") - if ( - len(processed_dataset) > 0 - and isinstance(analysis, DatasetProfilerResults) - and len(analysis.column_statistics) > 0 - ): + if isinstance(analysis, DatasetProfilerResults) and len(analysis.column_statistics) > 0: logger.info(f"{RandomEmoji.success()} Preview complete!") # Create dataset metadata from the resource provider diff --git a/packages/data-designer/tests/interface/test_data_designer.py b/packages/data-designer/tests/interface/test_data_designer.py index 1e60bf709..f3a517383 100644 --- a/packages/data-designer/tests/interface/test_data_designer.py +++ b/packages/data-designer/tests/interface/test_data_designer.py @@ -304,6 +304,73 @@ def test_preview_raises_error_when_profiler_fails( data_designer.preview(stub_sampler_only_config_builder, num_records=3) +def test_create_raises_generation_error_when_dataset_is_empty( + stub_artifact_path, stub_model_providers, stub_sampler_only_config_builder, stub_managed_assets_path +): + """When all records are dropped during generation, create should raise + DataDesignerGenerationError with a clear message instead of a misleading profiler error. + """ + data_designer = DataDesigner( + artifact_path=stub_artifact_path, + model_providers=stub_model_providers, + secret_resolver=PlaintextResolver(), + managed_assets_path=stub_managed_assets_path, + ) + + with patch( + "data_designer.engine.storage.artifact_storage.ArtifactStorage.load_dataset_with_dropped_columns", + return_value=lazy.pd.DataFrame(), + ): + with pytest.raises(DataDesignerGenerationError, match="Dataset is empty"): + data_designer.create(stub_sampler_only_config_builder, num_records=1) + + +def test_create_raises_generation_error_when_load_dataset_fails( + stub_artifact_path: Path, + stub_model_providers: list[ModelProvider], + stub_sampler_only_config_builder: DataDesignerConfigBuilder, + stub_managed_assets_path: Path, +) -> None: + """When no parquet was written (e.g. all records dropped), load_dataset_with_dropped_columns + raises an exception. create() should surface this as DataDesignerGenerationError, not + DataDesignerProfilingError. + """ + data_designer = DataDesigner( + artifact_path=stub_artifact_path, + model_providers=stub_model_providers, + secret_resolver=PlaintextResolver(), + managed_assets_path=stub_managed_assets_path, + ) + + with patch( + "data_designer.engine.storage.artifact_storage.ArtifactStorage.load_dataset_with_dropped_columns", + side_effect=FileNotFoundError("No parquet files found"), + ): + with pytest.raises(DataDesignerGenerationError, match="Failed to load generated dataset"): + data_designer.create(stub_sampler_only_config_builder, num_records=1) + + +def test_preview_raises_generation_error_when_dataset_is_empty( + stub_artifact_path, stub_model_providers, stub_sampler_only_config_builder, stub_managed_assets_path +): + """When all records are dropped during generation, preview should raise + DataDesignerGenerationError with a clear message instead of a misleading profiler error. + """ + data_designer = DataDesigner( + artifact_path=stub_artifact_path, + model_providers=stub_model_providers, + secret_resolver=PlaintextResolver(), + managed_assets_path=stub_managed_assets_path, + ) + + with patch( + "data_designer.engine.dataset_builders.column_wise_builder.ColumnWiseDatasetBuilder.process_preview", + return_value=lazy.pd.DataFrame(), + ): + with pytest.raises(DataDesignerGenerationError, match="Dataset is empty"): + data_designer.preview(stub_sampler_only_config_builder, num_records=1) + + def test_preview_with_dropped_columns( stub_artifact_path, stub_model_providers, stub_model_configs, stub_managed_assets_path ):