Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -213,11 +213,25 @@ def create(
raise DataDesignerGenerationError(f"🛑 Error generating dataset: {e}")

try:
profiler = self._create_dataset_profiler(config_builder, resource_provider)
analysis = profiler.profile_dataset(
num_records,
builder.artifact_storage.load_dataset_with_dropped_columns(),
dataset_for_profiler = builder.artifact_storage.load_dataset_with_dropped_columns()
except Exception as e:
raise DataDesignerGenerationError(
f"🛑 Failed to load generated dataset — all records may have been dropped "
f"due to generation failures. Check the warnings above for details. Original error: {e}"
)

# Defensive: the batch manager skips writing when the buffer is empty, so in
# practice load_dataset_with_dropped_columns() would raise before returning a
# zero-row DataFrame. This guard protects against future changes to that contract.
if len(dataset_for_profiler) == 0:
raise DataDesignerGenerationError(
"🛑 Dataset is empty — all records were dropped due to generation failures. "
"Check the warnings above for details on which columns failed."
)

try:
profiler = self._create_dataset_profiler(config_builder, resource_provider)
analysis = profiler.profile_dataset(num_records, dataset_for_profiler)
except Exception as e:
raise DataDesignerProfilingError(f"🛑 Error profiling dataset: {e}")

Expand Down Expand Up @@ -267,6 +281,12 @@ def preview(
except Exception as e:
raise DataDesignerGenerationError(f"🛑 Error generating preview dataset: {e}")

if len(processed_dataset) == 0:
raise DataDesignerGenerationError(
"🛑 Dataset is empty — all records were dropped due to generation or processing failures. "
"Check the warnings above for details on which columns failed."
)

dropped_columns = raw_dataset.columns.difference(processed_dataset.columns)
if len(dropped_columns) > 0:
dataset_for_profiler = lazy.pd.concat([processed_dataset, raw_dataset[dropped_columns]], axis=1)
Expand All @@ -283,11 +303,7 @@ def preview(
for name in builder.artifact_storage.list_processor_names():
processor_artifacts[name] = builder.artifact_storage.load_processor_dataset(name).to_dict(orient="records")

if (
len(processed_dataset) > 0
and isinstance(analysis, DatasetProfilerResults)
and len(analysis.column_statistics) > 0
):
if isinstance(analysis, DatasetProfilerResults) and len(analysis.column_statistics) > 0:
logger.info(f"{RandomEmoji.success()} Preview complete!")

# Create dataset metadata from the resource provider
Expand Down
67 changes: 67 additions & 0 deletions packages/data-designer/tests/interface/test_data_designer.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,73 @@ def test_preview_raises_error_when_profiler_fails(
data_designer.preview(stub_sampler_only_config_builder, num_records=3)


def test_create_raises_generation_error_when_dataset_is_empty(
stub_artifact_path, stub_model_providers, stub_sampler_only_config_builder, stub_managed_assets_path
):
"""When all records are dropped during generation, create should raise
DataDesignerGenerationError with a clear message instead of a misleading profiler error.
"""
data_designer = DataDesigner(
artifact_path=stub_artifact_path,
model_providers=stub_model_providers,
secret_resolver=PlaintextResolver(),
managed_assets_path=stub_managed_assets_path,
)

with patch(
"data_designer.engine.storage.artifact_storage.ArtifactStorage.load_dataset_with_dropped_columns",
return_value=lazy.pd.DataFrame(),
):
with pytest.raises(DataDesignerGenerationError, match="Dataset is empty"):
data_designer.create(stub_sampler_only_config_builder, num_records=1)


def test_create_raises_generation_error_when_load_dataset_fails(
stub_artifact_path: Path,
stub_model_providers: list[ModelProvider],
stub_sampler_only_config_builder: DataDesignerConfigBuilder,
stub_managed_assets_path: Path,
) -> None:
"""When no parquet was written (e.g. all records dropped), load_dataset_with_dropped_columns
raises an exception. create() should surface this as DataDesignerGenerationError, not
DataDesignerProfilingError.
"""
data_designer = DataDesigner(
artifact_path=stub_artifact_path,
model_providers=stub_model_providers,
secret_resolver=PlaintextResolver(),
managed_assets_path=stub_managed_assets_path,
)

with patch(
"data_designer.engine.storage.artifact_storage.ArtifactStorage.load_dataset_with_dropped_columns",
side_effect=FileNotFoundError("No parquet files found"),
):
with pytest.raises(DataDesignerGenerationError, match="Failed to load generated dataset"):
data_designer.create(stub_sampler_only_config_builder, num_records=1)


def test_preview_raises_generation_error_when_dataset_is_empty(
stub_artifact_path, stub_model_providers, stub_sampler_only_config_builder, stub_managed_assets_path
):
"""When all records are dropped during generation, preview should raise
DataDesignerGenerationError with a clear message instead of a misleading profiler error.
"""
data_designer = DataDesigner(
artifact_path=stub_artifact_path,
model_providers=stub_model_providers,
secret_resolver=PlaintextResolver(),
managed_assets_path=stub_managed_assets_path,
)

with patch(
"data_designer.engine.dataset_builders.column_wise_builder.ColumnWiseDatasetBuilder.process_preview",
return_value=lazy.pd.DataFrame(),
):
with pytest.raises(DataDesignerGenerationError, match="Dataset is empty"):
data_designer.preview(stub_sampler_only_config_builder, num_records=1)


def test_preview_with_dropped_columns(
stub_artifact_path, stub_model_providers, stub_model_configs, stub_managed_assets_path
):
Expand Down