Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions packages/data-designer/src/data_designer/cli/commands/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,16 @@ def create_command(
"-o",
help="Path where generated artifacts will be stored. Defaults to ./artifacts.",
),
output_format: str | None = typer.Option(
None,
"--output-format",
"-f",
help=(
"Export the dataset to a single file after generation. "
"Supported formats: jsonl, csv, parquet. "
"The file is written to <artifact-path>/<dataset-name>/dataset.<format>."
),
),
) -> None:
"""Create a full dataset and save results to disk.

Expand All @@ -60,4 +70,5 @@ def create_command(
num_records=num_records,
dataset_name=dataset_name,
artifact_path=artifact_path,
output_format=output_format,
)
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ def run_create(
num_records: int,
dataset_name: str,
artifact_path: str | None,
output_format: str | None = None,
) -> None:
"""Load config, create a full dataset, and save results to disk.

Expand All @@ -124,7 +125,17 @@ def run_create(
num_records: Number of records to generate.
dataset_name: Name for the generated dataset folder.
artifact_path: Path where generated artifacts will be stored, or None for default.
output_format: If set, export the dataset to a single file in this format after
generation. One of 'jsonl', 'csv', 'parquet'.
"""
from data_designer.interface.results import SUPPORTED_EXPORT_FORMATS

if output_format is not None and output_format not in SUPPORTED_EXPORT_FORMATS:
print_error(
f"Unsupported export format: {output_format!r}. Choose one of: {', '.join(SUPPORTED_EXPORT_FORMATS)}."
)
raise typer.Exit(code=1)

config_builder = self._load_config(config_source)

resolved_artifact_path = Path(artifact_path) if artifact_path else Path.cwd() / "artifacts"
Expand Down Expand Up @@ -157,6 +168,16 @@ def run_create(
console.print()
print_success(f"Dataset created — {len(dataset)} record(s) generated")
console.print(f" Artifacts saved to: [bold]{results.artifact_storage.base_dataset_path}[/bold]")

if output_format is not None:
export_path = results.artifact_storage.base_dataset_path / f"dataset.{output_format}"
try:
results.export(export_path, format=output_format) # type: ignore[arg-type]
except Exception as e:
print_error(f"Export failed: {e}")
raise typer.Exit(code=1)
console.print(f" Exported to: [bold]{export_path}[/bold]")

console.print()

def _load_config(self, config_source: str) -> DataDesignerConfigBuilder:
Expand Down
41 changes: 40 additions & 1 deletion packages/data-designer/src/data_designer/interface/results.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from __future__ import annotations

from pathlib import Path
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Literal

from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
from data_designer.config.config_builder import DataDesignerConfigBuilder
Expand All @@ -19,6 +19,9 @@

from data_designer.engine.dataset_builders.utils.task_model import TaskTrace

ExportFormat = Literal["jsonl", "csv", "parquet"]
SUPPORTED_EXPORT_FORMATS: tuple[str, ...] = ("jsonl", "csv", "parquet")


class DatasetCreationResults(WithRecordSamplerMixin):
"""Results container for a Data Designer dataset creation run.
Expand Down Expand Up @@ -95,6 +98,42 @@ def get_path_to_processor_artifacts(self, processor_name: str) -> Path:
raise ArtifactStorageError(f"Processor {processor_name} has no artifacts.")
return self.artifact_storage.processors_outputs_path / processor_name

def export(self, path: Path | str, *, format: ExportFormat = "jsonl") -> Path:
"""Export the generated dataset to a single file.

Args:
path: Output file path. The extension is not inferred from *format* —
the exact path is used as-is.
format: Output format. One of ``'jsonl'``, ``'csv'``, or ``'parquet'``.
Defaults to ``'jsonl'``.

Returns:
Path to the written file.

Raises:
ValueError: If an unsupported format is requested.

Example:
>>> results = data_designer.create(config, num_records=1000)
>>> results.export("output.jsonl")
PosixPath('output.jsonl')
>>> results.export("output.csv", format="csv")
PosixPath('output.csv')
"""
if format not in SUPPORTED_EXPORT_FORMATS:
raise ValueError(
f"Unsupported export format: {format!r}. Choose one of: {', '.join(SUPPORTED_EXPORT_FORMATS)}."
)
path = Path(path)
df = self.load_dataset()
if format == "jsonl":
df.to_json(path, orient="records", lines=True, force_ascii=False)
elif format == "csv":
df.to_csv(path, index=False)
elif format == "parquet":
df.to_parquet(path, index=False)
return path

def push_to_hub(
self,
repo_id: str,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,15 @@ def test_create_command_delegates_to_controller(mock_ctrl_cls: MagicMock) -> Non
mock_ctrl = MagicMock()
mock_ctrl_cls.return_value = mock_ctrl

create_command(config_source="config.yaml", num_records=10, dataset_name="dataset", artifact_path=None)
create_command(config_source="config.yaml", num_records=10, dataset_name="dataset", artifact_path=None, output_format=None)

mock_ctrl_cls.assert_called_once()
mock_ctrl.run_create.assert_called_once_with(
config_source="config.yaml",
num_records=10,
dataset_name="dataset",
artifact_path=None,
output_format=None,
)


Expand All @@ -40,13 +41,15 @@ def test_create_command_passes_custom_options(mock_ctrl_cls: MagicMock) -> None:
num_records=100,
dataset_name="my_data",
artifact_path="/custom/output",
output_format=None,
)

mock_ctrl.run_create.assert_called_once_with(
config_source="config.py",
num_records=100,
dataset_name="my_data",
artifact_path="/custom/output",
output_format=None,
)


Expand All @@ -56,11 +59,35 @@ def test_create_command_default_artifact_path_is_none(mock_ctrl_cls: MagicMock)
mock_ctrl = MagicMock()
mock_ctrl_cls.return_value = mock_ctrl

create_command(config_source="config.yaml", num_records=5, dataset_name="ds", artifact_path=None)
create_command(config_source="config.yaml", num_records=5, dataset_name="ds", artifact_path=None, output_format=None)

mock_ctrl.run_create.assert_called_once_with(
config_source="config.yaml",
num_records=5,
dataset_name="ds",
artifact_path=None,
output_format=None,
)


@patch("data_designer.cli.commands.create.GenerationController")
def test_create_command_passes_output_format(mock_ctrl_cls: MagicMock) -> None:
"""Test create_command forwards --output-format to the controller."""
mock_ctrl = MagicMock()
mock_ctrl_cls.return_value = mock_ctrl

create_command(
config_source="config.yaml",
num_records=10,
dataset_name="dataset",
artifact_path=None,
output_format="jsonl",
)

mock_ctrl.run_create.assert_called_once_with(
config_source="config.yaml",
num_records=10,
dataset_name="dataset",
artifact_path=None,
output_format="jsonl",
)
1 change: 1 addition & 0 deletions packages/data-designer/tests/cli/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,5 @@ def test_app_dispatches_lazy_create_command(mock_controller_cls: Mock) -> None:
num_records=DEFAULT_NUM_RECORDS,
dataset_name="dataset",
artifact_path=None,
output_format=None,
)
71 changes: 71 additions & 0 deletions packages/data-designer/tests/interface/test_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,77 @@ def test_load_dataset_independent_of_record_sampler_cache(stub_dataset_creation_
stub_artifact_storage.load_dataset.assert_called_once()


@pytest.mark.parametrize("fmt", ["jsonl", "csv", "parquet"])
def test_export_writes_file(stub_dataset_creation_results, tmp_path, fmt):
"""export() writes a file in the requested format."""
out = tmp_path / f"out.{fmt}"
result = stub_dataset_creation_results.export(out, format=fmt)
assert result == out
assert out.exists()
assert out.stat().st_size > 0


def test_export_jsonl_content(stub_dataset_creation_results, stub_dataframe, tmp_path):
"""JSONL export writes one JSON object per line."""
import json

out = tmp_path / "out.jsonl"
stub_dataset_creation_results.export(out, format="jsonl")
lines = out.read_text(encoding="utf-8").splitlines()
assert len(lines) == len(stub_dataframe)
# Each line must be valid JSON
for line in lines:
json.loads(line)


def test_export_csv_content(stub_dataset_creation_results, stub_dataframe, tmp_path):
"""CSV export has a header row and one data row per record."""
import data_designer.lazy_heavy_imports as lazy

out = tmp_path / "out.csv"
stub_dataset_creation_results.export(out, format="csv")
loaded = lazy.pd.read_csv(out)
assert list(loaded.columns) == list(stub_dataframe.columns)
assert len(loaded) == len(stub_dataframe)


def test_export_parquet_content(stub_dataset_creation_results, stub_dataframe, tmp_path):
"""Parquet export round-trips to the original DataFrame."""
import data_designer.lazy_heavy_imports as lazy

out = tmp_path / "out.parquet"
stub_dataset_creation_results.export(out, format="parquet")
loaded = lazy.pd.read_parquet(out)
lazy.pd.testing.assert_frame_equal(loaded.reset_index(drop=True), stub_dataframe.reset_index(drop=True))


def test_export_default_format_is_jsonl(stub_dataset_creation_results, tmp_path):
"""export() defaults to JSONL when no format is given."""
import json

out = tmp_path / "out.jsonl"
stub_dataset_creation_results.export(out)
lines = out.read_text(encoding="utf-8").splitlines()
# All lines must be valid JSON
for line in lines:
json.loads(line)


def test_export_unsupported_format_raises(stub_dataset_creation_results, tmp_path):
"""export() raises ValueError for unknown formats."""
with pytest.raises(ValueError, match="Unsupported export format"):
stub_dataset_creation_results.export(tmp_path / "out.xyz", format="xlsx") # type: ignore[arg-type]


def test_export_returns_path_object(stub_dataset_creation_results, tmp_path):
"""export() returns a Path regardless of whether str or Path was passed."""
from pathlib import Path

out = tmp_path / "out.jsonl"
result = stub_dataset_creation_results.export(str(out))
assert isinstance(result, Path)


def test_preview_results_dataset_metadata() -> None:
"""Test that PreviewResults uses DatasetMetadata in display_sample_record."""
config_builder = MagicMock(spec=DataDesignerConfigBuilder)
Expand Down
Loading