diff --git a/docs/assets/recipes/plugin_development/markdown_seed_reader.py b/docs/assets/recipes/plugin_development/markdown_seed_reader.py new file mode 100644 index 000000000..8388874eb --- /dev/null +++ b/docs/assets/recipes/plugin_development/markdown_seed_reader.py @@ -0,0 +1,205 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "data-designer", +# ] +# /// +"""Markdown Section Seed Reader Recipe + +Prototype a custom FileSystemSeedReader inline by overriding how one +DataDesigner instance handles DirectorySeedSource inputs. The reader keeps a +file-based manifest and fans each Markdown file out into one row per section. +This keeps the example in the same single-file format as the other recipes +while still showing the core `build_manifest(...)` and `hydrate_row(...)` +contract for a custom filesystem-backed seed reader. + +Run: + uv run markdown_seed_reader.py +""" + +from __future__ import annotations + +import re +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import Any, ClassVar + +import data_designer.config as dd +from data_designer.config.seed import IndexRange +from data_designer.engine.resources.seed_reader import FileSystemSeedReader, SeedReaderFileSystemContext +from data_designer.interface import DataDesigner + +_ATX_HEADING_PATTERN = re.compile(r"^(#{1,6})[ \t]+(.+?)\s*$") + + +class MarkdownSectionDirectorySeedReader(FileSystemSeedReader[dd.DirectorySeedSource]): + """Turn each Markdown file matched by DirectorySeedSource into section rows.""" + + output_columns: ClassVar[list[str]] = [ + "relative_path", + "file_name", + "section_index", + "section_header", + "section_content", + ] + + def build_manifest(self, *, context: SeedReaderFileSystemContext) -> list[dict[str, str]]: + """Return one cheap manifest row per matched Markdown file.""" + + matched_paths = self.get_matching_relative_paths( + context=context, + file_pattern=self.source.file_pattern, + recursive=self.source.recursive, + ) + return [ + { + "relative_path": relative_path, + "file_name": Path(relative_path).name, + } + for relative_path in matched_paths + ] + + def hydrate_row( + self, + *, + manifest_row: dict[str, Any], + context: SeedReaderFileSystemContext, + ) -> list[dict[str, Any]]: + """Read one Markdown file and fan it out into one record per heading section.""" + + relative_path = str(manifest_row["relative_path"]) + file_name = str(manifest_row["file_name"]) + with context.fs.open(relative_path, "r", encoding="utf-8") as handle: + markdown_text = handle.read() + + sections = extract_markdown_sections(markdown_text=markdown_text, fallback_header=file_name) + return [ + { + "relative_path": relative_path, + "file_name": file_name, + "section_index": section_index, + "section_header": section_header, + "section_content": section_content, + } + for section_index, (section_header, section_content) in enumerate(sections) + ] + + +def extract_markdown_sections(*, markdown_text: str, fallback_header: str) -> list[tuple[str, str]]: + """Split Markdown into `(header, content)` pairs using ATX headings.""" + + sections: list[tuple[str, str]] = [] + current_header = fallback_header + current_lines: list[str] = [] + saw_heading = False + + for line in markdown_text.splitlines(): + heading_match = _ATX_HEADING_PATTERN.match(line) + if heading_match is not None: + if saw_heading or any(existing_line.strip() for existing_line in current_lines): + sections.append((current_header, "\n".join(current_lines).strip())) + current_header = heading_match.group(2).strip() + current_lines = [] + saw_heading = True + continue + current_lines.append(line) + + if saw_heading or markdown_text.strip(): + sections.append((current_header, "\n".join(current_lines).strip())) + + return [ + (section_header, section_content) + for section_header, section_content in sections + if section_header or section_content + ] + + +def create_sample_markdown_files(seed_dir: Path) -> None: + """Create a tiny Markdown corpus that keeps the recipe self-contained.""" + + (seed_dir / "faq.md").write_text( + "# FAQ\nAnswers to frequent questions.\n\n## Support\nContact support@example.com.", + encoding="utf-8", + ) + (seed_dir / "guide.md").write_text( + "# Quickstart\nInstall Data Designer.\n\n## Usage\nRun the recipe with uv.", + encoding="utf-8", + ) + + +def build_config( + *, + seed_path: Path, + selection_strategy: IndexRange | None = None, +) -> dd.DataDesignerConfigBuilder: + """Create the dataset config used by both preview runs in the recipe.""" + + config_builder = dd.DataDesignerConfigBuilder() + config_builder.with_seed_dataset( + dd.DirectorySeedSource(path=str(seed_path), file_pattern="*.md"), + selection_strategy=selection_strategy, + ) + config_builder.add_column( + dd.ExpressionColumnConfig( + name="section_summary", + expr="{{ file_name }} :: {{ section_header }}", + ) + ) + return config_builder + + +def print_preview( + *, + data_designer: DataDesigner, + title: str, + config_builder: dd.DataDesignerConfigBuilder, + num_records: int, +) -> None: + """Run a preview and print the columns that matter for the walkthrough.""" + + print(title) + preview = data_designer.preview(config_builder, num_records=num_records) + print( + preview.dataset[ + [ + "relative_path", + "section_index", + "section_header", + "section_summary", + ] + ].to_string(index=False) + ) + print() + + +def main() -> None: + """Build sample input files and print previews with and without selection.""" + + with TemporaryDirectory(prefix="markdown-seed-reader-") as temp_dir: + seed_dir = Path(temp_dir) / "sample_markdown" + seed_dir.mkdir() + create_sample_markdown_files(seed_dir) + + data_designer = DataDesigner(seed_readers=[MarkdownSectionDirectorySeedReader()]) + + print_preview( + data_designer=data_designer, + title="Full preview across all markdown files", + config_builder=build_config(seed_path=seed_dir), + num_records=4, + ) + print_preview( + data_designer=data_designer, + title="Manifest-based selection of only the second matched file", + config_builder=build_config( + seed_path=seed_dir, + selection_strategy=IndexRange(start=1, end=1), + ), + num_records=2, + ) + + +if __name__ == "__main__": + main() diff --git a/docs/plugins/example.md b/docs/plugins/example.md index 4f9e561f4..ce847be93 100644 --- a/docs/plugins/example.md +++ b/docs/plugins/example.md @@ -4,7 +4,7 @@ # Example Plugin: Column Generator -Data Designer supports two plugin types: **column generators** and **seed readers**. This page walks through a complete column generator example. +Data Designer supports three plugin types: **column generators**, **seed readers**, and **processors**. This page walks through a complete column generator example. For filesystem-backed seed reader plugins, see [FileSystemSeedReader Plugins](filesystem_seed_reader.md). A Data Designer plugin is implemented as a Python package with three main components: diff --git a/docs/plugins/filesystem_seed_reader.md b/docs/plugins/filesystem_seed_reader.md new file mode 100644 index 000000000..04e32daff --- /dev/null +++ b/docs/plugins/filesystem_seed_reader.md @@ -0,0 +1,167 @@ +# FileSystemSeedReader Plugins + +!!! warning "Experimental Feature" + The plugin system is currently **experimental** and under active development. The documentation, examples, and plugin interface are subject to significant changes in future releases. If you encounter any issues, have questions, or have ideas for improvement, please consider starting [a discussion on GitHub](https://github.com/NVIDIA-NeMo/DataDesigner/discussions). + +`FileSystemSeedReader` is the simplest way to build a seed reader plugin when your source data lives in a directory of files. You describe the files cheaply in `build_manifest(...)`, then optionally read and reshape them in `hydrate_row(...)`. + +This guide focuses on the filesystem-specific contract. The fastest way to learn it is usually to start with an inline reader over `DirectorySeedSource`, then package that reader later only if you need automatic plugin discovery or a brand-new `seed_type`. For a runnable single-file example, see the [Markdown Section Seed Reader recipe](../recipes/plugin_development/markdown_seed_reader.md). + +## What the framework owns + +When you inherit from `FileSystemSeedReader`, Data Designer already handles: + +- attachment-scoped filesystem context reuse +- file matching with `file_pattern` and `recursive` +- manifest sampling, `IndexRange`, `PartitionBlock`, and shuffle +- batching and DuckDB registration +- hydrated output schema validation via `output_columns` + +Most readers only need to implement `build_manifest(...)` and `hydrate_row(...)`. + +## Start with an existing filesystem config + +If your source data already fits `DirectorySeedSource` or `FileContentsSeedSource`, you do not need a new config model just to learn or prototype a reader. Reuse the built-in source type and override how one `DataDesigner` instance interprets that seed source. + +The Markdown recipe uses `DirectorySeedSource(path=..., file_pattern="*.md")` and pairs it with an inline reader: + +```python +import data_designer.config as dd +from pathlib import Path +from typing import Any + +from data_designer.engine.resources.seed_reader import FileSystemSeedReader, SeedReaderFileSystemContext + + +class MarkdownSectionDirectorySeedReader(FileSystemSeedReader[dd.DirectorySeedSource]): + output_columns = [ + "relative_path", + "file_name", + "section_index", + "section_header", + "section_content", + ] + + def build_manifest(self, *, context: SeedReaderFileSystemContext) -> list[dict[str, str]]: + matched_paths = self.get_matching_relative_paths( + context=context, + file_pattern=self.source.file_pattern, + recursive=self.source.recursive, + ) + return [ + { + "relative_path": relative_path, + "file_name": Path(relative_path).name, + } + for relative_path in matched_paths + ] + + def hydrate_row( + self, + *, + manifest_row: dict[str, Any], + context: SeedReaderFileSystemContext, + ) -> list[dict[str, Any]]: + ... +``` + +This approach lets you inspect the manifest and hydration contract without first creating a package, entry points, or a new `seed_type`. + +## Step 1: Build a cheap manifest + +`build_manifest(...)` should be inexpensive. Usually that means enumerating matching files and returning one logical row per file, without reading file contents yet. + +In this example, the manifest only tracks: + +- `relative_path` +- `file_name` + +That keeps selection and partitioning file-based. + +## Step 2: Hydrate one file into one or many rows + +`hydrate_row(...)` can return either: + +- a single record dict for `1:1` hydration +- an iterable of record dicts for `1:N` hydration + +If hydration changes the schema, set `output_columns` to the exact emitted schema: + +```python +output_columns = [ + "relative_path", + "file_name", + "section_index", + "section_header", + "section_content", +] +``` + +In the recipe implementation, `hydrate_row(...)` reads one file and emits one record per ATX heading section. + +Every emitted record must match `output_columns` exactly. Data Designer will raise a plugin-facing error if a hydrated record is missing a declared column or includes an undeclared one. + +## Step 3: Pass the reader to Data Designer + +Register the inline reader on the `DataDesigner` instance you want to use: + +```python +import data_designer.config as dd +from data_designer.interface import DataDesigner + +data_designer = DataDesigner(seed_readers=[MarkdownSectionDirectorySeedReader()]) + +builder = dd.DataDesignerConfigBuilder() +builder.with_seed_dataset( + dd.DirectorySeedSource(path="sample_data", file_pattern="*.md"), +) +``` + +That pattern overrides how this `DataDesigner` instance handles the built-in `directory` seed source. Because `seed_readers` sets the registry for that instance, include any other readers you still want available. This is a good fit for local experiments, tests, and docs recipes. + +## Manifest-Based Selection Semantics + +Selection stays manifest-based even when `hydrate_row(...)` fans out. + +If the matched files are: + +```text +0 -> faq.md +1 -> guide.md +``` + +and `guide.md` hydrates into two section rows, then: + +```python +import data_designer.config as dd +from data_designer.config.seed import IndexRange + +builder.with_seed_dataset( + dd.DirectorySeedSource(path="sample_data", file_pattern="*.md"), + selection_strategy=IndexRange(start=1, end=1), +) +``` + +selects only `guide.md`, then returns **all** section rows emitted from `guide.md`. + +That means `get_seed_dataset_size()`, `IndexRange`, `PartitionBlock`, and shuffle all operate on manifest rows before hydration. + +## Package it later when needed + +If you want the same reader to be installable and auto-discovered as a plugin, then move from the inline pattern to a package: + +- define a config class that inherits from `FileSystemSeedSource` +- give it a unique `seed_type` +- create a `Plugin` object with `plugin_type=PluginType.SEED_READER` +- register that plugin via a `data_designer.plugins` entry point + +That extra packaging step is only necessary when you need a reusable plugin boundary. The reader logic itself still lives in the same `build_manifest(...)` and `hydrate_row(...)` methods shown above. + +## Advanced Hooks + +If you need more control, `FileSystemSeedReader` also lets you override: + +- `on_attach(...)` for per-attachment setup +- `create_filesystem_context(...)` for custom rooted filesystem behavior + +Most filesystem plugins do not need either hook. diff --git a/docs/plugins/overview.md b/docs/plugins/overview.md index cbfa30490..45a469c80 100644 --- a/docs/plugins/overview.md +++ b/docs/plugins/overview.md @@ -25,7 +25,7 @@ uv pip install -e /path/to/your/plugin pip install data-designer-{plugin-name} ``` -Once installed, plugins are automatically discovered and ready to use — no additional registration or configuration needed. See the [example plugin](example.md) for a complete walkthrough. +Once installed, plugins are automatically discovered and ready to use — no additional registration or configuration needed. See the [example plugin](example.md) for a complete walkthrough, or jump to [FileSystemSeedReader Plugins](filesystem_seed_reader.md) for filesystem-backed seed reader authoring. ## How do you create plugins? @@ -41,7 +41,7 @@ Each plugin has three components, and we recommend organizing them into separate - Processor plugins: inherit from `ProcessorConfig` with a `processor_type` discriminator - **`impl.py`** -- Implementation class containing the core logic - Column generator plugins: inherit from `ColumnGeneratorFullColumn` or `ColumnGeneratorCellByCell` - - Seed reader plugins: inherit from `SeedReader` + - Seed reader plugins: inherit from `SeedReader` or `FileSystemSeedReader` for directory-backed sources - Processor plugins: inherit from `Processor` and override callback methods (`process_before_batch`, `process_after_batch`, `process_after_generation`) - **`plugin.py`** -- A `Plugin` instance that connects the config and implementation classes @@ -81,4 +81,8 @@ my_processor_plugin = Plugin( ) ``` -**Ready to get started?** See the [Example Plugin](example.md) for a complete walkthrough of creating a column generator plugin. +**Ready to get started?** + +- See the [Example Plugin](example.md) for a column generator walkthrough +- See [FileSystemSeedReader Plugins](filesystem_seed_reader.md) for filesystem-backed seed reader plugins +- See the [Markdown Section Seed Reader recipe](../recipes/plugin_development/markdown_seed_reader.md) for a runnable single-file `1:N` filesystem reader example diff --git a/docs/recipes/cards.md b/docs/recipes/cards.md index acfac41d8..58ac0fae5 100644 --- a/docs/recipes/cards.md +++ b/docs/recipes/cards.md @@ -154,4 +154,22 @@ Each recipe is a self-contained example that can be run independently. [:material-book-open-page-variant: View Recipe](mcp_and_tooluse/search_agent.md){ .md-button } [Download Code :octicons-download-24:](../assets/recipes/mcp_and_tooluse/search_agent.py){ .md-button download="search_agent.py" } +- :material-file-document-multiple:{ .lg .middle } **Markdown Section Seed Reader** + + Define a custom `FileSystemSeedReader` inline and turn Markdown files into one seed row per heading section. + + --- + + **Demonstrates:** + + - Single-file custom seed reader pattern + - `hydrate_row()` fanout from `1 -> N` + - Manifest-based file selection semantics + - `DirectorySeedSource` customization without a new `seed_type` + + --- + + [:material-book-open-page-variant: View Recipe](plugin_development/markdown_seed_reader.md){ .md-button } + [Download Code :octicons-download-24:](../assets/recipes/plugin_development/markdown_seed_reader.py){ .md-button download="markdown_seed_reader.py" } + diff --git a/docs/recipes/plugin_development/markdown_seed_reader.md b/docs/recipes/plugin_development/markdown_seed_reader.md new file mode 100644 index 000000000..22c8a8aed --- /dev/null +++ b/docs/recipes/plugin_development/markdown_seed_reader.md @@ -0,0 +1,31 @@ +# Markdown Section Seed Reader + +Turn a directory of Markdown files into a seed dataset with one row per section. This recipe stays in the same single-file format as the other recipes: it creates sample files, defines an inline `FileSystemSeedReader[DirectorySeedSource]`, and passes that reader to `DataDesigner(seed_readers=[...])`. + +This keeps the example focused on the actual seed reader contract: + +- implementing `build_manifest(...)` +- returning `1:N` hydrated rows from `hydrate_row(...)` +- declaring `output_columns` for the hydrated schema +- keeping `IndexRange` selection manifest-based + +Because the example reuses `DirectorySeedSource`, it does not register a brand-new `seed_type`. If you later want to package the same reader as an installable plugin, see [FileSystemSeedReader Plugins](../../plugins/filesystem_seed_reader.md). + +## Run the Recipe + +Run the script directly: + +```bash +uv run markdown_seed_reader.py +``` + +The script prints two previews: + +- the full section dataset across all Markdown files +- a manifest-only selection using `IndexRange(start=1, end=1)` that still returns every section from the selected file + +[Download Code :octicons-download-24:](../../assets/recipes/plugin_development/markdown_seed_reader.py){ .md-button download="markdown_seed_reader.py" } + +```python +--8<-- "assets/recipes/plugin_development/markdown_seed_reader.py" +``` diff --git a/mkdocs.yml b/mkdocs.yml index fd2cf2d5e..89e945da1 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -44,6 +44,8 @@ nav: - Text to Python: recipes/code_generation/text_to_python.md - Text to SQL: recipes/code_generation/text_to_sql.md - "Nemotron Super Text to SQL": recipes/code_generation/enterprise_text_to_sql.md + - Plugin Development: + - Markdown Section Seed Reader Plugin: recipes/plugin_development/markdown_seed_reader.md - QA and Chat: - Product Info QA: recipes/qa_and_chat/product_info_qa.md - Multi-Turn Chat: recipes/qa_and_chat/multi_turn_chat.md @@ -54,6 +56,7 @@ nav: - Plugins: - Overview: plugins/overview.md - Example Plugin: plugins/example.md + - FileSystemSeedReader Plugins: plugins/filesystem_seed_reader.md - Available Plugin List: plugins/available.md - Code Reference: - models: code_reference/models.md