Skip to content

Commit e411688

Browse files
committed
test: trim FileSystemSeedReader follow-up coverage
1 parent 3d33680 commit e411688

3 files changed

Lines changed: 133 additions & 125 deletions

File tree

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
from __future__ import annotations
5+
6+
from pathlib import Path
7+
from typing import Any
8+
9+
import data_designer.lazy_heavy_imports as lazy
10+
from data_designer.config.seed_source import DirectorySeedSource
11+
from data_designer.engine.resources.seed_reader import FileSystemSeedReader, SeedReaderFileSystemContext
12+
13+
14+
class LineFanoutDirectorySeedReader(FileSystemSeedReader[DirectorySeedSource]):
15+
def __init__(self, *, include_file_name: bool = False) -> None:
16+
self._include_file_name = include_file_name
17+
self.hydrated_relative_paths: list[str] = []
18+
self.output_columns = ["relative_path", "line_index", "line"]
19+
if include_file_name:
20+
self.output_columns.insert(1, "file_name")
21+
22+
def build_manifest(self, *, context: SeedReaderFileSystemContext) -> lazy.pd.DataFrame | list[dict[str, str]]:
23+
matched_paths = self.get_matching_relative_paths(
24+
context=context,
25+
file_pattern=self.source.file_pattern,
26+
recursive=self.source.recursive,
27+
)
28+
return [self._build_manifest_row(relative_path) for relative_path in matched_paths]
29+
30+
def hydrate_row(
31+
self,
32+
*,
33+
manifest_row: dict[str, Any],
34+
context: SeedReaderFileSystemContext,
35+
) -> list[dict[str, Any]]:
36+
relative_path = str(manifest_row["relative_path"])
37+
self.hydrated_relative_paths.append(relative_path)
38+
with context.fs.open(relative_path, "r", encoding="utf-8") as handle:
39+
lines = handle.read().splitlines()
40+
return [
41+
self._build_hydrated_row(
42+
relative_path=relative_path,
43+
file_name=str(manifest_row["file_name"]) if self._include_file_name else None,
44+
line_index=line_index,
45+
line=line,
46+
)
47+
for line_index, line in enumerate(lines)
48+
]
49+
50+
def _build_manifest_row(self, relative_path: str) -> dict[str, str]:
51+
manifest_row = {"relative_path": relative_path}
52+
if self._include_file_name:
53+
manifest_row["file_name"] = Path(relative_path).name
54+
return manifest_row
55+
56+
def _build_hydrated_row(
57+
self,
58+
*,
59+
relative_path: str,
60+
file_name: str | None,
61+
line_index: int,
62+
line: str,
63+
) -> dict[str, Any]:
64+
hydrated_row: dict[str, Any] = {
65+
"relative_path": relative_path,
66+
"line_index": line_index,
67+
"line": line,
68+
}
69+
if file_name is not None:
70+
hydrated_row["file_name"] = file_name
71+
return hydrated_row

packages/data-designer-engine/tests/engine/resources/test_seed_reader.py

Lines changed: 59 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
from __future__ import annotations
55

6+
from collections.abc import Callable
67
from pathlib import Path
78
from typing import Any
89

@@ -23,6 +24,7 @@
2324
SeedReaderRegistry,
2425
)
2526
from data_designer.engine.secret_resolver import PlaintextResolver
27+
from data_designer.engine.testing.seed_readers import LineFanoutDirectorySeedReader
2628

2729

2830
class TrackingFileContentsSeedReader(FileContentsSeedReader):
@@ -134,50 +136,10 @@ def hydrate_row(
134136
}
135137

136138

137-
class FanoutDirectorySeedReader(FileSystemSeedReader[DirectorySeedSource]):
138-
output_columns = ["relative_path", "file_name", "line_index", "line"]
139-
140-
def __init__(self) -> None:
141-
self.hydrated_relative_paths: list[str] = []
142-
143-
def build_manifest(self, *, context: SeedReaderFileSystemContext) -> lazy.pd.DataFrame | list[dict[str, str]]:
144-
matched_paths = self.get_matching_relative_paths(
145-
context=context,
146-
file_pattern=self.source.file_pattern,
147-
recursive=self.source.recursive,
148-
)
149-
return [
150-
{
151-
"relative_path": relative_path,
152-
"file_name": Path(relative_path).name,
153-
}
154-
for relative_path in matched_paths
155-
]
156-
157-
def hydrate_row(
158-
self,
159-
*,
160-
manifest_row: dict[str, str],
161-
context: SeedReaderFileSystemContext,
162-
) -> list[dict[str, Any]]:
163-
relative_path = manifest_row["relative_path"]
164-
self.hydrated_relative_paths.append(relative_path)
165-
with context.fs.open(relative_path, "r", encoding="utf-8") as handle:
166-
lines = handle.read().splitlines()
167-
return [
168-
{
169-
"relative_path": relative_path,
170-
"file_name": manifest_row["file_name"],
171-
"line_index": line_index,
172-
"line": line,
173-
}
174-
for line_index, line in enumerate(lines)
175-
]
176-
177-
178-
class InvalidHydrationReturnSeedReader(FileSystemSeedReader[DirectorySeedSource]):
179-
def __init__(self, hydrated_return: Any) -> None:
139+
class ConfigurableHydrationDirectorySeedReader(FileSystemSeedReader[DirectorySeedSource]):
140+
def __init__(self, *, hydrated_return: Any, output_columns: list[str] | None = None) -> None:
180141
self._hydrated_return = hydrated_return
142+
self.output_columns = output_columns
181143

182144
def build_manifest(self, *, context: SeedReaderFileSystemContext) -> lazy.pd.DataFrame | list[dict[str, str]]:
183145
matched_paths = self.get_matching_relative_paths(
@@ -190,36 +152,13 @@ def build_manifest(self, *, context: SeedReaderFileSystemContext) -> lazy.pd.Dat
190152
def hydrate_row(
191153
self,
192154
*,
193-
manifest_row: dict[str, str],
155+
manifest_row: dict[str, Any],
194156
context: SeedReaderFileSystemContext,
195157
) -> Any:
196158
del manifest_row, context
197159
return self._hydrated_return
198160

199161

200-
class SchemaMismatchFanoutSeedReader(FileSystemSeedReader[DirectorySeedSource]):
201-
def __init__(self, *, output_columns: list[str], hydrated_rows: list[dict[str, str]]) -> None:
202-
self.output_columns = output_columns
203-
self._hydrated_rows = hydrated_rows
204-
205-
def build_manifest(self, *, context: SeedReaderFileSystemContext) -> lazy.pd.DataFrame | list[dict[str, str]]:
206-
matched_paths = self.get_matching_relative_paths(
207-
context=context,
208-
file_pattern=self.source.file_pattern,
209-
recursive=self.source.recursive,
210-
)
211-
return [{"relative_path": relative_path} for relative_path in matched_paths]
212-
213-
def hydrate_row(
214-
self,
215-
*,
216-
manifest_row: dict[str, str],
217-
context: SeedReaderFileSystemContext,
218-
) -> list[dict[str, str]]:
219-
del manifest_row, context
220-
return self._hydrated_rows
221-
222-
223162
class ContextCountingDirectorySeedReader(FileSystemSeedReader[DirectorySeedSource]):
224163
def __init__(self) -> None:
225164
self.filesystem_context_calls = 0
@@ -237,6 +176,16 @@ def build_manifest(self, *, context: SeedReaderFileSystemContext) -> lazy.pd.Dat
237176
return [{"relative_path": relative_path} for relative_path in matched_paths]
238177

239178

179+
@pytest.fixture
180+
def write_alpha_beta_text_files(tmp_path: Path) -> Callable[[str, str], Path]:
181+
def _write_alpha_beta_text_files(alpha_contents: str, beta_contents: str) -> Path:
182+
(tmp_path / "alpha.txt").write_text(alpha_contents, encoding="utf-8")
183+
(tmp_path / "beta.txt").write_text(beta_contents, encoding="utf-8")
184+
return tmp_path
185+
186+
return _write_alpha_beta_text_files
187+
188+
240189
def test_one_reader_per_seed_type():
241190
local_1 = LocalFileSeedReader()
242191
local_2 = LocalFileSeedReader()
@@ -308,7 +257,7 @@ def test_plugin_style_filesystem_seed_reader_can_fan_out_rows(tmp_path: Path) ->
308257
(tmp_path / "alpha.txt").write_text("alpha-0\nalpha-1", encoding="utf-8")
309258
(tmp_path / "beta.txt").write_text("beta-0", encoding="utf-8")
310259

311-
reader = FanoutDirectorySeedReader()
260+
reader = LineFanoutDirectorySeedReader(include_file_name=True)
312261
reader.attach(
313262
DirectorySeedSource(path=str(tmp_path), file_pattern="*.txt"),
314263
PlaintextResolver(),
@@ -444,13 +393,14 @@ def test_file_contents_seed_reader_hydrates_only_selected_manifest_rows(tmp_path
444393
assert reader.hydrated_relative_paths == ["beta.txt"]
445394

446395

447-
def test_filesystem_seed_reader_fanout_keeps_manifest_based_index_selection(tmp_path: Path) -> None:
448-
(tmp_path / "alpha.txt").write_text("alpha-0\nalpha-1", encoding="utf-8")
449-
(tmp_path / "beta.txt").write_text("beta-0\nbeta-1", encoding="utf-8")
396+
def test_filesystem_seed_reader_fanout_keeps_manifest_based_index_selection(
397+
write_alpha_beta_text_files: Callable[[str, str], Path],
398+
) -> None:
399+
seed_dir = write_alpha_beta_text_files("alpha-0\nalpha-1", "beta-0\nbeta-1")
450400

451-
reader = FanoutDirectorySeedReader()
401+
reader = LineFanoutDirectorySeedReader(include_file_name=True)
452402
reader.attach(
453-
DirectorySeedSource(path=str(tmp_path), file_pattern="*.txt"),
403+
DirectorySeedSource(path=str(seed_dir), file_pattern="*.txt"),
454404
PlaintextResolver(),
455405
)
456406

@@ -467,14 +417,13 @@ def test_filesystem_seed_reader_fanout_keeps_manifest_based_index_selection(tmp_
467417

468418

469419
def test_filesystem_seed_reader_batch_reader_raises_for_selected_manifest_rows_with_empty_fanout(
470-
tmp_path: Path,
420+
write_alpha_beta_text_files: Callable[[str, str], Path],
471421
) -> None:
472-
(tmp_path / "alpha.txt").write_text("alpha-0", encoding="utf-8")
473-
(tmp_path / "beta.txt").write_text("", encoding="utf-8")
422+
seed_dir = write_alpha_beta_text_files("alpha-0", "")
474423

475-
reader = FanoutDirectorySeedReader()
424+
reader = LineFanoutDirectorySeedReader(include_file_name=True)
476425
reader.attach(
477-
DirectorySeedSource(path=str(tmp_path), file_pattern="*.txt"),
426+
DirectorySeedSource(path=str(seed_dir), file_pattern="*.txt"),
478427
PlaintextResolver(),
479428
)
480429

@@ -494,14 +443,13 @@ def test_filesystem_seed_reader_batch_reader_raises_for_selected_manifest_rows_w
494443

495444

496445
def test_filesystem_seed_reader_batch_reader_skips_empty_fanout_rows_before_returning_records(
497-
tmp_path: Path,
446+
write_alpha_beta_text_files: Callable[[str, str], Path],
498447
) -> None:
499-
(tmp_path / "alpha.txt").write_text("", encoding="utf-8")
500-
(tmp_path / "beta.txt").write_text("beta-0\nbeta-1", encoding="utf-8")
448+
seed_dir = write_alpha_beta_text_files("", "beta-0\nbeta-1")
501449

502-
reader = FanoutDirectorySeedReader()
450+
reader = LineFanoutDirectorySeedReader(include_file_name=True)
503451
reader.attach(
504-
DirectorySeedSource(path=str(tmp_path), file_pattern="*.txt"),
452+
DirectorySeedSource(path=str(seed_dir), file_pattern="*.txt"),
505453
PlaintextResolver(),
506454
)
507455

@@ -518,14 +466,13 @@ def test_filesystem_seed_reader_batch_reader_skips_empty_fanout_rows_before_retu
518466

519467

520468
def test_filesystem_seed_reader_batch_reader_stops_cleanly_after_emitting_records_when_only_empty_fanout_rows_remain(
521-
tmp_path: Path,
469+
write_alpha_beta_text_files: Callable[[str, str], Path],
522470
) -> None:
523-
(tmp_path / "alpha.txt").write_text("alpha-0", encoding="utf-8")
524-
(tmp_path / "beta.txt").write_text("", encoding="utf-8")
471+
seed_dir = write_alpha_beta_text_files("alpha-0", "")
525472

526-
reader = FanoutDirectorySeedReader()
473+
reader = LineFanoutDirectorySeedReader(include_file_name=True)
527474
reader.attach(
528-
DirectorySeedSource(path=str(tmp_path), file_pattern="*.txt"),
475+
DirectorySeedSource(path=str(seed_dir), file_pattern="*.txt"),
529476
PlaintextResolver(),
530477
)
531478

@@ -545,6 +492,21 @@ def test_filesystem_seed_reader_batch_reader_stops_cleanly_after_emitting_record
545492
assert reader.hydrated_relative_paths == ["alpha.txt", "beta.txt"]
546493

547494

495+
def test_filesystem_seed_reader_full_output_raises_when_all_manifest_rows_fan_out_to_empty(
496+
write_alpha_beta_text_files: Callable[[str, str], Path],
497+
) -> None:
498+
seed_dir = write_alpha_beta_text_files("", "")
499+
500+
reader = LineFanoutDirectorySeedReader(include_file_name=True)
501+
reader.attach(
502+
DirectorySeedSource(path=str(seed_dir), file_pattern="*.txt"),
503+
PlaintextResolver(),
504+
)
505+
506+
with pytest.raises(SeedReaderError, match="Seed source at .* did not produce any rows"):
507+
reader.create_duckdb_connection().execute(f"SELECT * FROM '{reader.get_dataset_uri()}'").df()
508+
509+
548510
def test_local_file_seed_reader_uses_load_time_runtime_path_when_cwd_changes(
549511
tmp_path: Path,
550512
monkeypatch: pytest.MonkeyPatch,
@@ -649,10 +611,11 @@ def test_filesystem_seed_reader_raises_for_undeclared_hydrated_columns(
649611
@pytest.mark.parametrize(
650612
("hydrated_return", "error_pattern"),
651613
[
614+
(None, "Manifest row index 0 returned NoneType"),
652615
(123, "Manifest row index 0 returned int"),
653616
(["not-a-record"], "Manifest row index 0 returned an iterable containing str"),
654617
],
655-
ids=["scalar", "iterable-of-invalid-records"],
618+
ids=["none", "scalar", "iterable-of-invalid-records"],
656619
)
657620
def test_filesystem_seed_reader_rejects_invalid_hydrate_row_returns(
658621
tmp_path: Path,
@@ -661,15 +624,15 @@ def test_filesystem_seed_reader_rejects_invalid_hydrate_row_returns(
661624
) -> None:
662625
(tmp_path / "alpha.txt").write_text("alpha", encoding="utf-8")
663626

664-
reader = InvalidHydrationReturnSeedReader(hydrated_return)
627+
reader = ConfigurableHydrationDirectorySeedReader(hydrated_return=hydrated_return)
665628
reader.attach(DirectorySeedSource(path=str(tmp_path), file_pattern="*.txt"), PlaintextResolver())
666629

667630
with pytest.raises(SeedReaderError, match=error_pattern):
668631
reader.create_duckdb_connection().execute(f"SELECT * FROM '{reader.get_dataset_uri()}'").df()
669632

670633

671634
@pytest.mark.parametrize(
672-
("output_columns", "hydrated_rows", "error_pattern"),
635+
("output_columns", "hydrated_return", "error_pattern"),
673636
[
674637
(
675638
["relative_path", "content"],
@@ -693,12 +656,15 @@ def test_filesystem_seed_reader_rejects_invalid_hydrate_row_returns(
693656
def test_filesystem_seed_reader_validates_each_fanout_record_against_output_columns(
694657
tmp_path: Path,
695658
output_columns: list[str],
696-
hydrated_rows: list[dict[str, str]],
659+
hydrated_return: list[dict[str, str]],
697660
error_pattern: str,
698661
) -> None:
699662
(tmp_path / "alpha.txt").write_text("alpha", encoding="utf-8")
700663

701-
reader = SchemaMismatchFanoutSeedReader(output_columns=output_columns, hydrated_rows=hydrated_rows)
664+
reader = ConfigurableHydrationDirectorySeedReader(
665+
output_columns=output_columns,
666+
hydrated_return=hydrated_return,
667+
)
702668
reader.attach(DirectorySeedSource(path=str(tmp_path), file_pattern="*.txt"), PlaintextResolver())
703669

704670
with pytest.raises(SeedReaderError, match=error_pattern):

0 commit comments

Comments
 (0)