33
44from __future__ import annotations
55
6+ from collections .abc import Callable
67from pathlib import Path
78from typing import Any
89
2324 SeedReaderRegistry ,
2425)
2526from data_designer .engine .secret_resolver import PlaintextResolver
27+ from data_designer .engine .testing .seed_readers import LineFanoutDirectorySeedReader
2628
2729
2830class TrackingFileContentsSeedReader (FileContentsSeedReader ):
@@ -134,50 +136,10 @@ def hydrate_row(
134136 }
135137
136138
137- class FanoutDirectorySeedReader (FileSystemSeedReader [DirectorySeedSource ]):
138- output_columns = ["relative_path" , "file_name" , "line_index" , "line" ]
139-
140- def __init__ (self ) -> None :
141- self .hydrated_relative_paths : list [str ] = []
142-
143- def build_manifest (self , * , context : SeedReaderFileSystemContext ) -> lazy .pd .DataFrame | list [dict [str , str ]]:
144- matched_paths = self .get_matching_relative_paths (
145- context = context ,
146- file_pattern = self .source .file_pattern ,
147- recursive = self .source .recursive ,
148- )
149- return [
150- {
151- "relative_path" : relative_path ,
152- "file_name" : Path (relative_path ).name ,
153- }
154- for relative_path in matched_paths
155- ]
156-
157- def hydrate_row (
158- self ,
159- * ,
160- manifest_row : dict [str , str ],
161- context : SeedReaderFileSystemContext ,
162- ) -> list [dict [str , Any ]]:
163- relative_path = manifest_row ["relative_path" ]
164- self .hydrated_relative_paths .append (relative_path )
165- with context .fs .open (relative_path , "r" , encoding = "utf-8" ) as handle :
166- lines = handle .read ().splitlines ()
167- return [
168- {
169- "relative_path" : relative_path ,
170- "file_name" : manifest_row ["file_name" ],
171- "line_index" : line_index ,
172- "line" : line ,
173- }
174- for line_index , line in enumerate (lines )
175- ]
176-
177-
178- class InvalidHydrationReturnSeedReader (FileSystemSeedReader [DirectorySeedSource ]):
179- def __init__ (self , hydrated_return : Any ) -> None :
139+ class ConfigurableHydrationDirectorySeedReader (FileSystemSeedReader [DirectorySeedSource ]):
140+ def __init__ (self , * , hydrated_return : Any , output_columns : list [str ] | None = None ) -> None :
180141 self ._hydrated_return = hydrated_return
142+ self .output_columns = output_columns
181143
182144 def build_manifest (self , * , context : SeedReaderFileSystemContext ) -> lazy .pd .DataFrame | list [dict [str , str ]]:
183145 matched_paths = self .get_matching_relative_paths (
@@ -190,36 +152,13 @@ def build_manifest(self, *, context: SeedReaderFileSystemContext) -> lazy.pd.Dat
190152 def hydrate_row (
191153 self ,
192154 * ,
193- manifest_row : dict [str , str ],
155+ manifest_row : dict [str , Any ],
194156 context : SeedReaderFileSystemContext ,
195157 ) -> Any :
196158 del manifest_row , context
197159 return self ._hydrated_return
198160
199161
200- class SchemaMismatchFanoutSeedReader (FileSystemSeedReader [DirectorySeedSource ]):
201- def __init__ (self , * , output_columns : list [str ], hydrated_rows : list [dict [str , str ]]) -> None :
202- self .output_columns = output_columns
203- self ._hydrated_rows = hydrated_rows
204-
205- def build_manifest (self , * , context : SeedReaderFileSystemContext ) -> lazy .pd .DataFrame | list [dict [str , str ]]:
206- matched_paths = self .get_matching_relative_paths (
207- context = context ,
208- file_pattern = self .source .file_pattern ,
209- recursive = self .source .recursive ,
210- )
211- return [{"relative_path" : relative_path } for relative_path in matched_paths ]
212-
213- def hydrate_row (
214- self ,
215- * ,
216- manifest_row : dict [str , str ],
217- context : SeedReaderFileSystemContext ,
218- ) -> list [dict [str , str ]]:
219- del manifest_row , context
220- return self ._hydrated_rows
221-
222-
223162class ContextCountingDirectorySeedReader (FileSystemSeedReader [DirectorySeedSource ]):
224163 def __init__ (self ) -> None :
225164 self .filesystem_context_calls = 0
@@ -237,6 +176,16 @@ def build_manifest(self, *, context: SeedReaderFileSystemContext) -> lazy.pd.Dat
237176 return [{"relative_path" : relative_path } for relative_path in matched_paths ]
238177
239178
179+ @pytest .fixture
180+ def write_alpha_beta_text_files (tmp_path : Path ) -> Callable [[str , str ], Path ]:
181+ def _write_alpha_beta_text_files (alpha_contents : str , beta_contents : str ) -> Path :
182+ (tmp_path / "alpha.txt" ).write_text (alpha_contents , encoding = "utf-8" )
183+ (tmp_path / "beta.txt" ).write_text (beta_contents , encoding = "utf-8" )
184+ return tmp_path
185+
186+ return _write_alpha_beta_text_files
187+
188+
240189def test_one_reader_per_seed_type ():
241190 local_1 = LocalFileSeedReader ()
242191 local_2 = LocalFileSeedReader ()
@@ -308,7 +257,7 @@ def test_plugin_style_filesystem_seed_reader_can_fan_out_rows(tmp_path: Path) ->
308257 (tmp_path / "alpha.txt" ).write_text ("alpha-0\n alpha-1" , encoding = "utf-8" )
309258 (tmp_path / "beta.txt" ).write_text ("beta-0" , encoding = "utf-8" )
310259
311- reader = FanoutDirectorySeedReader ( )
260+ reader = LineFanoutDirectorySeedReader ( include_file_name = True )
312261 reader .attach (
313262 DirectorySeedSource (path = str (tmp_path ), file_pattern = "*.txt" ),
314263 PlaintextResolver (),
@@ -444,13 +393,14 @@ def test_file_contents_seed_reader_hydrates_only_selected_manifest_rows(tmp_path
444393 assert reader .hydrated_relative_paths == ["beta.txt" ]
445394
446395
447- def test_filesystem_seed_reader_fanout_keeps_manifest_based_index_selection (tmp_path : Path ) -> None :
448- (tmp_path / "alpha.txt" ).write_text ("alpha-0\n alpha-1" , encoding = "utf-8" )
449- (tmp_path / "beta.txt" ).write_text ("beta-0\n beta-1" , encoding = "utf-8" )
396+ def test_filesystem_seed_reader_fanout_keeps_manifest_based_index_selection (
397+ write_alpha_beta_text_files : Callable [[str , str ], Path ],
398+ ) -> None :
399+ seed_dir = write_alpha_beta_text_files ("alpha-0\n alpha-1" , "beta-0\n beta-1" )
450400
451- reader = FanoutDirectorySeedReader ( )
401+ reader = LineFanoutDirectorySeedReader ( include_file_name = True )
452402 reader .attach (
453- DirectorySeedSource (path = str (tmp_path ), file_pattern = "*.txt" ),
403+ DirectorySeedSource (path = str (seed_dir ), file_pattern = "*.txt" ),
454404 PlaintextResolver (),
455405 )
456406
@@ -467,14 +417,13 @@ def test_filesystem_seed_reader_fanout_keeps_manifest_based_index_selection(tmp_
467417
468418
469419def test_filesystem_seed_reader_batch_reader_raises_for_selected_manifest_rows_with_empty_fanout (
470- tmp_path : Path ,
420+ write_alpha_beta_text_files : Callable [[ str , str ], Path ] ,
471421) -> None :
472- (tmp_path / "alpha.txt" ).write_text ("alpha-0" , encoding = "utf-8" )
473- (tmp_path / "beta.txt" ).write_text ("" , encoding = "utf-8" )
422+ seed_dir = write_alpha_beta_text_files ("alpha-0" , "" )
474423
475- reader = FanoutDirectorySeedReader ( )
424+ reader = LineFanoutDirectorySeedReader ( include_file_name = True )
476425 reader .attach (
477- DirectorySeedSource (path = str (tmp_path ), file_pattern = "*.txt" ),
426+ DirectorySeedSource (path = str (seed_dir ), file_pattern = "*.txt" ),
478427 PlaintextResolver (),
479428 )
480429
@@ -494,14 +443,13 @@ def test_filesystem_seed_reader_batch_reader_raises_for_selected_manifest_rows_w
494443
495444
496445def test_filesystem_seed_reader_batch_reader_skips_empty_fanout_rows_before_returning_records (
497- tmp_path : Path ,
446+ write_alpha_beta_text_files : Callable [[ str , str ], Path ] ,
498447) -> None :
499- (tmp_path / "alpha.txt" ).write_text ("" , encoding = "utf-8" )
500- (tmp_path / "beta.txt" ).write_text ("beta-0\n beta-1" , encoding = "utf-8" )
448+ seed_dir = write_alpha_beta_text_files ("" , "beta-0\n beta-1" )
501449
502- reader = FanoutDirectorySeedReader ( )
450+ reader = LineFanoutDirectorySeedReader ( include_file_name = True )
503451 reader .attach (
504- DirectorySeedSource (path = str (tmp_path ), file_pattern = "*.txt" ),
452+ DirectorySeedSource (path = str (seed_dir ), file_pattern = "*.txt" ),
505453 PlaintextResolver (),
506454 )
507455
@@ -518,14 +466,13 @@ def test_filesystem_seed_reader_batch_reader_skips_empty_fanout_rows_before_retu
518466
519467
520468def test_filesystem_seed_reader_batch_reader_stops_cleanly_after_emitting_records_when_only_empty_fanout_rows_remain (
521- tmp_path : Path ,
469+ write_alpha_beta_text_files : Callable [[ str , str ], Path ] ,
522470) -> None :
523- (tmp_path / "alpha.txt" ).write_text ("alpha-0" , encoding = "utf-8" )
524- (tmp_path / "beta.txt" ).write_text ("" , encoding = "utf-8" )
471+ seed_dir = write_alpha_beta_text_files ("alpha-0" , "" )
525472
526- reader = FanoutDirectorySeedReader ( )
473+ reader = LineFanoutDirectorySeedReader ( include_file_name = True )
527474 reader .attach (
528- DirectorySeedSource (path = str (tmp_path ), file_pattern = "*.txt" ),
475+ DirectorySeedSource (path = str (seed_dir ), file_pattern = "*.txt" ),
529476 PlaintextResolver (),
530477 )
531478
@@ -545,6 +492,21 @@ def test_filesystem_seed_reader_batch_reader_stops_cleanly_after_emitting_record
545492 assert reader .hydrated_relative_paths == ["alpha.txt" , "beta.txt" ]
546493
547494
495+ def test_filesystem_seed_reader_full_output_raises_when_all_manifest_rows_fan_out_to_empty (
496+ write_alpha_beta_text_files : Callable [[str , str ], Path ],
497+ ) -> None :
498+ seed_dir = write_alpha_beta_text_files ("" , "" )
499+
500+ reader = LineFanoutDirectorySeedReader (include_file_name = True )
501+ reader .attach (
502+ DirectorySeedSource (path = str (seed_dir ), file_pattern = "*.txt" ),
503+ PlaintextResolver (),
504+ )
505+
506+ with pytest .raises (SeedReaderError , match = "Seed source at .* did not produce any rows" ):
507+ reader .create_duckdb_connection ().execute (f"SELECT * FROM '{ reader .get_dataset_uri ()} '" ).df ()
508+
509+
548510def test_local_file_seed_reader_uses_load_time_runtime_path_when_cwd_changes (
549511 tmp_path : Path ,
550512 monkeypatch : pytest .MonkeyPatch ,
@@ -649,10 +611,11 @@ def test_filesystem_seed_reader_raises_for_undeclared_hydrated_columns(
649611@pytest .mark .parametrize (
650612 ("hydrated_return" , "error_pattern" ),
651613 [
614+ (None , "Manifest row index 0 returned NoneType" ),
652615 (123 , "Manifest row index 0 returned int" ),
653616 (["not-a-record" ], "Manifest row index 0 returned an iterable containing str" ),
654617 ],
655- ids = ["scalar" , "iterable-of-invalid-records" ],
618+ ids = ["none" , " scalar" , "iterable-of-invalid-records" ],
656619)
657620def test_filesystem_seed_reader_rejects_invalid_hydrate_row_returns (
658621 tmp_path : Path ,
@@ -661,15 +624,15 @@ def test_filesystem_seed_reader_rejects_invalid_hydrate_row_returns(
661624) -> None :
662625 (tmp_path / "alpha.txt" ).write_text ("alpha" , encoding = "utf-8" )
663626
664- reader = InvalidHydrationReturnSeedReader ( hydrated_return )
627+ reader = ConfigurableHydrationDirectorySeedReader ( hydrated_return = hydrated_return )
665628 reader .attach (DirectorySeedSource (path = str (tmp_path ), file_pattern = "*.txt" ), PlaintextResolver ())
666629
667630 with pytest .raises (SeedReaderError , match = error_pattern ):
668631 reader .create_duckdb_connection ().execute (f"SELECT * FROM '{ reader .get_dataset_uri ()} '" ).df ()
669632
670633
671634@pytest .mark .parametrize (
672- ("output_columns" , "hydrated_rows " , "error_pattern" ),
635+ ("output_columns" , "hydrated_return " , "error_pattern" ),
673636 [
674637 (
675638 ["relative_path" , "content" ],
@@ -693,12 +656,15 @@ def test_filesystem_seed_reader_rejects_invalid_hydrate_row_returns(
693656def test_filesystem_seed_reader_validates_each_fanout_record_against_output_columns (
694657 tmp_path : Path ,
695658 output_columns : list [str ],
696- hydrated_rows : list [dict [str , str ]],
659+ hydrated_return : list [dict [str , str ]],
697660 error_pattern : str ,
698661) -> None :
699662 (tmp_path / "alpha.txt" ).write_text ("alpha" , encoding = "utf-8" )
700663
701- reader = SchemaMismatchFanoutSeedReader (output_columns = output_columns , hydrated_rows = hydrated_rows )
664+ reader = ConfigurableHydrationDirectorySeedReader (
665+ output_columns = output_columns ,
666+ hydrated_return = hydrated_return ,
667+ )
702668 reader .attach (DirectorySeedSource (path = str (tmp_path ), file_pattern = "*.txt" ), PlaintextResolver ())
703669
704670 with pytest .raises (SeedReaderError , match = error_pattern ):
0 commit comments