77
88import pytest
99
10+ import data_designer .config as dd
1011import data_designer .lazy_heavy_imports as lazy
1112from data_designer .config .errors import InvalidFilePathError
12- from data_designer .config .seed_source import LocalFileSeedSource
13+ from data_designer .config .seed_source import DirectorySeedSource , FileContentsSeedSource , LocalFileSeedSource
1314from data_designer .config .seed_source_dataframe import DataFrameSeedSource
1415
1516
@@ -64,6 +65,27 @@ def test_local_source_from_dataframe(tmp_path: Path):
6465 lazy .pd .testing .assert_frame_equal (df , lazy .pd .read_parquet (filepath ))
6566
6667
68+ def test_local_seed_source_caches_runtime_path_across_cwd_changes (
69+ tmp_path : Path , monkeypatch : pytest .MonkeyPatch
70+ ) -> None :
71+ initial_root = tmp_path / "initial"
72+ later_root = tmp_path / "later"
73+ initial_seed_dir = initial_root / "seed-dir"
74+ initial_seed_dir .mkdir (parents = True )
75+ create_partitions_in_path (initial_seed_dir , "parquet" , num_files = 1 )
76+ later_root .mkdir ()
77+
78+ monkeypatch .chdir (initial_root )
79+ source = LocalFileSeedSource (path = "seed-dir/*.parquet" )
80+ expected_runtime_path = str (initial_seed_dir .resolve () / "*.parquet" )
81+
82+ monkeypatch .chdir (later_root )
83+
84+ assert source .path == "seed-dir/*.parquet"
85+ assert source .runtime_path == expected_runtime_path
86+ assert source .model_dump (mode = "json" )["path" ] == "seed-dir/*.parquet"
87+
88+
6789def test_dataframe_seed_source_serialization ():
6890 """Test that DataFrameSeedSource excludes the DataFrame field during serialization."""
6991 df = lazy .pd .DataFrame ({"col1" : [1 , 2 , 3 ], "col2" : ["a" , "b" , "c" ]})
@@ -73,3 +95,142 @@ def test_dataframe_seed_source_serialization():
7395 serialized = source .model_dump (mode = "json" )
7496 assert "df" not in serialized
7597 assert serialized == {"seed_type" : "df" }
98+
99+
100+ def test_directory_seed_source_requires_directory (tmp_path : Path ) -> None :
101+ file_path = tmp_path / "file.txt"
102+ file_path .write_text ("alpha" , encoding = "utf-8" )
103+
104+ with pytest .raises (InvalidFilePathError , match = "is not a directory" ):
105+ DirectorySeedSource (path = str (file_path ))
106+
107+
108+ def test_directory_seed_source_preserves_relative_path_input (tmp_path : Path , monkeypatch : pytest .MonkeyPatch ) -> None :
109+ seed_dir = tmp_path / "seed-dir"
110+ seed_dir .mkdir ()
111+ monkeypatch .chdir (tmp_path )
112+
113+ source = DirectorySeedSource (path = "seed-dir" )
114+
115+ assert source .path == "seed-dir"
116+ assert source .model_dump (mode = "json" )["path" ] == "seed-dir"
117+ assert source .file_pattern == "*"
118+ assert source .recursive is True
119+
120+
121+ def test_file_contents_seed_source_defaults () -> None :
122+ source = FileContentsSeedSource (path = "." , file_pattern = "*.md" , recursive = False )
123+
124+ assert source .seed_type == "file_contents"
125+ assert source .file_pattern == "*.md"
126+ assert source .recursive is False
127+ assert source .encoding == "utf-8"
128+
129+
130+ def test_file_contents_seed_source_preserves_relative_path_input (
131+ tmp_path : Path ,
132+ monkeypatch : pytest .MonkeyPatch ,
133+ ) -> None :
134+ seed_dir = tmp_path / "seed-dir"
135+ seed_dir .mkdir ()
136+ monkeypatch .chdir (tmp_path )
137+
138+ source = FileContentsSeedSource (path = "seed-dir" , file_pattern = "*.txt" )
139+
140+ assert source .path == "seed-dir"
141+ assert source .model_dump (mode = "json" )["path" ] == "seed-dir"
142+
143+
144+ @pytest .mark .parametrize (
145+ ("source_type" , "source_kwargs" ),
146+ [
147+ pytest .param (DirectorySeedSource , {}, id = "directory" ),
148+ pytest .param (FileContentsSeedSource , {"file_pattern" : "*.txt" }, id = "file-contents" ),
149+ ],
150+ )
151+ def test_filesystem_seed_sources_cache_runtime_path_across_cwd_changes (
152+ source_type : type [DirectorySeedSource ] | type [FileContentsSeedSource ],
153+ source_kwargs : dict [str , str ],
154+ tmp_path : Path ,
155+ monkeypatch : pytest .MonkeyPatch ,
156+ ) -> None :
157+ initial_root = tmp_path / "initial"
158+ later_root = tmp_path / "later"
159+ initial_seed_dir = initial_root / "seed-dir"
160+ initial_seed_dir .mkdir (parents = True )
161+ later_root .mkdir ()
162+
163+ monkeypatch .chdir (initial_root )
164+ source = source_type (path = "seed-dir" , ** source_kwargs )
165+ expected_runtime_path = str (initial_seed_dir .resolve ())
166+
167+ monkeypatch .chdir (later_root )
168+
169+ assert source .path == "seed-dir"
170+ assert source .runtime_path == expected_runtime_path
171+ assert source .model_dump (mode = "json" )["path" ] == "seed-dir"
172+
173+
174+ def test_seed_source_path_descriptions_document_cwd_resolution () -> None :
175+ local_path_description = LocalFileSeedSource .model_json_schema ()["properties" ]["path" ]["description" ]
176+ directory_path_description = DirectorySeedSource .model_json_schema ()["properties" ]["path" ]["description" ]
177+ file_contents_path_description = FileContentsSeedSource .model_json_schema ()["properties" ]["path" ]["description" ]
178+
179+ assert "current working directory" in local_path_description
180+ assert "config file location" in local_path_description
181+ assert "current working directory" in directory_path_description
182+ assert "config file location" in directory_path_description
183+ assert "current working directory" in file_contents_path_description
184+ assert "config file location" in file_contents_path_description
185+
186+
187+ def test_seed_sources_are_exported_from_config_module (tmp_path : Path ) -> None :
188+ directory_source = dd .DirectorySeedSource (path = str (tmp_path ))
189+ file_contents_source = dd .FileContentsSeedSource (path = str (tmp_path ), file_pattern = "*.txt" )
190+
191+ assert directory_source .seed_type == "directory"
192+ assert file_contents_source .seed_type == "file_contents"
193+
194+
195+ def test_file_contents_seed_source_parses_from_dict (tmp_path : Path ) -> None :
196+ source = FileContentsSeedSource .model_validate (
197+ {
198+ "path" : str (tmp_path ),
199+ "file_pattern" : "*.txt" ,
200+ "recursive" : False ,
201+ "encoding" : "latin-1" ,
202+ }
203+ )
204+
205+ assert source .file_pattern == "*.txt"
206+ assert source .recursive is False
207+ assert source .encoding == "latin-1"
208+
209+
210+ def test_file_contents_seed_source_rejects_unknown_encoding (tmp_path : Path ) -> None :
211+ with pytest .raises (ValueError , match = "Unknown encoding" ):
212+ FileContentsSeedSource (path = str (tmp_path ), file_pattern = "*.txt" , encoding = "utf-999" )
213+
214+
215+ @pytest .mark .parametrize (
216+ ("source_type" , "file_pattern" , "error_message" ),
217+ [
218+ pytest .param (DirectorySeedSource , "" , "non-empty string" , id = "directory-empty" ),
219+ pytest .param (DirectorySeedSource , "subdir/*.txt" , "match file names, not relative paths" , id = "directory-posix" ),
220+ pytest .param (FileContentsSeedSource , "" , "non-empty string" , id = "contents-empty" ),
221+ pytest .param (
222+ FileContentsSeedSource ,
223+ r"subdir\\*.txt" ,
224+ "match file names, not relative paths" ,
225+ id = "contents-windows" ,
226+ ),
227+ ],
228+ )
229+ def test_filesystem_seed_sources_reject_path_like_file_patterns (
230+ source_type : type [DirectorySeedSource ] | type [FileContentsSeedSource ],
231+ file_pattern : str ,
232+ error_message : str ,
233+ tmp_path : Path ,
234+ ) -> None :
235+ with pytest .raises (ValueError , match = error_message ):
236+ source_type (path = str (tmp_path ), file_pattern = file_pattern )
0 commit comments