Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit b1627ff

Browse files
Merge branch 'main' into andreatgretel/feat/async-scheduler-buffer
2 parents 0cd3b04 + 28c8345 commit b1627ff

18 files changed

Lines changed: 1810 additions & 136 deletions

File tree

‎packages/data-designer-config/src/data_designer/config/__init__.py‎

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,8 @@
8888
SeedConfig,
8989
)
9090
from data_designer.config.seed_source import ( # noqa: F401
91+
DirectorySeedSource,
92+
FileContentsSeedSource,
9193
HuggingFaceSeedSource,
9294
LocalFileSeedSource,
9395
)
@@ -197,6 +199,8 @@
197199
"SeedConfig": (_MOD_SEED, "SeedConfig"),
198200
# seed_source
199201
"DataFrameSeedSource": (f"{_MOD_BASE}.seed_source_dataframe", "DataFrameSeedSource"),
202+
"DirectorySeedSource": (_MOD_SEED_SOURCE, "DirectorySeedSource"),
203+
"FileContentsSeedSource": (_MOD_SEED_SOURCE, "FileContentsSeedSource"),
200204
"HuggingFaceSeedSource": (_MOD_SEED_SOURCE, "HuggingFaceSeedSource"),
201205
"LocalFileSeedSource": (_MOD_SEED_SOURCE, "LocalFileSeedSource"),
202206
# utils

‎packages/data-designer-config/src/data_designer/config/seed_source.py‎

Lines changed: 103 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,15 @@
33

44
from __future__ import annotations
55

6+
import codecs
67
from abc import ABC
7-
from typing import TYPE_CHECKING, Literal
8+
from pathlib import Path
9+
from typing import TYPE_CHECKING, Any, Literal
810

9-
from pydantic import BaseModel, Field, field_validator
11+
from pydantic import BaseModel, Field, PrivateAttr, field_validator
1012
from typing_extensions import Self
1113

14+
from data_designer.config.errors import InvalidFilePathError
1215
from data_designer.config.utils.io_helpers import (
1316
VALID_DATASET_FILE_EXTENSIONS,
1417
validate_dataset_file_path,
@@ -31,8 +34,15 @@ class SeedSource(BaseModel, ABC):
3134

3235
class LocalFileSeedSource(SeedSource):
3336
seed_type: Literal["local"] = "local"
37+
_runtime_path: str | None = PrivateAttr(default=None)
3438

35-
path: str
39+
path: str = Field(
40+
...,
41+
description=(
42+
"Path to a local seed dataset file or wildcard pattern. Relative paths are resolved from the "
43+
"current working directory when the config is loaded, not from the config file location."
44+
),
45+
)
3646

3747
@field_validator("path", mode="after")
3848
def validate_path(cls, v: str) -> str:
@@ -46,6 +56,15 @@ def validate_path(cls, v: str) -> str:
4656
validate_dataset_file_path(v)
4757
return v
4858

59+
def model_post_init(self, __context: Any) -> None:
60+
self._runtime_path = _resolve_local_file_runtime_path(self.path)
61+
62+
@property
63+
def runtime_path(self) -> str:
64+
if self._runtime_path is None:
65+
self._runtime_path = _resolve_local_file_runtime_path(self.path)
66+
return self._runtime_path
67+
4968
@classmethod
5069
def from_dataframe(cls, df: pd.DataFrame, path: str) -> Self:
5170
df.to_parquet(path, index=False)
@@ -65,3 +84,84 @@ class HuggingFaceSeedSource(SeedSource):
6584
)
6685
token: str | None = None
6786
endpoint: str = "https://huggingface.co"
87+
88+
89+
class FileSystemSeedSource(SeedSource, ABC):
90+
_runtime_path: str | None = PrivateAttr(default=None)
91+
92+
path: str = Field(
93+
...,
94+
description=(
95+
"Directory containing seed artifacts. Relative paths are resolved from the current working "
96+
"directory when the config is loaded, not from the config file location."
97+
),
98+
)
99+
file_pattern: str = Field(
100+
"*",
101+
description=(
102+
"Case-sensitive filename pattern used to match files under the provided directory. "
103+
"Patterns match basenames only, not relative paths."
104+
),
105+
)
106+
recursive: bool = Field(
107+
True,
108+
description="Whether to search nested subdirectories under the provided directory for matching files.",
109+
)
110+
111+
@field_validator("path", mode="after")
112+
def validate_path(cls, value: str) -> str:
113+
path = Path(value).expanduser().resolve()
114+
if not path.is_dir():
115+
raise InvalidFilePathError(f"🛑 Path {path} is not a directory.")
116+
return value
117+
118+
def model_post_init(self, __context: Any) -> None:
119+
self._runtime_path = _resolve_filesystem_runtime_path(self.path)
120+
121+
@property
122+
def runtime_path(self) -> str:
123+
if self._runtime_path is None:
124+
self._runtime_path = _resolve_filesystem_runtime_path(self.path)
125+
return self._runtime_path
126+
127+
@field_validator("file_pattern", mode="after")
128+
def validate_file_pattern(cls, value: str) -> str:
129+
if not value.strip():
130+
raise ValueError("🛑 FileSystemSeedSource.file_pattern must be a non-empty string.")
131+
if "/" in value or "\\" in value:
132+
raise ValueError("🛑 FileSystemSeedSource.file_pattern must match file names, not relative paths.")
133+
return value
134+
135+
136+
class DirectorySeedSource(FileSystemSeedSource):
137+
seed_type: Literal["directory"] = "directory"
138+
139+
140+
class FileContentsSeedSource(FileSystemSeedSource):
141+
seed_type: Literal["file_contents"] = "file_contents"
142+
143+
encoding: str = Field(
144+
"utf-8",
145+
description="Text encoding used when reading matching files into the `content` column.",
146+
)
147+
148+
@field_validator("encoding", mode="after")
149+
def validate_encoding(cls, value: str) -> str:
150+
try:
151+
codecs.lookup(value)
152+
except LookupError as error:
153+
raise ValueError(f"🛑 Unknown encoding: {value!r}. Use a valid Python codec name.") from error
154+
return value
155+
156+
157+
def _resolve_filesystem_runtime_path(path: str) -> str:
158+
return str(Path(path).expanduser().resolve())
159+
160+
161+
def _resolve_local_file_runtime_path(path: str) -> str:
162+
if "*" not in path:
163+
return _resolve_filesystem_runtime_path(path)
164+
165+
path_prefix, glob_suffix = path.split("*", 1)
166+
resolved_prefix = Path(path_prefix or ".").expanduser().resolve()
167+
return str(resolved_prefix / f"*{glob_suffix}")

‎packages/data-designer-config/src/data_designer/config/seed_source_types.py‎

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,20 @@
88
from pydantic import Field
99
from typing_extensions import TypeAlias
1010

11-
from data_designer.config.seed_source import HuggingFaceSeedSource, LocalFileSeedSource
11+
from data_designer.config.seed_source import (
12+
DirectorySeedSource,
13+
FileContentsSeedSource,
14+
HuggingFaceSeedSource,
15+
LocalFileSeedSource,
16+
)
1217
from data_designer.config.seed_source_dataframe import DataFrameSeedSource
1318
from data_designer.plugin_manager import PluginManager
1419

1520
plugin_manager = PluginManager()
1621

17-
_SeedSourceT: TypeAlias = LocalFileSeedSource | HuggingFaceSeedSource | DataFrameSeedSource
22+
_SeedSourceT: TypeAlias = (
23+
LocalFileSeedSource | HuggingFaceSeedSource | DataFrameSeedSource | DirectorySeedSource | FileContentsSeedSource
24+
)
1825
_SeedSourceT = plugin_manager.inject_into_seed_source_type_union(_SeedSourceT)
1926

2027
SeedSourceT = Annotated[_SeedSourceT, Field(discriminator="seed_type")]

‎packages/data-designer-config/tests/config/test_seed_source.py‎

Lines changed: 162 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,10 @@
77

88
import pytest
99

10+
import data_designer.config as dd
1011
import data_designer.lazy_heavy_imports as lazy
1112
from data_designer.config.errors import InvalidFilePathError
12-
from data_designer.config.seed_source import LocalFileSeedSource
13+
from data_designer.config.seed_source import DirectorySeedSource, FileContentsSeedSource, LocalFileSeedSource
1314
from data_designer.config.seed_source_dataframe import DataFrameSeedSource
1415

1516

@@ -64,6 +65,27 @@ def test_local_source_from_dataframe(tmp_path: Path):
6465
lazy.pd.testing.assert_frame_equal(df, lazy.pd.read_parquet(filepath))
6566

6667

68+
def test_local_seed_source_caches_runtime_path_across_cwd_changes(
69+
tmp_path: Path, monkeypatch: pytest.MonkeyPatch
70+
) -> None:
71+
initial_root = tmp_path / "initial"
72+
later_root = tmp_path / "later"
73+
initial_seed_dir = initial_root / "seed-dir"
74+
initial_seed_dir.mkdir(parents=True)
75+
create_partitions_in_path(initial_seed_dir, "parquet", num_files=1)
76+
later_root.mkdir()
77+
78+
monkeypatch.chdir(initial_root)
79+
source = LocalFileSeedSource(path="seed-dir/*.parquet")
80+
expected_runtime_path = str(initial_seed_dir.resolve() / "*.parquet")
81+
82+
monkeypatch.chdir(later_root)
83+
84+
assert source.path == "seed-dir/*.parquet"
85+
assert source.runtime_path == expected_runtime_path
86+
assert source.model_dump(mode="json")["path"] == "seed-dir/*.parquet"
87+
88+
6789
def test_dataframe_seed_source_serialization():
6890
"""Test that DataFrameSeedSource excludes the DataFrame field during serialization."""
6991
df = lazy.pd.DataFrame({"col1": [1, 2, 3], "col2": ["a", "b", "c"]})
@@ -73,3 +95,142 @@ def test_dataframe_seed_source_serialization():
7395
serialized = source.model_dump(mode="json")
7496
assert "df" not in serialized
7597
assert serialized == {"seed_type": "df"}
98+
99+
100+
def test_directory_seed_source_requires_directory(tmp_path: Path) -> None:
101+
file_path = tmp_path / "file.txt"
102+
file_path.write_text("alpha", encoding="utf-8")
103+
104+
with pytest.raises(InvalidFilePathError, match="is not a directory"):
105+
DirectorySeedSource(path=str(file_path))
106+
107+
108+
def test_directory_seed_source_preserves_relative_path_input(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
109+
seed_dir = tmp_path / "seed-dir"
110+
seed_dir.mkdir()
111+
monkeypatch.chdir(tmp_path)
112+
113+
source = DirectorySeedSource(path="seed-dir")
114+
115+
assert source.path == "seed-dir"
116+
assert source.model_dump(mode="json")["path"] == "seed-dir"
117+
assert source.file_pattern == "*"
118+
assert source.recursive is True
119+
120+
121+
def test_file_contents_seed_source_defaults() -> None:
122+
source = FileContentsSeedSource(path=".", file_pattern="*.md", recursive=False)
123+
124+
assert source.seed_type == "file_contents"
125+
assert source.file_pattern == "*.md"
126+
assert source.recursive is False
127+
assert source.encoding == "utf-8"
128+
129+
130+
def test_file_contents_seed_source_preserves_relative_path_input(
131+
tmp_path: Path,
132+
monkeypatch: pytest.MonkeyPatch,
133+
) -> None:
134+
seed_dir = tmp_path / "seed-dir"
135+
seed_dir.mkdir()
136+
monkeypatch.chdir(tmp_path)
137+
138+
source = FileContentsSeedSource(path="seed-dir", file_pattern="*.txt")
139+
140+
assert source.path == "seed-dir"
141+
assert source.model_dump(mode="json")["path"] == "seed-dir"
142+
143+
144+
@pytest.mark.parametrize(
145+
("source_type", "source_kwargs"),
146+
[
147+
pytest.param(DirectorySeedSource, {}, id="directory"),
148+
pytest.param(FileContentsSeedSource, {"file_pattern": "*.txt"}, id="file-contents"),
149+
],
150+
)
151+
def test_filesystem_seed_sources_cache_runtime_path_across_cwd_changes(
152+
source_type: type[DirectorySeedSource] | type[FileContentsSeedSource],
153+
source_kwargs: dict[str, str],
154+
tmp_path: Path,
155+
monkeypatch: pytest.MonkeyPatch,
156+
) -> None:
157+
initial_root = tmp_path / "initial"
158+
later_root = tmp_path / "later"
159+
initial_seed_dir = initial_root / "seed-dir"
160+
initial_seed_dir.mkdir(parents=True)
161+
later_root.mkdir()
162+
163+
monkeypatch.chdir(initial_root)
164+
source = source_type(path="seed-dir", **source_kwargs)
165+
expected_runtime_path = str(initial_seed_dir.resolve())
166+
167+
monkeypatch.chdir(later_root)
168+
169+
assert source.path == "seed-dir"
170+
assert source.runtime_path == expected_runtime_path
171+
assert source.model_dump(mode="json")["path"] == "seed-dir"
172+
173+
174+
def test_seed_source_path_descriptions_document_cwd_resolution() -> None:
175+
local_path_description = LocalFileSeedSource.model_json_schema()["properties"]["path"]["description"]
176+
directory_path_description = DirectorySeedSource.model_json_schema()["properties"]["path"]["description"]
177+
file_contents_path_description = FileContentsSeedSource.model_json_schema()["properties"]["path"]["description"]
178+
179+
assert "current working directory" in local_path_description
180+
assert "config file location" in local_path_description
181+
assert "current working directory" in directory_path_description
182+
assert "config file location" in directory_path_description
183+
assert "current working directory" in file_contents_path_description
184+
assert "config file location" in file_contents_path_description
185+
186+
187+
def test_seed_sources_are_exported_from_config_module(tmp_path: Path) -> None:
188+
directory_source = dd.DirectorySeedSource(path=str(tmp_path))
189+
file_contents_source = dd.FileContentsSeedSource(path=str(tmp_path), file_pattern="*.txt")
190+
191+
assert directory_source.seed_type == "directory"
192+
assert file_contents_source.seed_type == "file_contents"
193+
194+
195+
def test_file_contents_seed_source_parses_from_dict(tmp_path: Path) -> None:
196+
source = FileContentsSeedSource.model_validate(
197+
{
198+
"path": str(tmp_path),
199+
"file_pattern": "*.txt",
200+
"recursive": False,
201+
"encoding": "latin-1",
202+
}
203+
)
204+
205+
assert source.file_pattern == "*.txt"
206+
assert source.recursive is False
207+
assert source.encoding == "latin-1"
208+
209+
210+
def test_file_contents_seed_source_rejects_unknown_encoding(tmp_path: Path) -> None:
211+
with pytest.raises(ValueError, match="Unknown encoding"):
212+
FileContentsSeedSource(path=str(tmp_path), file_pattern="*.txt", encoding="utf-999")
213+
214+
215+
@pytest.mark.parametrize(
216+
("source_type", "file_pattern", "error_message"),
217+
[
218+
pytest.param(DirectorySeedSource, "", "non-empty string", id="directory-empty"),
219+
pytest.param(DirectorySeedSource, "subdir/*.txt", "match file names, not relative paths", id="directory-posix"),
220+
pytest.param(FileContentsSeedSource, "", "non-empty string", id="contents-empty"),
221+
pytest.param(
222+
FileContentsSeedSource,
223+
r"subdir\\*.txt",
224+
"match file names, not relative paths",
225+
id="contents-windows",
226+
),
227+
],
228+
)
229+
def test_filesystem_seed_sources_reject_path_like_file_patterns(
230+
source_type: type[DirectorySeedSource] | type[FileContentsSeedSource],
231+
file_pattern: str,
232+
error_message: str,
233+
tmp_path: Path,
234+
) -> None:
235+
with pytest.raises(ValueError, match=error_message):
236+
source_type(path=str(tmp_path), file_pattern=file_pattern)

‎packages/data-designer-engine/pyproject.toml‎

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ dependencies = [
3737
"data-designer-config=={{ version }}",
3838
"duckdb>=1.5.0,<2",
3939
"faker>=20.1.0,<21",
40+
"fsspec>=2025.3.0,<2026",
4041
"httpx>=0.27.2,<1",
4142
"httpx-retries>=0.4.2,<1",
4243
"huggingface-hub>=1.0.1,<2",

0 commit comments

Comments
 (0)