-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconfig.py
More file actions
100 lines (78 loc) · 3.24 KB
/
config.py
File metadata and controls
100 lines (78 loc) · 3.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
"""Configuration loading for DTD Pipeline."""
import json
import os
from pathlib import Path
from typing import Optional
from pydantic import BaseModel, Field
from models import PodcastConfig
class Config(BaseModel):
"""Pipeline configuration loaded from environment."""
# Taddy API
taddy_api_key: str = Field(default="")
taddy_user_id: str = Field(default="")
lookback_days: int = Field(default=7)
# Whisper
whisper_model: str = Field(default="large-v3")
whisper_device: str = Field(default="cuda")
whisper_compute_type: str = Field(default="float16")
# Pyannote
hf_token: str = Field(default="")
pyannote_model: str = Field(default="pyannote/speaker-diarization-3.1")
# Speaker ID
speaker_match_threshold: float = Field(default=0.70)
# Supabase
supabase_url: str = Field(default="")
supabase_key: str = Field(default="")
supabase_bucket: str = Field(default="podcast-transcripts")
# Paths
data_dir: Path = Field(default=Path("data"))
audio_dir: Path = Field(default=Path("data/audio"))
speakers_dir: Path = Field(default=Path("data/speakers"))
logs_dir: Path = Field(default=Path("data/logs"))
state_db_path: Path = Field(default=Path("data/state.db"))
# Podcasts config
podcasts_config_path: Path = Field(default=Path("podcasts.json"))
def load_config() -> Config:
"""Load configuration from environment variables."""
return Config(
# Taddy
taddy_api_key=os.getenv("TADDY_API_KEY", ""),
taddy_user_id=os.getenv("TADDY_USER_ID", ""),
lookback_days=int(os.getenv("LOOKBACK_DAYS", "7")),
# Whisper
whisper_model=os.getenv("WHISPER_MODEL", "large-v3"),
whisper_device=os.getenv("WHISPER_DEVICE", "cuda"),
whisper_compute_type=os.getenv("WHISPER_COMPUTE_TYPE", "float16"),
# Pyannote
hf_token=os.getenv("HF_TOKEN", ""),
pyannote_model=os.getenv("PYANNOTE_MODEL", "pyannote/speaker-diarization-3.1"),
# Speaker ID
speaker_match_threshold=float(os.getenv("SPEAKER_MATCH_THRESHOLD", "0.70")),
# Supabase
supabase_url=os.getenv("SUPABASE_URL", ""),
supabase_key=os.getenv("SUPABASE_KEY", ""),
supabase_bucket=os.getenv("SUPABASE_BUCKET", "podcast-transcripts"),
# Paths
data_dir=Path(os.getenv("DATA_DIR", "data")),
audio_dir=Path(os.getenv("AUDIO_DIR", "data/audio")),
speakers_dir=Path(os.getenv("SPEAKERS_DIR", "data/speakers")),
logs_dir=Path(os.getenv("LOGS_DIR", "data/logs")),
state_db_path=Path(os.getenv("STATE_DB_PATH", "data/state.db")),
podcasts_config_path=Path(os.getenv("PODCASTS_CONFIG_PATH", "podcasts.json")),
)
def load_podcasts(config: Config) -> list[PodcastConfig]:
"""Load podcast configurations from JSON file."""
config_path = config.podcasts_config_path
if not config_path.exists():
return []
with open(config_path) as f:
data = json.load(f)
return [PodcastConfig(**p) for p in data.get("podcasts", [])]
# Global config instance
_config: Optional[Config] = None
def get_config() -> Config:
"""Get or create global config instance."""
global _config
if _config is None:
_config = load_config()
return _config