Codestin Search App

89 lines (66 loc) · 3.18 KB
# =============================================================================
# DTD Pipeline Configuration
# =============================================================================
# Copy this file to .env and fill in your credentials
# Required variables are marked with (REQUIRED)
# =============================================================================
# TADDY API (REQUIRED)
# =============================================================================
# Get credentials from: https://taddy.org/developers
TADDY_API_KEY=your_taddy_api_key_here
TADDY_USER_ID=your_taddy_user_id_here
# Days to look back for new episodes
LOOKBACK_DAYS=7
# =============================================================================
# HUGGINGFACE (REQUIRED)
# =============================================================================
# Get token from: https://huggingface.co/settings/tokens
# IMPORTANT: You must accept the pyannote model license first:
# https://huggingface.co/pyannote/speaker-diarization-3.1
HF_TOKEN=hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxx
# =============================================================================
# SUPABASE (REQUIRED)
# =============================================================================
# Get credentials from: Supabase Dashboard → Settings → API
SUPABASE_URL=https://xxxxxxxxxxxxx.supabase.co
SUPABASE_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.xxxxx
# Storage bucket name (create in Supabase Dashboard → Storage)
SUPABASE_BUCKET=podcast-transcripts
# =============================================================================
# WHISPER CONFIGURATION
# =============================================================================
# Model size: tiny, base, small, medium, large-v2, large-v3
# Larger = better quality, more VRAM required
# large-v3: ~6GB VRAM, medium: ~4GB, small: ~2GB
WHISPER_MODEL=large-v3
# Device: cuda (GPU) or cpu
WHISPER_DEVICE=cuda
# Compute type: float16 (fastest), int8 (less VRAM), float32 (most accurate)
WHISPER_COMPUTE_TYPE=float16
# =============================================================================
# PYANNOTE CONFIGURATION
# =============================================================================
# Speaker diarization model (requires ~3GB VRAM)
PYANNOTE_MODEL=pyannote/speaker-diarization-3.1
# =============================================================================
# SPEAKER IDENTIFICATION
# =============================================================================
# Cosine similarity threshold for matching known speakers (0.0-1.0)
# Higher = stricter matching (fewer false positives, more unknowns)
# Lower = looser matching (more matches, risk of false positives)
SPEAKER_MATCH_THRESHOLD=0.70
# =============================================================================
# =============================================================================
# Base data directory
DATA_DIR=data
# Audio download directory (files are auto-deleted after processing)
AUDIO_DIR=data/audio
# Speaker embeddings directory
SPEAKERS_DIR=data/speakers
# Log files directory
LOGS_DIR=data/logs
# SQLite state database path
STATE_DB_PATH=data/state.db
# Podcast configuration file
PODCASTS_CONFIG_PATH=podcasts.json
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

.env.example

Latest commit

History

.env.example

File metadata and controls