refactor(KDP): impreoving auto configuration functionality and UX

piotrlaczkowski · piotrlaczkowski · commit 7b76a99d027c · 2025-03-30T22:45:32.000+02:00
diff --git a/docs/auto_configuration.md b/docs/auto_configuration.md
@@ -11,69 +11,45 @@ The automatic model configuration system leverages statistical analysis to:
 3. **Optimize global settings** - Recommends global parameters for improved model performance
 4. **Generate code** - Provides ready-to-use Python code implementing the recommendations
 
-## 🛠️ How It Works
-
-The system works in two main phases:
-
-### 1. Statistics Collection
-
-First, the `DatasetStatistics` class analyzes your dataset to compute various statistical properties:
-
-- **Numerical features**: Mean, variance, distribution shape metrics (estimated skewness/kurtosis)
-- **Categorical features**: Vocabulary size, cardinality, unique values
-- **Text features**: Vocabulary statistics, average sequence length
-- **Date features**: Cyclical patterns, temporal variance
-
-### 2. Configuration Recommendation
-
-Then, the `ModelAdvisor` analyzes these statistics to recommend:
-
-- **Feature-specific transformations**: Based on the detected distribution type
-- **Advanced encoding options**: Such as distribution-aware encoding for complex distributions
-- **Attention mechanisms**: Tabular attention or multi-resolution attention when appropriate
-- **Global parameters**: Overall architecture suggestions based on the feature mix
-
 ## 🚀 Using the Configuration Advisor
 
-### Method 1: Using the Python API
+The simplest way to use the automatic configuration system is through the `auto_configure` function:
 
 ```python
-from kdp.stats import DatasetStatistics
-from kdp.processor import PreprocessingModel
+from kdp import auto_configure
 
-# Initialize statistics calculator
-stats_calculator = DatasetStatistics(
-    path_data="data/my_dataset.csv",
-    features_specs=features_specs  # Optional, will be inferred if not provided
-)
+# Analyze your dataset and get recommendations
+config = auto_configure("data/my_dataset.csv")
 
-# Calculate statistics
-stats = stats_calculator.main()
+# Get the ready-to-use code snippet
+print(config["code_snippet"])
 
-# Generate recommendations
-recommendations = stats_calculator.recommend_model_configuration()
+# Get feature-specific recommendations
+print(config["recommendations"])
 
-# Use the recommendations to build a model
-# You can directly use the generated code snippet or access specific recommendations
-print(recommendations["code_snippet"])
+# Get computed statistics (if save_stats=True)
+print(config["statistics"])
 ```
 
-### Method 2: Using the Command-Line Tool
+### Advanced Usage
 
-KDP provides a command-line tool to analyze datasets and generate recommendations:
+You can customize the analysis with additional parameters:
 
-```bash
-python scripts/analyze_dataset.py --data path/to/data.csv --output recommendations.json
+```python
+config = auto_configure(
+    data_path="data/my_dataset.csv",
+    features_specs={
+        "age": "NumericalFeature",
+        "category": "CategoricalFeature",
+        "text": "TextFeature"
+    },
+    batch_size=100_000,
+    save_stats=True,
+    stats_path="my_stats.json",
+    overwrite_stats=False
+)
 ```
 
-Options:
-- `--data`, `-d`: Path to CSV data file or directory (required)
-- `--output`, `-o`: Path to save recommendations (default: recommendations.json)
-- `--stats`, `-s`: Path to save/load feature statistics (default: features_stats.json)
-- `--batch-size`, `-b`: Batch size for processing (default: 50000)
-- `--overwrite`, `-w`: Overwrite existing statistics file
-- `--feature-types`, `-f`: JSON file specifying feature types (optional)
-
 ## 🔮 Distribution Detection
 
 The system can detect and recommend specific configurations for various distribution types:
diff --git a/kdp/__init__.py b/kdp/__init__.py
@@ -16,6 +16,7 @@
     TransformerBlockPlacementOptions,
 )
 from kdp.stats import DatasetStatistics
+from kdp.auto_config import auto_configure
 
 __all__ = [
     "ProcessingStep",
@@ -33,4 +34,5 @@
     "TransformerBlockPlacementOptions",
     "OutputModeOptions",
     "TabularAttentionPlacementOptions",
+    "auto_configure",
 ]
diff --git a/kdp/auto_config.py b/kdp/auto_config.py
@@ -0,0 +1,92 @@
+"""
+Automatic model configuration module that provides a simple interface for
+analyzing datasets and generating optimal preprocessing configurations.
+"""
+
+from pathlib import Path
+from typing import Dict, Any, Optional, Union
+
+from loguru import logger
+
+from kdp.stats import DatasetStatistics
+from kdp.model_advisor import ModelAdvisor
+
+
+def auto_configure(
+    data_path: Union[str, Path],
+    features_specs: Optional[Dict[str, Any]] = None,
+    batch_size: int = 50_000,
+    save_stats: bool = True,
+    stats_path: Optional[Union[str, Path]] = None,
+    overwrite_stats: bool = False,
+) -> Dict[str, Any]:
+    """
+    Automatically analyze a dataset and generate optimal preprocessing configurations.
+
+    This is a high-level function that handles all the complexity of analyzing your dataset
+    and recommending the best preprocessing strategies. It will:
+    1. Calculate comprehensive statistics about your features
+    2. Analyze the distributions and characteristics of each feature
+    3. Generate specific recommendations for preprocessing each feature
+    4. Provide global configuration recommendations
+    5. Generate ready-to-use code implementing the recommendations
+
+    Args:
+        data_path: Path to your dataset (CSV file or directory of CSVs)
+        features_specs: Optional dictionary specifying feature types and configurations
+        batch_size: Batch size for processing large datasets (default: 50000)
+        save_stats: Whether to save the computed statistics (default: True)
+        stats_path: Optional path to save/load statistics (default: features_stats.json)
+        overwrite_stats: Whether to overwrite existing statistics file (default: False)
+
+    Returns:
+        Dictionary containing:
+        - feature-specific recommendations
+        - global configuration recommendations
+        - ready-to-use code snippet
+        - computed statistics (if save_stats=True)
+
+    Example:
+        >>> config = auto_configure("data/my_dataset.csv")
+        >>> print(config["code_snippet"])  # Get ready-to-use code
+        >>> print(config["recommendations"])  # Get feature-specific recommendations
+    """
+    # Convert paths to Path objects
+    data_path = Path(data_path)
+    if stats_path is None:
+        stats_path = Path("features_stats.json")
+    else:
+        stats_path = Path(stats_path)
+
+    # Initialize statistics calculator
+    stats_calculator = DatasetStatistics(
+        path_data=str(data_path),
+        features_specs=features_specs,
+        features_stats_path=stats_path,
+        overwrite_stats=overwrite_stats,
+        batch_size=batch_size,
+    )
+
+    # Calculate statistics
+    logger.info("Calculating dataset statistics...")
+    stats = stats_calculator.main()
+
+    # Generate recommendations
+    logger.info("Generating preprocessing recommendations...")
+    advisor = ModelAdvisor(stats)
+    recommendations = advisor.analyze_feature_stats()
+
+    # Generate code snippet
+    logger.info("Generating code snippet...")
+    code_snippet = advisor.generate_code_snippet()
+
+    # Prepare output
+    output = {
+        "recommendations": recommendations,
+        "code_snippet": code_snippet,
+    }
+
+    if save_stats:
+        output["statistics"] = stats
+
+    return output
diff --git a/kdp/model_advisor.py b/kdp/model_advisor.py

Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,7 @@`
`16`	`16`	`TransformerBlockPlacementOptions,`
`17`	`17`	`)`
`18`	`18`	`from kdp.stats import DatasetStatistics`
	`19`	`+from kdp.auto_config import auto_configure`
`19`	`20`
`20`	`21`	`__all__ = [`
`21`	`22`	`"ProcessingStep",`
`@@ -33,4 +34,5 @@`
`33`	`34`	`"TransformerBlockPlacementOptions",`
`34`	`35`	`"OutputModeOptions",`
`35`	`36`	`"TabularAttentionPlacementOptions",`
	`37`	`+ "auto_configure",`
`36`	`38`	`]`