fix(KDP): added prefered_distribution parameter for NumericalFeatures

Gandalfdore · Gandalfdore · commit 84b2eb546749 · 2025-02-13T16:43:52.000+02:00
diff --git a/kdp/custom_layers.py b/kdp/custom_layers.py
@@ -462,7 +462,7 @@ def __init__(
         mixture_components: int = 3,
         trainable: bool = True,
         name: str = None,
-        specified_distribution: DistributionType = None,
+        prefered_distribution: DistributionType = None,
         **kwargs,
     ) -> None:
         """Initialize the DistributionAwareEncoder.
@@ -476,17 +476,18 @@ def __init__(
             mixture_components: Number of components for mixture models
             trainable: Whether parameters are trainable
             name: Name of the layer
-            specified_distribution: Specific distribution type to use
+            prefered_distribution: Specific distribution type to use
             **kwargs: Additional layer arguments
         """
         super().__init__(name=name, trainable=trainable, **kwargs)
+        self.name = name
         self.num_bins = num_bins
         self.epsilon = epsilon
         self.detect_periodicity = detect_periodicity
         self.handle_sparsity = handle_sparsity
         self.adaptive_binning = adaptive_binning
         self.mixture_components = mixture_components
-        self.specified_distribution = specified_distribution
+        self.prefered_distribution = prefered_distribution
 
         # Initialize TFP distributions
         self.normal_dist = tfp.distributions.Normal
@@ -543,14 +544,12 @@ def build(self, input_shape) -> None:
 
         super().build(input_shape)
 
-    def _estimate_distribution(
-        self, inputs: tf.Tensor, feature_name: str = "unknown"
-    ) -> dict:
+    def _estimate_distribution(self, inputs: tf.Tensor, name: str = "unknown") -> dict:
         """Estimate distribution type with comprehensive checks or use specified distribution type.
 
         Args:
             inputs: Input tensor to analyze
-            feature_name: Name of the feature being analyzed
+            name: Name of the feature being analyzed
         """
 
         # Otherwise, perform automatic detection
@@ -638,13 +637,15 @@ def _estimate_distribution(
             "zero_ratio": zero_ratio,
         }
 
-        if self.specified_distribution:
+        feature_name = name.rsplit("_", 1)[-1]
+
+        if self.prefered_distribution:
             tf.print(
                 "\n--------------------------------",
-                f"Using manually specified distribution for {feature_name}: {self.specified_distribution}",
+                f'Using manually specified distribution for "{feature_name}": {self.prefered_distribution}',
             )
             return {
-                "type": self.specified_distribution,
+                "type": self.prefered_distribution,
                 "stats": stats_dict,
             }
         else:
@@ -667,7 +668,7 @@ def _estimate_distribution(
             )
             tf.print(
                 "\n--------------------------------",
-                f"Determined distribution type for {feature_name}: {distrib_dict_determined}",
+                f'Determined distribution type for "{feature_name}": {distrib_dict_determined}',
             )
             return {
                 "type": distrib_dict_determined,
@@ -1260,7 +1261,7 @@ def call(self, inputs: tf.Tensor) -> tf.Tensor:
         Returns:
             Transformed tensor
         """
-        dist_info = self._estimate_distribution(inputs)
+        dist_info = self._estimate_distribution(inputs, name=self.name)
         # print(f"Distribution info: {dist_info}")
         return self._transform_distribution(inputs, dist_info)
 
@@ -1279,7 +1280,7 @@ def get_config(self) -> dict:
                 "handle_sparsity": self.handle_sparsity,
                 "adaptive_binning": self.adaptive_binning,
                 "mixture_components": self.mixture_components,
-                "specified_distribution": self.specified_distribution,
+                "prefered_distribution": self.prefered_distribution,
             },
         )
         return config
diff --git a/kdp/features.py b/kdp/features.py
@@ -123,20 +123,20 @@ def __init__(
         self,
         name: str,
         feature_type: FeatureType = FeatureType.FLOAT_NORMALIZED,
-        distribution: DistributionType | None = None,
+        preferred_distribution: DistributionType | None = None,
         **kwargs,
     ) -> None:
         """Initializes a NumericalFeature instance.
 
         Args:
             name (str): The name of the feature.
             feature_type (FeatureType): The type of the feature.
-            distribution (DistributionType | None): The distribution type for the feature.
+            preferred_distribution (DistributionType | None): The preferred distribution type for the feature.
             **kwargs: Additional keyword arguments for the feature.
         """
         super().__init__(name, feature_type, **kwargs)
         self.dtype = tf.float32
-        self.distribution = distribution
+        self.preferred_distribution = preferred_distribution
 
 
 class CategoricalFeature(Feature):
diff --git a/kdp/layers_factory.py b/kdp/layers_factory.py
@@ -62,7 +62,7 @@ def distribution_aware_encoder(
         handle_sparsity: bool = True,
         adaptive_binning: bool = True,
         mixture_components: int = 3,
-        specified_distribution: "DistributionType" = None,
+        prefered_distribution: "DistributionType" = None,
         **kwargs,
     ) -> tf.keras.layers.Layer:
         """Create a DistributionAwareEncoder layer.
@@ -89,7 +89,7 @@ def distribution_aware_encoder(
             handle_sparsity=handle_sparsity,
             adaptive_binning=adaptive_binning,
             mixture_components=mixture_components,
-            specified_distribution=specified_distribution,
+            prefered_distribution=prefered_distribution,
             **kwargs,
         )
 
diff --git a/kdp/processor.py b/kdp/processor.py
@@ -112,8 +112,16 @@ class instances (NumericalFeature, CategoricalFeature, TextFeature), or strings.
                     FeatureType.FLOAT_RESCALED,
                     FeatureType.FLOAT_DISCRETIZED,
                 }:
+                    # Get preferred_distribution from kwargs if provided
+                    preferred_distribution = (
+                        spec.kwargs.get("preferred_distribution")
+                        if isinstance(spec, Feature)
+                        else None
+                    )
                     feature_instance = NumericalFeature(
-                        name=name, feature_type=feature_type
+                        name=name,
+                        feature_type=feature_type,
+                        preferred_distribution=preferred_distribution,
                     )
                 elif feature_type in {
                     FeatureType.INTEGER_CATEGORICAL,
@@ -182,7 +190,6 @@ def __init__(
         feature_selection_placement: str = FeatureSelectionPlacementOptions.NONE.value,
         use_distribution_aware: bool = False,
         distribution_aware_bins: int = 1000,
-        specified_distribution: str = None,
         feature_selection_units: int = 32,
         feature_selection_dropout: float = 0.2,
     ) -> None:
@@ -218,9 +225,6 @@ def __init__(
             feature_selection_dropout (float): Dropout rate for feature selection.
             use_distribution_aware (bool): Whether to use distribution-aware encoding for features.
             distribution_aware_bins (int): Number of bins to use for distribution-aware encoding.
-            specified_distribution (str, optional): The specified distribution type for
-                distribution-aware encoding. Options: 'normal', 'lognormal', 'exponential', etc.
-                Defaults to None (automatic detection).
         """
         self.path_data = path_data
         self.batch_size = batch_size or 50_000
@@ -273,8 +277,6 @@ def __init__(
         # initializing stats
         self._init_stats()
 
-        self.specified_distribution = specified_distribution
-
     def _monitor_performance(func: Callable) -> Callable:
         """Decorator to monitor the performance of a function.
 
@@ -603,16 +605,27 @@ def _add_pipeline_numeric(
                     layer_creator=PreprocessorLayerFactory.cast_to_float32_layer,
                     name=f"pre_dist_cast_to_float_{feature_name}",
                 )
+                # Check if manually specified distribution is provided
+                _prefered_distribution = _feature.kwargs.get("prefered_distribution")
+                if _prefered_distribution is not None:
+                    logger.info(
+                        f"Using manually specified distribution for {feature_name}"
+                    )
+                else:
+                    logger.info(
+                        f"Using automatic distribution detection for {feature_name}"
+                    )
+
                 # Apply distribution-aware encoding
                 preprocessor.add_processing_step(
                     layer_creator=PreprocessorLayerFactory.distribution_aware_encoder,
-                    name=f"distribution_aware_{feature_name}",
+                    name=f"distribution_aware_layer_{feature_name}",
                     num_bins=self.distribution_aware_bins,
                     detect_periodicity=True,
                     handle_sparsity=True,
                     adaptive_binning=True,
                     mixture_components=3,
-                    specified_distribution=self.specified_distribution,
+                    prefered_distribution=_prefered_distribution,
                 )
                 # Cast to float32 after distribution-aware encoding
                 preprocessor.add_processing_step(