Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 84b2eb5

Browse files
committed
fix(KDP): added prefered_distribution parameter for NumericalFeatures
1 parent a3fe7e1 commit 84b2eb5

File tree

4 files changed

+41
-27
lines changed

4 files changed

+41
-27
lines changed

kdp/custom_layers.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -462,7 +462,7 @@ def __init__(
462462
mixture_components: int = 3,
463463
trainable: bool = True,
464464
name: str = None,
465-
specified_distribution: DistributionType = None,
465+
prefered_distribution: DistributionType = None,
466466
**kwargs,
467467
) -> None:
468468
"""Initialize the DistributionAwareEncoder.
@@ -476,17 +476,18 @@ def __init__(
476476
mixture_components: Number of components for mixture models
477477
trainable: Whether parameters are trainable
478478
name: Name of the layer
479-
specified_distribution: Specific distribution type to use
479+
prefered_distribution: Specific distribution type to use
480480
**kwargs: Additional layer arguments
481481
"""
482482
super().__init__(name=name, trainable=trainable, **kwargs)
483+
self.name = name
483484
self.num_bins = num_bins
484485
self.epsilon = epsilon
485486
self.detect_periodicity = detect_periodicity
486487
self.handle_sparsity = handle_sparsity
487488
self.adaptive_binning = adaptive_binning
488489
self.mixture_components = mixture_components
489-
self.specified_distribution = specified_distribution
490+
self.prefered_distribution = prefered_distribution
490491

491492
# Initialize TFP distributions
492493
self.normal_dist = tfp.distributions.Normal
@@ -543,14 +544,12 @@ def build(self, input_shape) -> None:
543544

544545
super().build(input_shape)
545546

546-
def _estimate_distribution(
547-
self, inputs: tf.Tensor, feature_name: str = "unknown"
548-
) -> dict:
547+
def _estimate_distribution(self, inputs: tf.Tensor, name: str = "unknown") -> dict:
549548
"""Estimate distribution type with comprehensive checks or use specified distribution type.
550549
551550
Args:
552551
inputs: Input tensor to analyze
553-
feature_name: Name of the feature being analyzed
552+
name: Name of the feature being analyzed
554553
"""
555554

556555
# Otherwise, perform automatic detection
@@ -638,13 +637,15 @@ def _estimate_distribution(
638637
"zero_ratio": zero_ratio,
639638
}
640639

641-
if self.specified_distribution:
640+
feature_name = name.rsplit("_", 1)[-1]
641+
642+
if self.prefered_distribution:
642643
tf.print(
643644
"\n--------------------------------",
644-
f"Using manually specified distribution for {feature_name}: {self.specified_distribution}",
645+
f'Using manually specified distribution for "{feature_name}": {self.prefered_distribution}',
645646
)
646647
return {
647-
"type": self.specified_distribution,
648+
"type": self.prefered_distribution,
648649
"stats": stats_dict,
649650
}
650651
else:
@@ -667,7 +668,7 @@ def _estimate_distribution(
667668
)
668669
tf.print(
669670
"\n--------------------------------",
670-
f"Determined distribution type for {feature_name}: {distrib_dict_determined}",
671+
f'Determined distribution type for "{feature_name}": {distrib_dict_determined}',
671672
)
672673
return {
673674
"type": distrib_dict_determined,
@@ -1260,7 +1261,7 @@ def call(self, inputs: tf.Tensor) -> tf.Tensor:
12601261
Returns:
12611262
Transformed tensor
12621263
"""
1263-
dist_info = self._estimate_distribution(inputs)
1264+
dist_info = self._estimate_distribution(inputs, name=self.name)
12641265
# print(f"Distribution info: {dist_info}")
12651266
return self._transform_distribution(inputs, dist_info)
12661267

@@ -1279,7 +1280,7 @@ def get_config(self) -> dict:
12791280
"handle_sparsity": self.handle_sparsity,
12801281
"adaptive_binning": self.adaptive_binning,
12811282
"mixture_components": self.mixture_components,
1282-
"specified_distribution": self.specified_distribution,
1283+
"prefered_distribution": self.prefered_distribution,
12831284
},
12841285
)
12851286
return config

kdp/features.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -123,20 +123,20 @@ def __init__(
123123
self,
124124
name: str,
125125
feature_type: FeatureType = FeatureType.FLOAT_NORMALIZED,
126-
distribution: DistributionType | None = None,
126+
preferred_distribution: DistributionType | None = None,
127127
**kwargs,
128128
) -> None:
129129
"""Initializes a NumericalFeature instance.
130130
131131
Args:
132132
name (str): The name of the feature.
133133
feature_type (FeatureType): The type of the feature.
134-
distribution (DistributionType | None): The distribution type for the feature.
134+
preferred_distribution (DistributionType | None): The preferred distribution type for the feature.
135135
**kwargs: Additional keyword arguments for the feature.
136136
"""
137137
super().__init__(name, feature_type, **kwargs)
138138
self.dtype = tf.float32
139-
self.distribution = distribution
139+
self.preferred_distribution = preferred_distribution
140140

141141

142142
class CategoricalFeature(Feature):

kdp/layers_factory.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def distribution_aware_encoder(
6262
handle_sparsity: bool = True,
6363
adaptive_binning: bool = True,
6464
mixture_components: int = 3,
65-
specified_distribution: "DistributionType" = None,
65+
prefered_distribution: "DistributionType" = None,
6666
**kwargs,
6767
) -> tf.keras.layers.Layer:
6868
"""Create a DistributionAwareEncoder layer.
@@ -89,7 +89,7 @@ def distribution_aware_encoder(
8989
handle_sparsity=handle_sparsity,
9090
adaptive_binning=adaptive_binning,
9191
mixture_components=mixture_components,
92-
specified_distribution=specified_distribution,
92+
prefered_distribution=prefered_distribution,
9393
**kwargs,
9494
)
9595

kdp/processor.py

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -112,8 +112,16 @@ class instances (NumericalFeature, CategoricalFeature, TextFeature), or strings.
112112
FeatureType.FLOAT_RESCALED,
113113
FeatureType.FLOAT_DISCRETIZED,
114114
}:
115+
# Get preferred_distribution from kwargs if provided
116+
preferred_distribution = (
117+
spec.kwargs.get("preferred_distribution")
118+
if isinstance(spec, Feature)
119+
else None
120+
)
115121
feature_instance = NumericalFeature(
116-
name=name, feature_type=feature_type
122+
name=name,
123+
feature_type=feature_type,
124+
preferred_distribution=preferred_distribution,
117125
)
118126
elif feature_type in {
119127
FeatureType.INTEGER_CATEGORICAL,
@@ -182,7 +190,6 @@ def __init__(
182190
feature_selection_placement: str = FeatureSelectionPlacementOptions.NONE.value,
183191
use_distribution_aware: bool = False,
184192
distribution_aware_bins: int = 1000,
185-
specified_distribution: str = None,
186193
feature_selection_units: int = 32,
187194
feature_selection_dropout: float = 0.2,
188195
) -> None:
@@ -218,9 +225,6 @@ def __init__(
218225
feature_selection_dropout (float): Dropout rate for feature selection.
219226
use_distribution_aware (bool): Whether to use distribution-aware encoding for features.
220227
distribution_aware_bins (int): Number of bins to use for distribution-aware encoding.
221-
specified_distribution (str, optional): The specified distribution type for
222-
distribution-aware encoding. Options: 'normal', 'lognormal', 'exponential', etc.
223-
Defaults to None (automatic detection).
224228
"""
225229
self.path_data = path_data
226230
self.batch_size = batch_size or 50_000
@@ -273,8 +277,6 @@ def __init__(
273277
# initializing stats
274278
self._init_stats()
275279

276-
self.specified_distribution = specified_distribution
277-
278280
def _monitor_performance(func: Callable) -> Callable:
279281
"""Decorator to monitor the performance of a function.
280282
@@ -603,16 +605,27 @@ def _add_pipeline_numeric(
603605
layer_creator=PreprocessorLayerFactory.cast_to_float32_layer,
604606
name=f"pre_dist_cast_to_float_{feature_name}",
605607
)
608+
# Check if manually specified distribution is provided
609+
_prefered_distribution = _feature.kwargs.get("prefered_distribution")
610+
if _prefered_distribution is not None:
611+
logger.info(
612+
f"Using manually specified distribution for {feature_name}"
613+
)
614+
else:
615+
logger.info(
616+
f"Using automatic distribution detection for {feature_name}"
617+
)
618+
606619
# Apply distribution-aware encoding
607620
preprocessor.add_processing_step(
608621
layer_creator=PreprocessorLayerFactory.distribution_aware_encoder,
609-
name=f"distribution_aware_{feature_name}",
622+
name=f"distribution_aware_layer_{feature_name}",
610623
num_bins=self.distribution_aware_bins,
611624
detect_periodicity=True,
612625
handle_sparsity=True,
613626
adaptive_binning=True,
614627
mixture_components=3,
615-
specified_distribution=self.specified_distribution,
628+
prefered_distribution=_prefered_distribution,
616629
)
617630
# Cast to float32 after distribution-aware encoding
618631
preprocessor.add_processing_step(

0 commit comments

Comments
 (0)