test(KDP): add tests

Gandalfdore · Gandalfdore · commit 55cbbb3f3bcb · 2025-02-19T14:11:31.000+02:00
diff --git a/kdp/custom_layers.py b/kdp/custom_layers.py
@@ -2133,35 +2133,3 @@ def get_config(self):
             }
         )
         return config
-
-
-if __name__ == "__main__":
-    tf.random.set_seed(42)
-    logger.info("Testing AdvancedNumericalEmbedding with multi-feature input.")
-    # Multi-feature test: 32 samples, 3 features.
-    x_multi = tf.random.normal((32, 3))
-    layer_multi = AdvancedNumericalEmbedding(
-        embedding_dim=8,
-        mlp_hidden_units=16,
-        num_bins=10,
-        init_min=[-3.0, -2.0, -4.0],
-        init_max=[3.0, 2.0, 4.0],
-        dropout_rate=0.1,
-        use_batch_norm=True,
-    )
-    y_multi = layer_multi(x_multi)
-    logger.info("Multi-feature output shape: {}", y_multi.shape)
-
-    # Single-feature test: 32 samples, 1 feature.
-    x_single = tf.random.normal((32, 1))
-    layer_single = AdvancedNumericalEmbedding(
-        embedding_dim=8,
-        mlp_hidden_units=16,
-        num_bins=10,
-        init_min=-3.0,
-        init_max=3.0,
-        dropout_rate=0.1,
-        use_batch_norm=True,
-    )
-    y_single = layer_single(x_single)
-    logger.info("Single-feature output shape: {}", y_single.shape)
diff --git a/kdp/processor.py b/kdp/processor.py
@@ -193,6 +193,13 @@ def __init__(
         feature_selection_units: int = 32,
         feature_selection_dropout: float = 0.2,
         use_advanced_numerical_embedding: bool = False,
+        embedding_dim: int = 8,
+        mlp_hidden_units: int = 16,
+        num_bins: int = 10,
+        init_min: float = -3.0,
+        init_max: float = 3.0,
+        dropout_rate: float = 0.1,
+        use_batch_norm: bool = True,
     ) -> None:
         """Initialize a preprocessing model.
 
@@ -226,6 +233,12 @@ def __init__(
             feature_selection_dropout (float): Dropout rate for feature selection.
             use_distribution_aware (bool): Whether to use distribution-aware encoding for features.
             distribution_aware_bins (int): Number of bins to use for distribution-aware encoding.
+            use_advanced_numerical_embedding (bool): Whether to use advanced numerical embedding.
+            embedding_dim (int): Dimension of the embedding for advanced numerical embedding.
+            mlp_hidden_units (int): Number of units for the MLP in advanced numerical embedding.
+            num_bins (int): Number of bins for discretization in advanced numerical embedding.
+            init_min (float): Minimum value for the embedding in advanced numerical embedding.
+            init_max (float): Maximum value for the embedding in advanced numerical embedding.
         """
         self.path_data = path_data
         self.batch_size = batch_size or 50_000
@@ -261,6 +274,13 @@ def __init__(
 
         # advanced numerical embedding control
         self.use_advanced_numerical_embedding = use_advanced_numerical_embedding
+        self.embedding_dim = embedding_dim
+        self.mlp_hidden_units = mlp_hidden_units
+        self.num_bins = num_bins
+        self.init_min = init_min
+        self.init_max = init_max
+        self.dropout_rate = dropout_rate
+        self.use_batch_norm = use_batch_norm
 
         # PLACEHOLDERS
         self.preprocessors = {}
@@ -691,6 +711,13 @@ def _add_pipeline_numeric(
                 layer_creator=lambda **kwargs: embedding_layer,
                 layer_class="AdvancedNumericalEmbedding",
                 name=f"advanced_embedding_{feature_name}",
+                embedding_dim=self.embedding_dim,
+                mlp_hidden_units=self.mlp_hidden_units,
+                num_bins=self.num_bins,
+                init_min=self.init_min,
+                init_max=self.init_max,
+                dropout_rate=self.dropout_rate,
+                use_batch_norm=self.use_batch_norm,
             )
 
         # Process the feature
diff --git a/test/test_advanced_numerical_embedding.py b/test/test_advanced_numerical_embedding.py
@@ -0,0 +1,127 @@
+import tensorflow as tf
+import numpy as np
+from kdp.custom_layers import AdvancedNumericalEmbedding
+
+
+class TestAdvancedNumericalEmbedding:
+    def test_multi_feature_input(self):
+        """Test with input having multiple features."""
+        batch_size = 32
+        num_features = 3
+        embedding_dim = 8
+
+        # Create random multi-feature input.
+        x_multi = tf.random.normal((batch_size, num_features))
+        layer = AdvancedNumericalEmbedding(
+            embedding_dim=embedding_dim,
+            mlp_hidden_units=16,
+            num_bins=10,
+            init_min=[-3.0, -2.0, -4.0],
+            init_max=[3.0, 2.0, 4.0],
+            dropout_rate=0.1,
+            use_batch_norm=True,
+        )
+        # Run in inference mode.
+        y_multi = layer(x_multi, training=False)
+        # Expected output shape: (batch_size, num_features, embedding_dim)
+        assert (
+            y_multi.shape == (batch_size, num_features, embedding_dim)
+        ), f"Expected shape {(batch_size, num_features, embedding_dim)} but got {y_multi.shape}"
+        # Ensure outputs are finite.
+        assert np.all(
+            np.isfinite(y_multi.numpy())
+        ), "Output contains non-finite values."
+
+    def test_single_feature_input(self):
+        """Test with a single numeric feature."""
+        batch_size = 32
+        num_features = 1
+        embedding_dim = 8
+
+        x_single = tf.random.normal((batch_size, num_features))
+        layer = AdvancedNumericalEmbedding(
+            embedding_dim=embedding_dim,
+            mlp_hidden_units=16,
+            num_bins=10,
+            init_min=-3.0,
+            init_max=3.0,
+            dropout_rate=0.1,
+            use_batch_norm=False,
+        )
+        y_single = layer(x_single, training=False)
+        assert (
+            y_single.shape == (batch_size, num_features, embedding_dim)
+        ), f"Expected shape {(batch_size, num_features, embedding_dim)} but got {y_single.shape}"
+        assert np.all(
+            np.isfinite(y_single.numpy())
+        ), "Output contains non-finite values."
+
+    def test_dropout_behavior(self):
+        """When dropout is 0.0 and no batch norm is used, training and inference should match."""
+        batch_size = 16
+        num_features = 2
+        embedding_dim = 8
+
+        x = tf.random.normal((batch_size, num_features))
+        layer = AdvancedNumericalEmbedding(
+            embedding_dim=embedding_dim,
+            mlp_hidden_units=16,
+            num_bins=10,
+            init_min=[-3.0, -2.0],
+            init_max=[3.0, 2.0],
+            dropout_rate=0.0,
+            use_batch_norm=False,
+        )
+        y_train = layer(x, training=True)
+        y_infer = layer(x, training=False)
+        assert np.allclose(
+            y_train.numpy(), y_infer.numpy(), atol=1e-5
+        ), "Outputs in training and inference modes should match when dropout is disabled."
+
+    def test_config_round_trip(self):
+        """Test get_config and from_config round-trip functionality."""
+        layer = AdvancedNumericalEmbedding(
+            embedding_dim=8,
+            mlp_hidden_units=16,
+            num_bins=10,
+            init_min=-3.0,
+            init_max=3.0,
+            dropout_rate=0.1,
+            use_batch_norm=True,
+            name="advanced_numeric_test",
+        )
+        config = layer.get_config()
+        new_layer = AdvancedNumericalEmbedding.from_config(config)
+        # Create a dummy input to ensure the layers are built.
+        x = tf.random.normal((10, 1))
+        y1 = layer(x, training=False)
+        y2 = new_layer(x, training=False)
+        assert (
+            y1.shape == y2.shape
+        ), "Shapes from original and reloaded layers should match."
+
+    def test_gradient_flow(self):
+        """Test that gradients can be computed through the layer."""
+        batch_size = 8
+        num_features = 3
+        embedding_dim = 8
+
+        x = tf.random.normal((batch_size, num_features))
+        layer = AdvancedNumericalEmbedding(
+            embedding_dim=embedding_dim,
+            mlp_hidden_units=16,
+            num_bins=10,
+            init_min=[-3.0, -2.0, -4.0],
+            init_max=[3.0, 2.0, 4.0],
+            dropout_rate=0.1,
+            use_batch_norm=True,
+        )
+        with tf.GradientTape() as tape:
+            tape.watch(x)
+            y = layer(x, training=True)
+            loss = tf.reduce_mean(y)
+        grads = tape.gradient(loss, layer.trainable_variables)
+        grad_not_none = [g for g in grads if g is not None]
+        assert (
+            len(grad_not_none) > 0
+        ), "Gradients should be computed for AdvancedNumericalEmbedding trainable variables."
diff --git a/test/test_processor.py b/test/test_processor.py
@@ -1758,5 +1758,161 @@ def test_preprocessor_parameter_combinations(self):
                         # You can add more specific checks for each feature if needed
 
 
+class TestPreprocessingModel_AdvancedNumericalEmbedding(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.temp_dir = tempfile.TemporaryDirectory()
+        cls.temp_file = Path(cls.temp_dir.name)
+        cls._path_data = cls.temp_file / "data.csv"
+        cls.features_stats_path = cls.temp_file / "features_stats.json"
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.temp_dir.cleanup()
+
+    def setUp(self):
+        if self.features_stats_path.exists():
+            self.features_stats_path.unlink()
+
+    def test_preprocessor_with_advanced_numerical_embedding(self):
+        """
+        Test that when advanced numerical embedding is enabled, the preprocessor model is
+        built successfully and produces an output with the expected 3D shape:
+         (batch_size, num_features, embedding_dim)
+        """
+        # Define a numerical feature. (No special flag is needed on the feature, as the model-level
+        # configuration controls the use of advanced numerical embedding.)
+        features = {
+            "num1": NumericalFeature(
+                name="num1",
+                feature_type=FeatureType.FLOAT_NORMALIZED,
+            )
+        }
+        # Generate fake data for training statistics.
+        df = generate_fake_data(features, num_rows=50)
+        df.to_csv(self._path_data, index=False)
+
+        # Build the PreprocessingModel with advanced numerical embedding turned on.
+        ppr = PreprocessingModel(
+            path_data=str(self._path_data),
+            features_specs=features,
+            features_stats_path=self.features_stats_path,
+            overwrite_stats=True,
+            use_advanced_numerical_embedding=True,
+            embedding_dim=8,
+            mlp_hidden_units=16,
+            num_bins=10,
+            init_min=-3.0,
+            init_max=3.0,
+            dropout_rate=0.1,
+            use_batch_norm=True,
+            output_mode=OutputModeOptions.CONCAT,
+        )
+        result = ppr.build_preprocessor()
+        self.assertIsNotNone(result["model"], "Preprocessor model should be built")
+
+        # Create a small batch of test data.
+        test_data = generate_fake_data(features, num_rows=5)
+        dataset = tf.data.Dataset.from_tensor_slices(dict(test_data)).batch(5)
+        preprocessed = result["model"].predict(dataset)
+        self.assertIsNotNone(preprocessed, "Preprocessed output should not be None")
+
+        # Check that advanced numerical embedding produces a 3D output
+        # (batch_size, num_features, embedding_dim)
+        self.assertEqual(
+            len(preprocessed.shape),
+            3,
+            "Expected output shape to be 3D with advanced numerical embedding",
+        )
+        self.assertEqual(
+            preprocessed.shape[-1],
+            8,
+            "The output's last dimension (embedding_dim) should match the provided value (8)",
+        )
+
+    def test_advanced_embedding_if_false(self):
+        """
+        Test that the advanced numerical embedding is not used if the flag is set to False.
+        """
+        features = {
+            "num1": NumericalFeature(
+                name="num1",
+                feature_type=FeatureType.FLOAT_NORMALIZED,
+            )
+        }
+        df = generate_fake_data(features, num_rows=20)
+        df.to_csv(self._path_data, index=False)
+
+        # Build the model with advanced embedding.
+        ppr = PreprocessingModel(
+            path_data=str(self._path_data),
+            features_specs=features,
+            features_stats_path=self.features_stats_path,
+            use_advanced_numerical_embedding=False,
+            output_mode=OutputModeOptions.CONCAT,
+        )
+        result = ppr.build_preprocessor()
+        self.assertIsNotNone(result["model"])
+
+        # Get the configuration from the built model.
+        config = result["model"].get_config()
+        # Iterate the layer configurations.
+        layers_config = config.get("layers", [])
+        found = any(
+            layer.get("class_name", "") == "AdvancedNumericalEmbedding"
+            for layer in layers_config
+        )
+        self.assertFalse(
+            found,
+            "The model config should not include an AdvancedNumericalEmbedding layer when disabled.",
+        )
+
+    def test_advanced_embedding_config_preservation(self):
+        """
+        Ensure that the advanced numerical embedding's configuration is properly saved and can be
+        reloaded with get_config/from_config.
+        """
+        features = {
+            "num1": NumericalFeature(
+                name="num1",
+                feature_type=FeatureType.FLOAT_NORMALIZED,
+            )
+        }
+        df = generate_fake_data(features, num_rows=20)
+        df.to_csv(self._path_data, index=False)
+
+        # Build the model with advanced embedding.
+        ppr = PreprocessingModel(
+            path_data=str(self._path_data),
+            features_specs=features,
+            features_stats_path=self.features_stats_path,
+            overwrite_stats=True,
+            use_advanced_numerical_embedding=True,
+            embedding_dim=8,
+            mlp_hidden_units=16,
+            num_bins=10,
+            init_min=-3.0,
+            init_max=3.0,
+            dropout_rate=0.1,
+            use_batch_norm=True,
+            output_mode=OutputModeOptions.CONCAT,
+        )
+        result = ppr.build_preprocessor()
+        self.assertIsNotNone(result["model"])
+
+        # Get the configuration from the built model.
+        config = result["model"].get_config()
+        # Iterate the layer configurations.
+        layers_config = config.get("layers", [])
+        found = any(
+            layer.get("class_name", "") == "AdvancedNumericalEmbedding"
+            for layer in layers_config
+        )
+        self.assertTrue(
+            found,
+            "The model config should include an AdvancedNumericalEmbedding layer when enabled.",
+        )
+
+
 if __name__ == "__main__":
     unittest.main()