Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 55cbbb3

Browse files
committed
test(KDP): add tests
1 parent 8fa90e7 commit 55cbbb3

File tree

4 files changed

+310
-32
lines changed

4 files changed

+310
-32
lines changed

kdp/custom_layers.py

Lines changed: 0 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -2133,35 +2133,3 @@ def get_config(self):
21332133
}
21342134
)
21352135
return config
2136-
2137-
2138-
if __name__ == "__main__":
2139-
tf.random.set_seed(42)
2140-
logger.info("Testing AdvancedNumericalEmbedding with multi-feature input.")
2141-
# Multi-feature test: 32 samples, 3 features.
2142-
x_multi = tf.random.normal((32, 3))
2143-
layer_multi = AdvancedNumericalEmbedding(
2144-
embedding_dim=8,
2145-
mlp_hidden_units=16,
2146-
num_bins=10,
2147-
init_min=[-3.0, -2.0, -4.0],
2148-
init_max=[3.0, 2.0, 4.0],
2149-
dropout_rate=0.1,
2150-
use_batch_norm=True,
2151-
)
2152-
y_multi = layer_multi(x_multi)
2153-
logger.info("Multi-feature output shape: {}", y_multi.shape)
2154-
2155-
# Single-feature test: 32 samples, 1 feature.
2156-
x_single = tf.random.normal((32, 1))
2157-
layer_single = AdvancedNumericalEmbedding(
2158-
embedding_dim=8,
2159-
mlp_hidden_units=16,
2160-
num_bins=10,
2161-
init_min=-3.0,
2162-
init_max=3.0,
2163-
dropout_rate=0.1,
2164-
use_batch_norm=True,
2165-
)
2166-
y_single = layer_single(x_single)
2167-
logger.info("Single-feature output shape: {}", y_single.shape)

kdp/processor.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,13 @@ def __init__(
193193
feature_selection_units: int = 32,
194194
feature_selection_dropout: float = 0.2,
195195
use_advanced_numerical_embedding: bool = False,
196+
embedding_dim: int = 8,
197+
mlp_hidden_units: int = 16,
198+
num_bins: int = 10,
199+
init_min: float = -3.0,
200+
init_max: float = 3.0,
201+
dropout_rate: float = 0.1,
202+
use_batch_norm: bool = True,
196203
) -> None:
197204
"""Initialize a preprocessing model.
198205
@@ -226,6 +233,12 @@ def __init__(
226233
feature_selection_dropout (float): Dropout rate for feature selection.
227234
use_distribution_aware (bool): Whether to use distribution-aware encoding for features.
228235
distribution_aware_bins (int): Number of bins to use for distribution-aware encoding.
236+
use_advanced_numerical_embedding (bool): Whether to use advanced numerical embedding.
237+
embedding_dim (int): Dimension of the embedding for advanced numerical embedding.
238+
mlp_hidden_units (int): Number of units for the MLP in advanced numerical embedding.
239+
num_bins (int): Number of bins for discretization in advanced numerical embedding.
240+
init_min (float): Minimum value for the embedding in advanced numerical embedding.
241+
init_max (float): Maximum value for the embedding in advanced numerical embedding.
229242
"""
230243
self.path_data = path_data
231244
self.batch_size = batch_size or 50_000
@@ -261,6 +274,13 @@ def __init__(
261274

262275
# advanced numerical embedding control
263276
self.use_advanced_numerical_embedding = use_advanced_numerical_embedding
277+
self.embedding_dim = embedding_dim
278+
self.mlp_hidden_units = mlp_hidden_units
279+
self.num_bins = num_bins
280+
self.init_min = init_min
281+
self.init_max = init_max
282+
self.dropout_rate = dropout_rate
283+
self.use_batch_norm = use_batch_norm
264284

265285
# PLACEHOLDERS
266286
self.preprocessors = {}
@@ -691,6 +711,13 @@ def _add_pipeline_numeric(
691711
layer_creator=lambda **kwargs: embedding_layer,
692712
layer_class="AdvancedNumericalEmbedding",
693713
name=f"advanced_embedding_{feature_name}",
714+
embedding_dim=self.embedding_dim,
715+
mlp_hidden_units=self.mlp_hidden_units,
716+
num_bins=self.num_bins,
717+
init_min=self.init_min,
718+
init_max=self.init_max,
719+
dropout_rate=self.dropout_rate,
720+
use_batch_norm=self.use_batch_norm,
694721
)
695722

696723
# Process the feature
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
import tensorflow as tf
2+
import numpy as np
3+
from kdp.custom_layers import AdvancedNumericalEmbedding
4+
5+
6+
class TestAdvancedNumericalEmbedding:
7+
def test_multi_feature_input(self):
8+
"""Test with input having multiple features."""
9+
batch_size = 32
10+
num_features = 3
11+
embedding_dim = 8
12+
13+
# Create random multi-feature input.
14+
x_multi = tf.random.normal((batch_size, num_features))
15+
layer = AdvancedNumericalEmbedding(
16+
embedding_dim=embedding_dim,
17+
mlp_hidden_units=16,
18+
num_bins=10,
19+
init_min=[-3.0, -2.0, -4.0],
20+
init_max=[3.0, 2.0, 4.0],
21+
dropout_rate=0.1,
22+
use_batch_norm=True,
23+
)
24+
# Run in inference mode.
25+
y_multi = layer(x_multi, training=False)
26+
# Expected output shape: (batch_size, num_features, embedding_dim)
27+
assert (
28+
y_multi.shape == (batch_size, num_features, embedding_dim)
29+
), f"Expected shape {(batch_size, num_features, embedding_dim)} but got {y_multi.shape}"
30+
# Ensure outputs are finite.
31+
assert np.all(
32+
np.isfinite(y_multi.numpy())
33+
), "Output contains non-finite values."
34+
35+
def test_single_feature_input(self):
36+
"""Test with a single numeric feature."""
37+
batch_size = 32
38+
num_features = 1
39+
embedding_dim = 8
40+
41+
x_single = tf.random.normal((batch_size, num_features))
42+
layer = AdvancedNumericalEmbedding(
43+
embedding_dim=embedding_dim,
44+
mlp_hidden_units=16,
45+
num_bins=10,
46+
init_min=-3.0,
47+
init_max=3.0,
48+
dropout_rate=0.1,
49+
use_batch_norm=False,
50+
)
51+
y_single = layer(x_single, training=False)
52+
assert (
53+
y_single.shape == (batch_size, num_features, embedding_dim)
54+
), f"Expected shape {(batch_size, num_features, embedding_dim)} but got {y_single.shape}"
55+
assert np.all(
56+
np.isfinite(y_single.numpy())
57+
), "Output contains non-finite values."
58+
59+
def test_dropout_behavior(self):
60+
"""When dropout is 0.0 and no batch norm is used, training and inference should match."""
61+
batch_size = 16
62+
num_features = 2
63+
embedding_dim = 8
64+
65+
x = tf.random.normal((batch_size, num_features))
66+
layer = AdvancedNumericalEmbedding(
67+
embedding_dim=embedding_dim,
68+
mlp_hidden_units=16,
69+
num_bins=10,
70+
init_min=[-3.0, -2.0],
71+
init_max=[3.0, 2.0],
72+
dropout_rate=0.0,
73+
use_batch_norm=False,
74+
)
75+
y_train = layer(x, training=True)
76+
y_infer = layer(x, training=False)
77+
assert np.allclose(
78+
y_train.numpy(), y_infer.numpy(), atol=1e-5
79+
), "Outputs in training and inference modes should match when dropout is disabled."
80+
81+
def test_config_round_trip(self):
82+
"""Test get_config and from_config round-trip functionality."""
83+
layer = AdvancedNumericalEmbedding(
84+
embedding_dim=8,
85+
mlp_hidden_units=16,
86+
num_bins=10,
87+
init_min=-3.0,
88+
init_max=3.0,
89+
dropout_rate=0.1,
90+
use_batch_norm=True,
91+
name="advanced_numeric_test",
92+
)
93+
config = layer.get_config()
94+
new_layer = AdvancedNumericalEmbedding.from_config(config)
95+
# Create a dummy input to ensure the layers are built.
96+
x = tf.random.normal((10, 1))
97+
y1 = layer(x, training=False)
98+
y2 = new_layer(x, training=False)
99+
assert (
100+
y1.shape == y2.shape
101+
), "Shapes from original and reloaded layers should match."
102+
103+
def test_gradient_flow(self):
104+
"""Test that gradients can be computed through the layer."""
105+
batch_size = 8
106+
num_features = 3
107+
embedding_dim = 8
108+
109+
x = tf.random.normal((batch_size, num_features))
110+
layer = AdvancedNumericalEmbedding(
111+
embedding_dim=embedding_dim,
112+
mlp_hidden_units=16,
113+
num_bins=10,
114+
init_min=[-3.0, -2.0, -4.0],
115+
init_max=[3.0, 2.0, 4.0],
116+
dropout_rate=0.1,
117+
use_batch_norm=True,
118+
)
119+
with tf.GradientTape() as tape:
120+
tape.watch(x)
121+
y = layer(x, training=True)
122+
loss = tf.reduce_mean(y)
123+
grads = tape.gradient(loss, layer.trainable_variables)
124+
grad_not_none = [g for g in grads if g is not None]
125+
assert (
126+
len(grad_not_none) > 0
127+
), "Gradients should be computed for AdvancedNumericalEmbedding trainable variables."

test/test_processor.py

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1758,5 +1758,161 @@ def test_preprocessor_parameter_combinations(self):
17581758
# You can add more specific checks for each feature if needed
17591759

17601760

1761+
class TestPreprocessingModel_AdvancedNumericalEmbedding(unittest.TestCase):
1762+
@classmethod
1763+
def setUpClass(cls):
1764+
cls.temp_dir = tempfile.TemporaryDirectory()
1765+
cls.temp_file = Path(cls.temp_dir.name)
1766+
cls._path_data = cls.temp_file / "data.csv"
1767+
cls.features_stats_path = cls.temp_file / "features_stats.json"
1768+
1769+
@classmethod
1770+
def tearDownClass(cls):
1771+
cls.temp_dir.cleanup()
1772+
1773+
def setUp(self):
1774+
if self.features_stats_path.exists():
1775+
self.features_stats_path.unlink()
1776+
1777+
def test_preprocessor_with_advanced_numerical_embedding(self):
1778+
"""
1779+
Test that when advanced numerical embedding is enabled, the preprocessor model is
1780+
built successfully and produces an output with the expected 3D shape:
1781+
(batch_size, num_features, embedding_dim)
1782+
"""
1783+
# Define a numerical feature. (No special flag is needed on the feature, as the model-level
1784+
# configuration controls the use of advanced numerical embedding.)
1785+
features = {
1786+
"num1": NumericalFeature(
1787+
name="num1",
1788+
feature_type=FeatureType.FLOAT_NORMALIZED,
1789+
)
1790+
}
1791+
# Generate fake data for training statistics.
1792+
df = generate_fake_data(features, num_rows=50)
1793+
df.to_csv(self._path_data, index=False)
1794+
1795+
# Build the PreprocessingModel with advanced numerical embedding turned on.
1796+
ppr = PreprocessingModel(
1797+
path_data=str(self._path_data),
1798+
features_specs=features,
1799+
features_stats_path=self.features_stats_path,
1800+
overwrite_stats=True,
1801+
use_advanced_numerical_embedding=True,
1802+
embedding_dim=8,
1803+
mlp_hidden_units=16,
1804+
num_bins=10,
1805+
init_min=-3.0,
1806+
init_max=3.0,
1807+
dropout_rate=0.1,
1808+
use_batch_norm=True,
1809+
output_mode=OutputModeOptions.CONCAT,
1810+
)
1811+
result = ppr.build_preprocessor()
1812+
self.assertIsNotNone(result["model"], "Preprocessor model should be built")
1813+
1814+
# Create a small batch of test data.
1815+
test_data = generate_fake_data(features, num_rows=5)
1816+
dataset = tf.data.Dataset.from_tensor_slices(dict(test_data)).batch(5)
1817+
preprocessed = result["model"].predict(dataset)
1818+
self.assertIsNotNone(preprocessed, "Preprocessed output should not be None")
1819+
1820+
# Check that advanced numerical embedding produces a 3D output
1821+
# (batch_size, num_features, embedding_dim)
1822+
self.assertEqual(
1823+
len(preprocessed.shape),
1824+
3,
1825+
"Expected output shape to be 3D with advanced numerical embedding",
1826+
)
1827+
self.assertEqual(
1828+
preprocessed.shape[-1],
1829+
8,
1830+
"The output's last dimension (embedding_dim) should match the provided value (8)",
1831+
)
1832+
1833+
def test_advanced_embedding_if_false(self):
1834+
"""
1835+
Test that the advanced numerical embedding is not used if the flag is set to False.
1836+
"""
1837+
features = {
1838+
"num1": NumericalFeature(
1839+
name="num1",
1840+
feature_type=FeatureType.FLOAT_NORMALIZED,
1841+
)
1842+
}
1843+
df = generate_fake_data(features, num_rows=20)
1844+
df.to_csv(self._path_data, index=False)
1845+
1846+
# Build the model with advanced embedding.
1847+
ppr = PreprocessingModel(
1848+
path_data=str(self._path_data),
1849+
features_specs=features,
1850+
features_stats_path=self.features_stats_path,
1851+
use_advanced_numerical_embedding=False,
1852+
output_mode=OutputModeOptions.CONCAT,
1853+
)
1854+
result = ppr.build_preprocessor()
1855+
self.assertIsNotNone(result["model"])
1856+
1857+
# Get the configuration from the built model.
1858+
config = result["model"].get_config()
1859+
# Iterate the layer configurations.
1860+
layers_config = config.get("layers", [])
1861+
found = any(
1862+
layer.get("class_name", "") == "AdvancedNumericalEmbedding"
1863+
for layer in layers_config
1864+
)
1865+
self.assertFalse(
1866+
found,
1867+
"The model config should not include an AdvancedNumericalEmbedding layer when disabled.",
1868+
)
1869+
1870+
def test_advanced_embedding_config_preservation(self):
1871+
"""
1872+
Ensure that the advanced numerical embedding's configuration is properly saved and can be
1873+
reloaded with get_config/from_config.
1874+
"""
1875+
features = {
1876+
"num1": NumericalFeature(
1877+
name="num1",
1878+
feature_type=FeatureType.FLOAT_NORMALIZED,
1879+
)
1880+
}
1881+
df = generate_fake_data(features, num_rows=20)
1882+
df.to_csv(self._path_data, index=False)
1883+
1884+
# Build the model with advanced embedding.
1885+
ppr = PreprocessingModel(
1886+
path_data=str(self._path_data),
1887+
features_specs=features,
1888+
features_stats_path=self.features_stats_path,
1889+
overwrite_stats=True,
1890+
use_advanced_numerical_embedding=True,
1891+
embedding_dim=8,
1892+
mlp_hidden_units=16,
1893+
num_bins=10,
1894+
init_min=-3.0,
1895+
init_max=3.0,
1896+
dropout_rate=0.1,
1897+
use_batch_norm=True,
1898+
output_mode=OutputModeOptions.CONCAT,
1899+
)
1900+
result = ppr.build_preprocessor()
1901+
self.assertIsNotNone(result["model"])
1902+
1903+
# Get the configuration from the built model.
1904+
config = result["model"].get_config()
1905+
# Iterate the layer configurations.
1906+
layers_config = config.get("layers", [])
1907+
found = any(
1908+
layer.get("class_name", "") == "AdvancedNumericalEmbedding"
1909+
for layer in layers_config
1910+
)
1911+
self.assertTrue(
1912+
found,
1913+
"The model config should include an AdvancedNumericalEmbedding layer when enabled.",
1914+
)
1915+
1916+
17611917
if __name__ == "__main__":
17621918
unittest.main()

0 commit comments

Comments
 (0)