Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 147bceb

Browse files
Merge branch 'main' into feat_adding_grvs
2 parents f82b788 + 4b1c510 commit 147bceb

10 files changed

+811
-44
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,3 +164,6 @@ kdp/data/fake_data.csv
164164

165165
# Ignore all contents of my_tests folder
166166
my_tests/*
167+
168+
# derivative files
169+
data.csv

docs/complex_example.md

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
# 📚 Complex Example 🌟
2+
3+
This example shows how to create a compound model with both transformer blocks and attention mechanisms.
4+
5+
```python
6+
import pandas as pd
7+
import tensorflow as tf
8+
from kdp.features import (
9+
NumericalFeature,
10+
CategoricalFeature,
11+
TextFeature,
12+
DateFeature,
13+
FeatureType
14+
)
15+
from kdp.processor import PreprocessingModel, OutputModeOptions
16+
17+
# Define features
18+
features = {
19+
# Numerical features
20+
"price": NumericalFeature(
21+
name="price",
22+
feature_type=FeatureType.FLOAT_NORMALIZED
23+
),
24+
"quantity": NumericalFeature(
25+
name="quantity",
26+
feature_type=FeatureType.FLOAT_RESCALED
27+
),
28+
29+
# Categorical features
30+
"category": CategoricalFeature(
31+
name="category",
32+
feature_type=FeatureType.STRING_CATEGORICAL,
33+
embedding_size=32
34+
),
35+
"brand": CategoricalFeature(
36+
name="brand",
37+
feature_type=FeatureType.STRING_CATEGORICAL,
38+
embedding_size=16
39+
),
40+
41+
# Text features
42+
"description": TextFeature(
43+
name="description",
44+
feature_type=FeatureType.TEXT,
45+
max_tokens=100
46+
),
47+
"title": TextFeature(
48+
name="title",
49+
feature_type=FeatureType.TEXT,
50+
max_tokens=50, # max number of tokens to keep
51+
),
52+
53+
# Date features
54+
"sale_date": DateFeature(
55+
name="sale_date",
56+
feature_type=FeatureType.DATE,
57+
add_season=True, # adds one-hot season indicator (summer, winter, etc) defaults to False
58+
)
59+
}
60+
61+
# Create sample data
62+
df = pd.DataFrame({
63+
"price": [10.5, 20.0, 15.75, 30.25, 25.50] * 20,
64+
"quantity": [5, 10, 3, 8, 12] * 20,
65+
"category": ["electronics", "books", "clothing", "food", "toys"] * 20,
66+
"brand": ["brandA", "brandB", "brandC", "brandD", "brandE"] * 20,
67+
"description": [
68+
"High quality product with great features",
69+
"Must-read book for enthusiasts",
70+
"Comfortable and stylish clothing",
71+
"Fresh and organic produce",
72+
"Educational toy for children"
73+
] * 20,
74+
"title": [
75+
"Premium Device",
76+
"Best Seller Book",
77+
"Fashion Item",
78+
"Organic Food",
79+
"Kids Toy"
80+
] * 20,
81+
"sale_date": [
82+
"2023-01-15",
83+
"2023-02-20",
84+
"2023-03-25",
85+
"2023-04-30",
86+
"2023-05-05"
87+
] * 20
88+
})
89+
90+
# Save to CSV
91+
df.to_csv("sample_data.csv", index=False)
92+
93+
# Create preprocessor with both transformer blocks and attention
94+
ppr = PreprocessingModel(
95+
path_data="sample_data.csv",
96+
features_specs=features,
97+
output_mode=OutputModeOptions.CONCAT,
98+
99+
# Transformer block configuration
100+
transfo_placement="all_features", # Choose between (categorical|all_features)
101+
transfo_nr_blocks=2, # Number of transformer blocks
102+
transfo_nr_heads=4, # Number of attention heads in transformer
103+
transfo_ff_units=64, # Feed-forward units in transformer
104+
transfo_dropout_rate=0.1, # Dropout rate for transformer
105+
106+
# Tabular attention configuration
107+
tabular_attention=True,
108+
tabular_attention_placement="all_features", # Choose between (none|numeric|categorical|all_features| multi_resolution)
109+
tabular_attention_heads=3, # Number of attention heads
110+
tabular_attention_dim=32, # Attention dimension
111+
tabular_attention_dropout=0.1, # Attention dropout rate
112+
tabular_attention_embedding_dim=16, # Embedding dimension
113+
114+
# Other parameters
115+
overwrite_stats=True, # Force stats generation, recommended to be set to True
116+
)
117+
118+
# Build the preprocessor
119+
result = ppr.build_preprocessor()
120+
```
121+
122+
Now if one wants to plot, use the Neural Network for predictions or just get the statistics, use the following:
123+
124+
```python
125+
# Plot the model architecture
126+
ppr.plot_model("complex_model.png")
127+
128+
# Get predictions with an example test batch from the example data
129+
test_batch = tf.data.Dataset.from_tensor_slices(dict(df.head(3))).batch(3)
130+
predictions = result["model"].predict(test_batch)
131+
print("Output shape:", predictions.shape)
132+
133+
# Print feature statistics
134+
print("\nFeature Statistics:")
135+
for feature_type, features in ppr.get_feature_statistics().items():
136+
if isinstance(features, dict):
137+
print(f"\n{feature_type}:")
138+
for feature_name, stats in features.items():
139+
print(f" {feature_name}: {list(stats.keys())}")
140+
```
141+
142+
143+
Here is the plot of the model:
144+
![Complex Model](imgs/complex_model.png)

docs/features.md

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -232,14 +232,16 @@ You can even process string encoded date features (format: 'YYYY-MM-DD' or 'YYYY
232232

233233
features_specs = {
234234
"feat1": DateFeature(
235-
name="feat2",
236-
feature_type=FeatureType.FLOAT,
235+
name="feat1",
236+
feature_type=FeatureType.DATE,
237237
),
238-
"feat2": TextFeature(
238+
"feat2": DateFeature(
239239
name="feat2",
240240
feature_type=FeatureType.DATE,
241+
date_format="%Y-%m-%d", # date format of the input data
242+
output_format="year", # output format of the feature
241243
# additional option to add season layer:
242-
add_season=True, # adds one-hot season indicator (summer, winter, etc) defaults to False
244+
add_season=True, # adds one-hot season indicator (summer, winter, autumn or spring) defaults to False
243245
),
244246
...
245247
}
209 KB
Loading
214 KB
Loading
200 KB
Loading

docs/imgs/complex_model.png

275 KB
Loading

docs/tabular_attention.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,26 @@ model = PreprocessingModel(
3434
)
3535
```
3636

37+
![Standard TabularAttention](imgs/attention_example_standard.png)
38+
39+
### Categorical Tabular Attention
40+
41+
```python
42+
from kdp.processor import PreprocessingModel, TabularAttentionPlacementOptions
43+
44+
model = PreprocessingModel(
45+
# ... other parameters ...
46+
tabular_attention=True,
47+
tabular_attention_heads=4,
48+
tabular_attention_dim=64,
49+
tabular_attention_dropout=0.1,
50+
tabular_attention_embedding_dim=32, # Dimension for categorical embeddings
51+
tabular_attention_placement=TabularAttentionPlacementOptions.CATEGORICAL.value,
52+
)
53+
```
54+
55+
![Categorical TabularAttention](imgs/attention_example_categorical.png)
56+
3757
### Multi-Resolution TabularAttention
3858

3959
```python
@@ -50,6 +70,8 @@ model = PreprocessingModel(
5070
)
5171
```
5272

73+
![Multi-Resolution TabularAttention](imgs/attention_example_multi_resolution.png)
74+
5375
## Configuration Options
5476

5577
### Common Options

kdp/processor.py

Lines changed: 40 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -718,7 +718,7 @@ def _add_pipeline_text(self, feature_name: str, input_layer, stats: dict) -> Non
718718
Args:
719719
feature_name (str): The name of the feature to be preprocessed.
720720
input_layer: The input layer for the feature.
721-
stats (dict): A dictionary containing the metadata of the feature, including
721+
stats (dict): A dictionary containing the metadata of the feature.
722722
"""
723723
# getting feature object
724724
_feature = self.features_specs[feature_name]
@@ -928,45 +928,6 @@ def _prepare_outputs(self) -> None:
928928
else:
929929
raise ValueError("No features available for concatenation")
930930

931-
# Add transformer blocks if specified
932-
if self.transfo_nr_blocks:
933-
if self.transfo_placement == TransformerBlockPlacementOptions.CATEGORICAL and concat_cat is not None:
934-
logger.info(f"Adding transformer blocks to categorical features: #{self.transfo_nr_blocks}")
935-
transformed = concat_cat
936-
for block_idx in range(self.transfo_nr_blocks):
937-
transformed = PreprocessorLayerFactory.transformer_block_layer(
938-
dim_model=transformed.shape[-1],
939-
num_heads=self.transfo_nr_heads,
940-
ff_units=self.transfo_ff_units,
941-
dropout_rate=self.transfo_dropout_rate,
942-
name=f"transformer_block_{block_idx}_{self.transfo_nr_heads}heads",
943-
)(transformed)
944-
# Reshape transformer output to remove the extra dimension
945-
transformed = tf.keras.layers.Reshape(
946-
target_shape=(-1,), # Flatten to match numeric shape
947-
name="reshape_transformer_output",
948-
)(transformed)
949-
950-
# Recombine with numeric features if they exist
951-
if concat_num is not None:
952-
self.concat_all = tf.keras.layers.Concatenate(
953-
name="ConcatenateTransformed",
954-
axis=-1,
955-
)([concat_num, transformed])
956-
else:
957-
self.concat_all = transformed
958-
959-
elif self.transfo_placement == TransformerBlockPlacementOptions.ALL_FEATURES:
960-
logger.info(f"Adding transformer blocks to all features: #{self.transfo_nr_blocks}")
961-
for block_idx in range(self.transfo_nr_blocks):
962-
self.concat_all = PreprocessorLayerFactory.transformer_block_layer(
963-
dim_model=self.concat_all.shape[-1],
964-
num_heads=self.transfo_nr_heads,
965-
ff_units=self.transfo_ff_units,
966-
dropout_rate=self.transfo_dropout_rate,
967-
name=f"transformer_block_{block_idx}_{self.transfo_nr_heads}heads",
968-
)(self.concat_all)
969-
970931
# Add tabular attention if specified
971932
if self.tabular_attention:
972933
if self.tabular_attention_placement == TabularAttentionPlacementOptions.MULTI_RESOLUTION:
@@ -1095,6 +1056,45 @@ def _prepare_outputs(self) -> None:
10951056
else:
10961057
self.concat_all = concat_cat
10971058

1059+
# Add transformer blocks if specified
1060+
if self.transfo_nr_blocks:
1061+
if self.transfo_placement == TransformerBlockPlacementOptions.CATEGORICAL and concat_cat is not None:
1062+
logger.info(f"Adding transformer blocks to categorical features: #{self.transfo_nr_blocks}")
1063+
transformed = concat_cat
1064+
for block_idx in range(self.transfo_nr_blocks):
1065+
transformed = PreprocessorLayerFactory.transformer_block_layer(
1066+
dim_model=transformed.shape[-1],
1067+
num_heads=self.transfo_nr_heads,
1068+
ff_units=self.transfo_ff_units,
1069+
dropout_rate=self.transfo_dropout_rate,
1070+
name=f"transformer_block_{block_idx}_{self.transfo_nr_heads}heads",
1071+
)(transformed)
1072+
# Reshape transformer output to remove the extra dimension
1073+
transformed = tf.keras.layers.Reshape(
1074+
target_shape=(-1,), # Flatten to match numeric shape
1075+
name="reshape_transformer_output",
1076+
)(transformed)
1077+
1078+
# Recombine with numeric features if they exist
1079+
if concat_num is not None:
1080+
self.concat_all = tf.keras.layers.Concatenate(
1081+
name="ConcatenateTransformed",
1082+
axis=-1,
1083+
)([concat_num, transformed])
1084+
else:
1085+
self.concat_all = transformed
1086+
1087+
elif self.transfo_placement == TransformerBlockPlacementOptions.ALL_FEATURES:
1088+
logger.info(f"Adding transformer blocks to all features: #{self.transfo_nr_blocks}")
1089+
for block_idx in range(self.transfo_nr_blocks):
1090+
self.concat_all = PreprocessorLayerFactory.transformer_block_layer(
1091+
dim_model=self.concat_all.shape[-1],
1092+
num_heads=self.transfo_nr_heads,
1093+
ff_units=self.transfo_ff_units,
1094+
dropout_rate=self.transfo_dropout_rate,
1095+
name=f"transformer_block_{block_idx}_{self.transfo_nr_heads}heads",
1096+
)(self.concat_all)
1097+
10981098
logger.info("Concatenating outputs mode enabled")
10991099
else:
11001100
# Dictionary mode

0 commit comments

Comments
 (0)