UnicoLab
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/complex_example.md‎
Lines changed: 144 additions & 0 deletions b/‎docs/complex_example.md‎
Lines changed: 144 additions & 0 deletions
diff --git a/‎docs/features.md‎
Lines changed: 6 additions & 4 deletions b/‎docs/features.md‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎docs/imgs/attention_example_categorical.png‎
209 KB b/‎docs/imgs/attention_example_categorical.png‎
209 KB
diff --git a/‎docs/imgs/attention_example_multi_resolution.png‎
214 KB b/‎docs/imgs/attention_example_multi_resolution.png‎
214 KB
diff --git a/‎docs/imgs/attention_example_standard.png‎
200 KB b/‎docs/imgs/attention_example_standard.png‎
200 KB
diff --git a/‎docs/imgs/complex_model.png‎
275 KB b/‎docs/imgs/complex_model.png‎
275 KB
diff --git a/‎docs/tabular_attention.md‎
Lines changed: 22 additions & 0 deletions b/‎docs/tabular_attention.md‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎kdp/processor.py‎
Lines changed: 40 additions & 40 deletions b/‎kdp/processor.py‎
Lines changed: 40 additions & 40 deletions
@@ -164,3 +164,6 @@ kdp/data/fake_data.csv
 
 # Ignore all contents of my_tests folder
 my_tests/*
+
+# derivative files
+data.csv
@@ -0,0 +1,144 @@
+# 📚 Complex Example 🌟
+
+This example shows how to create a compound model with both transformer blocks and attention mechanisms.
+
+```python
+import pandas as pd
+import tensorflow as tf
+from kdp.features import (
+    NumericalFeature,
+    CategoricalFeature,
+    TextFeature,
+    DateFeature,
+    FeatureType
+)
+from kdp.processor import PreprocessingModel, OutputModeOptions
+
+# Define features
+features = {
+    # Numerical features
+    "price": NumericalFeature(
+        name="price",
+        feature_type=FeatureType.FLOAT_NORMALIZED
+    ),
+    "quantity": NumericalFeature(
+        name="quantity",
+        feature_type=FeatureType.FLOAT_RESCALED
+    ),
+
+    # Categorical features
+    "category": CategoricalFeature(
+        name="category",
+        feature_type=FeatureType.STRING_CATEGORICAL,
+        embedding_size=32
+    ),
+    "brand": CategoricalFeature(
+        name="brand",
+        feature_type=FeatureType.STRING_CATEGORICAL,
+        embedding_size=16
+    ),
+
+    # Text features
+    "description": TextFeature(
+        name="description",
+        feature_type=FeatureType.TEXT,
+        max_tokens=100
+    ),
+    "title": TextFeature(
+        name="title",
+        feature_type=FeatureType.TEXT,
+        max_tokens=50, # max number of tokens to keep
+    ),
+
+    # Date features
+    "sale_date": DateFeature(
+        name="sale_date",
+        feature_type=FeatureType.DATE,
+        add_season=True, # adds one-hot season indicator (summer, winter, etc) defaults to False
+    )
+}
+
+# Create sample data
+df = pd.DataFrame({
+    "price": [10.5, 20.0, 15.75, 30.25, 25.50] * 20,
+    "quantity": [5, 10, 3, 8, 12] * 20,
+    "category": ["electronics", "books", "clothing", "food", "toys"] * 20,
+    "brand": ["brandA", "brandB", "brandC", "brandD", "brandE"] * 20,
+    "description": [
+        "High quality product with great features",
+        "Must-read book for enthusiasts",
+        "Comfortable and stylish clothing",
+        "Fresh and organic produce",
+        "Educational toy for children"
+    ] * 20,
+    "title": [
+        "Premium Device",
+        "Best Seller Book",
+        "Fashion Item",
+        "Organic Food",
+        "Kids Toy"
+    ] * 20,
+    "sale_date": [
+        "2023-01-15",
+        "2023-02-20",
+        "2023-03-25",
+        "2023-04-30",
+        "2023-05-05"
+    ] * 20
+})
+
+# Save to CSV
+df.to_csv("sample_data.csv", index=False)
+
+# Create preprocessor with both transformer blocks and attention
+ppr = PreprocessingModel(
+    path_data="sample_data.csv",
+    features_specs=features,
+    output_mode=OutputModeOptions.CONCAT,
+
+    # Transformer block configuration
+    transfo_placement="all_features",  # Choose between (categorical|all_features)
+    transfo_nr_blocks=2,              # Number of transformer blocks
+    transfo_nr_heads=4,               # Number of attention heads in transformer
+    transfo_ff_units=64,              # Feed-forward units in transformer
+    transfo_dropout_rate=0.1,         # Dropout rate for transformer
+
+    # Tabular attention configuration
+    tabular_attention=True,
+    tabular_attention_placement="all_features",  # Choose between (none|numeric|categorical|all_features| multi_resolution)
+    tabular_attention_heads=3,                   # Number of attention heads
+    tabular_attention_dim=32,                    # Attention dimension
+    tabular_attention_dropout=0.1,               # Attention dropout rate
+    tabular_attention_embedding_dim=16,          # Embedding dimension
+
+    # Other parameters
+    overwrite_stats=True,             # Force stats generation, recommended to be set to True
+)
+
+# Build the preprocessor
+result = ppr.build_preprocessor()
+```
+
+Now if one wants to plot, use the Neural Network for predictions or just get the statistics, use the following:
+
+```python
+# Plot the model architecture
+ppr.plot_model("complex_model.png")
+
+# Get predictions with an example test batch from the example data
+test_batch = tf.data.Dataset.from_tensor_slices(dict(df.head(3))).batch(3)
+predictions = result["model"].predict(test_batch)
+print("Output shape:", predictions.shape)
+
+# Print feature statistics
+print("\nFeature Statistics:")
+for feature_type, features in ppr.get_feature_statistics().items():
+    if isinstance(features, dict):
+        print(f"\n{feature_type}:")
+        for feature_name, stats in features.items():
+            print(f"  {feature_name}: {list(stats.keys())}")
+```
+
+
+Here is the plot of the model:
+![Complex Model](imgs/complex_model.png)
@@ -232,14 +232,16 @@ You can even process string encoded date features (format: 'YYYY-MM-DD' or 'YYYY
 
     features_specs = {
         "feat1": DateFeature(
-            name="feat2",
-            feature_type=FeatureType.FLOAT,
+            name="feat1",
+            feature_type=FeatureType.DATE,
         ),
-        "feat2": TextFeature(
+        "feat2": DateFeature(
             name="feat2",
             feature_type=FeatureType.DATE,
+            date_format="%Y-%m-%d", # date format of the input data
+            output_format="year", # output format of the feature
             # additional option to add season layer:
-            add_season=True,  # adds one-hot season indicator (summer, winter, etc) defaults to False
+            add_season=True,  # adds one-hot season indicator (summer, winter, autumn or spring) defaults to False
         ),
         ...
     }
 
@@ -34,6 +34,26 @@ model = PreprocessingModel(
 )
 ```
 
+![Standard TabularAttention](imgs/attention_example_standard.png)
+
+### Categorical Tabular Attention
+
+```python
+from kdp.processor import PreprocessingModel, TabularAttentionPlacementOptions
+
+model = PreprocessingModel(
+    # ... other parameters ...
+    tabular_attention=True,
+    tabular_attention_heads=4,
+    tabular_attention_dim=64,
+    tabular_attention_dropout=0.1,
+    tabular_attention_embedding_dim=32,  # Dimension for categorical embeddings
+    tabular_attention_placement=TabularAttentionPlacementOptions.CATEGORICAL.value,
+)
+```
+
+![Categorical TabularAttention](imgs/attention_example_categorical.png)
+
 ### Multi-Resolution TabularAttention
 
 ```python
@@ -50,6 +70,8 @@ model = PreprocessingModel(
 )
 ```
 
+![Multi-Resolution TabularAttention](imgs/attention_example_multi_resolution.png)
+
 ## Configuration Options
 
 ### Common Options
 
@@ -718,7 +718,7 @@ def _add_pipeline_text(self, feature_name: str, input_layer, stats: dict) -> Non
         Args:
             feature_name (str): The name of the feature to be preprocessed.
             input_layer: The input layer for the feature.
-            stats (dict): A dictionary containing the metadata of the feature, including
+            stats (dict): A dictionary containing the metadata of the feature.
         """
         # getting feature object
         _feature = self.features_specs[feature_name]
@@ -928,45 +928,6 @@ def _prepare_outputs(self) -> None:
             else:
                 raise ValueError("No features available for concatenation")
 
-            # Add transformer blocks if specified
-            if self.transfo_nr_blocks:
-                if self.transfo_placement == TransformerBlockPlacementOptions.CATEGORICAL and concat_cat is not None:
-                    logger.info(f"Adding transformer blocks to categorical features: #{self.transfo_nr_blocks}")
-                    transformed = concat_cat
-                    for block_idx in range(self.transfo_nr_blocks):
-                        transformed = PreprocessorLayerFactory.transformer_block_layer(
-                            dim_model=transformed.shape[-1],
-                            num_heads=self.transfo_nr_heads,
-                            ff_units=self.transfo_ff_units,
-                            dropout_rate=self.transfo_dropout_rate,
-                            name=f"transformer_block_{block_idx}_{self.transfo_nr_heads}heads",
-                        )(transformed)
-                    # Reshape transformer output to remove the extra dimension
-                    transformed = tf.keras.layers.Reshape(
-                        target_shape=(-1,),  # Flatten to match numeric shape
-                        name="reshape_transformer_output",
-                    )(transformed)
-
-                    # Recombine with numeric features if they exist
-                    if concat_num is not None:
-                        self.concat_all = tf.keras.layers.Concatenate(
-                            name="ConcatenateTransformed",
-                            axis=-1,
-                        )([concat_num, transformed])
-                    else:
-                        self.concat_all = transformed
-
-                elif self.transfo_placement == TransformerBlockPlacementOptions.ALL_FEATURES:
-                    logger.info(f"Adding transformer blocks to all features: #{self.transfo_nr_blocks}")
-                    for block_idx in range(self.transfo_nr_blocks):
-                        self.concat_all = PreprocessorLayerFactory.transformer_block_layer(
-                            dim_model=self.concat_all.shape[-1],
-                            num_heads=self.transfo_nr_heads,
-                            ff_units=self.transfo_ff_units,
-                            dropout_rate=self.transfo_dropout_rate,
-                            name=f"transformer_block_{block_idx}_{self.transfo_nr_heads}heads",
-                        )(self.concat_all)
-
             # Add tabular attention if specified
             if self.tabular_attention:
                 if self.tabular_attention_placement == TabularAttentionPlacementOptions.MULTI_RESOLUTION:
@@ -1095,6 +1056,45 @@ def _prepare_outputs(self) -> None:
                         else:
                             self.concat_all = concat_cat
 
+            # Add transformer blocks if specified
+            if self.transfo_nr_blocks:
+                if self.transfo_placement == TransformerBlockPlacementOptions.CATEGORICAL and concat_cat is not None:
+                    logger.info(f"Adding transformer blocks to categorical features: #{self.transfo_nr_blocks}")
+                    transformed = concat_cat
+                    for block_idx in range(self.transfo_nr_blocks):
+                        transformed = PreprocessorLayerFactory.transformer_block_layer(
+                            dim_model=transformed.shape[-1],
+                            num_heads=self.transfo_nr_heads,
+                            ff_units=self.transfo_ff_units,
+                            dropout_rate=self.transfo_dropout_rate,
+                            name=f"transformer_block_{block_idx}_{self.transfo_nr_heads}heads",
+                        )(transformed)
+                    # Reshape transformer output to remove the extra dimension
+                    transformed = tf.keras.layers.Reshape(
+                        target_shape=(-1,),  # Flatten to match numeric shape
+                        name="reshape_transformer_output",
+                    )(transformed)
+
+                    # Recombine with numeric features if they exist
+                    if concat_num is not None:
+                        self.concat_all = tf.keras.layers.Concatenate(
+                            name="ConcatenateTransformed",
+                            axis=-1,
+                        )([concat_num, transformed])
+                    else:
+                        self.concat_all = transformed
+
+                elif self.transfo_placement == TransformerBlockPlacementOptions.ALL_FEATURES:
+                    logger.info(f"Adding transformer blocks to all features: #{self.transfo_nr_blocks}")
+                    for block_idx in range(self.transfo_nr_blocks):
+                        self.concat_all = PreprocessorLayerFactory.transformer_block_layer(
+                            dim_model=self.concat_all.shape[-1],
+                            num_heads=self.transfo_nr_heads,
+                            ff_units=self.transfo_ff_units,
+                            dropout_rate=self.transfo_dropout_rate,
+                            name=f"transformer_block_{block_idx}_{self.transfo_nr_heads}heads",
+                        )(self.concat_all)
+
             logger.info("Concatenating outputs mode enabled")
         else:
             # Dictionary mode