@@ -49,24 +49,14 @@ features = {
4949 name = " occupation" ,
5050 feature_type = FeatureType.STRING_CATEGORICAL ,
5151 category_encoding = CategoryEncodingOptions.EMBEDDING , # Use embeddings
52- embedding_dim = 16 , # Custom embedding dimension
53- vocabulary_size = 1000 # Limit vocabulary size
54- ),
55-
56- # High-cardinality feature with hashing
57- " product_id" : CategoricalFeature(
58- name = " product_id" ,
59- feature_type = FeatureType.STRING_CATEGORICAL ,
60- category_encoding = CategoryEncodingOptions.HASHING , # Use hashing for high cardinality
61- num_hash_bins = 10000 , # Number of hash buckets
62- embedding_dim = 32 # Embedding dimension after hashing
52+ embedding_size = 16 # Custom embedding size
6353 ),
6454
6555 # One-hot encoding for low-cardinality feature
6656 " day_of_week" : CategoricalFeature(
6757 name = " day_of_week" ,
6858 feature_type = FeatureType.STRING_CATEGORICAL ,
69- category_encoding = CategoryEncodingOptions.ONE_HOT , # One-hot encoding
59+ category_encoding = CategoryEncodingOptions.ONE_HOT_ENCODING , # One-hot encoding
7060 vocabulary = [" Mon" , " Tue" , " Wed" , " Thu" , " Fri" , " Sat" , " Sun" ] # Pre-defined vocabulary
7161 )
7262}
@@ -77,16 +67,13 @@ features = {
7767| Parameter | Description | Default | Suggested Range |
7868| -----------| -------------| ---------| ----------------|
7969| ` feature_type ` | Base feature type | Based on data | ` STRING_CATEGORICAL ` , ` INTEGER_CATEGORICAL ` |
80- | ` category_encoding ` | Encoding method | ` EMBEDDING ` | ` EMBEDDING ` , ` ONE_HOT ` , ` HASHING ` |
81- | ` embedding_dim ` | Dimensionality of embedding | Auto-scaled | 8-128 |
82- | ` vocabulary_size ` | Maximum vocabulary size | 10,000 | 100-1,000,000 |
70+ | ` category_encoding ` | Encoding method | ` EMBEDDING ` | ` EMBEDDING ` , ` ONE_HOT_ENCODING ` |
71+ | ` embedding_size ` | Dimensionality of embedding | Auto-scaled | 8-128 |
8372| ` vocabulary ` | Pre-defined vocabulary | ` None ` | List of categories |
84- | ` num_hash_bins ` | Number of hash buckets | 10,000 | 1,000-100,000 |
85- | ` hash_key ` | Hash seed for deterministic hashing | ` None ` | Integer seed |
8673
8774## 🔥 Power Features
8875
89- ### Automatic Vocabulary Sizing
76+ ### Automatic Embedding Sizing
9077
9178KDP automatically determines optimal embedding sizes based on cardinality:
9279
@@ -100,41 +87,22 @@ preprocessor = PreprocessingModel(
10087
10188### Handling High-Cardinality Features
10289
103- When dealing with millions of categories:
90+ When dealing with many categories, KDP automatically adjusts the embedding size :
10491
10592``` python
106- # For features with huge numbers of categories
93+ # For features with many categories
10794preprocessor = PreprocessingModel(
10895 features_specs = {
10996 " user_id" : CategoricalFeature(
11097 name = " user_id" ,
11198 feature_type = FeatureType.STRING_CATEGORICAL ,
112- category_encoding = CategoryEncodingOptions.HASHING ,
113- num_hash_bins = 100000 , # Large number of buckets
114- embedding_dim = 64 # Rich representation
99+ category_encoding = CategoryEncodingOptions.EMBEDDING ,
100+ # embedding size will be automatically determined based on cardinality
115101 )
116102 }
117103)
118104```
119105
120- ### Cross-Category Features
121-
122- Capture interactions between categorical features:
123-
124- ``` python
125- # Create interactions between categories
126- preprocessor = PreprocessingModel(
127- features_specs = {
128- " product_category" : FeatureType.STRING_CATEGORICAL ,
129- " user_country" : FeatureType.STRING_CATEGORICAL
130- },
131- # Define cross features
132- feature_crosses = [
133- (" product_category" , " user_country" , 32 ) # Names and embedding dimension
134- ]
135- )
136- ```
137-
138106## 📊 Model Architecture
139107
140108Below are visualizations of categorical feature processing in KDP:
@@ -156,18 +124,16 @@ For more control, you can use the `CategoricalFeature` class:
156124## 💡 Pro Tips
157125
1581261 . ** Choose the Right Encoding**
159- - Use ` ONE_HOT ` for very low cardinality (< 10 categories)
160- - Use ` EMBEDDING ` for medium cardinality (10-10,000 categories)
161- - Use ` HASHING ` for high cardinality (> 10,000 categories)
127+ - Use ` ONE_HOT_ENCODING ` for very low cardinality (< 10 categories)
128+ - Use ` EMBEDDING ` for medium to high cardinality (≥ 10 categories)
162129
1631302 . ** Embedding Dimension Rules of Thumb**
164- - A good starting point: ` embedding_dim = 1.6 * num_categories^0.56 `
165- - For very important features, increase this by 50%
166- - Cap around 512 dimensions even for extremely high cardinality
131+ - KDP automatically calculates optimal embedding size using the rule: ` min(500, 1.6 * num_categories^0.56) `
132+ - For very important features, you can override with custom ` embedding_size `
167133
1681343 . ** Vocabulary Management**
169- - Limit vocabulary size for memory efficiency
170- - Consider the "minimum_frequency" parameter to drop rare categories
135+ - For low-cardinality features, consider providing a pre-defined vocabulary
136+ - This ensures consistent encoding across different datasets
171137
1721384 . ** Cross Features for Interactions**
173139 - Use cross features when combinations have special meaning
0 commit comments