Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 3540a70

Browse files
fix(KDP): implementing reall passthrough (skipping all preocessing but adding inputs and ouputs
1 parent cc3d533 commit 3540a70

File tree

1 file changed

+154
-40
lines changed

1 file changed

+154
-40
lines changed

kdp/processor.py

Lines changed: 154 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,7 @@ def __init__(
314314
feature_moe_dropout: float = 0.1,
315315
feature_moe_freeze_experts: bool = False,
316316
feature_moe_use_residual: bool = True,
317+
include_passthrough_in_output: bool = True,
317318
) -> None:
318319
"""Initialize a preprocessing model.
319320
@@ -419,6 +420,9 @@ def __init__(
419420
self.feature_moe_freeze_experts = feature_moe_freeze_experts
420421
self.feature_moe_use_residual = feature_moe_use_residual
421422

423+
# Passthrough features control
424+
self.include_passthrough_in_output = include_passthrough_in_output
425+
422426
# Initialize feature type lists
423427
self.numeric_features = []
424428
self.categorical_features = []
@@ -433,6 +437,7 @@ def __init__(
433437
self.signature = {}
434438
self.outputs = {} # Final outputs for DICT mode
435439
self.processed_features = {} # All processed features before final output
440+
self.passthrough_outputs = {} # Passthrough features (unprocessed)
436441
self.concat_all = None # Final concatenated output for CONCAT mode
437442
self._preprocessed_cache = {} if use_caching else None
438443

@@ -1400,7 +1405,11 @@ def _add_pipeline_date(self, feature_name: str, input_layer) -> None:
14001405

14011406
@_monitor_performance
14021407
def _add_pipeline_passthrough(self, feature_name: str, input_layer) -> None:
1403-
"""Add a passthrough feature to the pipeline without preprocessing.
1408+
"""Add a passthrough feature to the pipeline.
1409+
1410+
Depending on include_passthrough_in_output setting:
1411+
- If True: Process minimally and include in main output (legacy behavior)
1412+
- If False: Store unprocessed for separate access (recommended for IDs/metadata)
14041413
14051414
Args:
14061415
feature_name (str): The name of the feature to be passed through.
@@ -1409,6 +1418,17 @@ def _add_pipeline_passthrough(self, feature_name: str, input_layer) -> None:
14091418
# getting feature object
14101419
_feature = self.features_specs[feature_name]
14111420

1421+
if self.include_passthrough_in_output:
1422+
# Legacy behavior: minimal processing and include in main output
1423+
self._process_passthrough_for_output(feature_name, input_layer, _feature)
1424+
else:
1425+
# New behavior: store unprocessed for separate access
1426+
self._store_passthrough_unprocessed(feature_name, input_layer, _feature)
1427+
1428+
def _process_passthrough_for_output(
1429+
self, feature_name: str, input_layer, _feature
1430+
) -> None:
1431+
"""Process passthrough feature minimally for inclusion in main output."""
14121432
# initializing preprocessor
14131433
preprocessor = FeaturePreprocessor(name=feature_name)
14141434

@@ -1452,6 +1472,17 @@ def _add_pipeline_passthrough(self, feature_name: str, input_layer) -> None:
14521472

14531473
self.processed_features[feature_name] = _output_pipeline
14541474

1475+
def _store_passthrough_unprocessed(
1476+
self, feature_name: str, input_layer, _feature
1477+
) -> None:
1478+
"""Store passthrough feature unprocessed for separate access."""
1479+
logger.info(
1480+
f"Storing passthrough feature '{feature_name}' unprocessed for separate access"
1481+
)
1482+
# Store the raw input layer for this passthrough feature
1483+
# This will be available in the model outputs but not processed by KDP
1484+
self.passthrough_outputs[feature_name] = input_layer
1485+
14551486
@_monitor_performance
14561487
def _add_pipeline_time_series(
14571488
self, feature_name: str, input_layer, feature
@@ -1650,6 +1681,37 @@ def _prepare_concat_mode_outputs(self) -> None:
16501681

16511682
logger.info("Concatenating outputs mode enabled")
16521683

1684+
def _combine_all_features(
1685+
self, concat_num: Optional[tf.Tensor], concat_cat: Optional[tf.Tensor]
1686+
) -> None:
1687+
"""Combine numeric and categorical features.
1688+
1689+
Args:
1690+
concat_num: Concatenated numeric features tensor
1691+
concat_cat: Concatenated categorical features tensor
1692+
1693+
Raises:
1694+
ValueError: If no features are available for concatenation
1695+
"""
1696+
if concat_num is not None and concat_cat is not None:
1697+
self.concat_all = tf.keras.layers.Concatenate(
1698+
name="ConcatenateAll",
1699+
axis=-1,
1700+
)([concat_num, concat_cat])
1701+
elif concat_num is not None:
1702+
self.concat_all = concat_num
1703+
elif concat_cat is not None:
1704+
self.concat_all = concat_cat
1705+
else:
1706+
# Check if we have passthrough features that are stored separately
1707+
if self.passthrough_outputs and not self.include_passthrough_in_output:
1708+
logger.info(
1709+
"No processed features to concatenate - only passthrough features exist"
1710+
)
1711+
self.concat_all = None # Will be handled in model building
1712+
else:
1713+
raise ValueError("No features available for concatenation")
1714+
16531715
def _group_features_by_type(self) -> Tuple[List, List]:
16541716
"""Group processed features by type for concatenation.
16551717
@@ -1659,7 +1721,8 @@ def _group_features_by_type(self) -> Tuple[List, List]:
16591721
# Initialize lists for features of different types
16601722
numeric_features = []
16611723
categorical_features = []
1662-
passthrough_features = []
1724+
passthrough_features_numeric = []
1725+
passthrough_features_string = []
16631726

16641727
# Group processed features by type
16651728
for feature_name, feature in self.processed_features.items():
@@ -1688,14 +1751,36 @@ def _group_features_by_type(self) -> Tuple[List, List]:
16881751
logger.debug(f"Adding {feature_name} to categorical features")
16891752
categorical_features.append(feature)
16901753
elif feature_name in self.passthrough_features:
1691-
logger.debug(f"Adding {feature_name} to passthrough features")
1692-
passthrough_features.append(feature)
1754+
# Only include passthrough features in concatenation if they're meant to be in output
1755+
# When include_passthrough_in_output=False, they should be stored separately
1756+
if self.include_passthrough_in_output:
1757+
# Separate passthrough features by dtype to avoid concatenation issues
1758+
feature_dtype = getattr(feature_spec, "dtype", tf.float32)
1759+
if feature_dtype == tf.string:
1760+
logger.debug(
1761+
f"Adding {feature_name} to string passthrough features"
1762+
)
1763+
passthrough_features_string.append(feature)
1764+
else:
1765+
logger.debug(
1766+
f"Adding {feature_name} to numeric passthrough features"
1767+
)
1768+
passthrough_features_numeric.append(feature)
1769+
else:
1770+
logger.debug(
1771+
f"Skipping {feature_name} from concatenation (stored separately)"
1772+
)
16931773
else:
16941774
logger.warning(f"Unknown feature type for {feature_name}")
16951775

1696-
# For concatenation purposes, add passthrough features to numeric features
1697-
if passthrough_features:
1698-
numeric_features.extend(passthrough_features)
1776+
# Add numeric passthrough features to numeric features (only if include_passthrough_in_output=True)
1777+
if passthrough_features_numeric:
1778+
numeric_features.extend(passthrough_features_numeric)
1779+
1780+
# Add string passthrough features to categorical features (only if include_passthrough_in_output=True)
1781+
# (since categorical features are typically strings and handled separately)
1782+
if passthrough_features_string:
1783+
categorical_features.extend(passthrough_features_string)
16991784

17001785
return numeric_features, categorical_features
17011786

@@ -1753,30 +1838,6 @@ def _concatenate_categorical_features(
17531838

17541839
return concat_cat
17551840

1756-
def _combine_all_features(
1757-
self, concat_num: Optional[tf.Tensor], concat_cat: Optional[tf.Tensor]
1758-
) -> None:
1759-
"""Combine numeric and categorical features.
1760-
1761-
Args:
1762-
concat_num: Concatenated numeric features tensor
1763-
concat_cat: Concatenated categorical features tensor
1764-
1765-
Raises:
1766-
ValueError: If no features are available for concatenation
1767-
"""
1768-
if concat_num is not None and concat_cat is not None:
1769-
self.concat_all = tf.keras.layers.Concatenate(
1770-
name="ConcatenateAll",
1771-
axis=-1,
1772-
)([concat_num, concat_cat])
1773-
elif concat_num is not None:
1774-
self.concat_all = concat_num
1775-
elif concat_cat is not None:
1776-
self.concat_all = concat_cat
1777-
else:
1778-
raise ValueError("No features available for concatenation")
1779-
17801841
def _apply_multi_resolution_attention(
17811842
self, concat_num: tf.Tensor, concat_cat: tf.Tensor
17821843
) -> None:
@@ -2388,24 +2449,77 @@ def build_preprocessor(self) -> dict:
23882449
# Build the model based on output mode
23892450
logger.info("Building preprocessor Model")
23902451
if self.output_mode == OutputModeOptions.CONCAT.value:
2391-
if self.concat_all is None:
2452+
# Handle case where only passthrough features exist
2453+
if (
2454+
self.concat_all is None
2455+
and self.passthrough_outputs
2456+
and not self.include_passthrough_in_output
2457+
):
2458+
logger.info(
2459+
"Only passthrough features detected - creating passthrough-only model"
2460+
)
2461+
self.model = tf.keras.Model(
2462+
inputs=self.inputs,
2463+
outputs=self.passthrough_outputs,
2464+
name="preprocessor",
2465+
)
2466+
_output_dims = "passthrough_only"
2467+
elif self.concat_all is None:
23922468
raise ValueError(
23932469
"No features were concatenated. Check if features were properly processed."
23942470
)
2395-
self.model = tf.keras.Model(
2396-
inputs=self.inputs,
2397-
outputs=self.concat_all, # Use concat_all for CONCAT mode
2398-
name="preprocessor",
2399-
)
2400-
_output_dims = self.model.output_shape[1]
2471+
else:
2472+
# Determine outputs based on passthrough settings
2473+
if (
2474+
self.passthrough_outputs
2475+
and not self.include_passthrough_in_output
2476+
):
2477+
# Include both processed (concat) and passthrough outputs
2478+
model_outputs = {
2479+
"processed": self.concat_all,
2480+
"passthrough": self.passthrough_outputs,
2481+
}
2482+
logger.info(
2483+
f"Creating model with separate passthrough outputs: {list(self.passthrough_outputs.keys())}"
2484+
)
2485+
else:
2486+
# Standard concat output
2487+
model_outputs = self.concat_all
2488+
2489+
self.model = tf.keras.Model(
2490+
inputs=self.inputs,
2491+
outputs=model_outputs,
2492+
name="preprocessor",
2493+
)
2494+
_output_dims = (
2495+
self.model.output_shape[1]
2496+
if isinstance(model_outputs, tf.Tensor)
2497+
else "mixed"
2498+
)
2499+
24012500
else: # DICT mode
2402-
if not self.outputs:
2501+
if not self.outputs and not self.passthrough_outputs:
24032502
raise ValueError(
24042503
"No outputs were created. Check if features were properly processed."
24052504
)
2505+
2506+
# Include passthrough outputs in dict mode if they exist
2507+
final_outputs = self.outputs.copy() if self.outputs else {}
2508+
if self.passthrough_outputs and not self.include_passthrough_in_output:
2509+
final_outputs.update(self.passthrough_outputs)
2510+
logger.info(
2511+
f"Adding passthrough outputs to dict mode: {list(self.passthrough_outputs.keys())}"
2512+
)
2513+
elif not final_outputs:
2514+
# Only passthrough features exist
2515+
final_outputs = self.passthrough_outputs
2516+
logger.info(
2517+
"Only passthrough features detected - creating passthrough-only dict model"
2518+
)
2519+
24062520
self.model = tf.keras.Model(
24072521
inputs=self.inputs,
2408-
outputs=self.outputs, # Use outputs dict for DICT mode
2522+
outputs=final_outputs,
24092523
name="preprocessor",
24102524
)
24112525
_output_dims = self.model.output_shape

0 commit comments

Comments
 (0)