Thanks to visit codestin.com
Credit goes to github.com

Skip to content
47 changes: 47 additions & 0 deletions feathr_project/feathr/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import tempfile
from typing import Dict, List, Union
from feathr.definition.feature import FeatureBase
import copy

import redis
from azure.identity import DefaultAzureCredential
Expand Down Expand Up @@ -568,13 +569,59 @@ def monitor_features(self, settings: MonitoringSettings, execution_configuration
"""
self.materialize_features(settings, execution_configurations, verbose)

# Get feature keys given the name of a feature
# Should search in both 'derived_feature_list' and 'anchor_list'
# Return related keys(key_column list) or None if cannot find the feature
def _get_feature_key(self, feature_name: str):
features = []
if 'derived_feature_list' in dir(self):
features += self.derived_feature_list
if 'anchor_list' in dir(self):
for anchor in self.anchor_list:
features += anchor.features
for feature in features:
if feature.name == feature_name:
keys = feature.key
return set(key.key_column for key in keys)
self.logger.warning(f"Invalid feature name: {feature_name}. Please call FeathrClient.build_features() first in order to materialize the features.")
return None

# Validation on feature keys:
# Features within a set of aggregation or planned to be merged should have same keys
# The param "allow_empty_key" shows if empty keys are acceptable
def _valid_materialize_keys(self, features: List[str], allow_empty_key=False):
keys = None
for feature in features:
new_keys = self._get_feature_key(feature)
if new_keys is None:
self.logger.error(f"Key of feature: {feature} is empty. If this feature is not from INPUT_CONTEXT, you might want to double check on the feature definition to see whether the key is empty or not.")
return False
# If only get one key and it's "NOT_NEEDED", it means the feature has an empty key.
if ','.join(new_keys) == "NOT_NEEDED" and not allow_empty_key:
self.logger.error(f"Empty feature key is not allowed for features: {features}")
return False
if keys is None:
keys = copy.deepcopy(new_keys)
else:
if len(keys) != len(new_keys):
self.logger.error(f"Inconsistent feature keys. Current keys are {str(keys)}")
return False
for new_key in new_keys:
if new_key not in keys:
self.logger.error(f"Inconsistent feature keys. Current keys are {str(keys)}")
return False
return True

def materialize_features(self, settings: MaterializationSettings, execution_configurations: Union[SparkExecutionConfiguration ,Dict[str,str]] = {}, verbose: bool = False):
"""Materialize feature data

Args:
settings: Feature materialization settings
execution_configurations: a dict that will be passed to spark job when the job starts up, i.e. the "spark configurations". Note that not all of the configuration will be honored since some of the configurations are managed by the Spark platform, such as Databricks or Azure Synapse. Refer to the [spark documentation](https://spark.apache.org/docs/latest/configuration.html) for a complete list of spark configurations.
"""
feature_list = settings.feature_names
if len(feature_list) > 0 and not self._valid_materialize_keys(feature_list):
raise RuntimeError(f"Invalid materialization features: {feature_list}, since they have different keys. Currently Feathr only supports materializing features of the same keys.")

# Collect secrets from sinks
secrets = []
Expand Down
5 changes: 4 additions & 1 deletion feathr_project/feathr/definition/materialization_settings.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from datetime import datetime, timedelta
from typing import List, Optional
from feathr.definition.sink import Sink
from feathr.definition.sink import HdfsSink, RedisSink, Sink
import math


Expand Down Expand Up @@ -31,6 +31,9 @@ def __init__(self, name: str, sinks: List[Sink], feature_names: List[str], backf
self.name = name
now = datetime.now()
self.backfill_time = backfill_time if backfill_time else BackfillTime(start=now, end=now, step=timedelta(days=1))
for sink in sinks:
if isinstance(sink, RedisSink):
sink.aggregation_features = feature_names
self.sinks = sinks
self.feature_names = feature_names

Expand Down
6 changes: 6 additions & 0 deletions feathr_project/feathr/definition/sink.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,9 @@ def to_feature_config(self) -> str:
{% if source.streamingTimeoutMs %}
timeoutMs: {{source.streamingTimeoutMs}}
{% endif %}
{% if source.aggregation_features %}
features: [{{','.join(source.aggregation_features)}}]
{% endif %}
}
}
""")
Expand Down Expand Up @@ -131,6 +134,9 @@ def to_feature_config(self) -> str:
name: HDFS
params: {
path: "{{sink.output_path}}"
{% if sink.aggregation_features %}
features: [{{','.join(sink.aggregation_features)}}]
{% endif %}
}
}
""")
Expand Down
1 change: 1 addition & 0 deletions feathr_project/test/test_feature_materialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def test_feature_materialization_config():
name: REDIS
params: {
table_name: "nycTaxiDemoFeature"
features: [f_location_avg_fare,f_location_max_fare]
}
}
]
Expand Down
1 change: 1 addition & 0 deletions feathr_project/test/test_fixture.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ def basic_test_setup(config_path: str):
transform=WindowAggTransformation(agg_expr="cast_float(fare_amount)",
agg_func="AVG",
window="90d",
filter="fare_amount > 0"
)),
Feature(name="f_location_max_fare",
key=location_id,
Expand Down