From 8a924def903927b6a6da5b7cd3f0af2d5798b1da Mon Sep 17 00:00:00 2001 From: Enya-Yx Date: Wed, 14 Sep 2022 16:26:51 +0800 Subject: [PATCH 1/9] Apply 'aggregation_features' parameter to merge dataframes --- feathr_project/feathr/definition/sink.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/feathr_project/feathr/definition/sink.py b/feathr_project/feathr/definition/sink.py index c287ba714..ae70ee453 100644 --- a/feathr_project/feathr/definition/sink.py +++ b/feathr_project/feathr/definition/sink.py @@ -64,10 +64,11 @@ class RedisSink(Sink): streaming: whether it is used in streaming mode streamingTimeoutMs: maximum running time for streaming mode. It is not used in batch mode. """ - def __init__(self, table_name: str, streaming: bool=False, streamingTimeoutMs: Optional[int]=None) -> None: + def __init__(self, table_name: str, streaming: bool=False, streamingTimeoutMs: Optional[int]=None, aggregation_features: Optional[List[str]]=None) -> None: self.table_name = table_name self.streaming = streaming self.streamingTimeoutMs = streamingTimeoutMs + self.aggregation_features = aggregation_features def to_feature_config(self) -> str: """Produce the config used in feature materialization""" @@ -82,6 +83,9 @@ def to_feature_config(self) -> str: {% if source.streamingTimeoutMs %} timeoutMs: {{source.streamingTimeoutMs}} {% endif %} + {% if source.aggregation_features %} + features: [{{','.join(source.aggregation_features)}}] + {% endif %} } } """) From e0cf66d20f5db2e2f25333b11c00a78b45207968 Mon Sep 17 00:00:00 2001 From: Enya-Yx Date: Wed, 14 Sep 2022 18:03:32 +0800 Subject: [PATCH 2/9] modify test cases --- feathr_project/test/test_feature_materialization.py | 3 ++- feathr_project/test/test_fixture.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/feathr_project/test/test_feature_materialization.py b/feathr_project/test/test_feature_materialization.py index c0545d1cd..54ccc7957 100644 --- a/feathr_project/test/test_feature_materialization.py +++ b/feathr_project/test/test_feature_materialization.py @@ -21,7 +21,7 @@ def test_feature_materialization_config(): backfill_time = BackfillTime(start=datetime(2020, 5, 20), end=datetime(2020, 5,20), step=timedelta(days=1)) - redisSink = RedisSink(table_name="nycTaxiDemoFeature") + redisSink = RedisSink(table_name="nycTaxiDemoFeature", aggregation_features=["f_location_avg_fare", "f_location_max_fare"]) settings = MaterializationSettings("nycTaxiTable", sinks=[redisSink], feature_names=["f_location_avg_fare", "f_location_max_fare"], @@ -38,6 +38,7 @@ def test_feature_materialization_config(): name: REDIS params: { table_name: "nycTaxiDemoFeature" + features: [f_location_avg_fare,f_location_max_fare] } } ] diff --git a/feathr_project/test/test_fixture.py b/feathr_project/test/test_fixture.py index af088b65d..3c9b78d38 100644 --- a/feathr_project/test/test_fixture.py +++ b/feathr_project/test/test_fixture.py @@ -69,6 +69,7 @@ def basic_test_setup(config_path: str): transform=WindowAggTransformation(agg_expr="cast_float(fare_amount)", agg_func="AVG", window="90d", + filter="fare_amount > 20" )), Feature(name="f_location_max_fare", key=location_id, From 08d59b47f0eda6691d972198b13dc004bbe7ece4 Mon Sep 17 00:00:00 2001 From: enya-yx Date: Wed, 14 Sep 2022 18:17:00 +0800 Subject: [PATCH 3/9] modify test case filter rule to keep same results as before --- feathr_project/test/test_fixture.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feathr_project/test/test_fixture.py b/feathr_project/test/test_fixture.py index 3c9b78d38..d13a261ef 100644 --- a/feathr_project/test/test_fixture.py +++ b/feathr_project/test/test_fixture.py @@ -69,7 +69,7 @@ def basic_test_setup(config_path: str): transform=WindowAggTransformation(agg_expr="cast_float(fare_amount)", agg_func="AVG", window="90d", - filter="fare_amount > 20" + filter="fare_amount > 0" )), Feature(name="f_location_max_fare", key=location_id, From 2d7fcddb3ed8efc5ba548345ed8d9fbf80b7a113 Mon Sep 17 00:00:00 2001 From: enya-yx Date: Thu, 15 Sep 2022 17:34:38 +0800 Subject: [PATCH 4/9] add typekey check and improve previous changes --- feathr_project/feathr/client.py | 40 +++++++++++++++++++ .../definition/materialization_settings.py | 7 +++- feathr_project/feathr/definition/sink.py | 3 +- .../test/test_feature_materialization.py | 2 +- 4 files changed, 47 insertions(+), 5 deletions(-) diff --git a/feathr_project/feathr/client.py b/feathr_project/feathr/client.py index 069417fb4..37d156139 100644 --- a/feathr_project/feathr/client.py +++ b/feathr_project/feathr/client.py @@ -2,9 +2,12 @@ import logging import os import tempfile +import copy from datetime import datetime, timedelta from pathlib import Path from typing import Dict, List, Optional, Union +from xmlrpc.client import Boolean, boolean +from feathr.definition.typed_key import TypedKey from numpy import isin from feathr.definition.feature import FeatureBase @@ -610,6 +613,40 @@ def monitor_features(self, settings: MonitoringSettings, execution_configuration """ self.materialize_features(settings, execution_configurations, verbose) + def _get_feature_key(self, feature_name: str): + features = [] + if 'derived_feature_list' in dir(self): + features += self.derived_feature_list + if 'anchor_list' in dir(self): + for anchor in self.anchor_list: + features += anchor.features + for feature in features: + if feature.name == feature_name: + keys = feature.key + return [key.key_column for key in keys] + self.logger.warning(f"Invalid feature name: {feature_name}. Please call FeathrClient.build_features() first in order to materialize the features.") + return None + + def _valid_materialize_keys(self, features: List[str], allow_empty_key=False)->boolean: + keys = None + for feature in features: + new_keys = self._get_feature_key(feature) + if new_keys is None: + self.logger.error(f"Failed to get feature key for feature: {feature}") + return False + if len(new_keys) == 1 and new_keys[0] == "NOT_NEEDED" and not allow_empty_key: + self.logger.error(f"Empty feature key is not allowed for features: {features}") + return False + new_keys = sorted(new_keys) + if keys is None: + keys = copy.deepcopy(new_keys) + else: + for key, new_key in zip(keys, new_keys): + if key != new_key: + self.logger.error("Inconsistent feature keys.") + return False + return True + def materialize_features(self, settings: MaterializationSettings, execution_configurations: Union[SparkExecutionConfiguration ,Dict[str,str]] = {}, verbose: bool = False): """Materialize feature data @@ -617,6 +654,9 @@ def materialize_features(self, settings: MaterializationSettings, execution_conf settings: Feature materialization settings execution_configurations: a dict that will be passed to spark job when the job starts up, i.e. the "spark configurations". Note that not all of the configuration will be honored since some of the configurations are managed by the Spark platform, such as Databricks or Azure Synapse. Refer to the [spark documentation](https://spark.apache.org/docs/latest/configuration.html) for a complete list of spark configurations. """ + feature_list = settings.feature_names + if len(feature_list) > 0 and not self._valid_materialize_keys(feature_list): + raise RuntimeError(f"Invalid materialization feature keys: {feature_list}") # Collect secrets from sinks secrets = [] diff --git a/feathr_project/feathr/definition/materialization_settings.py b/feathr_project/feathr/definition/materialization_settings.py index 4aa0c5870..35b4aec2b 100644 --- a/feathr_project/feathr/definition/materialization_settings.py +++ b/feathr_project/feathr/definition/materialization_settings.py @@ -1,11 +1,11 @@ from datetime import datetime, timedelta from typing import List, Optional -from feathr.definition.sink import Sink +from feathr.definition.sink import RedisSink, Sink import math class BackfillTime: - """Time range to materialize/backfill feature data. Please refer to https://feathr-ai.github.io/feathr/concepts/materializing-features.html#feature-backfill for a more detailed explanation. + """Time range to materialize/backfill feature data. Please refer to https://linkedin.github.io/feathr/concepts/materializing-features.html#feature-backfill for a more detailed explanation. Attributes: start: start time of the backfill, inclusive. @@ -31,6 +31,9 @@ def __init__(self, name: str, sinks: List[Sink], feature_names: List[str], backf self.name = name now = datetime.now() self.backfill_time = backfill_time if backfill_time else BackfillTime(start=now, end=now, step=timedelta(days=1)) + for sink in sinks: + if isinstance(sink, RedisSink): + sink.aggregation_features = feature_names self.sinks = sinks self.feature_names = feature_names diff --git a/feathr_project/feathr/definition/sink.py b/feathr_project/feathr/definition/sink.py index ae70ee453..1f6bf5ea2 100644 --- a/feathr_project/feathr/definition/sink.py +++ b/feathr_project/feathr/definition/sink.py @@ -64,11 +64,10 @@ class RedisSink(Sink): streaming: whether it is used in streaming mode streamingTimeoutMs: maximum running time for streaming mode. It is not used in batch mode. """ - def __init__(self, table_name: str, streaming: bool=False, streamingTimeoutMs: Optional[int]=None, aggregation_features: Optional[List[str]]=None) -> None: + def __init__(self, table_name: str, streaming: bool=False, streamingTimeoutMs: Optional[int]=None) -> None: self.table_name = table_name self.streaming = streaming self.streamingTimeoutMs = streamingTimeoutMs - self.aggregation_features = aggregation_features def to_feature_config(self) -> str: """Produce the config used in feature materialization""" diff --git a/feathr_project/test/test_feature_materialization.py b/feathr_project/test/test_feature_materialization.py index 54ccc7957..62b84d367 100644 --- a/feathr_project/test/test_feature_materialization.py +++ b/feathr_project/test/test_feature_materialization.py @@ -21,7 +21,7 @@ def test_feature_materialization_config(): backfill_time = BackfillTime(start=datetime(2020, 5, 20), end=datetime(2020, 5,20), step=timedelta(days=1)) - redisSink = RedisSink(table_name="nycTaxiDemoFeature", aggregation_features=["f_location_avg_fare", "f_location_max_fare"]) + redisSink = RedisSink(table_name="nycTaxiDemoFeature") settings = MaterializationSettings("nycTaxiTable", sinks=[redisSink], feature_names=["f_location_avg_fare", "f_location_max_fare"], From 51ff0e2d5bbea9c96c28ba1d87eb785944cc9629 Mon Sep 17 00:00:00 2001 From: enya-yx Date: Thu, 15 Sep 2022 18:07:47 +0800 Subject: [PATCH 5/9] merge to main and quick change --- feathr_project/feathr/client.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/feathr_project/feathr/client.py b/feathr_project/feathr/client.py index 37d156139..1f7bd6e34 100644 --- a/feathr_project/feathr/client.py +++ b/feathr_project/feathr/client.py @@ -6,8 +6,6 @@ from datetime import datetime, timedelta from pathlib import Path from typing import Dict, List, Optional, Union -from xmlrpc.client import Boolean, boolean -from feathr.definition.typed_key import TypedKey from numpy import isin from feathr.definition.feature import FeatureBase @@ -627,7 +625,7 @@ def _get_feature_key(self, feature_name: str): self.logger.warning(f"Invalid feature name: {feature_name}. Please call FeathrClient.build_features() first in order to materialize the features.") return None - def _valid_materialize_keys(self, features: List[str], allow_empty_key=False)->boolean: + def _valid_materialize_keys(self, features: List[str], allow_empty_key=False): keys = None for feature in features: new_keys = self._get_feature_key(feature) From ebf113b4952a6ba4d3973b251b926f5e15c0a7d2 Mon Sep 17 00:00:00 2001 From: enya-yx Date: Fri, 16 Sep 2022 17:19:38 +0800 Subject: [PATCH 6/9] revert change by mistake --- feathr_project/feathr/definition/materialization_settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feathr_project/feathr/definition/materialization_settings.py b/feathr_project/feathr/definition/materialization_settings.py index 35b4aec2b..80a02fcdd 100644 --- a/feathr_project/feathr/definition/materialization_settings.py +++ b/feathr_project/feathr/definition/materialization_settings.py @@ -5,7 +5,7 @@ class BackfillTime: - """Time range to materialize/backfill feature data. Please refer to https://linkedin.github.io/feathr/concepts/materializing-features.html#feature-backfill for a more detailed explanation. + """Time range to materialize/backfill feature data. Please refer to https://feathr-ai.github.io/feathr/concepts/materializing-features.html#feature-backfill for a more detailed explanation. Attributes: start: start time of the backfill, inclusive. From 0cdbde9665110e4a1211b0f7432d3999c40c577b Mon Sep 17 00:00:00 2001 From: enya-yx Date: Tue, 20 Sep 2022 20:03:09 +0800 Subject: [PATCH 7/9] Apply this parameter to HDSF sink and add comments --- feathr_project/feathr/client.py | 9 +++++++++ .../feathr/definition/materialization_settings.py | 4 ++-- feathr_project/feathr/definition/sink.py | 3 +++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/feathr_project/feathr/client.py b/feathr_project/feathr/client.py index 2c7c83f7d..660f9066a 100644 --- a/feathr_project/feathr/client.py +++ b/feathr_project/feathr/client.py @@ -4,6 +4,7 @@ import tempfile from typing import Dict, List, Union from feathr.definition.feature import FeatureBase +import copy import redis from azure.identity import DefaultAzureCredential @@ -568,6 +569,9 @@ def monitor_features(self, settings: MonitoringSettings, execution_configuration """ self.materialize_features(settings, execution_configurations, verbose) + # Get feature keys givin the name of a feature + # Should search in both 'derived_feature_list' and 'anchor_list' + # Return related keys(key_column list) or None if cannot find the feature def _get_feature_key(self, feature_name: str): features = [] if 'derived_feature_list' in dir(self): @@ -582,6 +586,9 @@ def _get_feature_key(self, feature_name: str): self.logger.warning(f"Invalid feature name: {feature_name}. Please call FeathrClient.build_features() first in order to materialize the features.") return None + # Validation on feature keys: + # Features within a set of aggregation or planned to be merged should have same keys + # The param "allow_empty_key" shows if empty keys are acceptable def _valid_materialize_keys(self, features: List[str], allow_empty_key=False): keys = None for feature in features: @@ -589,9 +596,11 @@ def _valid_materialize_keys(self, features: List[str], allow_empty_key=False): if new_keys is None: self.logger.error(f"Failed to get feature key for feature: {feature}") return False + # If only get one key are it's "NOT_NEEDED", it means the feature has an empty key. if len(new_keys) == 1 and new_keys[0] == "NOT_NEEDED" and not allow_empty_key: self.logger.error(f"Empty feature key is not allowed for features: {features}") return False + # Sorted keys to make it easier to compare new_keys = sorted(new_keys) if keys is None: keys = copy.deepcopy(new_keys) diff --git a/feathr_project/feathr/definition/materialization_settings.py b/feathr_project/feathr/definition/materialization_settings.py index 80a02fcdd..9ece26876 100644 --- a/feathr_project/feathr/definition/materialization_settings.py +++ b/feathr_project/feathr/definition/materialization_settings.py @@ -1,6 +1,6 @@ from datetime import datetime, timedelta from typing import List, Optional -from feathr.definition.sink import RedisSink, Sink +from feathr.definition.sink import HdfsSink, RedisSink, Sink import math @@ -32,7 +32,7 @@ def __init__(self, name: str, sinks: List[Sink], feature_names: List[str], backf now = datetime.now() self.backfill_time = backfill_time if backfill_time else BackfillTime(start=now, end=now, step=timedelta(days=1)) for sink in sinks: - if isinstance(sink, RedisSink): + if isinstance(sink, RedisSink) or isinstance(sink, HdfsSink): sink.aggregation_features = feature_names self.sinks = sinks self.feature_names = feature_names diff --git a/feathr_project/feathr/definition/sink.py b/feathr_project/feathr/definition/sink.py index 60a9f8e20..a23718a44 100644 --- a/feathr_project/feathr/definition/sink.py +++ b/feathr_project/feathr/definition/sink.py @@ -134,6 +134,9 @@ def to_feature_config(self) -> str: name: HDFS params: { path: "{{sink.output_path}}" + {% if sink.aggregation_features %} + features: [{{','.join(sink.aggregation_features)}}] + {% endif %} } } """) From 7bf8b5ed74dc30b2ea1ce2aff67fffa1335129ab Mon Sep 17 00:00:00 2001 From: enya-yx Date: Tue, 20 Sep 2022 20:44:04 +0800 Subject: [PATCH 8/9] quick fix --- feathr_project/feathr/definition/materialization_settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feathr_project/feathr/definition/materialization_settings.py b/feathr_project/feathr/definition/materialization_settings.py index 9ece26876..8cdc2fc71 100644 --- a/feathr_project/feathr/definition/materialization_settings.py +++ b/feathr_project/feathr/definition/materialization_settings.py @@ -32,7 +32,7 @@ def __init__(self, name: str, sinks: List[Sink], feature_names: List[str], backf now = datetime.now() self.backfill_time = backfill_time if backfill_time else BackfillTime(start=now, end=now, step=timedelta(days=1)) for sink in sinks: - if isinstance(sink, RedisSink) or isinstance(sink, HdfsSink): + if isinstance(sink, RedisSink): sink.aggregation_features = feature_names self.sinks = sinks self.feature_names = feature_names From 43772b8dce7b5c5cb918405a1ad73da641bfc1e4 Mon Sep 17 00:00:00 2001 From: enya-yx Date: Wed, 21 Sep 2022 11:49:37 +0800 Subject: [PATCH 9/9] quick improve --- feathr_project/feathr/client.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/feathr_project/feathr/client.py b/feathr_project/feathr/client.py index 660f9066a..f21d37d23 100644 --- a/feathr_project/feathr/client.py +++ b/feathr_project/feathr/client.py @@ -569,7 +569,7 @@ def monitor_features(self, settings: MonitoringSettings, execution_configuration """ self.materialize_features(settings, execution_configurations, verbose) - # Get feature keys givin the name of a feature + # Get feature keys given the name of a feature # Should search in both 'derived_feature_list' and 'anchor_list' # Return related keys(key_column list) or None if cannot find the feature def _get_feature_key(self, feature_name: str): @@ -582,7 +582,7 @@ def _get_feature_key(self, feature_name: str): for feature in features: if feature.name == feature_name: keys = feature.key - return [key.key_column for key in keys] + return set(key.key_column for key in keys) self.logger.warning(f"Invalid feature name: {feature_name}. Please call FeathrClient.build_features() first in order to materialize the features.") return None @@ -594,20 +594,21 @@ def _valid_materialize_keys(self, features: List[str], allow_empty_key=False): for feature in features: new_keys = self._get_feature_key(feature) if new_keys is None: - self.logger.error(f"Failed to get feature key for feature: {feature}") + self.logger.error(f"Key of feature: {feature} is empty. If this feature is not from INPUT_CONTEXT, you might want to double check on the feature definition to see whether the key is empty or not.") return False - # If only get one key are it's "NOT_NEEDED", it means the feature has an empty key. - if len(new_keys) == 1 and new_keys[0] == "NOT_NEEDED" and not allow_empty_key: + # If only get one key and it's "NOT_NEEDED", it means the feature has an empty key. + if ','.join(new_keys) == "NOT_NEEDED" and not allow_empty_key: self.logger.error(f"Empty feature key is not allowed for features: {features}") return False - # Sorted keys to make it easier to compare - new_keys = sorted(new_keys) if keys is None: keys = copy.deepcopy(new_keys) else: - for key, new_key in zip(keys, new_keys): - if key != new_key: - self.logger.error("Inconsistent feature keys.") + if len(keys) != len(new_keys): + self.logger.error(f"Inconsistent feature keys. Current keys are {str(keys)}") + return False + for new_key in new_keys: + if new_key not in keys: + self.logger.error(f"Inconsistent feature keys. Current keys are {str(keys)}") return False return True @@ -620,7 +621,7 @@ def materialize_features(self, settings: MaterializationSettings, execution_conf """ feature_list = settings.feature_names if len(feature_list) > 0 and not self._valid_materialize_keys(feature_list): - raise RuntimeError(f"Invalid materialization feature keys: {feature_list}") + raise RuntimeError(f"Invalid materialization features: {feature_list}, since they have different keys. Currently Feathr only supports materializing features of the same keys.") # Collect secrets from sinks secrets = []