Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion feathr_project/feathr/definition/_materialization_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ def _to_materialization_config(settings: MaterializationSettings):
name: {{ settings.name }}
endTime: "{{ settings.backfill_time.end.strftime('%Y-%m-%d %H:%M:%S') }}"
endTimeFormat: "yyyy-MM-dd HH:mm:ss"
resolution: DAILY
resolution: {{ settings.resolution }}
{% if settings.has_hdfs_sink == True %}
enableIncremental = true
{% endif %}
Expand Down
6 changes: 5 additions & 1 deletion feathr_project/feathr/definition/materialization_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,11 @@ class MaterializationSettings:
feature_names: list of feature names to be materialized
backfill_time: time range and frequency for the materialization. Default to now().
"""
def __init__(self, name: str, sinks: List[Sink], feature_names: List[str], backfill_time: Optional[BackfillTime] = None):
def __init__(self, name: str, sinks: List[Sink], feature_names: List[str], backfill_time: Optional[BackfillTime] = None, resolution: str = "DAILY"):
if resolution not in ["DAILY", "HOURLY"]:
raise RuntimeError(
f'{resolution} is not supported. Only \'DAILY\' and \'HOURLY\' are currently supported.')
self.resolution = resolution
self.name = name
now = datetime.now()
self.backfill_time = backfill_time if backfill_time else BackfillTime(start=now, end=now, step=timedelta(days=1))
Expand Down
62 changes: 59 additions & 3 deletions feathr_project/test/test_azure_spark_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def test_feathr_materialize_to_offline():
if client.spark_runtime == 'databricks':
output_path = ''.join(['dbfs:/feathrazure_cijob_materialize_offline_','_', str(now.minute), '_', str(now.second), ""])
else:
output_path = ''.join(['abfss://[email protected]/demo_data/feathrazure_cijob_materialize_offline_','_', str(now.minute), '_', str(now.second), ""])
output_path = ''.join(['abfss://[email protected]/demo_data/feathrazure_cijob_materialize_offline_','_', str(now.minute), '_', str(now.second), ""])
offline_sink = HdfsSink(output_path=output_path)
settings = MaterializationSettings("nycTaxiTable",
sinks=[offline_sink],
Expand Down Expand Up @@ -430,7 +430,7 @@ def test_feathr_materialize_with_time_partition_pattern():
client_consumer: FeathrClient = time_partition_pattern_test_setup(os.path.join(test_workspace_dir, "feathr_config.yaml"), output_path+'/df0/daily')

backfill_time_tpp = BackfillTime(start=datetime(
2020, 5, 20), end=datetime(2020, 5, 20), step=timedelta(days=1))
2020, 5, 21), end=datetime(2020, 5, 21), step=timedelta(days=1))

now = datetime.now()
if client_consumer.spark_runtime == 'databricks':
Expand All @@ -449,9 +449,65 @@ def test_feathr_materialize_with_time_partition_pattern():

# download result and just assert the returned result is not empty
# by default, it will write to a folder appended with date
res_df = get_result_df(client_consumer, "avro", output_path_tpp + "/df0/daily/2020/05/20")
res_df = get_result_df(client_consumer, "avro", output_path_tpp + "/df0/daily/2020/05/21")
assert res_df.shape[0] > 0

def test_feathr_materialize_with_time_partition_pattern_hourly():
"""
Test FeathrClient() using HdfsSource with 'timePartitionPattern'.
"""
test_workspace_dir = Path(
__file__).parent.resolve() / "test_user_workspace"
# os.chdir(test_workspace_dir)
# Create data source first
client_producer: FeathrClient = basic_test_setup(os.path.join(test_workspace_dir, "feathr_config.yaml"))

backfill_time = BackfillTime(start=datetime(
2020, 5, 20), end=datetime(2020, 5, 20), step=timedelta(days=1))

if client_producer.spark_runtime == 'databricks':
output_path = 'dbfs:/timePartitionPattern_hourly_sample'
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

better to add some time signature in the output path so each job's output can be unique

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is used as data source not the output we want to check so maybe we don't need to take extra space to save it

else:
output_path = 'abfss://[email protected]/timePartitionPattern_hourly_sample'

offline_sink = HdfsSink(output_path=output_path)
settings = MaterializationSettings("nycTaxiTable",
sinks=[offline_sink],
feature_names=[
"f_location_avg_fare", "f_location_max_fare"],
backfill_time=backfill_time,resolution='HOURLY')
client_producer.materialize_features(settings)
# assuming the job can successfully run; otherwise it will throw exception
client_producer.wait_job_to_finish(timeout_sec=Constants.SPARK_JOB_TIMEOUT_SECONDS)

res_df = get_result_df(client_producer, "avro", output_path + "/df0/daily/2020/05/20/00")
assert res_df.shape[0] > 0

client_consumer: FeathrClient = time_partition_pattern_test_setup(os.path.join(test_workspace_dir, "feathr_config.yaml"), output_path+'/df0/daily', 'HOURLY')

backfill_time_tpp = BackfillTime(start=datetime(
2020, 5, 21), end=datetime(2020, 5, 21), step=timedelta(days=1))

now = datetime.now()
if client_consumer.spark_runtime == 'databricks':
output_path_tpp = ''.join(['dbfs:/feathrazure_cijob_materialize_offline_','_', str(now.minute), '_', str(now.second), ""])
else:
output_path_tpp = ''.join(['abfss://[email protected]/demo_data/feathrazure_cijob_materialize_offline_','_', str(now.minute), '_', str(now.second), ""])
offline_sink_tpp = HdfsSink(output_path=output_path_tpp)
settings_tpp = MaterializationSettings("nycTaxiTable",
sinks=[offline_sink_tpp],
feature_names=[
"f_loc_avg_output", "f_loc_max_output"],
backfill_time=backfill_time_tpp,
resolution = 'HOURLY')
client_consumer.materialize_features(settings_tpp, allow_materialize_non_agg_feature=True)
# assuming the job can successfully run; otherwise it will throw exception
client_consumer.wait_job_to_finish(timeout_sec=Constants.SPARK_JOB_TIMEOUT_SECONDS)

# download result and just assert the returned result is not empty
# by default, it will write to a folder appended with date
res_df = get_result_df(client_consumer, "avro", output_path_tpp + "/df0/daily/2020/05/21/00")
assert res_df.shape[0] > 0

if __name__ == "__main__":
test_feathr_materialize_to_aerospike()
Expand Down
18 changes: 14 additions & 4 deletions feathr_project/test/test_fixture.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,28 +380,38 @@ def get_online_test_table_name(table_name: str):
print("The online Redis table is", res_table)
return res_table

def time_partition_pattern_test_setup(config_path: str, data_source_path: str):
def time_partition_pattern_test_setup(config_path: str, data_source_path: str, resolution: str = 'DAILY'):
now = datetime.now()
# set workspace folder by time; make sure we don't have write conflict if there are many CI tests running
os.environ['SPARK_CONFIG__DATABRICKS__WORK_DIR'] = ''.join(['dbfs:/feathrazure_cijob','_', str(now.minute), '_', str(now.second), '_', str(now.microsecond)])
os.environ['SPARK_CONFIG__AZURE_SYNAPSE__WORKSPACE_DIR'] = ''.join(['abfss://[email protected]/feathr_github_ci','_', str(now.minute), '_', str(now.second) ,'_', str(now.microsecond)])
client = FeathrClient(config_path=config_path)

batch_source = HdfsSource(name="testTimePartitionSource",
if resolution == 'DAILY':
batch_source = HdfsSource(name="testTimePartitionSource",
path=data_source_path,
time_partition_pattern="yyyy/MM/dd"
)
else:
batch_source = HdfsSource(name="testTimePartitionSource",
path=data_source_path,
time_partition_pattern="yyyy/MM/dd/HH"
)
key = TypedKey(key_column="key0",
key_column_type=ValueType.INT32)
agg_features = [
Feature(name="f_loc_avg_output",
key=[key],
feature_type=FLOAT,
transform="f_location_avg_fare"),
transform=WindowAggTransformation(agg_expr="f_location_avg_fare",
agg_func="AVG",
window="3d")),
Feature(name="f_loc_max_output",
feature_type=FLOAT,
key=[key],
transform="f_location_max_fare"),
transform=WindowAggTransformation(agg_expr="f_location_max_fare",
agg_func="MAX",
window="3d")),
]

agg_anchor = FeatureAnchor(name="testTimePartitionFeatures",
Expand Down