feathr-ai · enya-yx · Nov 1, 2022 · Nov 1, 2022 · Nov 1, 2022 · Nov 1, 2022
diff --git a/feathr_project/feathr/definition/_materialization_utils.py b/feathr_project/feathr/definition/_materialization_utils.py
@@ -9,7 +9,7 @@ def _to_materialization_config(settings: MaterializationSettings):
             name: {{ settings.name }}
             endTime: "{{ settings.backfill_time.end.strftime('%Y-%m-%d %H:%M:%S') }}"
             endTimeFormat: "yyyy-MM-dd HH:mm:ss"
-            resolution: DAILY
+            resolution: {{ settings.resolution }}
             {% if settings.has_hdfs_sink == True %}
             enableIncremental = true
             {% endif %}

diff --git a/feathr_project/feathr/definition/materialization_settings.py b/feathr_project/feathr/definition/materialization_settings.py
@@ -27,7 +27,11 @@ class MaterializationSettings:
         feature_names: list of feature names to be materialized
         backfill_time: time range and frequency for the materialization. Default to now().
     """
-    def __init__(self, name: str, sinks: List[Sink], feature_names: List[str], backfill_time: Optional[BackfillTime] = None):
+    def __init__(self, name: str, sinks: List[Sink], feature_names: List[str], backfill_time: Optional[BackfillTime] = None, resolution: str = "DAILY"):
+        if resolution not in ["DAILY", "HOURLY"]:
+            raise RuntimeError(
+                f'{resolution} is not supported. Only \'DAILY\' and \'HOURLY\' are currently supported.')
+        self.resolution = resolution
         self.name = name
         now = datetime.now()
         self.backfill_time = backfill_time if backfill_time else BackfillTime(start=now, end=now, step=timedelta(days=1))

diff --git a/feathr_project/test/test_azure_spark_e2e.py b/feathr_project/test/test_azure_spark_e2e.py
@@ -42,7 +42,7 @@ def test_feathr_materialize_to_offline():
     if client.spark_runtime == 'databricks':
         output_path = ''.join(['dbfs:/feathrazure_cijob_materialize_offline_','_', str(now.minute), '_', str(now.second), ""])
     else:
-        output_path = ''.join(['abfss://[email protected]/demo_data/feathrazure_cijob_materialize_offline_','_', str(now.minute), '_', str(now.second), ""])
+        output_path = ''.join(['abfss://[email protected]/demo_data/feathrazure_cijob_materialize_offline_','_', str(now.minute), '_', str(now.second), ""]) 
     offline_sink = HdfsSink(output_path=output_path)
     settings = MaterializationSettings("nycTaxiTable",
                                        sinks=[offline_sink],
@@ -430,7 +430,7 @@ def test_feathr_materialize_with_time_partition_pattern():
     client_consumer: FeathrClient = time_partition_pattern_test_setup(os.path.join(test_workspace_dir, "feathr_config.yaml"), output_path+'/df0/daily')
 
     backfill_time_tpp = BackfillTime(start=datetime(
-        2020, 5, 20), end=datetime(2020, 5, 20), step=timedelta(days=1))
+        2020, 5, 21), end=datetime(2020, 5, 21), step=timedelta(days=1))
 
     now = datetime.now()
     if client_consumer.spark_runtime == 'databricks':
@@ -449,9 +449,65 @@ def test_feathr_materialize_with_time_partition_pattern():
 
     # download result and just assert the returned result is not empty
     # by default, it will write to a folder appended with date
-    res_df = get_result_df(client_consumer, "avro", output_path_tpp + "/df0/daily/2020/05/20")
+    res_df = get_result_df(client_consumer, "avro", output_path_tpp + "/df0/daily/2020/05/21")
+    assert res_df.shape[0] > 0
+
+def test_feathr_materialize_with_time_partition_pattern_hourly():
+    """
+    Test FeathrClient() using HdfsSource with 'timePartitionPattern'.
+    """   
+    test_workspace_dir = Path(
+        __file__).parent.resolve() / "test_user_workspace"
+    # os.chdir(test_workspace_dir)
+    # Create data source first
+    client_producer: FeathrClient = basic_test_setup(os.path.join(test_workspace_dir, "feathr_config.yaml"))
+
+    backfill_time = BackfillTime(start=datetime(
+        2020, 5, 20), end=datetime(2020, 5, 20), step=timedelta(days=1))
+
+    if client_producer.spark_runtime == 'databricks':
+        output_path = 'dbfs:/timePartitionPattern_hourly_sample'
+    else:
+        output_path = 'abfss://[email protected]/timePartitionPattern_hourly_sample'
+
+    offline_sink = HdfsSink(output_path=output_path)
+    settings = MaterializationSettings("nycTaxiTable",
+                sinks=[offline_sink],
+                feature_names=[
+                    "f_location_avg_fare", "f_location_max_fare"],
+                backfill_time=backfill_time,resolution='HOURLY')
+    client_producer.materialize_features(settings)
+    # assuming the job can successfully run; otherwise it will throw exception
+    client_producer.wait_job_to_finish(timeout_sec=Constants.SPARK_JOB_TIMEOUT_SECONDS)
+
+    res_df = get_result_df(client_producer, "avro", output_path + "/df0/daily/2020/05/20/00")
     assert res_df.shape[0] > 0
 
+    client_consumer: FeathrClient = time_partition_pattern_test_setup(os.path.join(test_workspace_dir, "feathr_config.yaml"), output_path+'/df0/daily', 'HOURLY')
+
+    backfill_time_tpp = BackfillTime(start=datetime(
+        2020, 5, 21), end=datetime(2020, 5, 21), step=timedelta(days=1))
+
+    now = datetime.now()
+    if client_consumer.spark_runtime == 'databricks':
+        output_path_tpp = ''.join(['dbfs:/feathrazure_cijob_materialize_offline_','_', str(now.minute), '_', str(now.second), ""])
+    else:
+        output_path_tpp = ''.join(['abfss://[email protected]/demo_data/feathrazure_cijob_materialize_offline_','_', str(now.minute), '_', str(now.second), ""])
+    offline_sink_tpp = HdfsSink(output_path=output_path_tpp)
+    settings_tpp = MaterializationSettings("nycTaxiTable",
+                                       sinks=[offline_sink_tpp],
+                                       feature_names=[
+                                           "f_loc_avg_output", "f_loc_max_output"],
+                                       backfill_time=backfill_time_tpp,
+                                       resolution = 'HOURLY')
+    client_consumer.materialize_features(settings_tpp, allow_materialize_non_agg_feature=True)
+    # assuming the job can successfully run; otherwise it will throw exception
+    client_consumer.wait_job_to_finish(timeout_sec=Constants.SPARK_JOB_TIMEOUT_SECONDS)
+
+    # download result and just assert the returned result is not empty
+    # by default, it will write to a folder appended with date
+    res_df = get_result_df(client_consumer, "avro", output_path_tpp + "/df0/daily/2020/05/21/00")
+    assert res_df.shape[0] > 0
 
 if __name__ == "__main__":
     test_feathr_materialize_to_aerospike()

diff --git a/feathr_project/test/test_fixture.py b/feathr_project/test/test_fixture.py
@@ -380,28 +380,38 @@ def get_online_test_table_name(table_name: str):
     print("The online Redis table is", res_table)
     return res_table
 
-def time_partition_pattern_test_setup(config_path: str, data_source_path: str):
+def time_partition_pattern_test_setup(config_path: str, data_source_path: str, resolution: str = 'DAILY'):
     now = datetime.now()
     # set workspace folder by time; make sure we don't have write conflict if there are many CI tests running
     os.environ['SPARK_CONFIG__DATABRICKS__WORK_DIR'] = ''.join(['dbfs:/feathrazure_cijob','_', str(now.minute), '_', str(now.second), '_', str(now.microsecond)]) 
     os.environ['SPARK_CONFIG__AZURE_SYNAPSE__WORKSPACE_DIR'] = ''.join(['abfss://[email protected]/feathr_github_ci','_', str(now.minute), '_', str(now.second) ,'_', str(now.microsecond)]) 
     client = FeathrClient(config_path=config_path)
 
-    batch_source = HdfsSource(name="testTimePartitionSource",
+    if resolution == 'DAILY':
+        batch_source = HdfsSource(name="testTimePartitionSource",
                           path=data_source_path,
                           time_partition_pattern="yyyy/MM/dd"
                           )
+    else:
+        batch_source = HdfsSource(name="testTimePartitionSource",
+                          path=data_source_path,
+                          time_partition_pattern="yyyy/MM/dd/HH"
+                          )
     key = TypedKey(key_column="key0",
                key_column_type=ValueType.INT32)
     agg_features = [
     Feature(name="f_loc_avg_output",
             key=[key],
             feature_type=FLOAT,
-            transform="f_location_avg_fare"),
+            transform=WindowAggTransformation(agg_expr="f_location_avg_fare",
+                                              agg_func="AVG",
+                                              window="3d")),
     Feature(name="f_loc_max_output",
             feature_type=FLOAT,
             key=[key],
-            transform="f_location_max_fare"),
+            transform=WindowAggTransformation(agg_expr="f_location_max_fare",
+                                              agg_func="MAX",
+                                              window="3d")),
     ]
 
     agg_anchor = FeatureAnchor(name="testTimePartitionFeatures",