feathr-ai · windoze · Jun 13, 2022 · Jun 8, 2022 · Jun 9, 2022 · Jun 10, 2022
diff --git a/feathr_project/feathr/constants.py b/feathr_project/feathr/constants.py
@@ -24,3 +24,4 @@
 TYPEDEF_ARRAY_DERIVED_FEATURE=f"array<feathr_derived_feature_{REGISTRY_TYPEDEF_VERSION}>"
 TYPEDEF_ARRAY_ANCHOR_FEATURE=f"array<feathr_anchor_feature_{REGISTRY_TYPEDEF_VERSION}>"
 
+FEATHR_MAVEN_ARTIFACT="com.linkedin.feathr:feathr_2.12:0.4.0"
diff --git a/feathr_project/feathr/spark_provider/.gitignore b/feathr_project/feathr/spark_provider/.gitignore
@@ -0,0 +1 @@
+!noop-1.0.jar
diff --git a/feathr_project/feathr/spark_provider/_databricks_submission.py b/feathr_project/feathr/spark_provider/_databricks_submission.py
@@ -143,7 +143,11 @@ def submit_feathr_job(self, job_name: str, main_jar_path: str,  main_class_name:
             submission_params['new_cluster']['spark_conf'] = configuration
             submission_params['new_cluster']['custom_tags'] = job_tags
         # the feathr main jar file is anyway needed regardless it's pyspark or scala spark
-        submission_params['libraries'][0]['jar'] = self.upload_or_get_cloud_path(main_jar_path)
+        if not main_jar_path:
+            logger.info(f"Main JAR file is not set, using default package '{FEATHR_MAVEN_ARTIFACT}' from Maven")
+            submission_params['libraries'][0]['maven'] = { "coordinates": FEATHR_MAVEN_ARTIFACT }
+        else:
+            submission_params['libraries'][0]['jar'] = self.upload_or_get_cloud_path(main_jar_path)
         # see here for the submission parameter definition https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--request-structure-6
         if python_files:
             # this is a pyspark job. definition here: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--sparkpythontask

diff --git a/feathr_project/feathr/spark_provider/_synapse_submission.py b/feathr_project/feathr/spark_provider/_synapse_submission.py
@@ -1,4 +1,6 @@
+from copy import deepcopy
 import os
+import pathlib
 import re
 import time
 import urllib.request
@@ -43,7 +45,8 @@ class _FeathrSynapseJobLauncher(SparkJobLauncher):
     """
     Submits spark jobs to a Synapse spark cluster.
     """
-    def __init__(self, synapse_dev_url: str, pool_name: str, datalake_dir: str, executor_size: str, executors: int, credential = None):
+
+    def __init__(self, synapse_dev_url: str, pool_name: str, datalake_dir: str, executor_size: str, executors: int, credential=None):
         # use DeviceCodeCredential if EnvironmentCredential is not available
         self.credential = credential
         # use the same credential for authentication to avoid further login.
@@ -60,9 +63,11 @@ def upload_or_get_cloud_path(self, local_path_or_http_path: str):
         Supports transferring file from an http path to cloud working storage, or upload directly from a local storage.
         """
         logger.info('Uploading {} to cloud..', local_path_or_http_path)
-        res_path = self._datalake.upload_file_to_workdir(local_path_or_http_path)
+        res_path = self._datalake.upload_file_to_workdir(
+            local_path_or_http_path)
 
-        logger.info('{} is uploaded to location: {}', local_path_or_http_path, res_path)
+        logger.info('{} is uploaded to location: {}',
+                    local_path_or_http_path, res_path)
         return res_path
 
     def download_result(self, result_path: str, local_folder: str):
@@ -73,7 +78,7 @@ def download_result(self, result_path: str, local_folder: str):
         return self._datalake.download_file(result_path, local_folder)
 
     def submit_feathr_job(self, job_name: str, main_jar_path: str = None,  main_class_name: str = None, arguments: List[str] = None,
-                          python_files: List[str]= None, reference_files_path: List[str] = None, job_tags: Dict[str, str] = None,
+                          python_files: List[str] = None, reference_files_path: List[str] = None, job_tags: Dict[str, str] = None,
                           configuration: Dict[str, str] = None):
         """
         Submits the feathr job
@@ -92,21 +97,53 @@ def submit_feathr_job(self, job_name: str, main_jar_path: str = None,  main_clas
             job_name (str): name of the job
             main_jar_path (str): main file paths, usually your main jar file
             main_class_name (str): name of your main class
-            arguments (str): all the arugments you want to pass into the spark job
-            job_tags (str): tags of the job, for exmaple you might want to put your user ID, or a tag with a certain information
+            arguments (str): all the arguments you want to pass into the spark job
+            job_tags (str): tags of the job, for example you might want to put your user ID, or a tag with a certain information
             configuration (Dict[str, str]): Additional configs for the spark job
         """
-        assert main_jar_path, 'main_jar_path should not be none or empty but it is none or empty.'
-        if main_jar_path.startswith('abfs'):
-            main_jar_cloud_path = main_jar_path
-            logger.info(
-                'Cloud path {} is used for running the job: {}', main_jar_path, job_name)
+
+        if configuration:
+            cfg = configuration.copy()  # We don't want to mess up input parameters
+        else:
+            cfg = {}
+        if not main_jar_path:
+            # We don't have the main jar, use Maven
+            # Add Maven dependency to the job configuration
+            if "spark.jars.packages" in cfg:
+                cfg["spark.jars.packages"] = ",".join(
+                    [cfg["spark.jars.packages"], FEATHR_MAVEN_ARTIFACT])
+            else:
+                cfg["spark.jars.packages"] = FEATHR_MAVEN_ARTIFACT
+
+            if not python_files:
+                # This is a JAR job
+                # Azure Synapse/Livy doesn't allow JAR job starts from Maven directly, we must have a jar file uploaded.
+                # so we have to use a dummy jar as the main file.
+                logger.info(f"Main JAR file is not set, using default package '{FEATHR_MAVEN_ARTIFACT}' from Maven")
+                # Use the no-op jar as the main file
+                # This is a dummy jar which contains only one `org.example.Noop` class with one empty `main` function which does nothing
+                current_dir = pathlib.Path(__file__).parent.resolve()
+                main_jar_path = os.path.join(current_dir, "noop-1.0.jar")
+            else:
+                # This is a PySpark job, no more things to do
+                pass
+        main_jar_cloud_path = None
+        if main_jar_path:
+            # Now we have a main jar, either feathr or noop
+            if main_jar_path.startswith('abfs'):
+                main_jar_cloud_path = main_jar_path
+                logger.info(
+                    'Cloud path {} is used for running the job: {}', main_jar_path, job_name)
+            else:
+                logger.info('Uploading jar from {} to cloud for running job: {}',
+                            main_jar_path, job_name)
+                main_jar_cloud_path = self._datalake.upload_file_to_workdir(main_jar_path)
+                logger.info('{} is uploaded to {} for running job: {}',
+                            main_jar_path, main_jar_cloud_path, job_name)
         else:
-            logger.info('Uploading jar from {} to cloud for running job: {}',
-                        main_jar_path, job_name)
-            main_jar_cloud_path = self._datalake.upload_file_to_workdir(main_jar_path)
-            logger.info('{} is uploaded to {} for running job: {}',
-                        main_jar_path, main_jar_cloud_path, job_name)
+            # We don't have the main Jar, and this is a PySpark job so we don't use `noop.jar` either
+            # Keep `main_jar_cloud_path` as `None` as we already added maven package into cfg
+            pass
 
         reference_file_paths = []
         for file_path in reference_files_path:
@@ -120,7 +157,7 @@ def submit_feathr_job(self, job_name: str, main_jar_path: str = None,  main_clas
                                                                  arguments=arguments,
                                                                  reference_files=reference_files_path,
                                                                  tags=job_tags,
-                                                                 configuration=configuration)
+                                                                 configuration=cfg)
         logger.info('See submitted job here: https://web.azuresynapse.net/en-us/monitoring/sparkapplication')
         return self.current_job_info
 
@@ -247,8 +284,13 @@ def create_spark_batch_job(self, job_name, main_file, class_name=None,
         executor_cores = self.EXECUTOR_SIZE[self._executor_size]['Cores']
         executor_memory = self.EXECUTOR_SIZE[self._executor_size]['Memory']
 
-        # need to put the jar in as dependencies for pyspark job
-        jars = jars + [main_file]
+        # If we have a main jar, it needs to be added as dependencies for pyspark job
+        # Otherwise it's a PySpark job with Feathr JAR from Maven
+        if main_file:
+            jars = jars + [main_file]
+        elif not python_files:
+            # These 2 parameters should not be empty at the same time
+            raise ValueError("Main JAR is not set for the Spark job")
 
         # If file=main_file, then it's using only Scala Spark
         # If file=python_files[0], then it's using Pyspark
@@ -319,7 +361,7 @@ def __init__(self, datalake_dir, credential=None):
             self.dir_client = self.file_system_client.get_directory_client('/')
 
         self.datalake_dir = datalake_dir + \
-                            '/' if datalake_dir[-1] != '/' else datalake_dir
+            '/' if datalake_dir[-1] != '/' else datalake_dir
 
     def upload_file_to_workdir(self, src_file_path: str) -> str:
         """
@@ -394,7 +436,7 @@ def download_file(self, target_adls_directory: str, local_dir_cache: str):
         for folder in result_folders:
             folder_name = basename(folder)
             file_in_folder = [os.path.join(folder_name, basename(file_path.name)) for file_path in self.file_system_client.get_paths(
-            path=folder, recursive=False) if not file_path.is_directory]
+                path=folder, recursive=False) if not file_path.is_directory]
             local_paths = [os.path.join(local_dir_cache, file_name)
                        for file_name in file_in_folder]
             self._download_file_list(local_paths, file_in_folder, directory_client)
@@ -405,7 +447,7 @@ def download_file(self, target_adls_directory: str, local_dir_cache: str):
         self._download_file_list(local_paths, result_paths, directory_client)
 
         logger.info('Finish downloading files from {} to {}.',
-                    target_adls_directory,local_dir_cache)
+                    target_adls_directory, local_dir_cache)
 
     def _download_file_list(self, local_paths: List[str], result_paths, directory_client):
         '''

diff --git a/feathr_project/feathr/spark_provider/noop-1.0.jar b/feathr_project/feathr/spark_provider/noop-1.0.jar
diff --git a/feathr_project/test/test_azure_spark_maven_e2e.py b/feathr_project/test/test_azure_spark_maven_e2e.py
@@ -0,0 +1,63 @@
+import os
+from datetime import datetime, timedelta
+from pathlib import Path
+
+from click.testing import CliRunner
+from feathr import BOOLEAN, FLOAT, INT32, ValueType
+from feathr import FeathrClient
+from feathr import ValueType
+from feathr.utils.job_utils import get_result_df
+from feathr import (BackfillTime, MaterializationSettings)
+from feathr import FeatureQuery
+from feathr import ObservationSettings
+from feathr import RedisSink, HdfsSink
+from feathr import TypedKey
+from feathrcli.cli import init
+import pytest
+
+from test_fixture import (basic_test_setup, get_online_test_table_name)
+
+def test_feathr_online_store_agg_features():
+    """
+    Test FeathrClient() get_online_features and batch_get can get data correctly.
+    """
+
+    online_test_table = get_online_test_table_name("nycTaxiCITable")
+    test_workspace_dir = Path(
+        __file__).parent.resolve() / "test_user_workspace"
+    # os.chdir(test_workspace_dir)
+
+    # The `feathr_runtime_location` was commented out in this config file, so feathr should use
+    # Maven package as the dependency and `noop.jar` as the main file
+    client = basic_test_setup(os.path.join(test_workspace_dir, "feathr_config_maven.yaml"))
+
+    backfill_time = BackfillTime(start=datetime(
+        2020, 5, 20), end=datetime(2020, 5, 20), step=timedelta(days=1))
+    redisSink = RedisSink(table_name=online_test_table)
+    settings = MaterializationSettings("nycTaxiTable",
+                                       sinks=[redisSink],
+                                       feature_names=[
+                                           "f_location_avg_fare", "f_location_max_fare"],
+                                       backfill_time=backfill_time)
+    client.materialize_features(settings)
+    # just assume the job is successful without validating the actual result in Redis. Might need to consolidate
+    # this part with the test_feathr_online_store test case
+    client.wait_job_to_finish(timeout_sec=900)
+
+    res = client.get_online_features(online_test_table, '265', [
+                                     'f_location_avg_fare', 'f_location_max_fare'])
+    # just assme there are values. We don't hard code the values for now for testing
+    # the correctness of the feature generation should be garunteed by feathr runtime.
+    # ID 239 and 265 are available in the `DOLocationID` column in this file:
+    # https://s3.amazonaws.com/nyc-tlc/trip+data/green_tripdata_2020-04.csv
+    # View more detials on this dataset: https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page
+    assert len(res) == 2
+    assert res[0] != None
+    assert res[1] != None
+    res = client.multi_get_online_features(online_test_table,
+                                           ['239', '265'],
+                                           ['f_location_avg_fare', 'f_location_max_fare'])
+    assert res['239'][0] != None
+    assert res['239'][1] != None
+    assert res['265'][0] != None
+    assert res['265'][1] != None
diff --git a/feathr_project/test/test_user_workspace/feathr_config_maven.yaml b/feathr_project/test/test_user_workspace/feathr_config_maven.yaml
@@ -0,0 +1,118 @@
+# DO NOT MOVE OR DELETE THIS FILE
+
+# This file contains the configurations that are used by Feathr
+# All the configurations can be overwritten by environment variables with concatenation of `__` for different layers of this config file.
+# For example, `feathr_runtime_location` for databricks can be overwritten by setting this environment variable:
+# SPARK_CONFIG__DATABRICKS__FEATHR_RUNTIME_LOCATION
+# Another example would be overwriting Redis host with this config: `ONLINE_STORE__REDIS__HOST`
+# For example if you want to override this setting in a shell environment:
+# export ONLINE_STORE__REDIS__HOST=feathrazure.redis.cache.windows.net
+
+# version of API settings
+api_version: 1
+project_config:
+  project_name: 'project_feathr_integration_test'
+  # Information that are required to be set via environment variables.
+  required_environment_variables:
+    # the environemnt variables are required to run Feathr
+    # Redis password for your online store
+    - 'REDIS_PASSWORD'
+    # client IDs and client Secret for the service principal. Read the getting started docs on how to get those information.
+    - 'AZURE_CLIENT_ID'
+    - 'AZURE_TENANT_ID'
+    - 'AZURE_CLIENT_SECRET'
+  optional_environment_variables:
+    # the environemnt variables are optional, however you will need them if you want to use some of the services:
+    - ADLS_ACCOUNT
+    - ADLS_KEY
+    - WASB_ACCOUNT
+    - WASB_KEY
+    - S3_ACCESS_KEY
+    - S3_SECRET_KEY
+    - JDBC_TABLE
+    - JDBC_USER
+    - JDBC_PASSWORD
+    - KAFKA_SASL_JAAS_CONFIG
+
+offline_store:
+  # paths starts with abfss:// or abfs://
+  # ADLS_ACCOUNT and ADLS_KEY should be set in environment variable if this is set to true
+  adls:
+    adls_enabled: true
+
+  # paths starts with wasb:// or wasbs://
+  # WASB_ACCOUNT and WASB_KEY should be set in environment variable
+  wasb:
+    wasb_enabled: true
+
+  # paths starts with s3a://
+  # S3_ACCESS_KEY and S3_SECRET_KEY should be set in environment variable
+  s3:
+    s3_enabled: true
+    # S3 endpoint. If you use S3 endpoint, then you need to provide access key and secret key in the environment variable as well.
+    s3_endpoint: 's3.amazonaws.com'
+
+  # jdbc endpoint
+  jdbc:
+    jdbc_enabled: true
+    jdbc_database: 'feathrtestdb'
+    jdbc_table: 'feathrtesttable'
+
+  # snowflake endpoint
+  snowflake:
+    url: "dqllago-ol19457.snowflakecomputing.com"
+    user: "feathrintegration"
+    role: "ACCOUNTADMIN"
+
+spark_config:
+  # choice for spark runtime. Currently support: azure_synapse, databricks
+  # The `databricks` configs will be ignored if `azure_synapse` is set and vice versa.
+  spark_cluster: 'azure_synapse'
+  # configure number of parts for the spark output for feature generation job
+  spark_result_output_parts: '1'
+
+  azure_synapse:
+    dev_url: 'https://feathrazuretest3synapse.dev.azuresynapse.net'
+    pool_name: 'spark3'
+    # workspace dir for storing all the required configuration files and the jar resources
+    workspace_dir: 'abfss://[email protected]/feathr_test_workspace'
+    executor_size: 'Small'
+    executor_num: 4
+
+    # Feathr Job configuration. Support local paths, path start with http(s)://, and paths start with abfs(s)://
+    # this is the default location so end users don't have to compile the runtime again.
+    # feathr_runtime_location: wasbs://[email protected]/feathr-assembly-0.5.0-SNAPSHOT.jar
+    # Unset this value will use default package on Maven
+    # feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.5.0.jar"
+  databricks:
+    # workspace instance
+    workspace_instance_url: 'https://adb-5638037984879289.9.azuredatabricks.net/'
+    workspace_token_value: ''
+    # config string including run time information, spark version, machine size, etc.
+    # the config follows the format in the databricks documentation: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs
+    config_template: {'run_name':'','new_cluster':{'spark_version':'9.1.x-scala2.12','node_type_id':'Standard_F4s','num_workers':2,'spark_conf':{}},'libraries':[{'jar':''}],'spark_jar_task':{'main_class_name':'','parameters':['']}}
+    # Feathr Job location. Support local paths, path start with http(s)://, and paths start with dbfs:/
+    work_dir: 'dbfs:/feathr_getting_started'
+
+    # this is the default location so end users don't have to compile the runtime again.
+    # Unset this value will use default package on Maven
+    # feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.5.0.jar"
+
+online_store:
+  redis:
+    # Redis configs to access Redis cluster
+    host: 'feathrazuretest3redis.redis.cache.windows.net'
+    port: 6380
+    ssl_enabled: True
+
+feature_registry:
+  purview:
+    # Registry configs
+    # register type system in purview during feathr client initialization. This is only required to be executed once.
+    type_system_initialization: true
+    # configure the name of the purview endpoint
+    purview_name: 'feathrazuretest3-purview1'
+    # delimiter indicates that how the project/workspace name, feature names etc. are delimited. By default it will be '__'
+    # this is for global reference (mainly for feature sharing). For exmaple, when we setup a project called foo, and we have an anchor called 'taxi_driver' and the feature name is called 'f_daily_trips'
+    # the feature will have a globally unique name called 'foo__taxi_driver__f_daily_trips'
+    delimiter: '__'
Original file line number	Diff line number	Diff line change
Expand Up		@@ -24,3 +24,4 @@
		TYPEDEF_ARRAY_DERIVED_FEATURE=f"array<feathr_derived_feature_{REGISTRY_TYPEDEF_VERSION}>"
		TYPEDEF_ARRAY_ANCHOR_FEATURE=f"array<feathr_anchor_feature_{REGISTRY_TYPEDEF_VERSION}>"

		FEATHR_MAVEN_ARTIFACT="com.linkedin.feathr:feathr_2.12:0.4.0"