diff --git a/feathr_project/feathr/constants.py b/feathr_project/feathr/constants.py index 13adb785b..8884753f5 100644 --- a/feathr_project/feathr/constants.py +++ b/feathr_project/feathr/constants.py @@ -24,3 +24,4 @@ TYPEDEF_ARRAY_DERIVED_FEATURE=f"array" TYPEDEF_ARRAY_ANCHOR_FEATURE=f"array" +FEATHR_MAVEN_ARTIFACT="com.linkedin.feathr:feathr_2.12:0.4.0" \ No newline at end of file diff --git a/feathr_project/feathr/spark_provider/.gitignore b/feathr_project/feathr/spark_provider/.gitignore new file mode 100644 index 000000000..ba64b52e6 --- /dev/null +++ b/feathr_project/feathr/spark_provider/.gitignore @@ -0,0 +1 @@ +!noop-1.0.jar \ No newline at end of file diff --git a/feathr_project/feathr/spark_provider/_databricks_submission.py b/feathr_project/feathr/spark_provider/_databricks_submission.py index 3eca8a3a1..b5368c4f3 100644 --- a/feathr_project/feathr/spark_provider/_databricks_submission.py +++ b/feathr_project/feathr/spark_provider/_databricks_submission.py @@ -143,7 +143,11 @@ def submit_feathr_job(self, job_name: str, main_jar_path: str, main_class_name: submission_params['new_cluster']['spark_conf'] = configuration submission_params['new_cluster']['custom_tags'] = job_tags # the feathr main jar file is anyway needed regardless it's pyspark or scala spark - submission_params['libraries'][0]['jar'] = self.upload_or_get_cloud_path(main_jar_path) + if not main_jar_path: + logger.info(f"Main JAR file is not set, using default package '{FEATHR_MAVEN_ARTIFACT}' from Maven") + submission_params['libraries'][0]['maven'] = { "coordinates": FEATHR_MAVEN_ARTIFACT } + else: + submission_params['libraries'][0]['jar'] = self.upload_or_get_cloud_path(main_jar_path) # see here for the submission parameter definition https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--request-structure-6 if python_files: # this is a pyspark job. definition here: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--sparkpythontask diff --git a/feathr_project/feathr/spark_provider/_synapse_submission.py b/feathr_project/feathr/spark_provider/_synapse_submission.py index adfa8e973..98d4d05fd 100644 --- a/feathr_project/feathr/spark_provider/_synapse_submission.py +++ b/feathr_project/feathr/spark_provider/_synapse_submission.py @@ -1,4 +1,6 @@ +from copy import deepcopy import os +import pathlib import re import time import urllib.request @@ -43,7 +45,8 @@ class _FeathrSynapseJobLauncher(SparkJobLauncher): """ Submits spark jobs to a Synapse spark cluster. """ - def __init__(self, synapse_dev_url: str, pool_name: str, datalake_dir: str, executor_size: str, executors: int, credential = None): + + def __init__(self, synapse_dev_url: str, pool_name: str, datalake_dir: str, executor_size: str, executors: int, credential=None): # use DeviceCodeCredential if EnvironmentCredential is not available self.credential = credential # use the same credential for authentication to avoid further login. @@ -60,9 +63,11 @@ def upload_or_get_cloud_path(self, local_path_or_http_path: str): Supports transferring file from an http path to cloud working storage, or upload directly from a local storage. """ logger.info('Uploading {} to cloud..', local_path_or_http_path) - res_path = self._datalake.upload_file_to_workdir(local_path_or_http_path) + res_path = self._datalake.upload_file_to_workdir( + local_path_or_http_path) - logger.info('{} is uploaded to location: {}', local_path_or_http_path, res_path) + logger.info('{} is uploaded to location: {}', + local_path_or_http_path, res_path) return res_path def download_result(self, result_path: str, local_folder: str): @@ -73,7 +78,7 @@ def download_result(self, result_path: str, local_folder: str): return self._datalake.download_file(result_path, local_folder) def submit_feathr_job(self, job_name: str, main_jar_path: str = None, main_class_name: str = None, arguments: List[str] = None, - python_files: List[str]= None, reference_files_path: List[str] = None, job_tags: Dict[str, str] = None, + python_files: List[str] = None, reference_files_path: List[str] = None, job_tags: Dict[str, str] = None, configuration: Dict[str, str] = None): """ Submits the feathr job @@ -92,21 +97,53 @@ def submit_feathr_job(self, job_name: str, main_jar_path: str = None, main_clas job_name (str): name of the job main_jar_path (str): main file paths, usually your main jar file main_class_name (str): name of your main class - arguments (str): all the arugments you want to pass into the spark job - job_tags (str): tags of the job, for exmaple you might want to put your user ID, or a tag with a certain information + arguments (str): all the arguments you want to pass into the spark job + job_tags (str): tags of the job, for example you might want to put your user ID, or a tag with a certain information configuration (Dict[str, str]): Additional configs for the spark job """ - assert main_jar_path, 'main_jar_path should not be none or empty but it is none or empty.' - if main_jar_path.startswith('abfs'): - main_jar_cloud_path = main_jar_path - logger.info( - 'Cloud path {} is used for running the job: {}', main_jar_path, job_name) + + if configuration: + cfg = configuration.copy() # We don't want to mess up input parameters + else: + cfg = {} + if not main_jar_path: + # We don't have the main jar, use Maven + # Add Maven dependency to the job configuration + if "spark.jars.packages" in cfg: + cfg["spark.jars.packages"] = ",".join( + [cfg["spark.jars.packages"], FEATHR_MAVEN_ARTIFACT]) + else: + cfg["spark.jars.packages"] = FEATHR_MAVEN_ARTIFACT + + if not python_files: + # This is a JAR job + # Azure Synapse/Livy doesn't allow JAR job starts from Maven directly, we must have a jar file uploaded. + # so we have to use a dummy jar as the main file. + logger.info(f"Main JAR file is not set, using default package '{FEATHR_MAVEN_ARTIFACT}' from Maven") + # Use the no-op jar as the main file + # This is a dummy jar which contains only one `org.example.Noop` class with one empty `main` function which does nothing + current_dir = pathlib.Path(__file__).parent.resolve() + main_jar_path = os.path.join(current_dir, "noop-1.0.jar") + else: + # This is a PySpark job, no more things to do + pass + main_jar_cloud_path = None + if main_jar_path: + # Now we have a main jar, either feathr or noop + if main_jar_path.startswith('abfs'): + main_jar_cloud_path = main_jar_path + logger.info( + 'Cloud path {} is used for running the job: {}', main_jar_path, job_name) + else: + logger.info('Uploading jar from {} to cloud for running job: {}', + main_jar_path, job_name) + main_jar_cloud_path = self._datalake.upload_file_to_workdir(main_jar_path) + logger.info('{} is uploaded to {} for running job: {}', + main_jar_path, main_jar_cloud_path, job_name) else: - logger.info('Uploading jar from {} to cloud for running job: {}', - main_jar_path, job_name) - main_jar_cloud_path = self._datalake.upload_file_to_workdir(main_jar_path) - logger.info('{} is uploaded to {} for running job: {}', - main_jar_path, main_jar_cloud_path, job_name) + # We don't have the main Jar, and this is a PySpark job so we don't use `noop.jar` either + # Keep `main_jar_cloud_path` as `None` as we already added maven package into cfg + pass reference_file_paths = [] for file_path in reference_files_path: @@ -120,7 +157,7 @@ def submit_feathr_job(self, job_name: str, main_jar_path: str = None, main_clas arguments=arguments, reference_files=reference_files_path, tags=job_tags, - configuration=configuration) + configuration=cfg) logger.info('See submitted job here: https://web.azuresynapse.net/en-us/monitoring/sparkapplication') return self.current_job_info @@ -247,8 +284,13 @@ def create_spark_batch_job(self, job_name, main_file, class_name=None, executor_cores = self.EXECUTOR_SIZE[self._executor_size]['Cores'] executor_memory = self.EXECUTOR_SIZE[self._executor_size]['Memory'] - # need to put the jar in as dependencies for pyspark job - jars = jars + [main_file] + # If we have a main jar, it needs to be added as dependencies for pyspark job + # Otherwise it's a PySpark job with Feathr JAR from Maven + if main_file: + jars = jars + [main_file] + elif not python_files: + # These 2 parameters should not be empty at the same time + raise ValueError("Main JAR is not set for the Spark job") # If file=main_file, then it's using only Scala Spark # If file=python_files[0], then it's using Pyspark @@ -319,7 +361,7 @@ def __init__(self, datalake_dir, credential=None): self.dir_client = self.file_system_client.get_directory_client('/') self.datalake_dir = datalake_dir + \ - '/' if datalake_dir[-1] != '/' else datalake_dir + '/' if datalake_dir[-1] != '/' else datalake_dir def upload_file_to_workdir(self, src_file_path: str) -> str: """ @@ -394,7 +436,7 @@ def download_file(self, target_adls_directory: str, local_dir_cache: str): for folder in result_folders: folder_name = basename(folder) file_in_folder = [os.path.join(folder_name, basename(file_path.name)) for file_path in self.file_system_client.get_paths( - path=folder, recursive=False) if not file_path.is_directory] + path=folder, recursive=False) if not file_path.is_directory] local_paths = [os.path.join(local_dir_cache, file_name) for file_name in file_in_folder] self._download_file_list(local_paths, file_in_folder, directory_client) @@ -405,7 +447,7 @@ def download_file(self, target_adls_directory: str, local_dir_cache: str): self._download_file_list(local_paths, result_paths, directory_client) logger.info('Finish downloading files from {} to {}.', - target_adls_directory,local_dir_cache) + target_adls_directory, local_dir_cache) def _download_file_list(self, local_paths: List[str], result_paths, directory_client): ''' diff --git a/feathr_project/feathr/spark_provider/noop-1.0.jar b/feathr_project/feathr/spark_provider/noop-1.0.jar new file mode 100644 index 000000000..6b3b9ba56 Binary files /dev/null and b/feathr_project/feathr/spark_provider/noop-1.0.jar differ diff --git a/feathr_project/test/test_azure_spark_maven_e2e.py b/feathr_project/test/test_azure_spark_maven_e2e.py new file mode 100644 index 000000000..5aa51b4ab --- /dev/null +++ b/feathr_project/test/test_azure_spark_maven_e2e.py @@ -0,0 +1,63 @@ +import os +from datetime import datetime, timedelta +from pathlib import Path + +from click.testing import CliRunner +from feathr import BOOLEAN, FLOAT, INT32, ValueType +from feathr import FeathrClient +from feathr import ValueType +from feathr.utils.job_utils import get_result_df +from feathr import (BackfillTime, MaterializationSettings) +from feathr import FeatureQuery +from feathr import ObservationSettings +from feathr import RedisSink, HdfsSink +from feathr import TypedKey +from feathrcli.cli import init +import pytest + +from test_fixture import (basic_test_setup, get_online_test_table_name) + +def test_feathr_online_store_agg_features(): + """ + Test FeathrClient() get_online_features and batch_get can get data correctly. + """ + + online_test_table = get_online_test_table_name("nycTaxiCITable") + test_workspace_dir = Path( + __file__).parent.resolve() / "test_user_workspace" + # os.chdir(test_workspace_dir) + + # The `feathr_runtime_location` was commented out in this config file, so feathr should use + # Maven package as the dependency and `noop.jar` as the main file + client = basic_test_setup(os.path.join(test_workspace_dir, "feathr_config_maven.yaml")) + + backfill_time = BackfillTime(start=datetime( + 2020, 5, 20), end=datetime(2020, 5, 20), step=timedelta(days=1)) + redisSink = RedisSink(table_name=online_test_table) + settings = MaterializationSettings("nycTaxiTable", + sinks=[redisSink], + feature_names=[ + "f_location_avg_fare", "f_location_max_fare"], + backfill_time=backfill_time) + client.materialize_features(settings) + # just assume the job is successful without validating the actual result in Redis. Might need to consolidate + # this part with the test_feathr_online_store test case + client.wait_job_to_finish(timeout_sec=900) + + res = client.get_online_features(online_test_table, '265', [ + 'f_location_avg_fare', 'f_location_max_fare']) + # just assme there are values. We don't hard code the values for now for testing + # the correctness of the feature generation should be garunteed by feathr runtime. + # ID 239 and 265 are available in the `DOLocationID` column in this file: + # https://s3.amazonaws.com/nyc-tlc/trip+data/green_tripdata_2020-04.csv + # View more detials on this dataset: https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page + assert len(res) == 2 + assert res[0] != None + assert res[1] != None + res = client.multi_get_online_features(online_test_table, + ['239', '265'], + ['f_location_avg_fare', 'f_location_max_fare']) + assert res['239'][0] != None + assert res['239'][1] != None + assert res['265'][0] != None + assert res['265'][1] != None \ No newline at end of file diff --git a/feathr_project/test/test_user_workspace/feathr_config_maven.yaml b/feathr_project/test/test_user_workspace/feathr_config_maven.yaml new file mode 100644 index 000000000..ed3af5826 --- /dev/null +++ b/feathr_project/test/test_user_workspace/feathr_config_maven.yaml @@ -0,0 +1,118 @@ +# DO NOT MOVE OR DELETE THIS FILE + +# This file contains the configurations that are used by Feathr +# All the configurations can be overwritten by environment variables with concatenation of `__` for different layers of this config file. +# For example, `feathr_runtime_location` for databricks can be overwritten by setting this environment variable: +# SPARK_CONFIG__DATABRICKS__FEATHR_RUNTIME_LOCATION +# Another example would be overwriting Redis host with this config: `ONLINE_STORE__REDIS__HOST` +# For example if you want to override this setting in a shell environment: +# export ONLINE_STORE__REDIS__HOST=feathrazure.redis.cache.windows.net + +# version of API settings +api_version: 1 +project_config: + project_name: 'project_feathr_integration_test' + # Information that are required to be set via environment variables. + required_environment_variables: + # the environemnt variables are required to run Feathr + # Redis password for your online store + - 'REDIS_PASSWORD' + # client IDs and client Secret for the service principal. Read the getting started docs on how to get those information. + - 'AZURE_CLIENT_ID' + - 'AZURE_TENANT_ID' + - 'AZURE_CLIENT_SECRET' + optional_environment_variables: + # the environemnt variables are optional, however you will need them if you want to use some of the services: + - ADLS_ACCOUNT + - ADLS_KEY + - WASB_ACCOUNT + - WASB_KEY + - S3_ACCESS_KEY + - S3_SECRET_KEY + - JDBC_TABLE + - JDBC_USER + - JDBC_PASSWORD + - KAFKA_SASL_JAAS_CONFIG + +offline_store: + # paths starts with abfss:// or abfs:// + # ADLS_ACCOUNT and ADLS_KEY should be set in environment variable if this is set to true + adls: + adls_enabled: true + + # paths starts with wasb:// or wasbs:// + # WASB_ACCOUNT and WASB_KEY should be set in environment variable + wasb: + wasb_enabled: true + + # paths starts with s3a:// + # S3_ACCESS_KEY and S3_SECRET_KEY should be set in environment variable + s3: + s3_enabled: true + # S3 endpoint. If you use S3 endpoint, then you need to provide access key and secret key in the environment variable as well. + s3_endpoint: 's3.amazonaws.com' + + # jdbc endpoint + jdbc: + jdbc_enabled: true + jdbc_database: 'feathrtestdb' + jdbc_table: 'feathrtesttable' + + # snowflake endpoint + snowflake: + url: "dqllago-ol19457.snowflakecomputing.com" + user: "feathrintegration" + role: "ACCOUNTADMIN" + +spark_config: + # choice for spark runtime. Currently support: azure_synapse, databricks + # The `databricks` configs will be ignored if `azure_synapse` is set and vice versa. + spark_cluster: 'azure_synapse' + # configure number of parts for the spark output for feature generation job + spark_result_output_parts: '1' + + azure_synapse: + dev_url: 'https://feathrazuretest3synapse.dev.azuresynapse.net' + pool_name: 'spark3' + # workspace dir for storing all the required configuration files and the jar resources + workspace_dir: 'abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/feathr_test_workspace' + executor_size: 'Small' + executor_num: 4 + + # Feathr Job configuration. Support local paths, path start with http(s)://, and paths start with abfs(s):// + # this is the default location so end users don't have to compile the runtime again. + # feathr_runtime_location: wasbs://public@azurefeathrstorage.blob.core.windows.net/feathr-assembly-0.5.0-SNAPSHOT.jar + # Unset this value will use default package on Maven + # feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.5.0.jar" + databricks: + # workspace instance + workspace_instance_url: 'https://adb-5638037984879289.9.azuredatabricks.net/' + workspace_token_value: '' + # config string including run time information, spark version, machine size, etc. + # the config follows the format in the databricks documentation: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs + config_template: {'run_name':'','new_cluster':{'spark_version':'9.1.x-scala2.12','node_type_id':'Standard_F4s','num_workers':2,'spark_conf':{}},'libraries':[{'jar':''}],'spark_jar_task':{'main_class_name':'','parameters':['']}} + # Feathr Job location. Support local paths, path start with http(s)://, and paths start with dbfs:/ + work_dir: 'dbfs:/feathr_getting_started' + + # this is the default location so end users don't have to compile the runtime again. + # Unset this value will use default package on Maven + # feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.5.0.jar" + +online_store: + redis: + # Redis configs to access Redis cluster + host: 'feathrazuretest3redis.redis.cache.windows.net' + port: 6380 + ssl_enabled: True + +feature_registry: + purview: + # Registry configs + # register type system in purview during feathr client initialization. This is only required to be executed once. + type_system_initialization: true + # configure the name of the purview endpoint + purview_name: 'feathrazuretest3-purview1' + # delimiter indicates that how the project/workspace name, feature names etc. are delimited. By default it will be '__' + # this is for global reference (mainly for feature sharing). For exmaple, when we setup a project called foo, and we have an anchor called 'taxi_driver' and the feature name is called 'f_daily_trips' + # the feature will have a globally unique name called 'foo__taxi_driver__f_daily_trips' + delimiter: '__' \ No newline at end of file