Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions feathr_project/feathr/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,4 @@
TYPEDEF_ARRAY_DERIVED_FEATURE=f"array<feathr_derived_feature_{REGISTRY_TYPEDEF_VERSION}>"
TYPEDEF_ARRAY_ANCHOR_FEATURE=f"array<feathr_anchor_feature_{REGISTRY_TYPEDEF_VERSION}>"

FEATHR_MAVEN_ARTIFACT="com.linkedin.feathr:feathr_2.12:0.4.0"
1 change: 1 addition & 0 deletions feathr_project/feathr/spark_provider/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
!noop-1.0.jar
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,11 @@ def submit_feathr_job(self, job_name: str, main_jar_path: str, main_class_name:
submission_params['new_cluster']['spark_conf'] = configuration
submission_params['new_cluster']['custom_tags'] = job_tags
# the feathr main jar file is anyway needed regardless it's pyspark or scala spark
submission_params['libraries'][0]['jar'] = self.upload_or_get_cloud_path(main_jar_path)
if not main_jar_path:
logger.info(f"Main JAR file is not set, using default package '{FEATHR_MAVEN_ARTIFACT}' from Maven")
submission_params['libraries'][0]['maven'] = { "coordinates": FEATHR_MAVEN_ARTIFACT }
else:
submission_params['libraries'][0]['jar'] = self.upload_or_get_cloud_path(main_jar_path)
# see here for the submission parameter definition https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--request-structure-6
if python_files:
# this is a pyspark job. definition here: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--sparkpythontask
Expand Down
86 changes: 64 additions & 22 deletions feathr_project/feathr/spark_provider/_synapse_submission.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from copy import deepcopy
import os
import pathlib
import re
import time
import urllib.request
Expand Down Expand Up @@ -43,7 +45,8 @@ class _FeathrSynapseJobLauncher(SparkJobLauncher):
"""
Submits spark jobs to a Synapse spark cluster.
"""
def __init__(self, synapse_dev_url: str, pool_name: str, datalake_dir: str, executor_size: str, executors: int, credential = None):

def __init__(self, synapse_dev_url: str, pool_name: str, datalake_dir: str, executor_size: str, executors: int, credential=None):
# use DeviceCodeCredential if EnvironmentCredential is not available
self.credential = credential
# use the same credential for authentication to avoid further login.
Expand All @@ -60,9 +63,11 @@ def upload_or_get_cloud_path(self, local_path_or_http_path: str):
Supports transferring file from an http path to cloud working storage, or upload directly from a local storage.
"""
logger.info('Uploading {} to cloud..', local_path_or_http_path)
res_path = self._datalake.upload_file_to_workdir(local_path_or_http_path)
res_path = self._datalake.upload_file_to_workdir(
local_path_or_http_path)

logger.info('{} is uploaded to location: {}', local_path_or_http_path, res_path)
logger.info('{} is uploaded to location: {}',
local_path_or_http_path, res_path)
return res_path

def download_result(self, result_path: str, local_folder: str):
Expand All @@ -73,7 +78,7 @@ def download_result(self, result_path: str, local_folder: str):
return self._datalake.download_file(result_path, local_folder)

def submit_feathr_job(self, job_name: str, main_jar_path: str = None, main_class_name: str = None, arguments: List[str] = None,
python_files: List[str]= None, reference_files_path: List[str] = None, job_tags: Dict[str, str] = None,
python_files: List[str] = None, reference_files_path: List[str] = None, job_tags: Dict[str, str] = None,
configuration: Dict[str, str] = None):
"""
Submits the feathr job
Expand All @@ -92,21 +97,53 @@ def submit_feathr_job(self, job_name: str, main_jar_path: str = None, main_clas
job_name (str): name of the job
main_jar_path (str): main file paths, usually your main jar file
main_class_name (str): name of your main class
arguments (str): all the arugments you want to pass into the spark job
job_tags (str): tags of the job, for exmaple you might want to put your user ID, or a tag with a certain information
arguments (str): all the arguments you want to pass into the spark job
job_tags (str): tags of the job, for example you might want to put your user ID, or a tag with a certain information
configuration (Dict[str, str]): Additional configs for the spark job
"""
assert main_jar_path, 'main_jar_path should not be none or empty but it is none or empty.'
if main_jar_path.startswith('abfs'):
main_jar_cloud_path = main_jar_path
logger.info(
'Cloud path {} is used for running the job: {}', main_jar_path, job_name)

if configuration:
cfg = configuration.copy() # We don't want to mess up input parameters
else:
cfg = {}
if not main_jar_path:
# We don't have the main jar, use Maven
# Add Maven dependency to the job configuration
if "spark.jars.packages" in cfg:
cfg["spark.jars.packages"] = ",".join(
[cfg["spark.jars.packages"], FEATHR_MAVEN_ARTIFACT])
else:
cfg["spark.jars.packages"] = FEATHR_MAVEN_ARTIFACT

if not python_files:
# This is a JAR job
# Azure Synapse/Livy doesn't allow JAR job starts from Maven directly, we must have a jar file uploaded.
# so we have to use a dummy jar as the main file.
logger.info(f"Main JAR file is not set, using default package '{FEATHR_MAVEN_ARTIFACT}' from Maven")
# Use the no-op jar as the main file
# This is a dummy jar which contains only one `org.example.Noop` class with one empty `main` function which does nothing
current_dir = pathlib.Path(__file__).parent.resolve()
main_jar_path = os.path.join(current_dir, "noop-1.0.jar")
else:
# This is a PySpark job, no more things to do
pass
main_jar_cloud_path = None
if main_jar_path:
# Now we have a main jar, either feathr or noop
if main_jar_path.startswith('abfs'):
main_jar_cloud_path = main_jar_path
logger.info(
'Cloud path {} is used for running the job: {}', main_jar_path, job_name)
else:
logger.info('Uploading jar from {} to cloud for running job: {}',
main_jar_path, job_name)
main_jar_cloud_path = self._datalake.upload_file_to_workdir(main_jar_path)
logger.info('{} is uploaded to {} for running job: {}',
main_jar_path, main_jar_cloud_path, job_name)
else:
logger.info('Uploading jar from {} to cloud for running job: {}',
main_jar_path, job_name)
main_jar_cloud_path = self._datalake.upload_file_to_workdir(main_jar_path)
logger.info('{} is uploaded to {} for running job: {}',
main_jar_path, main_jar_cloud_path, job_name)
# We don't have the main Jar, and this is a PySpark job so we don't use `noop.jar` either
# Keep `main_jar_cloud_path` as `None` as we already added maven package into cfg
pass

reference_file_paths = []
for file_path in reference_files_path:
Expand All @@ -120,7 +157,7 @@ def submit_feathr_job(self, job_name: str, main_jar_path: str = None, main_clas
arguments=arguments,
reference_files=reference_files_path,
tags=job_tags,
configuration=configuration)
configuration=cfg)
logger.info('See submitted job here: https://web.azuresynapse.net/en-us/monitoring/sparkapplication')
return self.current_job_info

Expand Down Expand Up @@ -247,8 +284,13 @@ def create_spark_batch_job(self, job_name, main_file, class_name=None,
executor_cores = self.EXECUTOR_SIZE[self._executor_size]['Cores']
executor_memory = self.EXECUTOR_SIZE[self._executor_size]['Memory']

# need to put the jar in as dependencies for pyspark job
jars = jars + [main_file]
# If we have a main jar, it needs to be added as dependencies for pyspark job
# Otherwise it's a PySpark job with Feathr JAR from Maven
if main_file:
jars = jars + [main_file]
elif not python_files:
# These 2 parameters should not be empty at the same time
raise ValueError("Main JAR is not set for the Spark job")

# If file=main_file, then it's using only Scala Spark
# If file=python_files[0], then it's using Pyspark
Expand Down Expand Up @@ -319,7 +361,7 @@ def __init__(self, datalake_dir, credential=None):
self.dir_client = self.file_system_client.get_directory_client('/')

self.datalake_dir = datalake_dir + \
'/' if datalake_dir[-1] != '/' else datalake_dir
'/' if datalake_dir[-1] != '/' else datalake_dir

def upload_file_to_workdir(self, src_file_path: str) -> str:
"""
Expand Down Expand Up @@ -394,7 +436,7 @@ def download_file(self, target_adls_directory: str, local_dir_cache: str):
for folder in result_folders:
folder_name = basename(folder)
file_in_folder = [os.path.join(folder_name, basename(file_path.name)) for file_path in self.file_system_client.get_paths(
path=folder, recursive=False) if not file_path.is_directory]
path=folder, recursive=False) if not file_path.is_directory]
local_paths = [os.path.join(local_dir_cache, file_name)
for file_name in file_in_folder]
self._download_file_list(local_paths, file_in_folder, directory_client)
Expand All @@ -405,7 +447,7 @@ def download_file(self, target_adls_directory: str, local_dir_cache: str):
self._download_file_list(local_paths, result_paths, directory_client)

logger.info('Finish downloading files from {} to {}.',
target_adls_directory,local_dir_cache)
target_adls_directory, local_dir_cache)

def _download_file_list(self, local_paths: List[str], result_paths, directory_client):
'''
Expand Down
Binary file added feathr_project/feathr/spark_provider/noop-1.0.jar
Binary file not shown.
63 changes: 63 additions & 0 deletions feathr_project/test/test_azure_spark_maven_e2e.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import os
from datetime import datetime, timedelta
from pathlib import Path

from click.testing import CliRunner
from feathr import BOOLEAN, FLOAT, INT32, ValueType
from feathr import FeathrClient
from feathr import ValueType
from feathr.utils.job_utils import get_result_df
from feathr import (BackfillTime, MaterializationSettings)
from feathr import FeatureQuery
from feathr import ObservationSettings
from feathr import RedisSink, HdfsSink
from feathr import TypedKey
from feathrcli.cli import init
import pytest

from test_fixture import (basic_test_setup, get_online_test_table_name)

def test_feathr_online_store_agg_features():
"""
Test FeathrClient() get_online_features and batch_get can get data correctly.
"""

online_test_table = get_online_test_table_name("nycTaxiCITable")
test_workspace_dir = Path(
__file__).parent.resolve() / "test_user_workspace"
# os.chdir(test_workspace_dir)

# The `feathr_runtime_location` was commented out in this config file, so feathr should use
# Maven package as the dependency and `noop.jar` as the main file
client = basic_test_setup(os.path.join(test_workspace_dir, "feathr_config_maven.yaml"))

backfill_time = BackfillTime(start=datetime(
2020, 5, 20), end=datetime(2020, 5, 20), step=timedelta(days=1))
redisSink = RedisSink(table_name=online_test_table)
settings = MaterializationSettings("nycTaxiTable",
sinks=[redisSink],
feature_names=[
"f_location_avg_fare", "f_location_max_fare"],
backfill_time=backfill_time)
client.materialize_features(settings)
# just assume the job is successful without validating the actual result in Redis. Might need to consolidate
# this part with the test_feathr_online_store test case
client.wait_job_to_finish(timeout_sec=900)

res = client.get_online_features(online_test_table, '265', [
'f_location_avg_fare', 'f_location_max_fare'])
# just assme there are values. We don't hard code the values for now for testing
# the correctness of the feature generation should be garunteed by feathr runtime.
# ID 239 and 265 are available in the `DOLocationID` column in this file:
# https://s3.amazonaws.com/nyc-tlc/trip+data/green_tripdata_2020-04.csv
# View more detials on this dataset: https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page
assert len(res) == 2
assert res[0] != None
assert res[1] != None
res = client.multi_get_online_features(online_test_table,
['239', '265'],
['f_location_avg_fare', 'f_location_max_fare'])
assert res['239'][0] != None
assert res['239'][1] != None
assert res['265'][0] != None
assert res['265'][1] != None
118 changes: 118 additions & 0 deletions feathr_project/test/test_user_workspace/feathr_config_maven.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
# DO NOT MOVE OR DELETE THIS FILE

# This file contains the configurations that are used by Feathr
# All the configurations can be overwritten by environment variables with concatenation of `__` for different layers of this config file.
# For example, `feathr_runtime_location` for databricks can be overwritten by setting this environment variable:
# SPARK_CONFIG__DATABRICKS__FEATHR_RUNTIME_LOCATION
# Another example would be overwriting Redis host with this config: `ONLINE_STORE__REDIS__HOST`
# For example if you want to override this setting in a shell environment:
# export ONLINE_STORE__REDIS__HOST=feathrazure.redis.cache.windows.net

# version of API settings
api_version: 1
project_config:
project_name: 'project_feathr_integration_test'
# Information that are required to be set via environment variables.
required_environment_variables:
# the environemnt variables are required to run Feathr
# Redis password for your online store
- 'REDIS_PASSWORD'
# client IDs and client Secret for the service principal. Read the getting started docs on how to get those information.
- 'AZURE_CLIENT_ID'
- 'AZURE_TENANT_ID'
- 'AZURE_CLIENT_SECRET'
optional_environment_variables:
# the environemnt variables are optional, however you will need them if you want to use some of the services:
- ADLS_ACCOUNT
- ADLS_KEY
- WASB_ACCOUNT
- WASB_KEY
- S3_ACCESS_KEY
- S3_SECRET_KEY
- JDBC_TABLE
- JDBC_USER
- JDBC_PASSWORD
- KAFKA_SASL_JAAS_CONFIG

offline_store:
# paths starts with abfss:// or abfs://
# ADLS_ACCOUNT and ADLS_KEY should be set in environment variable if this is set to true
adls:
adls_enabled: true

# paths starts with wasb:// or wasbs://
# WASB_ACCOUNT and WASB_KEY should be set in environment variable
wasb:
wasb_enabled: true

# paths starts with s3a://
# S3_ACCESS_KEY and S3_SECRET_KEY should be set in environment variable
s3:
s3_enabled: true
# S3 endpoint. If you use S3 endpoint, then you need to provide access key and secret key in the environment variable as well.
s3_endpoint: 's3.amazonaws.com'

# jdbc endpoint
jdbc:
jdbc_enabled: true
jdbc_database: 'feathrtestdb'
jdbc_table: 'feathrtesttable'

# snowflake endpoint
snowflake:
url: "dqllago-ol19457.snowflakecomputing.com"
user: "feathrintegration"
role: "ACCOUNTADMIN"

spark_config:
# choice for spark runtime. Currently support: azure_synapse, databricks
# The `databricks` configs will be ignored if `azure_synapse` is set and vice versa.
spark_cluster: 'azure_synapse'
# configure number of parts for the spark output for feature generation job
spark_result_output_parts: '1'

azure_synapse:
dev_url: 'https://feathrazuretest3synapse.dev.azuresynapse.net'
pool_name: 'spark3'
# workspace dir for storing all the required configuration files and the jar resources
workspace_dir: 'abfss://[email protected]/feathr_test_workspace'
executor_size: 'Small'
executor_num: 4

# Feathr Job configuration. Support local paths, path start with http(s)://, and paths start with abfs(s)://
# this is the default location so end users don't have to compile the runtime again.
# feathr_runtime_location: wasbs://[email protected]/feathr-assembly-0.5.0-SNAPSHOT.jar
# Unset this value will use default package on Maven
# feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.5.0.jar"
databricks:
# workspace instance
workspace_instance_url: 'https://adb-5638037984879289.9.azuredatabricks.net/'
workspace_token_value: ''
# config string including run time information, spark version, machine size, etc.
# the config follows the format in the databricks documentation: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs
config_template: {'run_name':'','new_cluster':{'spark_version':'9.1.x-scala2.12','node_type_id':'Standard_F4s','num_workers':2,'spark_conf':{}},'libraries':[{'jar':''}],'spark_jar_task':{'main_class_name':'','parameters':['']}}
# Feathr Job location. Support local paths, path start with http(s)://, and paths start with dbfs:/
work_dir: 'dbfs:/feathr_getting_started'

# this is the default location so end users don't have to compile the runtime again.
# Unset this value will use default package on Maven
# feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.5.0.jar"

online_store:
redis:
# Redis configs to access Redis cluster
host: 'feathrazuretest3redis.redis.cache.windows.net'
port: 6380
ssl_enabled: True

feature_registry:
purview:
# Registry configs
# register type system in purview during feathr client initialization. This is only required to be executed once.
type_system_initialization: true
# configure the name of the purview endpoint
purview_name: 'feathrazuretest3-purview1'
# delimiter indicates that how the project/workspace name, feature names etc. are delimited. By default it will be '__'
# this is for global reference (mainly for feature sharing). For exmaple, when we setup a project called foo, and we have an anchor called 'taxi_driver' and the feature name is called 'f_daily_trips'
# the feature will have a globally unique name called 'foo__taxi_driver__f_daily_trips'
delimiter: '__'