From 2edc68833fa729d1ca950e100902592ee6313659 Mon Sep 17 00:00:00 2001 From: Richin Jain Date: Wed, 25 Jan 2023 11:36:16 -0800 Subject: [PATCH 1/6] Removing double quotes --- feathr_project/feathr/spark_provider/_databricks_submission.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feathr_project/feathr/spark_provider/_databricks_submission.py b/feathr_project/feathr/spark_provider/_databricks_submission.py index 36abef3fb..cddac775b 100644 --- a/feathr_project/feathr/spark_provider/_databricks_submission.py +++ b/feathr_project/feathr/spark_provider/_databricks_submission.py @@ -231,7 +231,7 @@ def submit_feathr_job( "coordinates": get_maven_artifact_fullname()} # Add json-schema dependency # TODO: find a proper way deal with unresolved dependencies - submission_params["libraries"][1]["maven"]= {"coordinates": "com.github.everit-org.json-schema:org.everit.json.schema:1.9.1","repo":"https://repository.mulesoft.org/nexus/content/repositories/public/"} + submission_params['libraries'][1]['maven']= {"coordinates": "com.github.everit-org.json-schema:org.everit.json.schema:1.9.1","repo":"https://repository.mulesoft.org/nexus/content/repositories/public/"} else: submission_params["libraries"][0]["jar"] = self.upload_or_get_cloud_path( main_jar_path) From d9c57e5558740d29533c895e791c2c7c52f29ae8 Mon Sep 17 00:00:00 2001 From: Richin Jain Date: Wed, 25 Jan 2023 15:00:55 -0800 Subject: [PATCH 2/6] Commenting out --- feathr_project/feathr/spark_provider/_databricks_submission.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feathr_project/feathr/spark_provider/_databricks_submission.py b/feathr_project/feathr/spark_provider/_databricks_submission.py index cddac775b..49cca904b 100644 --- a/feathr_project/feathr/spark_provider/_databricks_submission.py +++ b/feathr_project/feathr/spark_provider/_databricks_submission.py @@ -231,7 +231,7 @@ def submit_feathr_job( "coordinates": get_maven_artifact_fullname()} # Add json-schema dependency # TODO: find a proper way deal with unresolved dependencies - submission_params['libraries'][1]['maven']= {"coordinates": "com.github.everit-org.json-schema:org.everit.json.schema:1.9.1","repo":"https://repository.mulesoft.org/nexus/content/repositories/public/"} + # submission_params['libraries'][1]['maven']= {"coordinates": "com.github.everit-org.json-schema:org.everit.json.schema:1.9.1","repo":"https://repository.mulesoft.org/nexus/content/repositories/public/"} else: submission_params["libraries"][0]["jar"] = self.upload_or_get_cloud_path( main_jar_path) From 2bcd4c0e7caf815ffa9a54adac87056da633a15b Mon Sep 17 00:00:00 2001 From: Richin Jain Date: Wed, 25 Jan 2023 21:03:09 -0800 Subject: [PATCH 3/6] Bringing maven jar back --- .../feathr/spark_provider/_databricks_submission.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/feathr_project/feathr/spark_provider/_databricks_submission.py b/feathr_project/feathr/spark_provider/_databricks_submission.py index 49cca904b..2d583d3f4 100644 --- a/feathr_project/feathr/spark_provider/_databricks_submission.py +++ b/feathr_project/feathr/spark_provider/_databricks_submission.py @@ -231,7 +231,10 @@ def submit_feathr_job( "coordinates": get_maven_artifact_fullname()} # Add json-schema dependency # TODO: find a proper way deal with unresolved dependencies - # submission_params['libraries'][1]['maven']= {"coordinates": "com.github.everit-org.json-schema:org.everit.json.schema:1.9.1","repo":"https://repository.mulesoft.org/nexus/content/repositories/public/"} + submission_params['libraries'][1]['maven']= { + "coordinates": "com.github.everit-org.json-schema:org.everit.json.schema:1.9.1", + "repo":"https://repository.mulesoft.org/nexus/content/repositories/public/" + } else: submission_params["libraries"][0]["jar"] = self.upload_or_get_cloud_path( main_jar_path) From 9f0c1528cd79b0b27d5695efdedb550de4894eda Mon Sep 17 00:00:00 2001 From: Richin Jain Date: Wed, 25 Jan 2023 21:28:54 -0800 Subject: [PATCH 4/6] Adding double quotes --- feathr_project/feathr/spark_provider/_databricks_submission.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feathr_project/feathr/spark_provider/_databricks_submission.py b/feathr_project/feathr/spark_provider/_databricks_submission.py index 2d583d3f4..e8511fd18 100644 --- a/feathr_project/feathr/spark_provider/_databricks_submission.py +++ b/feathr_project/feathr/spark_provider/_databricks_submission.py @@ -231,7 +231,7 @@ def submit_feathr_job( "coordinates": get_maven_artifact_fullname()} # Add json-schema dependency # TODO: find a proper way deal with unresolved dependencies - submission_params['libraries'][1]['maven']= { + submission_params["libraries"][1]["maven"]= { "coordinates": "com.github.everit-org.json-schema:org.everit.json.schema:1.9.1", "repo":"https://repository.mulesoft.org/nexus/content/repositories/public/" } From ffc058ee5411ebb4c8f8cb31d856be6f55780de6 Mon Sep 17 00:00:00 2001 From: Richin Jain Date: Thu, 26 Jan 2023 14:09:16 -0800 Subject: [PATCH 5/6] Reverting changes --- .../feathr/spark_provider/_databricks_submission.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/feathr_project/feathr/spark_provider/_databricks_submission.py b/feathr_project/feathr/spark_provider/_databricks_submission.py index e8511fd18..36abef3fb 100644 --- a/feathr_project/feathr/spark_provider/_databricks_submission.py +++ b/feathr_project/feathr/spark_provider/_databricks_submission.py @@ -231,10 +231,7 @@ def submit_feathr_job( "coordinates": get_maven_artifact_fullname()} # Add json-schema dependency # TODO: find a proper way deal with unresolved dependencies - submission_params["libraries"][1]["maven"]= { - "coordinates": "com.github.everit-org.json-schema:org.everit.json.schema:1.9.1", - "repo":"https://repository.mulesoft.org/nexus/content/repositories/public/" - } + submission_params["libraries"][1]["maven"]= {"coordinates": "com.github.everit-org.json-schema:org.everit.json.schema:1.9.1","repo":"https://repository.mulesoft.org/nexus/content/repositories/public/"} else: submission_params["libraries"][0]["jar"] = self.upload_or_get_cloud_path( main_jar_path) From b17e4185d4a00194300fcd1f9d597ec977c394fd Mon Sep 17 00:00:00 2001 From: Richin Jain Date: Thu, 26 Jan 2023 15:54:54 -0800 Subject: [PATCH 6/6] Adding instructions to add additional entries to databricks spark job config, and also adding additonal entries to reference feathr_config.yaml file so array is initialized assize 2 if users are just copying from it. Also made a note of the error in the code --- .../feathr/spark_provider/_databricks_submission.py | 10 +++++++++- .../data/feathr_user_workspace/feathr_config.yaml | 2 +- .../test/test_user_workspace/feathr_config.yaml | 2 +- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/feathr_project/feathr/spark_provider/_databricks_submission.py b/feathr_project/feathr/spark_provider/_databricks_submission.py index 36abef3fb..87fe81dbd 100644 --- a/feathr_project/feathr/spark_provider/_databricks_submission.py +++ b/feathr_project/feathr/spark_provider/_databricks_submission.py @@ -231,7 +231,15 @@ def submit_feathr_job( "coordinates": get_maven_artifact_fullname()} # Add json-schema dependency # TODO: find a proper way deal with unresolved dependencies - submission_params["libraries"][1]["maven"]= {"coordinates": "com.github.everit-org.json-schema:org.everit.json.schema:1.9.1","repo":"https://repository.mulesoft.org/nexus/content/repositories/public/"} + # Since we are adding another entry to the config, make sure that the spark config passed as part of execution also contains a libraries array of atleast size 2 + # else you will get List Index Out of Bound exception + # Example from feathr_config.yaml - + # config_template: {"run_name":"FEATHR_FILL_IN",.....,"libraries":[{}, {}],".......} + + submission_params["libraries"][1]["maven"]= { + "coordinates": "com.github.everit-org.json-schema:org.everit.json.schema:1.9.1", + "repo":"https://repository.mulesoft.org/nexus/content/repositories/public/" + } else: submission_params["libraries"][0]["jar"] = self.upload_or_get_cloud_path( main_jar_path) diff --git a/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml b/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml index 3e5f6cb40..65910057c 100644 --- a/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml +++ b/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml @@ -92,7 +92,7 @@ spark_config: # config string including run time information, spark version, machine size, etc. # the config follows the format in the databricks documentation: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--request-structure-6 # The fields marked as "FEATHR_FILL_IN" will be managed by Feathr. Other parameters can be customizable. For example, you can customize the node type, spark version, number of workers, instance pools, timeout, etc. - config_template: '{"run_name":"FEATHR_FILL_IN","new_cluster":{"spark_version":"9.1.x-scala2.12","node_type_id":"Standard_D3_v2","num_workers":1,"spark_conf":{"FEATHR_FILL_IN":"FEATHR_FILL_IN"}},"libraries":[{"jar":"FEATHR_FILL_IN"}],"spark_jar_task":{"main_class_name":"FEATHR_FILL_IN","parameters":["FEATHR_FILL_IN"]}}' + config_template: '{"run_name":"FEATHR_FILL_IN","new_cluster":{"spark_version":"9.1.x-scala2.12","node_type_id":"Standard_D3_v2","num_workers":1,"spark_conf":{"FEATHR_FILL_IN":"FEATHR_FILL_IN"}},"libraries":[{"jar":"FEATHR_FILL_IN"}, {"maven":"FEATHR_FILL_IN"}],"spark_jar_task":{"main_class_name":"FEATHR_FILL_IN","parameters":["FEATHR_FILL_IN"]}}' # workspace dir for storing all the required configuration files and the jar resources. All the feature definitions will be uploaded here work_dir: "dbfs:/feathr_getting_started" # This is the location of the runtime jar for Spark job submission. If you have compiled the runtime yourself, you need to specify this location. diff --git a/feathr_project/test/test_user_workspace/feathr_config.yaml b/feathr_project/test/test_user_workspace/feathr_config.yaml index 865d745f8..e0c979e28 100644 --- a/feathr_project/test/test_user_workspace/feathr_config.yaml +++ b/feathr_project/test/test_user_workspace/feathr_config.yaml @@ -90,7 +90,7 @@ spark_config: workspace_token_value: '' # config string including run time information, spark version, machine size, etc. # the config follows the format in the databricks documentation: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs - config_template: {"run_name":"FEATHR_FILL_IN","new_cluster":{"spark_version":"9.1.x-scala2.12","num_workers":1,"spark_conf":{"FEATHR_FILL_IN":"FEATHR_FILL_IN"},"instance_pool_id":"0403-214809-inlet434-pool-l9dj3kwz"},"libraries":[{"jar":"FEATHR_FILL_IN"}],"spark_jar_task":{"main_class_name":"FEATHR_FILL_IN","parameters":["FEATHR_FILL_IN"]}} + config_template: {"run_name":"FEATHR_FILL_IN","new_cluster":{"spark_version":"9.1.x-scala2.12","num_workers":1,"spark_conf":{"FEATHR_FILL_IN":"FEATHR_FILL_IN"},"instance_pool_id":"0403-214809-inlet434-pool-l9dj3kwz"},"libraries":[{"jar":"FEATHR_FILL_IN"}, {"maven":"FEATHR_FILL_IN"}],"spark_jar_task":{"main_class_name":"FEATHR_FILL_IN","parameters":["FEATHR_FILL_IN"]}} # Feathr Job location. Support local paths, path start with http(s)://, and paths start with dbfs:/ work_dir: 'dbfs:/feathr_getting_started' # this is the default location so end users don't have to compile the runtime again.