diff --git a/feathr_project/feathr/spark_provider/_databricks_submission.py b/feathr_project/feathr/spark_provider/_databricks_submission.py index 36abef3fb..87fe81dbd 100644 --- a/feathr_project/feathr/spark_provider/_databricks_submission.py +++ b/feathr_project/feathr/spark_provider/_databricks_submission.py @@ -231,7 +231,15 @@ def submit_feathr_job( "coordinates": get_maven_artifact_fullname()} # Add json-schema dependency # TODO: find a proper way deal with unresolved dependencies - submission_params["libraries"][1]["maven"]= {"coordinates": "com.github.everit-org.json-schema:org.everit.json.schema:1.9.1","repo":"https://repository.mulesoft.org/nexus/content/repositories/public/"} + # Since we are adding another entry to the config, make sure that the spark config passed as part of execution also contains a libraries array of atleast size 2 + # else you will get List Index Out of Bound exception + # Example from feathr_config.yaml - + # config_template: {"run_name":"FEATHR_FILL_IN",.....,"libraries":[{}, {}],".......} + + submission_params["libraries"][1]["maven"]= { + "coordinates": "com.github.everit-org.json-schema:org.everit.json.schema:1.9.1", + "repo":"https://repository.mulesoft.org/nexus/content/repositories/public/" + } else: submission_params["libraries"][0]["jar"] = self.upload_or_get_cloud_path( main_jar_path) diff --git a/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml b/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml index 3e5f6cb40..65910057c 100644 --- a/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml +++ b/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml @@ -92,7 +92,7 @@ spark_config: # config string including run time information, spark version, machine size, etc. # the config follows the format in the databricks documentation: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--request-structure-6 # The fields marked as "FEATHR_FILL_IN" will be managed by Feathr. Other parameters can be customizable. For example, you can customize the node type, spark version, number of workers, instance pools, timeout, etc. - config_template: '{"run_name":"FEATHR_FILL_IN","new_cluster":{"spark_version":"9.1.x-scala2.12","node_type_id":"Standard_D3_v2","num_workers":1,"spark_conf":{"FEATHR_FILL_IN":"FEATHR_FILL_IN"}},"libraries":[{"jar":"FEATHR_FILL_IN"}],"spark_jar_task":{"main_class_name":"FEATHR_FILL_IN","parameters":["FEATHR_FILL_IN"]}}' + config_template: '{"run_name":"FEATHR_FILL_IN","new_cluster":{"spark_version":"9.1.x-scala2.12","node_type_id":"Standard_D3_v2","num_workers":1,"spark_conf":{"FEATHR_FILL_IN":"FEATHR_FILL_IN"}},"libraries":[{"jar":"FEATHR_FILL_IN"}, {"maven":"FEATHR_FILL_IN"}],"spark_jar_task":{"main_class_name":"FEATHR_FILL_IN","parameters":["FEATHR_FILL_IN"]}}' # workspace dir for storing all the required configuration files and the jar resources. All the feature definitions will be uploaded here work_dir: "dbfs:/feathr_getting_started" # This is the location of the runtime jar for Spark job submission. If you have compiled the runtime yourself, you need to specify this location. diff --git a/feathr_project/test/test_user_workspace/feathr_config.yaml b/feathr_project/test/test_user_workspace/feathr_config.yaml index 865d745f8..e0c979e28 100644 --- a/feathr_project/test/test_user_workspace/feathr_config.yaml +++ b/feathr_project/test/test_user_workspace/feathr_config.yaml @@ -90,7 +90,7 @@ spark_config: workspace_token_value: '' # config string including run time information, spark version, machine size, etc. # the config follows the format in the databricks documentation: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs - config_template: {"run_name":"FEATHR_FILL_IN","new_cluster":{"spark_version":"9.1.x-scala2.12","num_workers":1,"spark_conf":{"FEATHR_FILL_IN":"FEATHR_FILL_IN"},"instance_pool_id":"0403-214809-inlet434-pool-l9dj3kwz"},"libraries":[{"jar":"FEATHR_FILL_IN"}],"spark_jar_task":{"main_class_name":"FEATHR_FILL_IN","parameters":["FEATHR_FILL_IN"]}} + config_template: {"run_name":"FEATHR_FILL_IN","new_cluster":{"spark_version":"9.1.x-scala2.12","num_workers":1,"spark_conf":{"FEATHR_FILL_IN":"FEATHR_FILL_IN"},"instance_pool_id":"0403-214809-inlet434-pool-l9dj3kwz"},"libraries":[{"jar":"FEATHR_FILL_IN"}, {"maven":"FEATHR_FILL_IN"}],"spark_jar_task":{"main_class_name":"FEATHR_FILL_IN","parameters":["FEATHR_FILL_IN"]}} # Feathr Job location. Support local paths, path start with http(s)://, and paths start with dbfs:/ work_dir: 'dbfs:/feathr_getting_started' # this is the default location so end users don't have to compile the runtime again.