diff --git a/.github/workflows/pull_request_push_test.yml b/.github/workflows/pull_request_push_test.yml index 670e16a0f..137a810ee 100644 --- a/.github/workflows/pull_request_push_test.yml +++ b/.github/workflows/pull_request_push_test.yml @@ -108,7 +108,7 @@ jobs: SPARK_CONFIG__SPARK_CLUSTER: databricks SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL: ${{secrets.DATABRICKS_HOST}} DATABRICKS_WORKSPACE_TOKEN_VALUE: ${{secrets.DATABRICKS_WORKSPACE_TOKEN_VALUE}} - SPARK_CONFIG__DATABRICKS__CONFIG_TEMPLATE: '{"run_name":"FEATHR_FILL_IN","new_cluster":{"spark_version":"9.1.x-scala2.12","num_workers":1,"spark_conf":{"FEATHR_FILL_IN":"FEATHR_FILL_IN"},"instance_pool_id":"${{secrets.DATABRICKS_INSTANCE_POOL_ID}}"},"libraries":[{"maven": {"coordinates": "com.azure.cosmos.spark:azure-cosmos-spark_3-1_2-12:4.16.0"}}],"spark_jar_task":{"main_class_name":"FEATHR_FILL_IN","parameters":["FEATHR_FILL_IN"]}}' + SPARK_CONFIG__DATABRICKS__CONFIG_TEMPLATE: '{"run_name":"FEATHR_FILL_IN","new_cluster":{"spark_version":"11.3.x-scala2.12","num_workers":1,"spark_conf":{"FEATHR_FILL_IN":"FEATHR_FILL_IN"},"instance_pool_id":"${{secrets.DATABRICKS_INSTANCE_POOL_ID}}"},"libraries":[{"jar":"FEATHR_FILL_IN"}],"spark_jar_task":{"main_class_name":"FEATHR_FILL_IN","parameters":["FEATHR_FILL_IN"]}}' REDIS_PASSWORD: ${{secrets.REDIS_PASSWORD}} AZURE_CLIENT_ID: ${{secrets.AZURE_CLIENT_ID}} AZURE_TENANT_ID: ${{secrets.AZURE_TENANT_ID}} @@ -311,4 +311,4 @@ jobs: run: echo "NOW=$(date +'%Y-%m-%d')" >> $GITHUB_ENV - name: Notification run: | - curl -H 'Content-Type: application/json' -d '{"text": "${{env.NOW}} Daily Report: 1. Gradle Test ${{needs.gradle_test.result}}, 2. Python Lint Test ${{needs.python_lint.result}}, 3. Databricks Test ${{needs.databricks_test.result}}, 4. Synapse Test ${{needs.azure_synapse_test.result}} , 5. LOCAL SPARK TEST ${{needs.local_spark_test.result}}. Link: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"}' ${{ secrets.TEAMS_WEBHOOK }} \ No newline at end of file + curl -H 'Content-Type: application/json' -d '{"text": "${{env.NOW}} Daily Report: 1. Gradle Test ${{needs.gradle_test.result}}, 2. Python Lint Test ${{needs.python_lint.result}}, 3. Databricks Test ${{needs.databricks_test.result}}, 4. Synapse Test ${{needs.azure_synapse_test.result}} , 5. LOCAL SPARK TEST ${{needs.local_spark_test.result}}. Link: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"}' ${{ secrets.TEAMS_WEBHOOK }} diff --git a/build.gradle b/build.gradle index b8c550258..19b9d62a7 100644 --- a/build.gradle +++ b/build.gradle @@ -73,7 +73,7 @@ dependencies { implementation 'net.snowflake:snowflake-jdbc:3.13.18' implementation 'net.snowflake:spark-snowflake_2.12:2.10.0-spark_3.2' provided 'com.microsoft.azure:azure-eventhubs-spark_2.12:2.3.21' - implementation 'com.azure.cosmos.spark:azure-cosmos-spark_3-1_2-12:4.16.0' + provided 'com.azure.cosmos.spark:azure-cosmos-spark_3-2_2-12:4.11.1' provided 'com.microsoft.sqlserver:mssql-jdbc:10.2.0.jre8' provided 'org.eclipse.jetty:jetty-util:9.3.24.v20180605' provided 'org.apache.kafka:kafka-clients:3.1.0' @@ -130,7 +130,6 @@ project.ext.spec = [ 'avro' : "org.apache.avro:avro:1.10.2", "avroUtil": "com.linkedin.avroutil1:helper-all:0.2.100", "azure": "com.microsoft.azure:azure-eventhubs-spark_2.12:2.3.21", - "spark_cosmos": "com.azure.cosmos.spark:azure-cosmos-spark_3-1_2-12:4.16.0", 'fastutil' : "it.unimi.dsi:fastutil:8.1.1", 'mvel' : "org.mvel:mvel2:2.2.8.Final", 'protobuf' : "com.google.protobuf:protobuf-java:2.6.1", diff --git a/docs/samples/feature_embedding.ipynb b/docs/samples/feature_embedding.ipynb index 27498b1f5..ad58252dc 100644 --- a/docs/samples/feature_embedding.ipynb +++ b/docs/samples/feature_embedding.ipynb @@ -341,7 +341,6 @@ "client = FeathrClient(\n", " config_path=config_path,\n", " credential=credential,\n", - " use_env_vars=False,\n", ")" ] }, diff --git a/feathr-impl/build.gradle b/feathr-impl/build.gradle index 7f9894727..055fcd4c0 100644 --- a/feathr-impl/build.gradle +++ b/feathr-impl/build.gradle @@ -48,7 +48,6 @@ dependencies { implementation spec.product.jackson.dataformat_hocon implementation spec.product.jackson.jackson_core implementation spec.product.spark_redis - implementation spec.product.spark_cosmos implementation spec.product.fastutil implementation spec.product.hadoop.mapreduce_client_core implementation spec.product.mvel @@ -78,7 +77,6 @@ dependencies { testImplementation spec.product.equalsverifier testImplementation spec.product.spark.spark_catalyst - testImplementation spec.product.spark_cosmos testImplementation spec.product.mockito testImplementation spec.product.scala.scalatest testImplementation spec.product.testing diff --git a/feathr_project/feathr/client.py b/feathr_project/feathr/client.py index eb5e4ca80..6fafe9560 100644 --- a/feathr_project/feathr/client.py +++ b/feathr_project/feathr/client.py @@ -65,7 +65,6 @@ def __init__( local_workspace_dir: str = None, credential: Any = None, project_registry_tag: Dict[str, str] = None, - use_env_vars: bool = True, ): """Initialize Feathr Client. @@ -74,13 +73,12 @@ def __init__( local_workspace_dir (optional): Set where is the local work space dir. If not set, Feathr will create a temporary folder to store local workspace related files. credential (optional): Azure credential to access cloud resources, most likely to be the returned result of DefaultAzureCredential(). If not set, Feathr will initialize DefaultAzureCredential() inside the __init__ function to get credentials. project_registry_tag (optional): Adding tags for project in Feathr registry. This might be useful if you want to tag your project as deprecated, or allow certain customizations on project level. Default is empty - use_env_vars (optional): Whether to use environment variables to set up the client. If set to False, the client will not use environment variables to set up the client. Defaults to True. """ self.logger = logging.getLogger(__name__) # Redis key separator self._KEY_SEPARATOR = ':' self._COMPOSITE_KEY_SEPARATOR = '#' - self.env_config = EnvConfigReader(config_path=config_path, use_env_vars=use_env_vars) + self.env_config = EnvConfigReader(config_path=config_path) if local_workspace_dir: self.local_workspace_dir = local_workspace_dir else: diff --git a/feathr_project/feathr/spark_provider/_localspark_submission.py b/feathr_project/feathr/spark_provider/_localspark_submission.py index 2c1a598e7..c609be945 100644 --- a/feathr_project/feathr/spark_provider/_localspark_submission.py +++ b/feathr_project/feathr/spark_provider/_localspark_submission.py @@ -86,7 +86,6 @@ def submit_feathr_job( spark_args = self._init_args(job_name=job_name, confs=cfg) # Add additional repositories spark_args.extend(["--repositories", "https://repository.mulesoft.org/nexus/content/repositories/public/,https://linkedin.jfrog.io/artifactory/open-source/"]) - # spark_args.extend(["--repositories", "https://linkedin.jfrog.io/artifactory/open-source/"]) if not main_jar_path: # We don't have the main jar, use Maven diff --git a/feathr_project/feathr/utils/_env_config_reader.py b/feathr_project/feathr/utils/_env_config_reader.py index fd4555433..bf4a4ae8d 100644 --- a/feathr_project/feathr/utils/_env_config_reader.py +++ b/feathr_project/feathr/utils/_env_config_reader.py @@ -11,17 +11,19 @@ class EnvConfigReader(object): """A utility class to read Feathr environment variables either from os environment variables, the config yaml file or Azure Key Vault. If a key is set in the environment variable, ConfigReader will return the value of that environment variable - unless use_env_vars set to False. + It will retrieve the value in the following order: + - From the environment variable if the key is set in the os environment variables. + - From the config yaml file if the key exists. + - From the Azure Key Vault. """ akv_name: str = None # Azure Key Vault name to use for retrieving config values. yaml_config: dict = None # YAML config file content. - def __init__(self, config_path: str, use_env_vars: bool = True): + def __init__(self, config_path: str): """Initialize the utility class. Args: config_path: Config file path. - use_env_vars (optional): Whether to use os environment variables instead of config file. Defaults to True. """ if config_path is not None: config_path = Path(config_path) @@ -31,7 +33,6 @@ def __init__(self, config_path: str, use_env_vars: bool = True): except yaml.YAMLError as e: logger.warning(e) - self.use_env_vars = use_env_vars self.akv_name = self.get("secrets__azure_key_vault__name") self.akv_client = AzureKeyVaultClient(self.akv_name) if self.akv_name else None @@ -39,7 +40,7 @@ def __init__(self, config_path: str, use_env_vars: bool = True): def get(self, key: str, default: str = None) -> str: """Gets the Feathr config variable for the given key. It will retrieve the value in the following order: - - From the environment variable if `use_env_vars == True` and the key is set in the os environment variables. + - From the environment variable if the key is set in the os environment variables. - From the config yaml file if the key exists. - From the Azure Key Vault. If the key is not found in any of the above, it will return `default`. @@ -51,9 +52,9 @@ def get(self, key: str, default: str = None) -> str: Returns: Feathr client's config value. """ - res_env = (self._get_variable_from_env(key) if self.use_env_vars else None) - res_file = (self._get_variable_from_file(key) if self.yaml_config else None) - res_keyvault = (self._get_variable_from_akv(key) if self.akv_name else None) + res_env = self._get_variable_from_env(key) + res_file = (self._get_variable_from_file(key) if self.yaml_config and res_env is None else None) + res_keyvault = (self._get_variable_from_akv(key) if self.akv_name and res_env is None and res_file is None else None) # rewrite the logic below to make sure: # First we have the order (i.e. res1 > res2 > res3 > default) @@ -67,8 +68,7 @@ def get(self, key: str, default: str = None) -> str: return default def get_from_env_or_akv(self, key: str) -> str: - """Gets the Feathr config variable for the given key. This function ignores `use_env_vars` attribute and force to - look up environment variables or Azure Key Vault. + """Gets the Feathr config variable for the given key. This function will look up environment variables or Azure Key Vault. It will retrieve the value in the following order: - From the environment variable if the key is set in the os environment variables. - From the Azure Key Vault. @@ -80,8 +80,8 @@ def get_from_env_or_akv(self, key: str) -> str: Returns: Feathr client's config value. """ - res_env = (self._get_variable_from_env(key) if self.use_env_vars else None) - res_keyvault = (self._get_variable_from_akv(key) if self.akv_name else None) + res_env = self._get_variable_from_env(key) + res_keyvault = (self._get_variable_from_akv(key) if self.akv_name and res_env is None else None) # rewrite the logic below to make sure: # First we have the order (i.e. res1 > res2 > res3 > default) diff --git a/feathr_project/setup.py b/feathr_project/setup.py index d5f13b31c..98937933c 100644 --- a/feathr_project/setup.py +++ b/feathr_project/setup.py @@ -65,6 +65,7 @@ "py4j<=0.10.9.7", "loguru<=0.6.0", "pandas", + "numpy<=1.20.3", # pin numpy due to pyspark's deprecated np.bool access "redis<=4.4.0", "requests<=2.28.1", "tqdm<=4.64.1", diff --git a/feathr_project/test/unit/utils/test_env_config_reader.py b/feathr_project/test/unit/utils/test_env_config_reader.py index 98e591808..14489e3cc 100644 --- a/feathr_project/test/unit/utils/test_env_config_reader.py +++ b/feathr_project/test/unit/utils/test_env_config_reader.py @@ -18,21 +18,18 @@ @pytest.mark.parametrize( - "use_env_vars, env_value, expected_value", + "env_value, expected_value", [ - (True, TEST_CONFIG_ENV_VAL, TEST_CONFIG_ENV_VAL), - (True, None, TEST_CONFIG_FILE_VAL), - (False, TEST_CONFIG_ENV_VAL, TEST_CONFIG_FILE_VAL), + (TEST_CONFIG_ENV_VAL, TEST_CONFIG_ENV_VAL), + ( None, TEST_CONFIG_FILE_VAL), ] ) def test__envvariableutil__get( mocker: MockerFixture, - use_env_vars: bool, env_value: str, expected_value: str, ): """Test `get` method if it returns the correct value - along with `use_env_vars` argument. """ if env_value: mocker.patch.object(feathr.utils._env_config_reader.os, "environ", {TEST_CONFIG_KEY: env_value}) @@ -40,29 +37,27 @@ def test__envvariableutil__get( f = NamedTemporaryFile(delete=True) f.write(TEST_CONFIG_FILE_CONTENT.encode()) f.seek(0) - env_config = EnvConfigReader(config_path=f.name, use_env_vars=use_env_vars) + env_config = EnvConfigReader(config_path=f.name) assert env_config.get(TEST_CONFIG_KEY) == expected_value @pytest.mark.parametrize( - "use_env_vars, env_value, expected_value", + "env_value, expected_value", [ - (True, TEST_CONFIG_ENV_VAL, TEST_CONFIG_ENV_VAL), - (True, None, None), - (False, TEST_CONFIG_ENV_VAL, TEST_CONFIG_ENV_VAL), + (TEST_CONFIG_ENV_VAL, TEST_CONFIG_ENV_VAL), + (None, None), + (TEST_CONFIG_ENV_VAL, TEST_CONFIG_ENV_VAL), ] ) def test__envvariableutil__get_from_env_or_akv( mocker: MockerFixture, - use_env_vars: bool, env_value: str, expected_value: str, ): - """Test `get_from_env_or_akv` method if it returns the environment variable regardless of `use_env_vars` argument. + """Test `get_from_env_or_akv` method if it returns the environment variable Args: mocker (MockerFixture): _description_ - use_env_vars (bool): _description_ env_value (str): _description_ expected_value (str): _description_ """ @@ -72,5 +67,5 @@ def test__envvariableutil__get_from_env_or_akv( f = NamedTemporaryFile(delete=True) f.write(TEST_CONFIG_FILE_CONTENT.encode()) f.seek(0) - env_config = EnvConfigReader(config_path=f.name, use_env_vars=use_env_vars) + env_config = EnvConfigReader(config_path=f.name) assert env_config.get_from_env_or_akv(TEST_CONFIG_KEY) == expected_value