From be0c05d391864b4883a7ebed891e7b9cbdb44e0e Mon Sep 17 00:00:00 2001 From: Xiaoyong Zhu Date: Tue, 17 Jan 2023 19:42:05 -0800 Subject: [PATCH 01/15] Update docs --- docs/README.md | 20 ++++++++++++++++++++ docs/quickstart_local_sandbox.md | 14 ++++++++++++-- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/docs/README.md b/docs/README.md index acfae7e62..10ee0a710 100644 --- a/docs/README.md +++ b/docs/README.md @@ -74,6 +74,26 @@ If you want to set up everything manually, you can checkout the [Feathr CLI depl | [Fraud Detection Demo](./samples/fraud_detection_demo.ipynb) | An example to demonstrate Feature Store using multiple data sources such as user account and transaction data. | Azure Synapse, Databricks, Local Spark | | [Product Recommendation Demo](./samples/product_recommendation_demo_advanced.ipynb) | Feathr Feature Store example notebook with a product recommendation scenario | Azure Synapse, Databricks, Local Spark | +## πŸƒ Getting Started with Feathr + +The best way to try out Feathr is to use the [Feathr Sandbox](https://feathr-ai.github.io/feathr/quickstart_local_sandbox.html) which is a self-contained container with most of Feathr's capabilities. To use it, simply run this command: + +```bash +# 80: Feathr UI 8000: Feathr API 8888: Jupyter 8080: VsCode 7080: Interpret +docker run -it --rm -p 8888:8888 -p 8000:8000 -p 80:80 -p 8080:8080 -p 7080:7080 --env API_BASE="api/v1" --env FEATHR_SANDBOX=True -e GRANT_SUDO=yes feathrfeaturestore/feathr-sandbox +``` + +And you can view default jupyter notebook: +```bash +http://localhost:8888/lab/workspaces/auto-w/tree/local_quickstart_notebook.ipynb +``` + +After running the Notebooks, all the features will be registered in the UI, and you can visit the Feathr UI at: + +```bash +http://localhost:80 +``` + ## πŸ› οΈ Install Feathr Client Locally If you want to install Feathr client in a python environment, use this: diff --git a/docs/quickstart_local_sandbox.md b/docs/quickstart_local_sandbox.md index e72b3a0c4..7b721ecbf 100644 --- a/docs/quickstart_local_sandbox.md +++ b/docs/quickstart_local_sandbox.md @@ -21,15 +21,25 @@ The Sandbox is ideal for: ## Getting Started -To get started, simply run the command below. Note that the image is around 5GB so it might take a while to pull it from DockerHub. +First, make sure you have docker installed successfully on your machine by running this command: + +```bash +docker run hello-world +``` + +To get started using Feathr, simply run the command below. Note that the image is around 5GB so it might take a while to pull it from DockerHub. ```bash # 80: Feathr UI 8000: Feathr API 8888: Jupyter 8080: VsCode 7080: Interpret -docker run -it --rm -p 8888:8888 -p 8000:8000 -p 80:80 -p 8080:8080 -p 7080:7080 --env CONNECTION_STR="Server=" --env API_BASE="api/v1" --env FEATHR_SANDBOX=True -e GRANT_SUDO=yes feathrfeaturestore/feathr-sandbox +docker run -it --rm -p 8888:8888 -p 8000:8000 -p 80:80 -p 8080:8080 -p 7080:7080 --env API_BASE="api/v1" --env FEATHR_SANDBOX=True -e GRANT_SUDO=yes feathrfeaturestore/feathr-sandbox ``` It should pop up a Jupyter link in `http://127.0.0.1:8888/`. Double click on the notebook file to start the Jupyter Notebook, and you should be able to see the Feathr sample notebook. Click the triangle button on the Jupyter notebook and the whole notebook will run locally. +If you see some errors like below, simply change the `-p 80:80` part to `-p :80` so the Feathr UI will be redirected to the new port. + +`docker: Error response from daemon: driver failed programming external connectivity on endpoint hardcore_bose (ae107e924cddce6b942f96f2654369345b027ac82e5e44929a9f132e2af71746): Bind for 0.0.0.0:80 failed: port is already allocated.` + The default jupyter notebook is here: ```bash http://localhost:8888/lab/workspaces/auto-w/tree/local_quickstart_notebook.ipynb From 31306b66878764bc9a7280639da0b32ed7e21b4a Mon Sep 17 00:00:00 2001 From: Xiaoyong Zhu Date: Tue, 17 Jan 2023 19:50:05 -0800 Subject: [PATCH 02/15] Update README.md --- docs/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/README.md b/docs/README.md index 10ee0a710..979dad5ac 100644 --- a/docs/README.md +++ b/docs/README.md @@ -76,7 +76,7 @@ If you want to set up everything manually, you can checkout the [Feathr CLI depl ## πŸƒ Getting Started with Feathr -The best way to try out Feathr is to use the [Feathr Sandbox](https://feathr-ai.github.io/feathr/quickstart_local_sandbox.html) which is a self-contained container with most of Feathr's capabilities. To use it, simply run this command: +The easiest way to try out Feathr is to use the [Feathr Sandbox](https://feathr-ai.github.io/feathr/quickstart_local_sandbox.html) which is a self-contained container with most of Feathr's capabilities. To use it, simply run this command: ```bash # 80: Feathr UI 8000: Feathr API 8888: Jupyter 8080: VsCode 7080: Interpret From 6f2c59b23896651654e1d341d073d9a1b2cfd633 Mon Sep 17 00:00:00 2001 From: Xiaoyong Zhu Date: Tue, 17 Jan 2023 22:11:01 -0800 Subject: [PATCH 03/15] Update README.md --- docs/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/README.md b/docs/README.md index 979dad5ac..503435f5d 100644 --- a/docs/README.md +++ b/docs/README.md @@ -76,7 +76,7 @@ If you want to set up everything manually, you can checkout the [Feathr CLI depl ## πŸƒ Getting Started with Feathr -The easiest way to try out Feathr is to use the [Feathr Sandbox](https://feathr-ai.github.io/feathr/quickstart_local_sandbox.html) which is a self-contained container with most of Feathr's capabilities. To use it, simply run this command: +The easiest way to try out Feathr is to use the [Feathr Sandbox](https://feathr-ai.github.io/feathr/quickstart_local_sandbox.html) which is a self-contained container with most of Feathr's capabilities and you should be productive in 5 minutes. To use it, simply run this command: ```bash # 80: Feathr UI 8000: Feathr API 8888: Jupyter 8080: VsCode 7080: Interpret From 305f6c78dcda5c50f2ebe9e84f21c925783db5ab Mon Sep 17 00:00:00 2001 From: Xiaoyong Zhu Date: Tue, 17 Jan 2023 23:47:47 -0800 Subject: [PATCH 04/15] update URLs --- build.gradle | 4 ++-- docs/dev_guide/creating_bacpac_file.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/build.gradle b/build.gradle index 82aa48514..19b9d62a7 100644 --- a/build.gradle +++ b/build.gradle @@ -174,7 +174,7 @@ allprojects { pom { name = 'Feathr' description = 'An Enterprise-Grade, High Performance Feature Store' - url = 'https://github.com/linkedin/feathr' + url = 'https://github.com/feathr-ai/feathr' licenses { license { name = 'APL2' @@ -190,7 +190,7 @@ allprojects { } scm { connection = 'scm:git@github.com:linkedin/feathr.git' - url = 'https://github.com/linkedin/feathr' + url = 'https://github.com/feathr-ai/feathr' } } } diff --git a/docs/dev_guide/creating_bacpac_file.md b/docs/dev_guide/creating_bacpac_file.md index 02754184c..8bcb3f26c 100644 --- a/docs/dev_guide/creating_bacpac_file.md +++ b/docs/dev_guide/creating_bacpac_file.md @@ -13,7 +13,7 @@ In case you need to re-create the BACPAC file, follow these steps: 1. Create a new, empty SQL database on Azure ![Create Empty Database](../images/bacpac-sql-database.png) -2. Connect to the database with a SQL client, such as Azure Data Studio, run the SQL script at https://github.com/linkedin/feathr/blob/main/registry/sql-registry/scripts/schema.sql +2. Connect to the database with a SQL client, such as Azure Data Studio, run the SQL script at https://github.com/feathr-ai/feathr/blob/main/registry/sql-registry/scripts/schema.sql 3. Now we created the tables required by the registry service, we can use β€œExport” function to create the BACPAC file ![Bacpac Export UI](../images/bacpac-export.png) From 2c59810168809d227c75588c28b9dce438260986 Mon Sep 17 00:00:00 2001 From: Xiaoyong Zhu Date: Wed, 18 Jan 2023 00:29:04 -0800 Subject: [PATCH 05/15] fix comments --- docs/README.md | 4 ++-- docs/quickstart_local_sandbox.md | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/README.md b/docs/README.md index 503435f5d..05f61f169 100644 --- a/docs/README.md +++ b/docs/README.md @@ -80,7 +80,7 @@ The easiest way to try out Feathr is to use the [Feathr Sandbox](https://feathr- ```bash # 80: Feathr UI 8000: Feathr API 8888: Jupyter 8080: VsCode 7080: Interpret -docker run -it --rm -p 8888:8888 -p 8000:8000 -p 80:80 -p 8080:8080 -p 7080:7080 --env API_BASE="api/v1" --env FEATHR_SANDBOX=True -e GRANT_SUDO=yes feathrfeaturestore/feathr-sandbox +docker run -it --rm -p 8888:8888 -p 8000:8000 -p 8081:80 -p 8080:8080 -p 7080:7080 --env API_BASE="api/v1" --env FEATHR_SANDBOX=True -e GRANT_SUDO=yes feathrfeaturestore/feathr-sandbox ``` And you can view default jupyter notebook: @@ -91,7 +91,7 @@ http://localhost:8888/lab/workspaces/auto-w/tree/local_quickstart_notebook.ipynb After running the Notebooks, all the features will be registered in the UI, and you can visit the Feathr UI at: ```bash -http://localhost:80 +http://localhost:8081 ``` ## πŸ› οΈ Install Feathr Client Locally diff --git a/docs/quickstart_local_sandbox.md b/docs/quickstart_local_sandbox.md index 7b721ecbf..c964f744b 100644 --- a/docs/quickstart_local_sandbox.md +++ b/docs/quickstart_local_sandbox.md @@ -31,14 +31,14 @@ To get started using Feathr, simply run the command below. Note that the image i ```bash # 80: Feathr UI 8000: Feathr API 8888: Jupyter 8080: VsCode 7080: Interpret -docker run -it --rm -p 8888:8888 -p 8000:8000 -p 80:80 -p 8080:8080 -p 7080:7080 --env API_BASE="api/v1" --env FEATHR_SANDBOX=True -e GRANT_SUDO=yes feathrfeaturestore/feathr-sandbox +docker run -it --rm -p 8888:8888 -p 8000:8000 -p 8081:80 -p 8080:8080 -p 7080:7080 --env API_BASE="api/v1" --env FEATHR_SANDBOX=True -e GRANT_SUDO=yes feathrfeaturestore/feathr-sandbox ``` It should pop up a Jupyter link in `http://127.0.0.1:8888/`. Double click on the notebook file to start the Jupyter Notebook, and you should be able to see the Feathr sample notebook. Click the triangle button on the Jupyter notebook and the whole notebook will run locally. -If you see some errors like below, simply change the `-p 80:80` part to `-p :80` so the Feathr UI will be redirected to the new port. +If you see some errors like below, simply change the `-p 8081:80` part to `-p :80` so the Feathr UI will be redirected to the new port. -`docker: Error response from daemon: driver failed programming external connectivity on endpoint hardcore_bose (ae107e924cddce6b942f96f2654369345b027ac82e5e44929a9f132e2af71746): Bind for 0.0.0.0:80 failed: port is already allocated.` +`docker: Error response from daemon: driver failed programming external connectivity on endpoint hardcore_bose (ae107e924cddce6b942f96f2654369345b027ac82e5e44929a9f132e2af71746): Bind for 0.0.0.0:8081 failed: port is already allocated.` The default jupyter notebook is here: ```bash @@ -51,7 +51,7 @@ http://localhost:8888/lab/workspaces/auto-w/tree/local_quickstart_notebook.ipynb After running the Notebooks, all the features will be registered in the UI, and you can visit the Feathr UI at: ```bash -http://localhost:80 +http://localhost:8081 ``` From 5c081839aeaec46e45ba0f42f3a585345bf4ded1 Mon Sep 17 00:00:00 2001 From: Xiaoyong Zhu Date: Wed, 18 Jan 2023 01:14:53 -0800 Subject: [PATCH 06/15] Update _env_config_reader.py --- feathr_project/feathr/utils/_env_config_reader.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/feathr_project/feathr/utils/_env_config_reader.py b/feathr_project/feathr/utils/_env_config_reader.py index fd4555433..a1975cfd6 100644 --- a/feathr_project/feathr/utils/_env_config_reader.py +++ b/feathr_project/feathr/utils/_env_config_reader.py @@ -52,8 +52,8 @@ def get(self, key: str, default: str = None) -> str: Feathr client's config value. """ res_env = (self._get_variable_from_env(key) if self.use_env_vars else None) - res_file = (self._get_variable_from_file(key) if self.yaml_config else None) - res_keyvault = (self._get_variable_from_akv(key) if self.akv_name else None) + res_file = (self._get_variable_from_file(key) if self.yaml_config and res_env is None else None) + res_keyvault = (self._get_variable_from_akv(key) if self.akv_name and res_env is None and res_file is None else None) # rewrite the logic below to make sure: # First we have the order (i.e. res1 > res2 > res3 > default) @@ -81,7 +81,7 @@ def get_from_env_or_akv(self, key: str) -> str: Feathr client's config value. """ res_env = (self._get_variable_from_env(key) if self.use_env_vars else None) - res_keyvault = (self._get_variable_from_akv(key) if self.akv_name else None) + res_keyvault = (self._get_variable_from_akv(key) if self.akv_name and res_env is None else None) # rewrite the logic below to make sure: # First we have the order (i.e. res1 > res2 > res3 > default) From cbc1e017acc84e6fc4ffff9b27ce1dc057318a6a Mon Sep 17 00:00:00 2001 From: Xiaoyong Zhu Date: Wed, 18 Jan 2023 02:59:10 -0800 Subject: [PATCH 07/15] Update client.py --- feathr_project/feathr/client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/feathr_project/feathr/client.py b/feathr_project/feathr/client.py index 39c6202b9..eb5e4ca80 100644 --- a/feathr_project/feathr/client.py +++ b/feathr_project/feathr/client.py @@ -469,9 +469,9 @@ def _construct_redis_key(self, feature_table, key): def _str_to_bool(self, s: str, variable_name = None): """Define a function to detect convert string to bool, since Redis client sometimes require a bool and sometimes require a str """ - if s.casefold() == 'True'.casefold() or s == True: + if (isinstance(s, str) and s.casefold() == 'True'.casefold()) or s == True: return True - elif s.casefold() == 'False'.casefold() or s == False: + elif (isinstance(s, str) and s.casefold() == 'False'.casefold()) or s == False: return False else: self.logger.warning(f'{s} is not a valid Bool value. Maybe you want to double check if it is set correctly for {variable_name}.') From 9c4f3f73cfebcdcabf69c058fa80bd765e89f5ca Mon Sep 17 00:00:00 2001 From: james Date: Wed, 18 Jan 2023 12:22:28 +0000 Subject: [PATCH 08/15] update --- FeathrSandbox.Dockerfile | 24 +++++++++++++++++++ feathr-sandbox/feathr_init_script.py | 5 ++-- .../spark_provider/_localspark_submission.py | 18 +++++++++++++- 3 files changed, 44 insertions(+), 3 deletions(-) diff --git a/FeathrSandbox.Dockerfile b/FeathrSandbox.Dockerfile index 219cf97af..1472ca250 100644 --- a/FeathrSandbox.Dockerfile +++ b/FeathrSandbox.Dockerfile @@ -10,8 +10,31 @@ RUN echo 'REACT_APP_API_ENDPOINT=http://localhost:8000' >> .env.production RUN npm install && npm run build + +# Stage 1: build frontend ui +FROM gradle:7.6.0-jdk8 as gradle-build +WORKDIR /usr/src/feathr + +# for folers, we need to specify the dest foler name +# COPY feathr-compute/ ./feathr-compute/ +# COPY feathr-config/ ./feathr-config/ +# COPY feathr-data-models/ ./feathr-data-models/ +# COPY feathr-impl/ ./feathr-impl/ +# COPY gradle/ ./gradle/ +# COPY gradle.properties . +# COPY gradlew . +# COPY gradlew.bat . +# COPY repositories.gradle . +# COPY settings.gradle . +# COPY ["feathr-compute/", "feathr-config/", "feathr-data-models/", "feathr-impl/", "gradle/","gradle.properties", "gradlew", "gradlew.bat", "build.gradle", "repositories.gradle", "settings.gradle", "./"] +COPY . . +RUN ./gradlew build + + FROM jupyter/pyspark-notebook + + USER root ## Install dependencies @@ -53,6 +76,7 @@ USER jovyan # UID is like this: uid=1000(jovyan) gid=100(users) groups=100(users) COPY --chown=1000:100 ./docs/samples/local_quickstart_notebook.ipynb . COPY --chown=1000:100 ./feathr-sandbox/feathr_init_script.py . +COPY --chown=1000:100 --from=gradle-build /usr/src/feathr/build/libs . # Run the script so that maven cache can be added for better experience. Otherwise users might have to wait for some time for the maven cache to be ready. RUN python feathr_init_script.py diff --git a/feathr-sandbox/feathr_init_script.py b/feathr-sandbox/feathr_init_script.py index 3e0d37d2b..dbc0c4c3b 100644 --- a/feathr-sandbox/feathr_init_script.py +++ b/feathr-sandbox/feathr_init_script.py @@ -18,7 +18,6 @@ os.environ['SPARK_LOCAL_IP'] = "127.0.0.1" os.environ['REDIS_PASSWORD'] = "foobared" # default password for Redis - yaml_config = f""" api_version: 1 project_config: @@ -30,7 +29,7 @@ spark_result_output_parts: '1' local: master: 'local[*]' - feathr_runtime_location: + feathr_runtime_location: "./feathr_2.12-{feathr.__version__}.jar" online_store: redis: @@ -44,6 +43,8 @@ api_endpoint: "http://127.0.0.1:8000/api/v1" """ +print(yaml_config) + tmp = tempfile.NamedTemporaryFile(mode='w', delete=False) with open(tmp.name, "w") as text_file: text_file.write(yaml_config) diff --git a/feathr_project/feathr/spark_provider/_localspark_submission.py b/feathr_project/feathr/spark_provider/_localspark_submission.py index e946f636b..012e5523c 100644 --- a/feathr_project/feathr/spark_provider/_localspark_submission.py +++ b/feathr_project/feathr/spark_provider/_localspark_submission.py @@ -106,7 +106,23 @@ def submit_feathr_job( print(python_files) spark_args.append(python_files[0]) else: - spark_args.extend(["--class", main_class_name, main_jar_path]) + if not python_files: + # This is a JAR job + # Azure Synapse/Livy doesn't allow JAR job starts from Maven directly, we must have a jar file uploaded. + # so we have to use a dummy jar as the main file. + # Use the no-op jar as the main file + # This is a dummy jar which contains only one `org.example.Noop` class with one empty `main` function + # which does nothing + main_jar_path = main_jar_path + spark_args.extend(["--packages", maven_dependency, "--class", main_class_name, main_jar_path]) + else: + spark_args.extend(["--packages", maven_dependency]) + # This is a PySpark job, no more things to + if python_files.__len__() > 1: + spark_args.extend(["--py-files", ",".join(python_files[1:])]) + print(python_files) + spark_args.append(python_files[0]) + if arguments: spark_args.extend(arguments) From c6d9fef4eee760fbe4f8d7d8ba7f0bbd5745784d Mon Sep 17 00:00:00 2001 From: Xiaoyong Zhu Date: Wed, 18 Jan 2023 06:00:23 -0800 Subject: [PATCH 09/15] Update setup.py --- feathr_project/setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/feathr_project/setup.py b/feathr_project/setup.py index d5f13b31c..98937933c 100644 --- a/feathr_project/setup.py +++ b/feathr_project/setup.py @@ -65,6 +65,7 @@ "py4j<=0.10.9.7", "loguru<=0.6.0", "pandas", + "numpy<=1.20.3", # pin numpy due to pyspark's deprecated np.bool access "redis<=4.4.0", "requests<=2.28.1", "tqdm<=4.64.1", From 55ed806a48b4ff343ffb93a6de3fb5e6fd0f16bb Mon Sep 17 00:00:00 2001 From: Xiaoyong Zhu Date: Wed, 18 Jan 2023 18:59:46 -0800 Subject: [PATCH 10/15] Revert "update" This reverts commit 9c4f3f73cfebcdcabf69c058fa80bd765e89f5ca. --- FeathrSandbox.Dockerfile | 24 ------------------------ feathr-sandbox/feathr_init_script.py | 5 ++--- 2 files changed, 2 insertions(+), 27 deletions(-) diff --git a/FeathrSandbox.Dockerfile b/FeathrSandbox.Dockerfile index 1472ca250..219cf97af 100644 --- a/FeathrSandbox.Dockerfile +++ b/FeathrSandbox.Dockerfile @@ -10,31 +10,8 @@ RUN echo 'REACT_APP_API_ENDPOINT=http://localhost:8000' >> .env.production RUN npm install && npm run build - -# Stage 1: build frontend ui -FROM gradle:7.6.0-jdk8 as gradle-build -WORKDIR /usr/src/feathr - -# for folers, we need to specify the dest foler name -# COPY feathr-compute/ ./feathr-compute/ -# COPY feathr-config/ ./feathr-config/ -# COPY feathr-data-models/ ./feathr-data-models/ -# COPY feathr-impl/ ./feathr-impl/ -# COPY gradle/ ./gradle/ -# COPY gradle.properties . -# COPY gradlew . -# COPY gradlew.bat . -# COPY repositories.gradle . -# COPY settings.gradle . -# COPY ["feathr-compute/", "feathr-config/", "feathr-data-models/", "feathr-impl/", "gradle/","gradle.properties", "gradlew", "gradlew.bat", "build.gradle", "repositories.gradle", "settings.gradle", "./"] -COPY . . -RUN ./gradlew build - - FROM jupyter/pyspark-notebook - - USER root ## Install dependencies @@ -76,7 +53,6 @@ USER jovyan # UID is like this: uid=1000(jovyan) gid=100(users) groups=100(users) COPY --chown=1000:100 ./docs/samples/local_quickstart_notebook.ipynb . COPY --chown=1000:100 ./feathr-sandbox/feathr_init_script.py . -COPY --chown=1000:100 --from=gradle-build /usr/src/feathr/build/libs . # Run the script so that maven cache can be added for better experience. Otherwise users might have to wait for some time for the maven cache to be ready. RUN python feathr_init_script.py diff --git a/feathr-sandbox/feathr_init_script.py b/feathr-sandbox/feathr_init_script.py index dbc0c4c3b..3e0d37d2b 100644 --- a/feathr-sandbox/feathr_init_script.py +++ b/feathr-sandbox/feathr_init_script.py @@ -18,6 +18,7 @@ os.environ['SPARK_LOCAL_IP'] = "127.0.0.1" os.environ['REDIS_PASSWORD'] = "foobared" # default password for Redis + yaml_config = f""" api_version: 1 project_config: @@ -29,7 +30,7 @@ spark_result_output_parts: '1' local: master: 'local[*]' - feathr_runtime_location: "./feathr_2.12-{feathr.__version__}.jar" + feathr_runtime_location: online_store: redis: @@ -43,8 +44,6 @@ api_endpoint: "http://127.0.0.1:8000/api/v1" """ -print(yaml_config) - tmp = tempfile.NamedTemporaryFile(mode='w', delete=False) with open(tmp.name, "w") as text_file: text_file.write(yaml_config) From fa84161f944d10d03afd08c57df80e4fc3cee5a9 Mon Sep 17 00:00:00 2001 From: Xiaoyong Zhu Date: Wed, 18 Jan 2023 19:31:13 -0800 Subject: [PATCH 11/15] Fix env bugs --- docs/samples/feature_embedding.ipynb | 1 - feathr_project/feathr/client.py | 4 +-- .../spark_provider/_localspark_submission.py | 1 - .../feathr/utils/_env_config_reader.py | 18 ++++++------- .../test/unit/utils/test_env_config_reader.py | 25 ++++++++----------- 5 files changed, 20 insertions(+), 29 deletions(-) diff --git a/docs/samples/feature_embedding.ipynb b/docs/samples/feature_embedding.ipynb index 27498b1f5..ad58252dc 100644 --- a/docs/samples/feature_embedding.ipynb +++ b/docs/samples/feature_embedding.ipynb @@ -341,7 +341,6 @@ "client = FeathrClient(\n", " config_path=config_path,\n", " credential=credential,\n", - " use_env_vars=False,\n", ")" ] }, diff --git a/feathr_project/feathr/client.py b/feathr_project/feathr/client.py index eb5e4ca80..6fafe9560 100644 --- a/feathr_project/feathr/client.py +++ b/feathr_project/feathr/client.py @@ -65,7 +65,6 @@ def __init__( local_workspace_dir: str = None, credential: Any = None, project_registry_tag: Dict[str, str] = None, - use_env_vars: bool = True, ): """Initialize Feathr Client. @@ -74,13 +73,12 @@ def __init__( local_workspace_dir (optional): Set where is the local work space dir. If not set, Feathr will create a temporary folder to store local workspace related files. credential (optional): Azure credential to access cloud resources, most likely to be the returned result of DefaultAzureCredential(). If not set, Feathr will initialize DefaultAzureCredential() inside the __init__ function to get credentials. project_registry_tag (optional): Adding tags for project in Feathr registry. This might be useful if you want to tag your project as deprecated, or allow certain customizations on project level. Default is empty - use_env_vars (optional): Whether to use environment variables to set up the client. If set to False, the client will not use environment variables to set up the client. Defaults to True. """ self.logger = logging.getLogger(__name__) # Redis key separator self._KEY_SEPARATOR = ':' self._COMPOSITE_KEY_SEPARATOR = '#' - self.env_config = EnvConfigReader(config_path=config_path, use_env_vars=use_env_vars) + self.env_config = EnvConfigReader(config_path=config_path) if local_workspace_dir: self.local_workspace_dir = local_workspace_dir else: diff --git a/feathr_project/feathr/spark_provider/_localspark_submission.py b/feathr_project/feathr/spark_provider/_localspark_submission.py index 2c1a598e7..c609be945 100644 --- a/feathr_project/feathr/spark_provider/_localspark_submission.py +++ b/feathr_project/feathr/spark_provider/_localspark_submission.py @@ -86,7 +86,6 @@ def submit_feathr_job( spark_args = self._init_args(job_name=job_name, confs=cfg) # Add additional repositories spark_args.extend(["--repositories", "https://repository.mulesoft.org/nexus/content/repositories/public/,https://linkedin.jfrog.io/artifactory/open-source/"]) - # spark_args.extend(["--repositories", "https://linkedin.jfrog.io/artifactory/open-source/"]) if not main_jar_path: # We don't have the main jar, use Maven diff --git a/feathr_project/feathr/utils/_env_config_reader.py b/feathr_project/feathr/utils/_env_config_reader.py index a1975cfd6..ff0229861 100644 --- a/feathr_project/feathr/utils/_env_config_reader.py +++ b/feathr_project/feathr/utils/_env_config_reader.py @@ -11,17 +11,19 @@ class EnvConfigReader(object): """A utility class to read Feathr environment variables either from os environment variables, the config yaml file or Azure Key Vault. If a key is set in the environment variable, ConfigReader will return the value of that environment variable - unless use_env_vars set to False. + It will retrieve the value in the following order: + - From the environment variable if the key is set in the os environment variables. + - From the config yaml file if the key exists. + - From the Azure Key Vault. """ akv_name: str = None # Azure Key Vault name to use for retrieving config values. yaml_config: dict = None # YAML config file content. - def __init__(self, config_path: str, use_env_vars: bool = True): + def __init__(self, config_path: str): """Initialize the utility class. Args: config_path: Config file path. - use_env_vars (optional): Whether to use os environment variables instead of config file. Defaults to True. """ if config_path is not None: config_path = Path(config_path) @@ -31,7 +33,6 @@ def __init__(self, config_path: str, use_env_vars: bool = True): except yaml.YAMLError as e: logger.warning(e) - self.use_env_vars = use_env_vars self.akv_name = self.get("secrets__azure_key_vault__name") self.akv_client = AzureKeyVaultClient(self.akv_name) if self.akv_name else None @@ -39,7 +40,7 @@ def __init__(self, config_path: str, use_env_vars: bool = True): def get(self, key: str, default: str = None) -> str: """Gets the Feathr config variable for the given key. It will retrieve the value in the following order: - - From the environment variable if `use_env_vars == True` and the key is set in the os environment variables. + - From the environment variable if the key is set in the os environment variables. - From the config yaml file if the key exists. - From the Azure Key Vault. If the key is not found in any of the above, it will return `default`. @@ -51,7 +52,7 @@ def get(self, key: str, default: str = None) -> str: Returns: Feathr client's config value. """ - res_env = (self._get_variable_from_env(key) if self.use_env_vars else None) + res_env = self._get_variable_from_env(key)) res_file = (self._get_variable_from_file(key) if self.yaml_config and res_env is None else None) res_keyvault = (self._get_variable_from_akv(key) if self.akv_name and res_env is None and res_file is None else None) @@ -67,8 +68,7 @@ def get(self, key: str, default: str = None) -> str: return default def get_from_env_or_akv(self, key: str) -> str: - """Gets the Feathr config variable for the given key. This function ignores `use_env_vars` attribute and force to - look up environment variables or Azure Key Vault. + """Gets the Feathr config variable for the given key. This function will look up environment variables or Azure Key Vault. It will retrieve the value in the following order: - From the environment variable if the key is set in the os environment variables. - From the Azure Key Vault. @@ -80,7 +80,7 @@ def get_from_env_or_akv(self, key: str) -> str: Returns: Feathr client's config value. """ - res_env = (self._get_variable_from_env(key) if self.use_env_vars else None) + res_env = self._get_variable_from_env(key) res_keyvault = (self._get_variable_from_akv(key) if self.akv_name and res_env is None else None) # rewrite the logic below to make sure: diff --git a/feathr_project/test/unit/utils/test_env_config_reader.py b/feathr_project/test/unit/utils/test_env_config_reader.py index 98e591808..14489e3cc 100644 --- a/feathr_project/test/unit/utils/test_env_config_reader.py +++ b/feathr_project/test/unit/utils/test_env_config_reader.py @@ -18,21 +18,18 @@ @pytest.mark.parametrize( - "use_env_vars, env_value, expected_value", + "env_value, expected_value", [ - (True, TEST_CONFIG_ENV_VAL, TEST_CONFIG_ENV_VAL), - (True, None, TEST_CONFIG_FILE_VAL), - (False, TEST_CONFIG_ENV_VAL, TEST_CONFIG_FILE_VAL), + (TEST_CONFIG_ENV_VAL, TEST_CONFIG_ENV_VAL), + ( None, TEST_CONFIG_FILE_VAL), ] ) def test__envvariableutil__get( mocker: MockerFixture, - use_env_vars: bool, env_value: str, expected_value: str, ): """Test `get` method if it returns the correct value - along with `use_env_vars` argument. """ if env_value: mocker.patch.object(feathr.utils._env_config_reader.os, "environ", {TEST_CONFIG_KEY: env_value}) @@ -40,29 +37,27 @@ def test__envvariableutil__get( f = NamedTemporaryFile(delete=True) f.write(TEST_CONFIG_FILE_CONTENT.encode()) f.seek(0) - env_config = EnvConfigReader(config_path=f.name, use_env_vars=use_env_vars) + env_config = EnvConfigReader(config_path=f.name) assert env_config.get(TEST_CONFIG_KEY) == expected_value @pytest.mark.parametrize( - "use_env_vars, env_value, expected_value", + "env_value, expected_value", [ - (True, TEST_CONFIG_ENV_VAL, TEST_CONFIG_ENV_VAL), - (True, None, None), - (False, TEST_CONFIG_ENV_VAL, TEST_CONFIG_ENV_VAL), + (TEST_CONFIG_ENV_VAL, TEST_CONFIG_ENV_VAL), + (None, None), + (TEST_CONFIG_ENV_VAL, TEST_CONFIG_ENV_VAL), ] ) def test__envvariableutil__get_from_env_or_akv( mocker: MockerFixture, - use_env_vars: bool, env_value: str, expected_value: str, ): - """Test `get_from_env_or_akv` method if it returns the environment variable regardless of `use_env_vars` argument. + """Test `get_from_env_or_akv` method if it returns the environment variable Args: mocker (MockerFixture): _description_ - use_env_vars (bool): _description_ env_value (str): _description_ expected_value (str): _description_ """ @@ -72,5 +67,5 @@ def test__envvariableutil__get_from_env_or_akv( f = NamedTemporaryFile(delete=True) f.write(TEST_CONFIG_FILE_CONTENT.encode()) f.seek(0) - env_config = EnvConfigReader(config_path=f.name, use_env_vars=use_env_vars) + env_config = EnvConfigReader(config_path=f.name) assert env_config.get_from_env_or_akv(TEST_CONFIG_KEY) == expected_value From a7d445034bc9410df5772fda93d93b339a47fabc Mon Sep 17 00:00:00 2001 From: Xiaoyong Zhu Date: Wed, 18 Jan 2023 21:28:08 -0800 Subject: [PATCH 12/15] Update _env_config_reader.py --- feathr_project/feathr/utils/_env_config_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feathr_project/feathr/utils/_env_config_reader.py b/feathr_project/feathr/utils/_env_config_reader.py index ff0229861..bf4a4ae8d 100644 --- a/feathr_project/feathr/utils/_env_config_reader.py +++ b/feathr_project/feathr/utils/_env_config_reader.py @@ -52,7 +52,7 @@ def get(self, key: str, default: str = None) -> str: Returns: Feathr client's config value. """ - res_env = self._get_variable_from_env(key)) + res_env = self._get_variable_from_env(key) res_file = (self._get_variable_from_file(key) if self.yaml_config and res_env is None else None) res_keyvault = (self._get_variable_from_akv(key) if self.akv_name and res_env is None and res_file is None else None) From cb90ca355a76b6cf8c18efecd39a0936a92c1cd1 Mon Sep 17 00:00:00 2001 From: Xiaoyong Zhu Date: Wed, 18 Jan 2023 23:42:28 -0800 Subject: [PATCH 13/15] Update pull_request_push_test.yml --- .github/workflows/pull_request_push_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pull_request_push_test.yml b/.github/workflows/pull_request_push_test.yml index 670e16a0f..3cf4c8dc3 100644 --- a/.github/workflows/pull_request_push_test.yml +++ b/.github/workflows/pull_request_push_test.yml @@ -108,7 +108,7 @@ jobs: SPARK_CONFIG__SPARK_CLUSTER: databricks SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL: ${{secrets.DATABRICKS_HOST}} DATABRICKS_WORKSPACE_TOKEN_VALUE: ${{secrets.DATABRICKS_WORKSPACE_TOKEN_VALUE}} - SPARK_CONFIG__DATABRICKS__CONFIG_TEMPLATE: '{"run_name":"FEATHR_FILL_IN","new_cluster":{"spark_version":"9.1.x-scala2.12","num_workers":1,"spark_conf":{"FEATHR_FILL_IN":"FEATHR_FILL_IN"},"instance_pool_id":"${{secrets.DATABRICKS_INSTANCE_POOL_ID}}"},"libraries":[{"maven": {"coordinates": "com.azure.cosmos.spark:azure-cosmos-spark_3-1_2-12:4.16.0"}}],"spark_jar_task":{"main_class_name":"FEATHR_FILL_IN","parameters":["FEATHR_FILL_IN"]}}' + SPARK_CONFIG__DATABRICKS__CONFIG_TEMPLATE: '{"run_name":"FEATHR_FILL_IN","new_cluster":{"spark_version":"9.1.x-scala2.12","num_workers":1,"spark_conf":{"FEATHR_FILL_IN":"FEATHR_FILL_IN"},"instance_pool_id":"${{secrets.DATABRICKS_INSTANCE_POOL_ID}}"},"libraries":[{"jar":"FEATHR_FILL_IN"}],"spark_jar_task":{"main_class_name":"FEATHR_FILL_IN","parameters":["FEATHR_FILL_IN"]}}' REDIS_PASSWORD: ${{secrets.REDIS_PASSWORD}} AZURE_CLIENT_ID: ${{secrets.AZURE_CLIENT_ID}} AZURE_TENANT_ID: ${{secrets.AZURE_TENANT_ID}} From 53ccf2af716126dc955954954cefc9d04e512daa Mon Sep 17 00:00:00 2001 From: Xiaoyong Zhu Date: Thu, 19 Jan 2023 00:55:24 -0800 Subject: [PATCH 14/15] revert changes --- .github/workflows/pull_request_push_test.yml | 2 +- build.gradle | 3 +-- feathr-impl/build.gradle | 2 -- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/.github/workflows/pull_request_push_test.yml b/.github/workflows/pull_request_push_test.yml index 3cf4c8dc3..5cb10fa1c 100644 --- a/.github/workflows/pull_request_push_test.yml +++ b/.github/workflows/pull_request_push_test.yml @@ -108,7 +108,7 @@ jobs: SPARK_CONFIG__SPARK_CLUSTER: databricks SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL: ${{secrets.DATABRICKS_HOST}} DATABRICKS_WORKSPACE_TOKEN_VALUE: ${{secrets.DATABRICKS_WORKSPACE_TOKEN_VALUE}} - SPARK_CONFIG__DATABRICKS__CONFIG_TEMPLATE: '{"run_name":"FEATHR_FILL_IN","new_cluster":{"spark_version":"9.1.x-scala2.12","num_workers":1,"spark_conf":{"FEATHR_FILL_IN":"FEATHR_FILL_IN"},"instance_pool_id":"${{secrets.DATABRICKS_INSTANCE_POOL_ID}}"},"libraries":[{"jar":"FEATHR_FILL_IN"}],"spark_jar_task":{"main_class_name":"FEATHR_FILL_IN","parameters":["FEATHR_FILL_IN"]}}' + SPARK_CONFIG__DATABRICKS__CONFIG_TEMPLATE: '{"run_name":"FEATHR_FILL_IN","new_cluster":{"spark_version":"9.1.x-scala2.12","num_workers":1,"spark_conf":{"FEATHR_FILL_IN":"FEATHR_FILL_IN"},"instance_pool_id":"${{secrets.DATABRICKS_INSTANCE_POOL_ID}}"},"libraries":[{"jar":"FEATHR_FILL_IN"}],"spark_jar_task":{"main_class_name":"FEATHR_FILL_IN","parameters":["FEATHR_FILL_IN"]}}' REDIS_PASSWORD: ${{secrets.REDIS_PASSWORD}} AZURE_CLIENT_ID: ${{secrets.AZURE_CLIENT_ID}} AZURE_TENANT_ID: ${{secrets.AZURE_TENANT_ID}} diff --git a/build.gradle b/build.gradle index b8c550258..19b9d62a7 100644 --- a/build.gradle +++ b/build.gradle @@ -73,7 +73,7 @@ dependencies { implementation 'net.snowflake:snowflake-jdbc:3.13.18' implementation 'net.snowflake:spark-snowflake_2.12:2.10.0-spark_3.2' provided 'com.microsoft.azure:azure-eventhubs-spark_2.12:2.3.21' - implementation 'com.azure.cosmos.spark:azure-cosmos-spark_3-1_2-12:4.16.0' + provided 'com.azure.cosmos.spark:azure-cosmos-spark_3-2_2-12:4.11.1' provided 'com.microsoft.sqlserver:mssql-jdbc:10.2.0.jre8' provided 'org.eclipse.jetty:jetty-util:9.3.24.v20180605' provided 'org.apache.kafka:kafka-clients:3.1.0' @@ -130,7 +130,6 @@ project.ext.spec = [ 'avro' : "org.apache.avro:avro:1.10.2", "avroUtil": "com.linkedin.avroutil1:helper-all:0.2.100", "azure": "com.microsoft.azure:azure-eventhubs-spark_2.12:2.3.21", - "spark_cosmos": "com.azure.cosmos.spark:azure-cosmos-spark_3-1_2-12:4.16.0", 'fastutil' : "it.unimi.dsi:fastutil:8.1.1", 'mvel' : "org.mvel:mvel2:2.2.8.Final", 'protobuf' : "com.google.protobuf:protobuf-java:2.6.1", diff --git a/feathr-impl/build.gradle b/feathr-impl/build.gradle index 7f9894727..055fcd4c0 100644 --- a/feathr-impl/build.gradle +++ b/feathr-impl/build.gradle @@ -48,7 +48,6 @@ dependencies { implementation spec.product.jackson.dataformat_hocon implementation spec.product.jackson.jackson_core implementation spec.product.spark_redis - implementation spec.product.spark_cosmos implementation spec.product.fastutil implementation spec.product.hadoop.mapreduce_client_core implementation spec.product.mvel @@ -78,7 +77,6 @@ dependencies { testImplementation spec.product.equalsverifier testImplementation spec.product.spark.spark_catalyst - testImplementation spec.product.spark_cosmos testImplementation spec.product.mockito testImplementation spec.product.scala.scalatest testImplementation spec.product.testing From 2f74271c362fc87d97e8bbff5d6cf12739e0c493 Mon Sep 17 00:00:00 2001 From: Xiaoyong Zhu Date: Thu, 19 Jan 2023 08:35:17 -0500 Subject: [PATCH 15/15] Update pull_request_push_test.yml --- .github/workflows/pull_request_push_test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pull_request_push_test.yml b/.github/workflows/pull_request_push_test.yml index 5cb10fa1c..137a810ee 100644 --- a/.github/workflows/pull_request_push_test.yml +++ b/.github/workflows/pull_request_push_test.yml @@ -108,7 +108,7 @@ jobs: SPARK_CONFIG__SPARK_CLUSTER: databricks SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL: ${{secrets.DATABRICKS_HOST}} DATABRICKS_WORKSPACE_TOKEN_VALUE: ${{secrets.DATABRICKS_WORKSPACE_TOKEN_VALUE}} - SPARK_CONFIG__DATABRICKS__CONFIG_TEMPLATE: '{"run_name":"FEATHR_FILL_IN","new_cluster":{"spark_version":"9.1.x-scala2.12","num_workers":1,"spark_conf":{"FEATHR_FILL_IN":"FEATHR_FILL_IN"},"instance_pool_id":"${{secrets.DATABRICKS_INSTANCE_POOL_ID}}"},"libraries":[{"jar":"FEATHR_FILL_IN"}],"spark_jar_task":{"main_class_name":"FEATHR_FILL_IN","parameters":["FEATHR_FILL_IN"]}}' + SPARK_CONFIG__DATABRICKS__CONFIG_TEMPLATE: '{"run_name":"FEATHR_FILL_IN","new_cluster":{"spark_version":"11.3.x-scala2.12","num_workers":1,"spark_conf":{"FEATHR_FILL_IN":"FEATHR_FILL_IN"},"instance_pool_id":"${{secrets.DATABRICKS_INSTANCE_POOL_ID}}"},"libraries":[{"jar":"FEATHR_FILL_IN"}],"spark_jar_task":{"main_class_name":"FEATHR_FILL_IN","parameters":["FEATHR_FILL_IN"]}}' REDIS_PASSWORD: ${{secrets.REDIS_PASSWORD}} AZURE_CLIENT_ID: ${{secrets.AZURE_CLIENT_ID}} AZURE_TENANT_ID: ${{secrets.AZURE_TENANT_ID}} @@ -311,4 +311,4 @@ jobs: run: echo "NOW=$(date +'%Y-%m-%d')" >> $GITHUB_ENV - name: Notification run: | - curl -H 'Content-Type: application/json' -d '{"text": "${{env.NOW}} Daily Report: 1. Gradle Test ${{needs.gradle_test.result}}, 2. Python Lint Test ${{needs.python_lint.result}}, 3. Databricks Test ${{needs.databricks_test.result}}, 4. Synapse Test ${{needs.azure_synapse_test.result}} , 5. LOCAL SPARK TEST ${{needs.local_spark_test.result}}. Link: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"}' ${{ secrets.TEAMS_WEBHOOK }} \ No newline at end of file + curl -H 'Content-Type: application/json' -d '{"text": "${{env.NOW}} Daily Report: 1. Gradle Test ${{needs.gradle_test.result}}, 2. Python Lint Test ${{needs.python_lint.result}}, 3. Databricks Test ${{needs.databricks_test.result}}, 4. Synapse Test ${{needs.azure_synapse_test.result}} , 5. LOCAL SPARK TEST ${{needs.local_spark_test.result}}. Link: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"}' ${{ secrets.TEAMS_WEBHOOK }}