From 9a25c14dd9e2d260945ef540a530b7df52ed9d2d Mon Sep 17 00:00:00 2001 From: Xiaoyong Zhu Date: Tue, 17 Jan 2023 19:20:49 -0800 Subject: [PATCH 1/4] Fixing env read issues and setup pacakges --- feathr_project/feathr/client.py | 12 ++++---- .../feathr/utils/_env_config_reader.py | 20 +++++++++---- feathr_project/setup.py | 30 ++++++++++--------- 3 files changed, 36 insertions(+), 26 deletions(-) diff --git a/feathr_project/feathr/client.py b/feathr_project/feathr/client.py index ec22274fe..213713f65 100644 --- a/feathr_project/feathr/client.py +++ b/feathr_project/feathr/client.py @@ -71,7 +71,7 @@ def __init__( config_path (optional): Config yaml file path. See [Feathr Config Template](https://github.com/feathr-ai/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) for more details. Defaults to "./feathr_config.yaml". local_workspace_dir (optional): Set where is the local work space dir. If not set, Feathr will create a temporary folder to store local workspace related files. credential (optional): Azure credential to access cloud resources, most likely to be the returned result of DefaultAzureCredential(). If not set, Feathr will initialize DefaultAzureCredential() inside the __init__ function to get credentials. - project_registry_tag (optional): Adding tags for project in Feathr registry. This might be useful if you want to tag your project as deprecated, or allow certain customizations on project leve. Default is empty + project_registry_tag (optional): Adding tags for project in Feathr registry. This might be useful if you want to tag your project as deprecated, or allow certain customizations on project level. Default is empty use_env_vars (optional): Whether to use environment variables to set up the client. If set to False, the client will not use environment variables to set up the client. Defaults to True. """ self.logger = logging.getLogger(__name__) @@ -459,7 +459,7 @@ def _construct_redis_key(self, feature_table, key): key = self._COMPOSITE_KEY_SEPARATOR.join(key) return feature_table + self._KEY_SEPARATOR + key - def _str_to_bool(self, s): + def _str_to_bool(self, s, variable_name = None): """Define a function to detect convert string to bool, since Redis client sometimes require a bool and sometimes require a str """ if s == 'True' or s == True: @@ -467,7 +467,8 @@ def _str_to_bool(self, s): elif s == 'False' or s == False: return False else: - raise ValueError # evil ValueError that doesn't tell you what the wrong value was + self.logger.warning(f'{s} is not a valid Bool value. Maybe you want to double check if it is set correctly for {variable_name}.') + return s def _construct_redis_client(self): """Constructs the Redis client. The host, port, credential and other parameters can be set via environment @@ -477,13 +478,12 @@ def _construct_redis_client(self): host = self.redis_host port = self.redis_port ssl_enabled = self.redis_ssl_enabled - redis_client = redis.Redis( + self.redis_client = redis.Redis( host=host, port=port, password=password, - ssl=self._str_to_bool(ssl_enabled)) + ssl=self._str_to_bool(ssl_enabled, "ssl_enabled")) self.logger.info('Redis connection is successful and completed.') - self.redis_client = redis_client def get_offline_features(self, diff --git a/feathr_project/feathr/utils/_env_config_reader.py b/feathr_project/feathr/utils/_env_config_reader.py index 25f51be04..fd4555433 100644 --- a/feathr_project/feathr/utils/_env_config_reader.py +++ b/feathr_project/feathr/utils/_env_config_reader.py @@ -62,7 +62,7 @@ def get(self, key: str, default: str = None) -> str: if res is not None: return res - logger.info(f"Config {key} is not found in the environment variable, configuration file, or the remote key value store.") + logger.info(f"Config {key} is not found in the environment variable, configuration file, or the remote key value store. Using default value which is {default}.") return default @@ -80,12 +80,20 @@ def get_from_env_or_akv(self, key: str) -> str: Returns: Feathr client's config value. """ - conf_var = ( - self._get_variable_from_env(key) or - (self._get_variable_from_akv(key) if self.akv_name else None) - ) + res_env = (self._get_variable_from_env(key) if self.use_env_vars else None) + res_keyvault = (self._get_variable_from_akv(key) if self.akv_name else None) + + # rewrite the logic below to make sure: + # First we have the order (i.e. res1 > res2 > res3 > default) + # Also previously we use OR for the result, which will yield a bug where say res1=None, res2=False, res3=None. Using OR will result to None result, although res2 actually have value + for res in [res_env, res_keyvault]: + if res is not None: + return res + + logger.warning(f"Config {key} is not found in the environment variable or the remote key value store.") + return None + - return conf_var def _get_variable_from_env(self, key: str) -> str: # make it work for lower case and upper case. diff --git a/feathr_project/setup.py b/feathr_project/setup.py index 2e741860e..d9c7c178a 100644 --- a/feathr_project/setup.py +++ b/feathr_project/setup.py @@ -61,26 +61,28 @@ include_package_data=True, # consider install_requires=[ - "click<=8.1.3", - "py4j<=0.10.9.7", - "loguru<=0.6.0", + # packages below are not critical and we are using its basic functions, so any version should be fine + "click", + "py4j", + "loguru", "pandas", - "redis<=4.4.0", - "requests<=2.28.1", - "tqdm<=4.64.1", + "redis", + "requests", + "tqdm", + "graphlib_backport", + "pyyaml", + "Jinja2", + "databricks-cli", + "pyhocon", + "pandavro", + "pyarrow", + "python-snappy", + # we do have some sort of requirements for the packages below. "pyapacheatlas<=0.14.0", - "pyhocon<=0.3.59", - "pandavro<=1.7.1", - "pyyaml<=6.0", - "Jinja2<=3.1.2", - "pyarrow<=9.0.0", "pyspark>=3.1.2", - "python-snappy<=0.6.1", "deltalake>=0.6.2", - "graphlib_backport<=1.0.3", "protobuf<=3.19.4,>=3.0.0", "confluent-kafka<=1.9.2", - "databricks-cli<=0.17.3", "avro<=1.11.1", "azure-storage-file-datalake<=12.5.0", "azure-synapse-spark", From 4643cafc6c1d351a82d05014dac1233180f07d2e Mon Sep 17 00:00:00 2001 From: Xiaoyong Zhu Date: Tue, 17 Jan 2023 22:00:30 -0800 Subject: [PATCH 2/4] revert changes in version fix --- feathr_project/setup.py | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/feathr_project/setup.py b/feathr_project/setup.py index d9c7c178a..d5f13b31c 100644 --- a/feathr_project/setup.py +++ b/feathr_project/setup.py @@ -61,28 +61,26 @@ include_package_data=True, # consider install_requires=[ - # packages below are not critical and we are using its basic functions, so any version should be fine - "click", - "py4j", - "loguru", + "click<=8.1.3", + "py4j<=0.10.9.7", + "loguru<=0.6.0", "pandas", - "redis", - "requests", - "tqdm", - "graphlib_backport", - "pyyaml", - "Jinja2", - "databricks-cli", - "pyhocon", - "pandavro", - "pyarrow", - "python-snappy", - # we do have some sort of requirements for the packages below. + "redis<=4.4.0", + "requests<=2.28.1", + "tqdm<=4.64.1", "pyapacheatlas<=0.14.0", + "pyhocon<=0.3.59", + "pandavro<=1.7.1", + "pyyaml<=6.0", + "Jinja2<=3.1.2", + "pyarrow<=9.0.0", "pyspark>=3.1.2", + "python-snappy<=0.6.1", "deltalake>=0.6.2", + "graphlib_backport<=1.0.3", "protobuf<=3.19.4,>=3.0.0", "confluent-kafka<=1.9.2", + "databricks-cli<=0.17.3", "avro<=1.11.1", "azure-storage-file-datalake<=12.5.0", "azure-synapse-spark", @@ -115,4 +113,4 @@ "Operating System :: OS Independent", ], python_requires=">=3.7" -) +) \ No newline at end of file From f840ab4793104023c6b0dc5e4f948203dce36988 Mon Sep 17 00:00:00 2001 From: Xiaoyong Zhu Date: Wed, 18 Jan 2023 00:45:23 -0800 Subject: [PATCH 3/4] Decouple Redis from Feathr client --- feathr_project/feathr/client.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/feathr_project/feathr/client.py b/feathr_project/feathr/client.py index 213713f65..f195ea683 100644 --- a/feathr_project/feathr/client.py +++ b/feathr_project/feathr/client.py @@ -94,13 +94,15 @@ def __init__( self.project_name = self.env_config.get( 'project_config__project_name') - # Redis configs - self.redis_host = self.env_config.get( - 'online_store__redis__host') - self.redis_port = self.env_config.get( - 'online_store__redis__port') - self.redis_ssl_enabled = self.env_config.get( - 'online_store__redis__ssl_enabled') + # Redis configs. This is optional unless users have configured Redis host. + if self.env_config.get('online_store__redis__host'): + self.redis_host = self.env_config.get( + 'online_store__redis__host') + self.redis_port = self.env_config.get( + 'online_store__redis__port') + self.redis_ssl_enabled = self.env_config.get( + 'online_store__redis__ssl_enabled') + self._construct_redis_client() # Offline store enabled configs; false by default self.s3_enabled = self.env_config.get( @@ -182,7 +184,6 @@ def __init__( master = self.env_config.get('spark_config__local__master') ) - self._construct_redis_client() self.secret_names = [] @@ -459,12 +460,12 @@ def _construct_redis_key(self, feature_table, key): key = self._COMPOSITE_KEY_SEPARATOR.join(key) return feature_table + self._KEY_SEPARATOR + key - def _str_to_bool(self, s, variable_name = None): + def _str_to_bool(self, s: str, variable_name = None): """Define a function to detect convert string to bool, since Redis client sometimes require a bool and sometimes require a str """ - if s == 'True' or s == True: + if s.casefold() == 'True'.casefold() or s == True: return True - elif s == 'False' or s == False: + elif s.casefold() == 'False'.casefold() or s == False: return False else: self.logger.warning(f'{s} is not a valid Bool value. Maybe you want to double check if it is set correctly for {variable_name}.') From 6d72fddbbe0627af2360dbdf7ce0262c022cc801 Mon Sep 17 00:00:00 2001 From: Xiaoyong Zhu Date: Wed, 18 Jan 2023 00:48:22 -0800 Subject: [PATCH 4/4] Update client.py --- feathr_project/feathr/client.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/feathr_project/feathr/client.py b/feathr_project/feathr/client.py index f195ea683..39c6202b9 100644 --- a/feathr_project/feathr/client.py +++ b/feathr_project/feathr/client.py @@ -39,6 +39,8 @@ from feathr.utils.feature_printer import FeaturePrinter from feathr.utils.spark_job_params import FeatureGenerationJobParams, FeatureJoinJobParams from feathr.version import get_version +import importlib.util + class FeathrClient(object): @@ -96,6 +98,10 @@ def __init__( # Redis configs. This is optional unless users have configured Redis host. if self.env_config.get('online_store__redis__host'): + # For illustrative purposes. + spec = importlib.util.find_spec("redis") + if spec is None: + self.logger.warning('You have configured Redis host, but there is no local Redis client package. Install the package using "pip install redis". ') self.redis_host = self.env_config.get( 'online_store__redis__host') self.redis_port = self.env_config.get(