diff --git a/feathr_project/feathr/client.py b/feathr_project/feathr/client.py index 7c8784bda..3a94e0c73 100644 --- a/feathr_project/feathr/client.py +++ b/feathr_project/feathr/client.py @@ -245,7 +245,7 @@ def build_features(self, anchor_list: List[FeatureAnchor] = [], derived_feature_ # Pretty print anchor_list if verbose and self.anchor_list: - FeaturePrinter.pretty_print_anchors(self.anchor_list) + FeaturePrinter.pretty_print_anchors(self.anchor_list) def list_registered_features(self, project_name: str = None) -> List[str]: """List all the already registered features under the given project. diff --git a/feathr_project/feathr/registry/_feature_registry_purview.py b/feathr_project/feathr/registry/_feature_registry_purview.py index ef30b7be0..884d49a94 100644 --- a/feathr_project/feathr/registry/_feature_registry_purview.py +++ b/feathr_project/feathr/registry/_feature_registry_purview.py @@ -357,7 +357,7 @@ def _add_all_derived_features(self, derived_features: List[DerivedFeature], ts:T ts (TopologicalSorter): a topological sorter by python Returns: - None. The topo sorter will maitain a static topo sorted order. + None. The topological sorter will maintain a static topological sorted order. """ # return if the list is empty if derived_features is None: @@ -366,7 +366,7 @@ def _add_all_derived_features(self, derived_features: List[DerivedFeature], ts:T for derived_feature in derived_features: # make sure the input is derived feature if isinstance(derived_feature, DerivedFeature): - # add this derived feature in the topo sort graph without any precessesors + # add this derived feature in the topological sort graph without any predecessors # since regardless we need it ts.add(derived_feature) for input_feature in derived_feature.input_features: @@ -394,12 +394,12 @@ def _parse_derived_features(self, derived_features: List[DerivedFeature]) -> Lis ts = TopologicalSorter() self._add_all_derived_features(derived_features, ts) - # topo sort the derived features to make sure that we can correctly refer to them later in the registry + # topological sort the derived features to make sure that we can correctly refer to them later in the registry toposorted_derived_feature_list: List[DerivedFeature] = list(ts.static_order()) for derived_feature in toposorted_derived_feature_list: # get the corresponding Atlas entity by searching feature name - # Since this list is topo sorted, so you can always find the corresponding name + # Since this list is topological sorted, so you can always find the corresponding name input_feature_entity_list: List[AtlasEntity] = [ self.global_feature_entity_dict[f.name] for f in derived_feature.input_features] key_list = [] @@ -713,9 +713,9 @@ def _create_project(self) -> UUID: def upload_single_entity_to_purview(self,entity:Union[AtlasEntity,AtlasProcess]): ''' - Upload a single entity to purview, could be a process entity or atlasentity. + Upload a single entity to purview, could be a process entity or AtlasEntity. Since this is used for migration existing project, ignore Atlas PreconditionFail (412) - If the eneity already exists, return the existing entity's GUID. + If the entity already exists, return the existing entity's GUID. Otherwise, return the new entity GUID. The entity itself will also be modified, fill the GUID with real GUID in Purview. In order to avoid having concurrency issue, and provide clear guidance, this method only allows entity uploading once at a time. @@ -809,7 +809,7 @@ def _create_anchor(self, s: FeatureAnchor) -> UUID: anchor_entity = self.purview_client.get_entity(anchor_id)['entities'][0] - # project contians anchor, anchor belongs to project. + # project contains anchor, anchor belongs to project. project_contains_anchor_relation = self._generate_relation_pairs( project_entity, anchor_entity, RELATION_CONTAINS) anchor_consumes_source_relation = self._generate_relation_pairs( @@ -946,7 +946,7 @@ def _purge_feathr_registry(self): def _delete_all_feathr_types(self): """ - Delete all the corresonding type definitions for feathr registry. For internal use only + Delete all the corresponding type definitions for feathr registry. For internal use only """ typedefs = self.purview_client.get_all_typedefs() @@ -967,18 +967,18 @@ def _delete_all_feathr_types(self): def _delete_all_feathr_entities(self): """ - Delete all the corresonding entity for feathr registry. For internal use only + Delete all the corresponding entity for feathr registry. For internal use only :param guid: The guid or guids you want to remove. """ # should not be large than this, otherwise the backend might throw out error - batch_delte_size = 100 + batch_delete_size = 100 - # use the `query` API so that it can return immediatelly (don't use the search_entity API as it will try to return all the results in a single request) + # use the `query` API so that it can return immediately (don't use the search_entity API as it will try to return all the results in a single request) while True: result = self.purview_client.discovery.query( - "feathr", limit=batch_delte_size) + "feathr", limit=batch_delete_size) logger.info("Total number of entities:",result['@search.count'] ) # if no results, break: @@ -987,7 +987,7 @@ def _delete_all_feathr_entities(self): entities = result['value'] guid_list = [entity["id"] for entity in entities] self.purview_client.delete_entity(guid=guid_list) - logger.info("{} feathr entities deleted", batch_delte_size) + logger.info("{} feathr entities deleted", batch_delete_size) # sleep here, otherwise backend might throttle # process the next batch after sleep sleep(1) @@ -1237,7 +1237,7 @@ def search_for_input_feature(elem, full_relations,full_entities): name=source_entity["attributes"]["name"], event_timestamp_column=source_entity["attributes"]["event_timestamp_column"], timestamp_format=source_entity["attributes"]["timestamp_format"], - preprocessing=self._correct_function_identation(source_entity["attributes"]["preprocessing"]), + preprocessing=self._correct_function_indentation(source_entity["attributes"]["preprocessing"]), path=source_entity["attributes"]["path"], registry_tags=source_entity["attributes"]["tags"] ), @@ -1264,9 +1264,9 @@ def search_input_anchor_features(self,derived_guids,feature_entity_guid_mapping) return result - def _correct_function_identation(self, user_func: str) -> str: + def _correct_function_indentation(self, user_func: str) -> str: """ - The function read from registry might have the wrong identation. We need to correct those identations. + The function read from registry might have the wrong indentation. We need to correct those indentation. More specifically, we are using the inspect module to copy the function body for UDF for further submission. In that case, there will be situations like this: def feathr_udf1(df) @@ -1302,7 +1302,7 @@ def _get_source_by_guid(self, guid, entity_list) -> Source: return HdfsSource(name=source_entity["attributes"]["name"], event_timestamp_column=source_entity["attributes"]["event_timestamp_column"], timestamp_format=source_entity["attributes"]["timestamp_format"], - preprocessing=self._correct_function_identation(source_entity["attributes"]["preprocessing"]), + preprocessing=self._correct_function_indentation(source_entity["attributes"]["preprocessing"]), path=source_entity["attributes"]["path"], registry_tags=source_entity["attributes"]["tags"] ) diff --git a/feathr_project/feathr/spark_provider/_databricks_submission.py b/feathr_project/feathr/spark_provider/_databricks_submission.py index 8cb135e26..cc54260d7 100644 --- a/feathr_project/feathr/spark_provider/_databricks_submission.py +++ b/feathr_project/feathr/spark_provider/_databricks_submission.py @@ -1,25 +1,23 @@ -import base64 +import copy import json import os import time - from collections import namedtuple from os.path import basename from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Union from urllib.parse import urlparse from urllib.request import urlopen import requests +from databricks_cli.dbfs.api import DbfsApi +from databricks_cli.runs.api import RunsApi +from databricks_cli.sdk.api_client import ApiClient +from feathr.constants import * +from feathr.spark_provider._abc import SparkJobLauncher from loguru import logger from requests.structures import CaseInsensitiveDict -from tqdm import tqdm -from feathr.spark_provider._abc import SparkJobLauncher -from feathr.constants import * -from databricks_cli.dbfs.api import DbfsApi -from databricks_cli.sdk.api_client import ApiClient -from databricks_cli.runs.api import RunsApi class _FeathrDatabricksJobLauncher(SparkJobLauncher): """Class to interact with Databricks Spark cluster @@ -137,9 +135,10 @@ def submit_feathr_job(self, job_name: str, main_jar_path: str, main_class_name: # if the input is a string, load it directly submission_params = json.loads(self.config_template) else: - # otherwise users might have missed the quotes in the config. - submission_params = self.config_template - logger.warning("Databricks config template loaded in a non-string fashion. Please consider providing the config template in a string fashion.") + # otherwise users might have missed the quotes in the config. Treat them as dict + # Note that we need to use deep copy here, in order to make `self.config_template` immutable + # Otherwise, since we need to change submission_params later, which will modify `self.config_template` and cause unexpected behaviors + submission_params = copy.deepcopy(self.config_template) submission_params['run_name'] = job_name if 'existing_cluster_id' not in submission_params: @@ -161,6 +160,8 @@ def submit_feathr_job(self, job_name: str, main_jar_path: str, main_class_name: # this is a pyspark job. definition here: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--sparkpythontask # the first file is the pyspark driver code. we only need the driver code to execute pyspark param_and_file_dict = {"parameters": arguments, "python_file": self.upload_or_get_cloud_path(python_files[0])} + # indicates this is a pyspark job + # `setdefault` method will get the value of the "spark_python_task" item, if the "spark_python_task" item does not exist, insert "spark_python_task" with the value "param_and_file_dict": submission_params.setdefault('spark_python_task',param_and_file_dict) else: # this is a scala spark job @@ -256,4 +257,4 @@ def download_result(self, result_path: str, local_folder: str): if not result_path.startswith('dbfs'): raise RuntimeError('Currently only paths starting with dbfs is supported for downloading results from a databricks cluster. The path should start with \"dbfs:\" .') - DbfsApi(self.api_client).cp(recursive=True, overwrite=True, src=result_path, dst=local_folder) \ No newline at end of file + DbfsApi(self.api_client).cp(recursive=True, overwrite=True, src=result_path, dst=local_folder) diff --git a/feathr_project/feathr/udf/_preprocessing_pyudf_manager.py b/feathr_project/feathr/udf/_preprocessing_pyudf_manager.py index 142ae8032..ca7114343 100644 --- a/feathr_project/feathr/udf/_preprocessing_pyudf_manager.py +++ b/feathr_project/feathr/udf/_preprocessing_pyudf_manager.py @@ -3,6 +3,7 @@ from pathlib import Path from typing import List, Optional, Union import pickle +from feathr.definition.anchor import FeatureAnchor from jinja2 import Template from feathr.definition.source import HdfsSource import ast @@ -24,7 +25,7 @@ class _PreprocessingPyudfManager(object): """This class manages Pyspark UDF preprocessing related artifacts, like UDFs from users, the pyspark_client etc. """ @staticmethod - def build_anchor_preprocessing_metadata(anchor_list, local_workspace_dir): + def build_anchor_preprocessing_metadata(anchor_list: List[FeatureAnchor], local_workspace_dir): """When the client build features, UDFs and features that need preprocessing will be stored as metadata. Those metadata will later be used when uploading the Pyspark jobs. """ @@ -35,8 +36,14 @@ def build_anchor_preprocessing_metadata(anchor_list, local_workspace_dir): # preprocessing for requested features. features_with_preprocessing = [] client_udf_repo_path = os.path.join(local_workspace_dir, FEATHR_CLIENT_UDF_FILE_NAME) + metadata_path = os.path.join(local_workspace_dir, FEATHR_PYSPARK_METADATA) + pyspark_driver_path = os.path.join(local_workspace_dir, FEATHR_PYSPARK_DRIVER_FILE_NAME) + # delete the file if it already exists to avoid caching previous results - os.remove(client_udf_repo_path) if os.path.exists(client_udf_repo_path) else None + for f in [client_udf_repo_path, metadata_path, pyspark_driver_path]: + if os.path.exists(f): + os.remove(f) + for anchor in anchor_list: # only support batch source preprocessing for now. if not hasattr(anchor.source, "preprocessing"): @@ -105,17 +112,11 @@ def persist_pyspark_udf_to_file(user_func, local_workspace_dir): client_udf_repo_path = os.path.join(local_workspace_dir, FEATHR_CLIENT_UDF_FILE_NAME) # the directory may actually not exist yet, so create the directory first - file_name_start = client_udf_repo_path.rfind("/") - if file_name_start > 0: - dir_name = client_udf_repo_path[:file_name_start] - Path(dir_name).mkdir(parents=True, exist_ok=True) + Path(local_workspace_dir).mkdir(parents=True, exist_ok=True) - if Path(client_udf_repo_path).is_file(): - with open(client_udf_repo_path, "a") as handle: - print("".join(lines), file=handle) - else: - with open(client_udf_repo_path, "w") as handle: - print("".join(lines), file=handle) + # Append to file, Create it if doesn't exist + with open(client_udf_repo_path, "a+") as handle: + print("".join(lines), file=handle) @staticmethod def write_feature_names_to_udf_name_file(feature_names_to_func_mapping, local_workspace_dir): @@ -134,7 +135,8 @@ def write_feature_names_to_udf_name_file(feature_names_to_func_mapping, local_wo new_file = tm.render(func_maps=feature_names_to_func_mapping) full_file_name = os.path.join(local_workspace_dir, FEATHR_CLIENT_UDF_FILE_NAME) - with open(full_file_name, "a") as text_file: + # Append to file, Create it if doesn't exist + with open(full_file_name, "a+") as text_file: print(new_file, file=text_file) @staticmethod @@ -158,7 +160,7 @@ def prepare_pyspark_udf_files(feature_names: List[str], local_workspace_dir): if not features_with_preprocessing: return py_udf_files - # Figure out if we need to preprocessing via UDFs for requested features. + # Figure out if we need to preprocess via UDFs for requested features. # Only if the requested features contain preprocessing logic, we will load Pyspark. Otherwise just use Scala # spark. has_py_udf_preprocessing = False @@ -172,6 +174,7 @@ def prepare_pyspark_udf_files(feature_names: List[str], local_workspace_dir): client_udf_repo_path = os.path.join(local_workspace_dir, FEATHR_CLIENT_UDF_FILE_NAME) # write pyspark_driver_template_abs_path and then client_udf_repo_path filenames = [pyspark_driver_template_abs_path, client_udf_repo_path] + with open(pyspark_driver_path, 'w') as outfile: for fname in filenames: with open(fname) as infile: diff --git a/feathr_project/feathr/utils/_file_utils.py b/feathr_project/feathr/utils/_file_utils.py index bdca9772f..c81208e79 100644 --- a/feathr_project/feathr/utils/_file_utils.py +++ b/feathr_project/feathr/utils/_file_utils.py @@ -8,9 +8,7 @@ def write_to_file(content: str, full_file_name: str): content: content to write into the file full_file_name: full file path """ - file_name_start = full_file_name.rfind("/") - if file_name_start > 0: - dir_name = full_file_name[:file_name_start] - Path(dir_name).mkdir(parents=True, exist_ok=True) + dir_name = os.path.dirname(full_file_name) + Path(dir_name).mkdir(parents=True, exist_ok=True) with open(full_file_name, "w") as handle: print(content, file=handle) \ No newline at end of file diff --git a/src/main/scala/com/linkedin/feathr/offline/generation/outputProcessor/RedisOutputUtils.scala b/src/main/scala/com/linkedin/feathr/offline/generation/outputProcessor/RedisOutputUtils.scala index 87fd9eab3..3c8a56f5b 100644 --- a/src/main/scala/com/linkedin/feathr/offline/generation/outputProcessor/RedisOutputUtils.scala +++ b/src/main/scala/com/linkedin/feathr/offline/generation/outputProcessor/RedisOutputUtils.scala @@ -24,7 +24,6 @@ object RedisOutputUtils { val outputKeyColumnName = "feature_key" val decoratedDf = encodedDf.withColumn(outputKeyColumnName, newColExpr) .drop(keyColumns: _*) - // set the host/post/auth/ssl configs in Redis again in the output directly // otherwise, in some environment (like databricks), the configs from the active spark session is not passed here. decoratedDf.write