feathr-ai · xiaoyongzhu · Aug 8, 2022 · Aug 7, 2022 · Aug 7, 2022 · Aug 7, 2022
diff --git a/feathr_project/feathr/client.py b/feathr_project/feathr/client.py
@@ -245,7 +245,7 @@ def build_features(self, anchor_list: List[FeatureAnchor] = [], derived_feature_
 
         # Pretty print anchor_list
         if verbose and self.anchor_list:
-                FeaturePrinter.pretty_print_anchors(self.anchor_list)
+            FeaturePrinter.pretty_print_anchors(self.anchor_list)
 
     def list_registered_features(self, project_name: str = None) -> List[str]:
         """List all the already registered features under the given project.

diff --git a/feathr_project/feathr/registry/_feature_registry_purview.py b/feathr_project/feathr/registry/_feature_registry_purview.py
@@ -357,7 +357,7 @@ def _add_all_derived_features(self, derived_features: List[DerivedFeature], ts:T
             ts (TopologicalSorter): a topological sorter by python
 
         Returns:
-            None. The topo sorter will maitain a static topo sorted order.
+            None. The topological sorter will maintain a static topological sorted order.
         """
         # return if the list is empty
         if derived_features is None:
@@ -366,7 +366,7 @@ def _add_all_derived_features(self, derived_features: List[DerivedFeature], ts:T
         for derived_feature in derived_features:
             # make sure the input is derived feature
             if isinstance(derived_feature, DerivedFeature):
-                # add this derived feature in the topo sort graph without any precessesors
+                # add this derived feature in the topological sort graph without any predecessors
                 # since regardless we need it
                 ts.add(derived_feature)
                 for input_feature in derived_feature.input_features:
@@ -394,12 +394,12 @@ def _parse_derived_features(self, derived_features: List[DerivedFeature]) -> Lis
         ts = TopologicalSorter()
 
         self._add_all_derived_features(derived_features, ts)
-        # topo sort the derived features to make sure that we can correctly refer to them later in the registry
+        # topological sort the derived features to make sure that we can correctly refer to them later in the registry
         toposorted_derived_feature_list: List[DerivedFeature] = list(ts.static_order())
 
         for derived_feature in toposorted_derived_feature_list:
             # get the corresponding Atlas entity by searching feature name
-            # Since this list is topo sorted, so you can always find the corresponding name
+            # Since this list is topological sorted, so you can always find the corresponding name
             input_feature_entity_list: List[AtlasEntity] = [
                 self.global_feature_entity_dict[f.name] for f in derived_feature.input_features]
             key_list = []
@@ -713,9 +713,9 @@ def _create_project(self) -> UUID:
 
     def upload_single_entity_to_purview(self,entity:Union[AtlasEntity,AtlasProcess]):
         '''
-        Upload a single entity to purview, could be a process entity or atlasentity. 
+        Upload a single entity to purview, could be a process entity or AtlasEntity. 
         Since this is used for migration existing project, ignore Atlas PreconditionFail (412)
-        If the eneity already exists, return the existing entity's GUID.
+        If the entity already exists, return the existing entity's GUID.
         Otherwise, return the new entity GUID.
         The entity itself will also be modified, fill the GUID with real GUID in Purview.
         In order to avoid having concurrency issue, and provide clear guidance, this method only allows entity uploading once at a time.
@@ -809,7 +809,7 @@ def _create_anchor(self, s: FeatureAnchor) -> UUID:
         anchor_entity = self.purview_client.get_entity(anchor_id)['entities'][0]
 
 
-        # project contians anchor, anchor belongs to project.
+        # project contains anchor, anchor belongs to project.
         project_contains_anchor_relation = self._generate_relation_pairs(
             project_entity, anchor_entity, RELATION_CONTAINS)
         anchor_consumes_source_relation = self._generate_relation_pairs(
@@ -946,7 +946,7 @@ def _purge_feathr_registry(self):
 
     def _delete_all_feathr_types(self):
         """
-        Delete all the corresonding type definitions for feathr registry. For internal use only
+        Delete all the corresponding type definitions for feathr registry. For internal use only
         """
         typedefs = self.purview_client.get_all_typedefs()
 
@@ -967,18 +967,18 @@ def _delete_all_feathr_types(self):
 
     def _delete_all_feathr_entities(self):
         """
-        Delete all the corresonding entity for feathr registry. For internal use only
+        Delete all the corresponding entity for feathr registry. For internal use only
 
         :param guid: The guid or guids you want to remove.
         """
         # should not be large than this, otherwise the backend might throw out error
-        batch_delte_size = 100
+        batch_delete_size = 100
 
-        # use the `query` API so that it can return immediatelly (don't use the search_entity API as it will try to return all the results in a single request)
+        # use the `query` API so that it can return immediately (don't use the search_entity API as it will try to return all the results in a single request)
 
         while True:
             result = self.purview_client.discovery.query(
-                "feathr", limit=batch_delte_size)
+                "feathr", limit=batch_delete_size)
             logger.info("Total number of entities:",result['@search.count'] )
 
             # if no results, break:
@@ -987,7 +987,7 @@ def _delete_all_feathr_entities(self):
             entities = result['value']
             guid_list = [entity["id"] for entity in entities]
             self.purview_client.delete_entity(guid=guid_list)
-            logger.info("{} feathr entities deleted", batch_delte_size)
+            logger.info("{} feathr entities deleted", batch_delete_size)
             # sleep here, otherwise backend might throttle
             # process the next batch after sleep
             sleep(1)
@@ -1237,7 +1237,7 @@ def search_for_input_feature(elem, full_relations,full_entities):
                                     name=source_entity["attributes"]["name"],
                                     event_timestamp_column=source_entity["attributes"]["event_timestamp_column"],
                                     timestamp_format=source_entity["attributes"]["timestamp_format"],
-                                    preprocessing=self._correct_function_identation(source_entity["attributes"]["preprocessing"]),
+                                    preprocessing=self._correct_function_indentation(source_entity["attributes"]["preprocessing"]),
                                     path=source_entity["attributes"]["path"],
                                     registry_tags=source_entity["attributes"]["tags"]
                                     ),
@@ -1264,9 +1264,9 @@ def search_input_anchor_features(self,derived_guids,feature_entity_guid_mapping)
         return result
 
 
-    def _correct_function_identation(self, user_func: str) -> str:
+    def _correct_function_indentation(self, user_func: str) -> str:
         """
-        The function read from registry might have the wrong identation. We need to correct those identations.
+        The function read from registry might have the wrong indentation. We need to correct those indentation.
         More specifically, we are using the inspect module to copy the function body for UDF for further submission. In that case, there will be situations like this:
 
         def feathr_udf1(df)
@@ -1302,7 +1302,7 @@ def _get_source_by_guid(self, guid, entity_list) -> Source:
         return HdfsSource(name=source_entity["attributes"]["name"],
                 event_timestamp_column=source_entity["attributes"]["event_timestamp_column"],
                 timestamp_format=source_entity["attributes"]["timestamp_format"],
-                preprocessing=self._correct_function_identation(source_entity["attributes"]["preprocessing"]),
+                preprocessing=self._correct_function_indentation(source_entity["attributes"]["preprocessing"]),
                 path=source_entity["attributes"]["path"],
                 registry_tags=source_entity["attributes"]["tags"]
                 )

diff --git a/feathr_project/feathr/spark_provider/_databricks_submission.py b/feathr_project/feathr/spark_provider/_databricks_submission.py
@@ -1,25 +1,23 @@
-import base64
+import copy
 import json
 import os
 import time
-
 from collections import namedtuple
 from os.path import basename
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Union
 from urllib.parse import urlparse
 from urllib.request import urlopen
 
 import requests
+from databricks_cli.dbfs.api import DbfsApi
+from databricks_cli.runs.api import RunsApi
+from databricks_cli.sdk.api_client import ApiClient
+from feathr.constants import *
+from feathr.spark_provider._abc import SparkJobLauncher
 from loguru import logger
 from requests.structures import CaseInsensitiveDict
-from tqdm import tqdm
 
-from feathr.spark_provider._abc import SparkJobLauncher
-from feathr.constants import *
-from databricks_cli.dbfs.api import DbfsApi
-from databricks_cli.sdk.api_client import ApiClient
-from databricks_cli.runs.api import RunsApi
 
 class _FeathrDatabricksJobLauncher(SparkJobLauncher):
     """Class to interact with Databricks Spark cluster
@@ -137,9 +135,10 @@ def submit_feathr_job(self, job_name: str, main_jar_path: str,  main_class_name:
             # if the input is a string, load it directly
             submission_params = json.loads(self.config_template)
         else:
-            # otherwise users might have missed the quotes in the config.
-            submission_params = self.config_template
-            logger.warning("Databricks config template loaded in a non-string fashion. Please consider providing the config template in a string fashion.")
+            # otherwise users might have missed the quotes in the config. Treat them as dict
+            # Note that we need to use deep copy here, in order to make `self.config_template` immutable
+            # Otherwise, since we need to change submission_params later, which will modify `self.config_template` and cause unexpected behaviors
+            submission_params = copy.deepcopy(self.config_template) 
 
         submission_params['run_name'] = job_name
         if 'existing_cluster_id' not in submission_params:
@@ -161,6 +160,8 @@ def submit_feathr_job(self, job_name: str, main_jar_path: str,  main_class_name:
             # this is a pyspark job. definition here: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--sparkpythontask
             # the first file is the pyspark driver code. we only need the driver code to execute pyspark
             param_and_file_dict = {"parameters": arguments, "python_file": self.upload_or_get_cloud_path(python_files[0])}
+            # indicates this is a pyspark job
+            # `setdefault` method will get the value of the "spark_python_task" item, if the "spark_python_task" item does not exist, insert "spark_python_task" with the value "param_and_file_dict":
             submission_params.setdefault('spark_python_task',param_and_file_dict)
         else:
             # this is a scala spark job
@@ -256,4 +257,4 @@ def download_result(self, result_path: str, local_folder: str):
         if not result_path.startswith('dbfs'):
             raise RuntimeError('Currently only paths starting with dbfs is supported for downloading results from a databricks cluster. The path should start with \"dbfs:\" .')
 
-        DbfsApi(self.api_client).cp(recursive=True, overwrite=True, src=result_path, dst=local_folder)
+        DbfsApi(self.api_client).cp(recursive=True, overwrite=True, src=result_path, dst=local_folder)
diff --git a/feathr_project/feathr/udf/_preprocessing_pyudf_manager.py b/feathr_project/feathr/udf/_preprocessing_pyudf_manager.py
@@ -3,6 +3,7 @@
 from pathlib import Path
 from typing import List, Optional, Union
 import pickle
+from feathr.definition.anchor import FeatureAnchor
 from jinja2 import Template
 from feathr.definition.source import HdfsSource
 import ast
@@ -24,7 +25,7 @@ class _PreprocessingPyudfManager(object):
     """This class manages Pyspark UDF preprocessing related artifacts, like UDFs from users, the pyspark_client etc.
     """
     @staticmethod
-    def build_anchor_preprocessing_metadata(anchor_list, local_workspace_dir):
+    def build_anchor_preprocessing_metadata(anchor_list: List[FeatureAnchor], local_workspace_dir):
         """When the client build features, UDFs and features that need preprocessing will be stored as metadata. Those
         metadata will later be used when uploading the Pyspark jobs.
         """
@@ -35,8 +36,14 @@ def build_anchor_preprocessing_metadata(anchor_list, local_workspace_dir):
         # preprocessing for requested features.
         features_with_preprocessing = []
         client_udf_repo_path = os.path.join(local_workspace_dir, FEATHR_CLIENT_UDF_FILE_NAME)
+        metadata_path = os.path.join(local_workspace_dir, FEATHR_PYSPARK_METADATA)
+        pyspark_driver_path = os.path.join(local_workspace_dir, FEATHR_PYSPARK_DRIVER_FILE_NAME)
+
         # delete the file if it already exists to avoid caching previous results
-        os.remove(client_udf_repo_path) if os.path.exists(client_udf_repo_path) else None
+        for f in [client_udf_repo_path, metadata_path,  pyspark_driver_path]:
+            if os.path.exists(f):
+                os.remove(f) 
+
         for anchor in anchor_list:
             # only support batch source preprocessing for now.
             if not hasattr(anchor.source, "preprocessing"):
@@ -105,17 +112,11 @@ def persist_pyspark_udf_to_file(user_func, local_workspace_dir):
         client_udf_repo_path = os.path.join(local_workspace_dir, FEATHR_CLIENT_UDF_FILE_NAME)
 
         # the directory may actually not exist yet, so create the directory first
-        file_name_start = client_udf_repo_path.rfind("/")
-        if file_name_start > 0:
-            dir_name = client_udf_repo_path[:file_name_start]
-            Path(dir_name).mkdir(parents=True, exist_ok=True)
+        Path(local_workspace_dir).mkdir(parents=True, exist_ok=True)
 
-        if Path(client_udf_repo_path).is_file():
-            with open(client_udf_repo_path, "a") as handle:
-                print("".join(lines), file=handle)
-        else:
-            with open(client_udf_repo_path, "w") as handle:
-                print("".join(lines), file=handle)
+        # Append to file, Create it if doesn't exist
+        with open(client_udf_repo_path, "a+") as handle:
+            print("".join(lines), file=handle)
 
     @staticmethod
     def write_feature_names_to_udf_name_file(feature_names_to_func_mapping, local_workspace_dir):
@@ -134,7 +135,8 @@ def write_feature_names_to_udf_name_file(feature_names_to_func_mapping, local_wo
         new_file = tm.render(func_maps=feature_names_to_func_mapping)
 
         full_file_name = os.path.join(local_workspace_dir, FEATHR_CLIENT_UDF_FILE_NAME)
-        with open(full_file_name, "a") as text_file:
+        # Append to file, Create it if doesn't exist
+        with open(full_file_name, "a+") as text_file:
             print(new_file, file=text_file)
 
     @staticmethod
@@ -158,7 +160,7 @@ def prepare_pyspark_udf_files(feature_names: List[str], local_workspace_dir):
         if not features_with_preprocessing:
             return py_udf_files
 
-        # Figure out if we need to preprocessing via UDFs for requested features.
+        # Figure out if we need to preprocess via UDFs for requested features.
         # Only if the requested features contain preprocessing logic, we will load Pyspark. Otherwise just use Scala
         # spark.
         has_py_udf_preprocessing = False
@@ -172,6 +174,7 @@ def prepare_pyspark_udf_files(feature_names: List[str], local_workspace_dir):
             client_udf_repo_path = os.path.join(local_workspace_dir, FEATHR_CLIENT_UDF_FILE_NAME)
             # write pyspark_driver_template_abs_path and then client_udf_repo_path
             filenames = [pyspark_driver_template_abs_path, client_udf_repo_path]
+
             with open(pyspark_driver_path, 'w') as outfile:
                 for fname in filenames:
                     with open(fname) as infile:

diff --git a/feathr_project/feathr/utils/_file_utils.py b/feathr_project/feathr/utils/_file_utils.py
@@ -8,9 +8,7 @@ def write_to_file(content: str, full_file_name: str):
         content: content to write into the file
         full_file_name: full file path
     """
-    file_name_start = full_file_name.rfind("/")
-    if file_name_start > 0:
-        dir_name = full_file_name[:file_name_start]
-        Path(dir_name).mkdir(parents=True, exist_ok=True)
+    dir_name = os.path.dirname(full_file_name)
+    Path(dir_name).mkdir(parents=True, exist_ok=True)
     with open(full_file_name, "w") as handle:
         print(content, file=handle)
diff --git a/src/main/scala/com/linkedin/feathr/offline/generation/outputProcessor/RedisOutputUtils.scala b/src/main/scala/com/linkedin/feathr/offline/generation/outputProcessor/RedisOutputUtils.scala
@@ -24,7 +24,6 @@ object RedisOutputUtils {
     val outputKeyColumnName = "feature_key"
     val decoratedDf = encodedDf.withColumn(outputKeyColumnName, newColExpr)
       .drop(keyColumns: _*)
-
     // set the host/post/auth/ssl configs in Redis again in the output directly
     // otherwise, in some environment (like databricks), the configs from the active spark session is not passed here.
     decoratedDf.write