Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion feathr_project/feathr/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ def build_features(self, anchor_list: List[FeatureAnchor] = [], derived_feature_

# Pretty print anchor_list
if verbose and self.anchor_list:
FeaturePrinter.pretty_print_anchors(self.anchor_list)
FeaturePrinter.pretty_print_anchors(self.anchor_list)

def list_registered_features(self, project_name: str = None) -> List[str]:
"""List all the already registered features under the given project.
Expand Down
34 changes: 17 additions & 17 deletions feathr_project/feathr/registry/_feature_registry_purview.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,7 @@ def _add_all_derived_features(self, derived_features: List[DerivedFeature], ts:T
ts (TopologicalSorter): a topological sorter by python

Returns:
None. The topo sorter will maitain a static topo sorted order.
None. The topological sorter will maintain a static topological sorted order.
"""
# return if the list is empty
if derived_features is None:
Expand All @@ -366,7 +366,7 @@ def _add_all_derived_features(self, derived_features: List[DerivedFeature], ts:T
for derived_feature in derived_features:
# make sure the input is derived feature
if isinstance(derived_feature, DerivedFeature):
# add this derived feature in the topo sort graph without any precessesors
# add this derived feature in the topological sort graph without any predecessors
# since regardless we need it
ts.add(derived_feature)
for input_feature in derived_feature.input_features:
Expand Down Expand Up @@ -394,12 +394,12 @@ def _parse_derived_features(self, derived_features: List[DerivedFeature]) -> Lis
ts = TopologicalSorter()

self._add_all_derived_features(derived_features, ts)
# topo sort the derived features to make sure that we can correctly refer to them later in the registry
# topological sort the derived features to make sure that we can correctly refer to them later in the registry
toposorted_derived_feature_list: List[DerivedFeature] = list(ts.static_order())

for derived_feature in toposorted_derived_feature_list:
# get the corresponding Atlas entity by searching feature name
# Since this list is topo sorted, so you can always find the corresponding name
# Since this list is topological sorted, so you can always find the corresponding name
input_feature_entity_list: List[AtlasEntity] = [
self.global_feature_entity_dict[f.name] for f in derived_feature.input_features]
key_list = []
Expand Down Expand Up @@ -713,9 +713,9 @@ def _create_project(self) -> UUID:

def upload_single_entity_to_purview(self,entity:Union[AtlasEntity,AtlasProcess]):
'''
Upload a single entity to purview, could be a process entity or atlasentity.
Upload a single entity to purview, could be a process entity or AtlasEntity.
Since this is used for migration existing project, ignore Atlas PreconditionFail (412)
If the eneity already exists, return the existing entity's GUID.
If the entity already exists, return the existing entity's GUID.
Otherwise, return the new entity GUID.
The entity itself will also be modified, fill the GUID with real GUID in Purview.
In order to avoid having concurrency issue, and provide clear guidance, this method only allows entity uploading once at a time.
Expand Down Expand Up @@ -809,7 +809,7 @@ def _create_anchor(self, s: FeatureAnchor) -> UUID:
anchor_entity = self.purview_client.get_entity(anchor_id)['entities'][0]


# project contians anchor, anchor belongs to project.
# project contains anchor, anchor belongs to project.
project_contains_anchor_relation = self._generate_relation_pairs(
project_entity, anchor_entity, RELATION_CONTAINS)
anchor_consumes_source_relation = self._generate_relation_pairs(
Expand Down Expand Up @@ -946,7 +946,7 @@ def _purge_feathr_registry(self):

def _delete_all_feathr_types(self):
"""
Delete all the corresonding type definitions for feathr registry. For internal use only
Delete all the corresponding type definitions for feathr registry. For internal use only
"""
typedefs = self.purview_client.get_all_typedefs()

Expand All @@ -967,18 +967,18 @@ def _delete_all_feathr_types(self):

def _delete_all_feathr_entities(self):
"""
Delete all the corresonding entity for feathr registry. For internal use only
Delete all the corresponding entity for feathr registry. For internal use only

:param guid: The guid or guids you want to remove.
"""
# should not be large than this, otherwise the backend might throw out error
batch_delte_size = 100
batch_delete_size = 100

# use the `query` API so that it can return immediatelly (don't use the search_entity API as it will try to return all the results in a single request)
# use the `query` API so that it can return immediately (don't use the search_entity API as it will try to return all the results in a single request)

while True:
result = self.purview_client.discovery.query(
"feathr", limit=batch_delte_size)
"feathr", limit=batch_delete_size)
logger.info("Total number of entities:",result['@search.count'] )

# if no results, break:
Expand All @@ -987,7 +987,7 @@ def _delete_all_feathr_entities(self):
entities = result['value']
guid_list = [entity["id"] for entity in entities]
self.purview_client.delete_entity(guid=guid_list)
logger.info("{} feathr entities deleted", batch_delte_size)
logger.info("{} feathr entities deleted", batch_delete_size)
# sleep here, otherwise backend might throttle
# process the next batch after sleep
sleep(1)
Expand Down Expand Up @@ -1237,7 +1237,7 @@ def search_for_input_feature(elem, full_relations,full_entities):
name=source_entity["attributes"]["name"],
event_timestamp_column=source_entity["attributes"]["event_timestamp_column"],
timestamp_format=source_entity["attributes"]["timestamp_format"],
preprocessing=self._correct_function_identation(source_entity["attributes"]["preprocessing"]),
preprocessing=self._correct_function_indentation(source_entity["attributes"]["preprocessing"]),
path=source_entity["attributes"]["path"],
registry_tags=source_entity["attributes"]["tags"]
),
Expand All @@ -1264,9 +1264,9 @@ def search_input_anchor_features(self,derived_guids,feature_entity_guid_mapping)
return result


def _correct_function_identation(self, user_func: str) -> str:
def _correct_function_indentation(self, user_func: str) -> str:
"""
The function read from registry might have the wrong identation. We need to correct those identations.
The function read from registry might have the wrong indentation. We need to correct those indentation.
More specifically, we are using the inspect module to copy the function body for UDF for further submission. In that case, there will be situations like this:

def feathr_udf1(df)
Expand Down Expand Up @@ -1302,7 +1302,7 @@ def _get_source_by_guid(self, guid, entity_list) -> Source:
return HdfsSource(name=source_entity["attributes"]["name"],
event_timestamp_column=source_entity["attributes"]["event_timestamp_column"],
timestamp_format=source_entity["attributes"]["timestamp_format"],
preprocessing=self._correct_function_identation(source_entity["attributes"]["preprocessing"]),
preprocessing=self._correct_function_indentation(source_entity["attributes"]["preprocessing"]),
path=source_entity["attributes"]["path"],
registry_tags=source_entity["attributes"]["tags"]
)
Expand Down
27 changes: 14 additions & 13 deletions feathr_project/feathr/spark_provider/_databricks_submission.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,23 @@
import base64
import copy
import json
import os
import time

from collections import namedtuple
from os.path import basename
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
from typing import Any, Dict, List, Optional, Union
from urllib.parse import urlparse
from urllib.request import urlopen

import requests
from databricks_cli.dbfs.api import DbfsApi
from databricks_cli.runs.api import RunsApi
from databricks_cli.sdk.api_client import ApiClient
from feathr.constants import *
from feathr.spark_provider._abc import SparkJobLauncher
from loguru import logger
from requests.structures import CaseInsensitiveDict
from tqdm import tqdm

from feathr.spark_provider._abc import SparkJobLauncher
from feathr.constants import *
from databricks_cli.dbfs.api import DbfsApi
from databricks_cli.sdk.api_client import ApiClient
from databricks_cli.runs.api import RunsApi

class _FeathrDatabricksJobLauncher(SparkJobLauncher):
"""Class to interact with Databricks Spark cluster
Expand Down Expand Up @@ -137,9 +135,10 @@ def submit_feathr_job(self, job_name: str, main_jar_path: str, main_class_name:
# if the input is a string, load it directly
submission_params = json.loads(self.config_template)
else:
# otherwise users might have missed the quotes in the config.
submission_params = self.config_template
logger.warning("Databricks config template loaded in a non-string fashion. Please consider providing the config template in a string fashion.")
# otherwise users might have missed the quotes in the config. Treat them as dict
# Note that we need to use deep copy here, in order to make `self.config_template` immutable
# Otherwise, since we need to change submission_params later, which will modify `self.config_template` and cause unexpected behaviors
submission_params = copy.deepcopy(self.config_template)

submission_params['run_name'] = job_name
if 'existing_cluster_id' not in submission_params:
Expand All @@ -161,6 +160,8 @@ def submit_feathr_job(self, job_name: str, main_jar_path: str, main_class_name:
# this is a pyspark job. definition here: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--sparkpythontask
# the first file is the pyspark driver code. we only need the driver code to execute pyspark
param_and_file_dict = {"parameters": arguments, "python_file": self.upload_or_get_cloud_path(python_files[0])}
# indicates this is a pyspark job
# `setdefault` method will get the value of the "spark_python_task" item, if the "spark_python_task" item does not exist, insert "spark_python_task" with the value "param_and_file_dict":
submission_params.setdefault('spark_python_task',param_and_file_dict)
else:
# this is a scala spark job
Expand Down Expand Up @@ -256,4 +257,4 @@ def download_result(self, result_path: str, local_folder: str):
if not result_path.startswith('dbfs'):
raise RuntimeError('Currently only paths starting with dbfs is supported for downloading results from a databricks cluster. The path should start with \"dbfs:\" .')

DbfsApi(self.api_client).cp(recursive=True, overwrite=True, src=result_path, dst=local_folder)
DbfsApi(self.api_client).cp(recursive=True, overwrite=True, src=result_path, dst=local_folder)
31 changes: 17 additions & 14 deletions feathr_project/feathr/udf/_preprocessing_pyudf_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from pathlib import Path
from typing import List, Optional, Union
import pickle
from feathr.definition.anchor import FeatureAnchor
from jinja2 import Template
from feathr.definition.source import HdfsSource
import ast
Expand All @@ -24,7 +25,7 @@ class _PreprocessingPyudfManager(object):
"""This class manages Pyspark UDF preprocessing related artifacts, like UDFs from users, the pyspark_client etc.
"""
@staticmethod
def build_anchor_preprocessing_metadata(anchor_list, local_workspace_dir):
def build_anchor_preprocessing_metadata(anchor_list: List[FeatureAnchor], local_workspace_dir):
"""When the client build features, UDFs and features that need preprocessing will be stored as metadata. Those
metadata will later be used when uploading the Pyspark jobs.
"""
Expand All @@ -35,8 +36,14 @@ def build_anchor_preprocessing_metadata(anchor_list, local_workspace_dir):
# preprocessing for requested features.
features_with_preprocessing = []
client_udf_repo_path = os.path.join(local_workspace_dir, FEATHR_CLIENT_UDF_FILE_NAME)
metadata_path = os.path.join(local_workspace_dir, FEATHR_PYSPARK_METADATA)
pyspark_driver_path = os.path.join(local_workspace_dir, FEATHR_PYSPARK_DRIVER_FILE_NAME)

# delete the file if it already exists to avoid caching previous results
os.remove(client_udf_repo_path) if os.path.exists(client_udf_repo_path) else None
for f in [client_udf_repo_path, metadata_path, pyspark_driver_path]:
if os.path.exists(f):
os.remove(f)

for anchor in anchor_list:
# only support batch source preprocessing for now.
if not hasattr(anchor.source, "preprocessing"):
Expand Down Expand Up @@ -105,17 +112,11 @@ def persist_pyspark_udf_to_file(user_func, local_workspace_dir):
client_udf_repo_path = os.path.join(local_workspace_dir, FEATHR_CLIENT_UDF_FILE_NAME)

# the directory may actually not exist yet, so create the directory first
file_name_start = client_udf_repo_path.rfind("/")
if file_name_start > 0:
dir_name = client_udf_repo_path[:file_name_start]
Path(dir_name).mkdir(parents=True, exist_ok=True)
Path(local_workspace_dir).mkdir(parents=True, exist_ok=True)

if Path(client_udf_repo_path).is_file():
with open(client_udf_repo_path, "a") as handle:
print("".join(lines), file=handle)
else:
with open(client_udf_repo_path, "w") as handle:
print("".join(lines), file=handle)
# Append to file, Create it if doesn't exist
with open(client_udf_repo_path, "a+") as handle:
print("".join(lines), file=handle)

@staticmethod
def write_feature_names_to_udf_name_file(feature_names_to_func_mapping, local_workspace_dir):
Expand All @@ -134,7 +135,8 @@ def write_feature_names_to_udf_name_file(feature_names_to_func_mapping, local_wo
new_file = tm.render(func_maps=feature_names_to_func_mapping)

full_file_name = os.path.join(local_workspace_dir, FEATHR_CLIENT_UDF_FILE_NAME)
with open(full_file_name, "a") as text_file:
# Append to file, Create it if doesn't exist
with open(full_file_name, "a+") as text_file:
print(new_file, file=text_file)

@staticmethod
Expand All @@ -158,7 +160,7 @@ def prepare_pyspark_udf_files(feature_names: List[str], local_workspace_dir):
if not features_with_preprocessing:
return py_udf_files

# Figure out if we need to preprocessing via UDFs for requested features.
# Figure out if we need to preprocess via UDFs for requested features.
# Only if the requested features contain preprocessing logic, we will load Pyspark. Otherwise just use Scala
# spark.
has_py_udf_preprocessing = False
Expand All @@ -172,6 +174,7 @@ def prepare_pyspark_udf_files(feature_names: List[str], local_workspace_dir):
client_udf_repo_path = os.path.join(local_workspace_dir, FEATHR_CLIENT_UDF_FILE_NAME)
# write pyspark_driver_template_abs_path and then client_udf_repo_path
filenames = [pyspark_driver_template_abs_path, client_udf_repo_path]

with open(pyspark_driver_path, 'w') as outfile:
for fname in filenames:
with open(fname) as infile:
Expand Down
6 changes: 2 additions & 4 deletions feathr_project/feathr/utils/_file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,7 @@ def write_to_file(content: str, full_file_name: str):
content: content to write into the file
full_file_name: full file path
"""
file_name_start = full_file_name.rfind("/")
if file_name_start > 0:
dir_name = full_file_name[:file_name_start]
Path(dir_name).mkdir(parents=True, exist_ok=True)
dir_name = os.path.dirname(full_file_name)
Path(dir_name).mkdir(parents=True, exist_ok=True)
with open(full_file_name, "w") as handle:
print(content, file=handle)
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ object RedisOutputUtils {
val outputKeyColumnName = "feature_key"
val decoratedDf = encodedDf.withColumn(outputKeyColumnName, newColExpr)
.drop(keyColumns: _*)

// set the host/post/auth/ssl configs in Redis again in the output directly
// otherwise, in some environment (like databricks), the configs from the active spark session is not passed here.
decoratedDf.write
Expand Down