From 9b65040c291a617d71493f94bc1ac9f93199437e Mon Sep 17 00:00:00 2001 From: Chen Xu Date: Wed, 21 Sep 2022 22:17:07 +0800 Subject: [PATCH 1/2] Allow recreating entities for PurView registry --- .../registry/_feature_registry_purview.py | 46 +++++++++++++++++ .../registry/purview_registry.py | 51 +++++++++++++++---- 2 files changed, 88 insertions(+), 9 deletions(-) diff --git a/feathr_project/feathr/registry/_feature_registry_purview.py b/feathr_project/feathr/registry/_feature_registry_purview.py index beb696b26..9ef74cc92 100644 --- a/feathr_project/feathr/registry/_feature_registry_purview.py +++ b/feathr_project/feathr/registry/_feature_registry_purview.py @@ -3,6 +3,7 @@ import inspect import itertools import os +import re import sys import ast import types @@ -44,6 +45,25 @@ from feathr.constants import * +def _to_snake(d, level: int = 0): + """ + Convert `string`, `list[string]`, or all keys in a `dict` into snake case + The maximum length of input string or list is 100, or it will be truncated before being processed, for dict, the exception will be thrown if it has more than 100 keys. + the maximum nested level is 10, otherwise the exception will be thrown + """ + if level >= 10: + raise ValueError("Too many nested levels") + if isinstance(d, str): + d = d[:100] + return re.sub(r'(? 100: + raise ValueError("Dict has too many keys") + return {_to_snake(a, level + 1): _to_snake(b, level + 1) if isinstance(b, (dict, list)) else b for a, b in d.items()} + + class _PurviewRegistry(FeathrRegistry): """ Initializes the feature registry, doing the following: @@ -720,6 +740,32 @@ def upload_single_entity_to_purview(self,entity:Union[AtlasEntity,AtlasProcess]) The entity itself will also be modified, fill the GUID with real GUID in Purview. In order to avoid having concurrency issue, and provide clear guidance, this method only allows entity uploading once at a time. ''' + try: + """ + Try to find existing entity/process first, if found, return the existing entity's GUID + """ + id = self.get_entity_id(entity.qualifiedName) + response = self.purview_client.get_entity(id)['entities'][0] + j = entity.to_json() + if j["typeName"] == response["typeName"]: + if j["typeName"] == "Process": + if response["attributes"]["qualifiedName"] != j["attributes"]["qualifiedName"]: + raise RuntimeError("The requested entity conflicts with the existing entity in PurView") + else: + if "type" in response['attributes'] and response["typeName"] in ("feathr_anchor_feature_v1", "feathr_derived_feature_v1"): + conf = ConfigFactory.parse_string(response['attributes']['type']) + response['attributes']['type'] = dict(conf) + keys = set([_to_snake(key) for key in j["attributes"].keys()]) - set(["qualified_name"]) + keys.add("qualifiedName") + for k in keys: + if response["attributes"][k] != j["attributes"][k]: + raise RuntimeError("The requested entity conflicts with the existing entity in PurView") + return response["guid"] + else: + raise RuntimeError("The requested entity conflicts with the existing entity in PurView") + except AtlasException as e: + pass + try: entity.lastModifiedTS="0" result = self.purview_client.upload_entities([entity]) diff --git a/registry/purview-registry/registry/purview_registry.py b/registry/purview-registry/registry/purview_registry.py index 9f5f47560..eb95f7a75 100644 --- a/registry/purview-registry/registry/purview_registry.py +++ b/registry/purview-registry/registry/purview_registry.py @@ -5,6 +5,9 @@ from urllib.error import HTTPError from uuid import UUID +from registry.models import to_snake +from pyapacheatlas.core.util import AtlasException + from azure.identity import DefaultAzureCredential from loguru import logger from pyapacheatlas.auth.azcredential import AzCredentialWrapper @@ -568,17 +571,47 @@ def _register_feathr_feature_types(self): def _upload_entity_batch(self, entity_batch:list[AtlasEntity]): # we only support entity creation, update is not supported. # setting lastModifiedTS ==0 will ensure this, if another entity with ts>=1 exist - # upload funtion will fail with 412 Precondition fail. + # upload function will fail with 412 Precondition fail. for entity in entity_batch: - entity.lastModifiedTS="0" - results = self.purview_client.upload_entities( - batch=entity) - if results: - dict = {x.guid: x for x in entity_batch} - for k, v in results['guidAssignments'].items(): - dict[k].guid = v + self._upload_single_entity(entity) + + def _upload_single_entity(self, entity:AtlasEntity): + try: + """ + Try to find existing entity/process first, if found, return the existing entity's GUID + """ + id = self.get_entity_id(entity.qualifiedName) + response = self.purview_client.get_entity(id)['entities'][0] + j = entity.to_json() + if j["typeName"] == response["typeName"]: + if j["typeName"] == "Process": + if response["attributes"]["qualifiedName"] != j["attributes"]["qualifiedName"]: + raise RuntimeError("The requested entity conflicts with the existing entity in PurView") + else: + if "type" in response['attributes'] and response["typeName"] in ("feathr_anchor_feature_v1", "feathr_derived_feature_v1"): + conf = ConfigFactory.parse_string(response['attributes']['type']) + response['attributes']['type'] = dict(conf) + keys = set([to_snake(key) for key in j["attributes"].keys()]) - set(["qualified_name"]) + keys.add("qualifiedName") + for k in keys: + if response["attributes"][k] != j["attributes"][k]: + raise RuntimeError("The requested entity conflicts with the existing entity in PurView") + entity.guid = response["guid"] + return else: - raise RuntimeError("Feature registration failed.", results) + raise RuntimeError("The requested entity conflicts with the existing entity in PurView") + except AtlasException as e: + pass + + entity.lastModifiedTS="0" + results = self.purview_client.upload_entities( + batch=entity) + if results: + d = {x.guid: x for x in [entity]} + for k, v in results['guidAssignments'].items(): + d[k].guid = v + else: + raise RuntimeError("Feature registration failed.", results) def _generate_fully_qualified_name(self, segments): return self.registry_delimiter.join(segments) From f1d423da01bd8fa56a3cb99f3b0352a98fa656d5 Mon Sep 17 00:00:00 2001 From: Chen Xu Date: Fri, 23 Sep 2022 15:23:34 +0800 Subject: [PATCH 2/2] Use constants --- .../feathr/registry/_feature_registry_purview.py | 8 ++++---- .../purview-registry/registry/purview_registry.py | 11 +++++++---- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/feathr_project/feathr/registry/_feature_registry_purview.py b/feathr_project/feathr/registry/_feature_registry_purview.py index 9ef74cc92..df6fcd188 100644 --- a/feathr_project/feathr/registry/_feature_registry_purview.py +++ b/feathr_project/feathr/registry/_feature_registry_purview.py @@ -750,19 +750,19 @@ def upload_single_entity_to_purview(self,entity:Union[AtlasEntity,AtlasProcess]) if j["typeName"] == response["typeName"]: if j["typeName"] == "Process": if response["attributes"]["qualifiedName"] != j["attributes"]["qualifiedName"]: - raise RuntimeError("The requested entity conflicts with the existing entity in PurView") + raise RuntimeError("The requested entity %s conflicts with the existing entity in PurView" % j["attributes"]["qualifiedName"]) else: - if "type" in response['attributes'] and response["typeName"] in ("feathr_anchor_feature_v1", "feathr_derived_feature_v1"): + if "type" in response['attributes'] and response["typeName"] in (TYPEDEF_ANCHOR_FEATURE, TYPEDEF_DERIVED_FEATURE): conf = ConfigFactory.parse_string(response['attributes']['type']) response['attributes']['type'] = dict(conf) keys = set([_to_snake(key) for key in j["attributes"].keys()]) - set(["qualified_name"]) keys.add("qualifiedName") for k in keys: if response["attributes"][k] != j["attributes"][k]: - raise RuntimeError("The requested entity conflicts with the existing entity in PurView") + raise RuntimeError("The requested entity %s conflicts with the existing entity in PurView" % j["attributes"]["qualifiedName"]) return response["guid"] else: - raise RuntimeError("The requested entity conflicts with the existing entity in PurView") + raise RuntimeError("The requested entity %s conflicts with the existing entity in PurView" % j["attributes"]["qualifiedName"]) except AtlasException as e: pass diff --git a/registry/purview-registry/registry/purview_registry.py b/registry/purview-registry/registry/purview_registry.py index eb95f7a75..15a650167 100644 --- a/registry/purview-registry/registry/purview_registry.py +++ b/registry/purview-registry/registry/purview_registry.py @@ -23,6 +23,9 @@ Label_BelongsTo = "BELONGSTO" Label_Consumes = "CONSUMES" Label_Produces = "PRODUCES" +TYPEDEF_DERIVED_FEATURE="feathr_derived_feature_v1" +TYPEDEF_ANCHOR_FEATURE="feathr_anchor_feature_v1" + TYPEDEF_ARRAY_ANCHOR=f"array" TYPEDEF_ARRAY_DERIVED_FEATURE=f"array" TYPEDEF_ARRAY_ANCHOR_FEATURE=f"array" @@ -586,20 +589,20 @@ def _upload_single_entity(self, entity:AtlasEntity): if j["typeName"] == response["typeName"]: if j["typeName"] == "Process": if response["attributes"]["qualifiedName"] != j["attributes"]["qualifiedName"]: - raise RuntimeError("The requested entity conflicts with the existing entity in PurView") + raise RuntimeError("The requested entity %s conflicts with the existing entity in PurView" % j["attributes"]["qualifiedName"]) else: - if "type" in response['attributes'] and response["typeName"] in ("feathr_anchor_feature_v1", "feathr_derived_feature_v1"): + if "type" in response['attributes'] and response["typeName"] in (TYPEDEF_ANCHOR_FEATURE, TYPEDEF_DERIVED_FEATURE): conf = ConfigFactory.parse_string(response['attributes']['type']) response['attributes']['type'] = dict(conf) keys = set([to_snake(key) for key in j["attributes"].keys()]) - set(["qualified_name"]) keys.add("qualifiedName") for k in keys: if response["attributes"][k] != j["attributes"][k]: - raise RuntimeError("The requested entity conflicts with the existing entity in PurView") + raise RuntimeError("The requested entity %s conflicts with the existing entity in PurView" % j["attributes"]["qualifiedName"]) entity.guid = response["guid"] return else: - raise RuntimeError("The requested entity conflicts with the existing entity in PurView") + raise RuntimeError("The requested entity %s conflicts with the existing entity in PurView" % j["attributes"]["qualifiedName"]) except AtlasException as e: pass