From 3178a1a5072d383e98a58b12134432ad16347fa2 Mon Sep 17 00:00:00 2001 From: Yihui Guo Date: Mon, 27 Jun 2022 21:21:44 +0800 Subject: [PATCH 1/6] Purview Registry --- registry/purview-registry/.dockerignore | 3 + registry/purview-registry/.gitignore | 4 + registry/purview-registry/Dockerfile | 9 + registry/purview-registry/README.md | 5 + registry/purview-registry/api-spec.md | 366 +++++++++ registry/purview-registry/main.py | 116 +++ .../purview-registry/registry/__init__.py | 0 .../purview-registry/registry/interface.py | 87 ++ registry/purview-registry/registry/models.py | 752 ++++++++++++++++++ .../registry/purview_registry.py | 451 +++++++++++ registry/purview-registry/requirements.txt | 2 + .../purview-registry/test/test_creation.py | 23 + registry/purview-registry/test/test_get.py | 54 ++ 13 files changed, 1872 insertions(+) create mode 100644 registry/purview-registry/.dockerignore create mode 100644 registry/purview-registry/.gitignore create mode 100644 registry/purview-registry/Dockerfile create mode 100644 registry/purview-registry/README.md create mode 100644 registry/purview-registry/api-spec.md create mode 100644 registry/purview-registry/main.py create mode 100644 registry/purview-registry/registry/__init__.py create mode 100644 registry/purview-registry/registry/interface.py create mode 100644 registry/purview-registry/registry/models.py create mode 100644 registry/purview-registry/registry/purview_registry.py create mode 100644 registry/purview-registry/requirements.txt create mode 100644 registry/purview-registry/test/test_creation.py create mode 100644 registry/purview-registry/test/test_get.py diff --git a/registry/purview-registry/.dockerignore b/registry/purview-registry/.dockerignore new file mode 100644 index 000000000..bc0ed1f7a --- /dev/null +++ b/registry/purview-registry/.dockerignore @@ -0,0 +1,3 @@ +__pycache__ +.env +.vscode diff --git a/registry/purview-registry/.gitignore b/registry/purview-registry/.gitignore new file mode 100644 index 000000000..ed2a6faed --- /dev/null +++ b/registry/purview-registry/.gitignore @@ -0,0 +1,4 @@ +__pycache__ +.env +.vscode +.idea diff --git a/registry/purview-registry/Dockerfile b/registry/purview-registry/Dockerfile new file mode 100644 index 000000000..d2647021d --- /dev/null +++ b/registry/purview-registry/Dockerfile @@ -0,0 +1,9 @@ +FROM python:3.9 + +COPY ./ /usr/src + +WORKDIR /usr/src +RUN pip install -r requirements.txt + +# Start web server +CMD [ "uvicorn","main:app","--host", "0.0.0.0", "--port", "80" ] diff --git a/registry/purview-registry/README.md b/registry/purview-registry/README.md new file mode 100644 index 000000000..f06ca7def --- /dev/null +++ b/registry/purview-registry/README.md @@ -0,0 +1,5 @@ +# SQL-Based Registry for Feathr + +This is the reference implementation of [the Feathr API spec](./api-spec.md), base on SQL databases instead of PurView. + +Please note that this implementation uses iterations of `select` to retrieve graph lineages, this approach is very inefficient and should **not** be considered as production-ready. We only suggest to use this implementation for testing/researching purposes. \ No newline at end of file diff --git a/registry/purview-registry/api-spec.md b/registry/purview-registry/api-spec.md new file mode 100644 index 000000000..1b14cae8b --- /dev/null +++ b/registry/purview-registry/api-spec.md @@ -0,0 +1,366 @@ +# Feathr Registry API Specifications + +## Data Models + +### EntityType +Type: Enum + +| Value | +|-----------------------------| +| `feathr_workspace_v1` | +| `feathr_source_v1` | +| `feathr_anchor_v1` | +| `feathr_anchor_feature_v1` | +| `feathr_derived_feature_v1` | + +### ValueType +Type: Enum + +| Value | +|---------------| +| `UNSPECIFIED` | +| `BOOL` | +| `INT32` | +| `INT64` | +| `FLOAT` | +| `DOUBLE` | +| `STRING` | +| `BYTES` | + +### VectorType +Type: Enum + +| Value | +|----------| +| `TENSOR` | + +### TensorCategory +Type: Enum + +| Value | +|----------| +| `DENSE` | +| `SPARSE` | + +### FeatureType +Type: Object + +| Field | Type | +|----------------|-------------------------------------| +| type | [`VectorType`](#valuetype) | +| tensorCategory | [`TensorCategory`](#tensorcategory) | +| dimensionType | [`array`](#valuetype) | +| valType | [`ValueType`](#valuetype) | + +### TypedKey +Type: Object + +| Field | Type | +|------------------|-----------------------------| +| key_column | `string` | +| key_column_type | [`ValueType`](#valuetype) | +| full_name | `string`, optional | +| description | `string`, optional | +| key_column_alias | `string`, optional | + +### ExpressionTransformation +Type: Object + +| Field | Type | +|----------------|----------| +| transform_expr | `string` | + +### WindowAggregationTransformation +Type: Object + +| Field | Type | +|----------|--------------------| +| def_expr | `string` | +| agg_func | `string`, optional | +| window | `string`, optional | +| group_by | `string`, optional | +| filter | `string`, optional | +| limit | `number`, optional | + +### UdfTransformation +Type: Object + +| Field | Type | +|-------|----------| +| name | `string` | + +### EntityReference +Type: Object + +| Field | Type | Comments | +|------------------|-----------------------------|--------------------------------------| +| guid | `Guid` | | +| typeName | [`EntityType`](#entitytype) | | +| uniqueAttributes | `map` | Contains `qualifiedName` only so far | + +### ProjectAttributes +Type: Object + +| Field | Type | +|------------------|----------------------------------------------| +| qualifiedName | `string` | +| name | `string` | +| anchors | [`array`](#entityreference) | +| sources | [`array`](#entityreference) | +| anchor_features | [`array`](#entityreference) | +| derived_features | [`array`](#entityreference) | +| tags | `map` | + +### SourceAttributes +Type: Object + +| Field | Type | +|----------------------|-----------------------| +| qualifiedName | `string` | +| name | `string` | +| path | `string` | +| preprocessing | `string`, optional | +| eventTimestampColumn | `string`, optional | +| timestampFormat | `string`, optional | +| type | `string` | +| tags | `map` | + +### AnchorAttributes +Type: Object + +| Field | Type | +|---------------|----------------------------------------------| +| qualifiedName | `string` | +| name | `string` | +| features | [`array`](#entityreference) | +| source | [`EntityReference`](#entityreference) | +| tags | `map` | + +### AnchorFeatureAttributes +Type: Object + +| Field | Type | +|----------------|--------------------------------| +| qualifiedName | `string` | +| name | `string` | +| type | [`FeatureType`](#featuretype) | +| transformation | [`ExpressionTransformation`](#expressiontransformation)
`or` [`WindowAggregationTransformation`](#windowaggregationtransformation)
`or` [`UdfTransformation`](#udftransformation) | +| key | [`array`](#typedkey) | +| tags | `map` | + +### DerivedFeatureAttributes +Type: Object + +| Field | Type | +|------------------------|--------------------------------| +| qualifiedName | `string` | +| name | `string` | +| type | [`FeatureType`](#featuretype) | +| transformation | [`ExpressionTransformation`](#expressiontransformation)
`or` [`WindowAggregationTransformation`](#windowaggregationtransformation)
`or` [`UdfTransformation`](#udftransformation) | +| key | [`array`](#typedkey) | +| input_anchor_features | [`array`](#entityreference) | +| input_derived_features | [`array`](#entityreference) | +| tags | `map` | + +### EntityStatus +Type: Enum + +| Value | +|----------| +| `ACTIVE` | + +### Entity +Type: Object + +| Field | Type | +|----------------|---------------------------------| +| guid | `Guid` | +| lastModifiedTS | `string` | +| status | [`EntityStatus`](#entitystatus) | +| displayText | `string` | +| typeName | [`EntityType`](#entitytype) | +| attributes | [`ProjectAttributes`](#projectattributes)
`or` [`SourceAttributes`](#sourceattributes)
`or` [`AnchorAttributes`](#anchorattributes)
`or` [`AnchorFeatureAttributes`](#anchorfeatureattributes)
`or` [`DerivedFeatureAttributes`](#derivedfeatureattributes) | + +### RelationshipType +Type: Enum + +| Value | +|-------------| +| `BelongsTo` | +| `Contains` | +| `Produces` | +| `Consumes` | + +### Relationship +Type: Object + +| Field | Type | +|------------------|-----------------------------------------| +| relationshipId | `Guid` | +| relationshipType | [`RelationshipType`](#relationshiptype) | +| fromEntityId | `Guid` | +| toEntityId | `Guid` | + +### ProjectDefinition +Type: Object + +| Field | Type | +|----------------------|-----------------------| +| qualifiedName | `string` | +| tags | `map` | + + +### SourceDefinition +Type: Object + +| Field | Type | +|----------------------|-----------------------| +| qualifiedName | `string` | +| name | `string` | +| path | `string` | +| preprocessing | `string`, optional | +| eventTimestampColumn | `string`, optional | +| timestampFormat | `string`, optional | +| type | `string` | +| tags | `map` | + +### AnchorDefinition +Type: Object + +| Field | Type | +|----------------------|-----------------------| +| qualifiedName | `string` | +| name | `string` | +| source_id | `Guid` | +| tags | `map` | + +### AnchorFeatureDefinition +Type: Object + +| Field | Type | +|----------------|--------------------------------| +| qualifiedName | `string` | +| name | `string` | +| featureType | [`FeatureType`](#featuretype) | +| transformation | [`ExpressionTransformation`](#expressiontransformation)
`or` [`WindowAggregationTransformation`](#windowaggregationtransformation)
`or` [`UdfTransformation`](#udftransformation) | +| key | [`array`](#typedkey) | +| tags | `map` | + +### DerivedFeatureDefinition +Type: Object + +| Field | Type | +|------------------------|--------------------------------| +| qualifiedName | `string` | +| name | `string` | +| featureType | [`FeatureType`](#featuretype) | +| transformation | [`ExpressionTransformation`](#expressiontransformation)
`or` [`WindowAggregationTransformation`](#windowaggregationtransformation)
`or` [`UdfTransformation`](#udftransformation) | +| key | [`array`](#typedkey) | +| input_anchor_features | `array` | +| input_derived_features | `array` | +| tags | `map` | + + +### EntitiesAndRelationships +Type: Object + +| Field | Type | +|---------------|----------------------------------------| +| guidEntityMap | [`map`](#entity) | +| relations | [`array`](#relationship) | + + +## Feathr Registry API + +### `GET /projects` +List **names** of all projects. + +Response Type: `array` + +### `GET /projects/{project}` +Get everything defined in the project + +Response Type: [`EntitiesAndRelationships`](#entitiesandrelationships) + +### `GET /projects/{project}/datasources` +Get all sources defined in the project. + +Response Type: [`array`](#entity) + +### `GET /projects/{project}/features` +Get all anchor features and derived features in the project, or only features meet the search criteria in the project. + +Query Parameters: + +| Field | Type | +|---------|--------| +| keyword | string | +| size | number | +| offset | number | + + +Response Type: Object + +| Field | Type | +|----------|----------------------------| +| features | [`array`](#entity) | + +### `GET /features/:feature` +Get feature details. + +Response Type: Object + +| Field | Type | Comments | +|-----------------|-----------------------|-----------------------------| +| entity | [`Entity`](#entity) | | +| referredEntities| `map` | For compatibility, not used | + +### `POST /projects` +Create new project + ++ Request Type: [`ProjectDefinition`](#projectdefinition) ++ Response Type: Object + +| Field | Type | +|-------|------| +| guid | Guid | + +### `POST /projects/{project}/datasources` +Create new source in the project + ++ Request Type: [`SourceDefinition`](#sourcedefinition) ++ Response Type: Object + +| Field | Type | +|-------|------| +| guid | Guid | + +### `POST /projects/{project}/anchors` +Create new anchor in the project + ++ Request Type: [`AnchorDefinition`](#anchordefinition) ++ Response Type: Object + +| Field | Type | +|-------|------| +| guid | Guid | + +### `POST /projects/{project}/anchors/{anchor}/features` +Create new anchor feature in the project under specified anchor + ++ Request Type: [`AnchorFeatureDefinition`](#anchorfeaturedefinition) ++ Response Type: Object + +| Field | Type | +|-------|------| +| guid | Guid | + +### `POST /projects/{project}/derivedfeatures` +Create new derived feature in the project + ++ Request Type: [`DerivedFeatureDefinition`](#derivedfeaturedefinition) ++ Response Type: Object + +| Field | Type | +|-------|------| +| guid | Guid | diff --git a/registry/purview-registry/main.py b/registry/purview-registry/main.py new file mode 100644 index 000000000..4d34bdb1c --- /dev/null +++ b/registry/purview-registry/main.py @@ -0,0 +1,116 @@ +import os +from typing import Optional +from uuid import UUID +from fastapi import APIRouter, FastAPI, HTTPException +from starlette.middleware.cors import CORSMiddleware +from registry import * +from registry.purview_registry import PurviewRegistry +from registry.models import AnchorDef, AnchorFeatureDef, DerivedFeatureDef, EntityType, ProjectDef, SourceDef, to_snake + +rp = "/" +try: + rp = os.environ["API_BASE"] + if rp[0] != '/': + rp = '/' + rp +except: + pass +print("Using API BASE: ", rp) + +registry = PurviewRegistry() +app = FastAPI() +router = APIRouter() + +# Enables CORS +app.add_middleware(CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], + ) + + +@router.get("/projects") +def get_projects() -> list[str]: + return registry.get_projects() + + +@router.get("/projects/{project}") +def get_projects(project: str) -> dict: + return registry.get_project(project).to_dict() + + +@router.get("/projects/{project}/datasources") +def get_project_datasources(project: str) -> list: + p = registry.get_entity(project) + source_ids = [s.id for s in p.attributes.sources] + sources = registry.get_entities(source_ids) + return list([e.to_dict() for e in sources]) + + +@router.get("/projects/{project}/features") +def get_project_features(project: str, keyword: Optional[str] = None) -> list: + if keyword is None: + p = registry.get_entity(project) + feature_ids = [s.id for s in p.attributes.anchor_features] + \ + [s.id for s in p.attributes.derived_features] + features = registry.get_entities(feature_ids) + return list([e.to_dict() for e in features]) + else: + efs = registry.search_entity( + keyword, [EntityType.AnchorFeature, EntityType.DerivedFeature]) + feature_ids = [ef.id for ef in efs] + features = registry.get_entities(feature_ids) + return list([e.to_dict() for e in features]) + + +@router.get("/features/{feature}") +def get_feature(feature: str) -> dict: + e = registry.get_entity(feature) + if e.entity_type not in [EntityType.DerivedFeature, EntityType.AnchorFeature]: + raise HTTPException( + status_code=404, detail=f"Feature {feature} not found") + return e + + +@router.get("/features/{feature}/lineage") +def get_feature_lineage(feature: str) -> dict: + lineage = registry.get_lineage(feature) + return lineage.to_dict() + + +@router.post("/projects") +def new_project(definition: dict) -> UUID: + id = registry.create_project(ProjectDef(**to_snake(definition))) + return {"guid": str(id)} + + +@router.post("/projects/{project}/datasources") +def new_project_datasource(project: str, definition: dict) -> UUID: + project_id = registry.get_entity_id(project) + id = registry.create_project_datasource(project_id, SourceDef(**to_snake(definition))) + return {"guid": str(id)} + + +@router.post("/projects/{project}/anchors") +def new_project_anchor(project: str, definition: dict) -> UUID: + project_id = registry.get_entity_id(project) + id = registry.create_project_anchor(project_id, AnchorDef(**to_snake(definition))) + return {"guid": str(id)} + + +@router.post("/projects/{project}/anchors/{anchor}/features") +def new_project_anchor_feature(project: str, anchor: str, definition: dict) -> UUID: + project_id = registry.get_entity_id(project) + anchor_id = registry.get_entity_id(anchor) + id = registry.create_project_anchor_feature(project_id, anchor_id, AnchorFeatureDef(**to_snake(definition))) + return {"guid": str(id)} + + +@router.post("/projects/{project}/derivedfeatures") +def new_project_derived_feature(project: str, definition: dict) -> UUID: + project_id = registry.get_entity_id(project) + id = registry.create_project_derived_feature(project_id, DerivedFeatureDef(**to_snake(definition))) + return {"guid": str(id)} + + +app.include_router(prefix=rp, router=router) diff --git a/registry/purview-registry/registry/__init__.py b/registry/purview-registry/registry/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/registry/purview-registry/registry/interface.py b/registry/purview-registry/registry/interface.py new file mode 100644 index 000000000..78e79cb88 --- /dev/null +++ b/registry/purview-registry/registry/interface.py @@ -0,0 +1,87 @@ +from abc import ABC, abstractclassmethod, abstractmethod +from typing import Union +from uuid import UUID +from registry.models import * + + +class Registry(ABC): + @abstractmethod + def get_projects(self) -> list[str]: + """ + Returns the names of all projects + """ + pass + + @abstractmethod + def get_entity(self, id_or_name: Union[str, UUID],recursive = False) -> Entity: + """ + Get one entity by its id or qualified name + """ + pass + + @abstractmethod + def get_entities(self, ids: list[UUID]) -> list[Entity]: + """ + Get list of entities by their ids + """ + pass + + @abstractmethod + def get_entity_id(self, id_or_name: Union[str, UUID]) -> UUID: + """ + Get entity id by its name + """ + pass + + @abstractmethod + def get_neighbors(self, id_or_name: Union[str, UUID], relationship: RelationshipType) -> list[Edge]: + """ + Get list of edges with specified type that connect to this entity. + The edge contains fromId and toId so we can follow to the entity it connects to + """ + pass + + @abstractmethod + def get_lineage(self, id_or_name: Union[str, UUID]) -> EntitiesAndRelations: + """ + Get all the upstream and downstream entities of an entity, along with all edges connect them. + Only meaningful to features and data sources. + """ + pass + + @abstractmethod + def get_project(self, id_or_name: Union[str, UUID]) -> EntitiesAndRelations: + """ + Get a project and everything inside of it, both entities and edges + """ + pass + + @abstractmethod + def search_entity(self, + keyword: str, + type: list[EntityType], + project: Optional[Union[str, UUID]] = None) -> list[EntityRef]: + """ + Search entities with specified type that also match the keyword in a project + """ + pass + + @abstractmethod + def create_project(self, definition: ProjectDef) -> UUID: + pass + + @abstractmethod + def create_project_datasource(self, project_id: UUID, definition: SourceDef) -> UUID: + pass + + @abstractmethod + def create_project_anchor(self, project_id: UUID, definition: AnchorDef) -> UUID: + pass + + @abstractmethod + def create_project_anchor_feature(self, project_id: UUID, anchor_id: UUID, definition: AnchorFeatureDef) -> UUID: + pass + + @abstractmethod + def create_project_derived_feature(self, project_id: UUID, definition: DerivedFeatureDef) -> UUID: + pass diff --git a/registry/purview-registry/registry/models.py b/registry/purview-registry/registry/models.py new file mode 100644 index 000000000..2f956a186 --- /dev/null +++ b/registry/purview-registry/registry/models.py @@ -0,0 +1,752 @@ +from abc import ABC, abstractmethod +from enum import Enum +from typing import Optional, Union +from uuid import UUID +import json +import re + + +def to_snake(d, level: int = 0): + """ + Convert `string`, `list[string]`, or all keys in a `dict` into snake case + The maximum length of input string or list is 100, or it will be truncated before being processed, for dict, the exception will be thrown if it has more than 100 keys. + the maximum nested level is 10, otherwise the exception will be thrown + """ + if level >= 10: + raise ValueError("Too many nested levels") + if isinstance(d, str): + d = d[:100] + return re.sub(r'([A-Z]\w+$)', r'_\1', d).lower() + if isinstance(d, list): + d = d[:100] + return [to_snake(i, level + 1) if isinstance(i, (dict, list)) else i for i in d] + if len(d) > 100: + raise ValueError("Dict has too many keys") + return {to_snake(a, level + 1): to_snake(b, level + 1) if isinstance(b, (dict, list)) else b for a, b in d.items()} + + +def _to_type(value, type): + """ + Convert `value` into `type`, + or `list[type]` if `value` is a list + NOTE: This is **not** a generic implementation, only for objects in this module + """ + if isinstance(value, type): + return value + if isinstance(value, list): + return list([_to_type(v, type) for v in value]) + if isinstance(value, dict): + if hasattr(type, "new"): + try: + # The convention is to use `new` method to create the object from a dict + return type.new(**to_snake(value)) + except TypeError: + pass + return type(**to_snake(value)) + if issubclass(type, Enum): + try: + n = int(value) + return type(n) + except ValueError: + pass + if hasattr(type, "new"): + try: + # As well as Enum types, some of them have alias that cannot be handled by default Enum constructor + return type.new(value) + except KeyError: + pass + return type[value] + return type(value) + + +def _to_uuid(value): + return _to_type(value, UUID) + + +class ValueType(Enum): + UNSPECIFIED = 0 + BOOLEAN = 1 + INT = 2 + LONG = 3 + FLOAT = 4 + DOUBLE = 5 + STRING = 6 + BYTES = 7 + + +class VectorType(Enum): + TENSOR = 0 + + +class TensorCategory(Enum): + DENSE = 0 + SPARSE = 1 + + +class EntityType(Enum): + Project = 1 + Source = 2 + Anchor = 3 + AnchorFeature = 4 + DerivedFeature = 5 + + @staticmethod + def new(v): + return { + "feathr_workspace_v1": EntityType.Project, + "feathr_source_v1": EntityType.Source, + "feathr_anchor_v1": EntityType.Anchor, + "feathr_anchor_feature_v1": EntityType.AnchorFeature, + "feathr_derived_feature_v1": EntityType.DerivedFeature, + }[v] + + def __str__(self): + return { + EntityType.Project: "feathr_workspace_v1", + EntityType.Source: "feathr_source_v1", + EntityType.Anchor: "feathr_anchor_v1", + EntityType.AnchorFeature: "feathr_anchor_feature_v1", + EntityType.DerivedFeature: "feathr_derived_feature_v1", + }[self] + + +class RelationshipType(Enum): + Contains = 1 + BelongsTo = 2 + Consumes = 3 + Produces = 4 + + +class ToDict(ABC): + """ + This ABC is used to convert object to dict, then JSON. + """ + @abstractmethod + def to_dict(self) -> dict: + pass + + def to_json(self, indent=None) -> str: + return json.dumps(self.to_dict(), indent=indent) + + +class FeatureType(ToDict): + def __init__(self, + type: Union[str, VectorType], + tensor_category: Union[str, TensorCategory], + dimension_type: list[Union[str, ValueType]], + val_type: Union[str, ValueType]): + self.type = _to_type(type, VectorType) + self.tensor_category = _to_type(tensor_category, TensorCategory) + self.dimension_type = _to_type(dimension_type, ValueType) + self.val_type = _to_type(val_type, ValueType) + + def to_dict(self) -> dict: + return { + "type": self.type.name, + "tensorCategory": self.tensor_category.name, + "dimensionType": [t.name for t in self.dimension_type], + "valType": self.val_type.name, + } + + +class TypedKey(ToDict): + def __init__(self, + key_column: str, + key_column_type: ValueType, + full_name: Optional[str] = None, + description: Optional[str] = None, + key_column_alias: Optional[str] = None): + self.key_column = key_column + self.key_column_type = _to_type(key_column_type, ValueType) + self.full_name = full_name + self.description = description + self.key_column_alias = key_column_alias + + def to_dict(self) -> dict: + ret = { + "key_column": self.key_column, + "key_column_type": self.key_column_type.name, + } + if self.full_name is not None: + ret["full_name"] = self.full_name + if self.description is not None: + ret["description"] = self.full_name + if self.key_column_alias is not None: + ret["key_column_alias"] = self.key_column_alias + return ret + + +class Transformation(ToDict): + @staticmethod + def new(**kwargs): + if "transform_expr" in kwargs: + return ExpressionTransformation(**kwargs) + elif "def_expr" in kwargs: + return WindowAggregationTransformation(**kwargs) + elif "name" in kwargs: + return UdfTransformation(**kwargs) + else: + raise ValueError(kwargs) + + +class ExpressionTransformation(Transformation): + def __init__(self, transform_expr: str): + self.transform_expr = transform_expr + + def to_dict(self) -> dict: + return { + "transform_expr": self.transform_expr + } + + +class WindowAggregationTransformation(Transformation): + def __init__(self, + def_expr: str, + agg_func: Optional[str] = None, + window: Optional[str] = None, + group_by: Optional[str] = None, + filter: Optional[str] = None, + limit: Optional[int] = None): + self.def_expr = def_expr + self.agg_func = agg_func + self.window = window + self.group_by = group_by + self.filter = filter + self.limit = limit + + def to_dict(self) -> dict: + ret = { + "def_expr": self.def_expr, + } + if self.agg_func is not None: + ret["agg_func"] = self.agg_func + if self.window is not None: + ret["window"] = self.window + if self.group_by is not None: + ret["group_by"] = self.group_by + if self.filter is not None: + ret["filter"] = self.filter + if self.limit is not None: + ret["limit"] = self.limit + return ret + + +class UdfTransformation(Transformation): + def __init__(self, name: str): + self.name = name + + def to_dict(self) -> dict: + return { + "name": self.name + } + + +class EntityRef(ToDict): + def __init__(self, + id: UUID, + type: Union[str, EntityType], + qualified_name: Optional[str] = None, + uniq_attr: dict = {}): + self.id = id + self.type = _to_type(type, EntityType) + if qualified_name is not None: + self.uniq_attr = {"qualifiedName": qualified_name} + else: + self.uniq_attr = uniq_attr + + @property + def entity_type(self) -> EntityType: + return self.type + + @property + def qualified_name(self) -> EntityType: + return self.uniq_attr['qualifiedName'] + + def get_ref(self): + return self + + def to_dict(self) -> dict: + return { + "guid": str(self.id), + "typeName": str(self.type), + "uniqueAttributes": self.uniq_attr, + } + + +class Attributes(ToDict): + @staticmethod + def new(entity_type: Union[str, EntityType], **kwargs): + print("YYY ", entity_type, kwargs) + return { + EntityType.Project: ProjectAttributes, + EntityType.Source: SourceAttributes, + EntityType.Anchor: AnchorAttributes, + EntityType.AnchorFeature: AnchorFeatureAttributes, + EntityType.DerivedFeature: DerivedFeatureAttributes, + }[_to_type(entity_type, EntityType)](**kwargs) + + +class Entity(ToDict): + def __init__(self, + entity_id: Union[str, UUID], + qualified_name: str, + entity_type: Union[str, EntityType], + attributes: Union[dict, Attributes], + **kwargs): + self.id = _to_uuid(entity_id) + self.qualified_name = qualified_name + self.entity_type = _to_type(entity_type, EntityType) + if isinstance(attributes, Attributes): + self.attributes = attributes + else: + self.attributes = Attributes.new( + entity_type, **to_snake(attributes)) + + def get_ref(self) -> EntityRef: + return EntityRef(self.id, + self.attributes.entity_type, + self.qualified_name) + + def to_dict(self) -> dict: + return { + "guid": str(self.id), + "lastModifiedTS": "1", + "status": "ACTIVE", + "displayText": self.attributes.name, + "typeName": str(self.attributes.entity_type), + "attributes": self.attributes.to_dict(), + } + + def to_min_repr(self) -> dict: + return { + 'qualifiedName':self.qualified_name, + 'guid':str(self.id), + 'typeName':str(self.attributes.entity_type), + } + + +class ProjectAttributes(Attributes): + def __init__(self, + name: str, + children: list[Union[dict, Entity]] = [], + tags: dict = {}, + **kwargs): + self.name = name + self.tags = tags + self._children = [] + if len(children) > 0: + self.children = children + + @property + def entity_type(self) -> EntityType: + return EntityType.Project + + @property + def children(self): + return self._children + + @children.setter + def children(self, v: list[Union[dict, Entity]]): + for f in v: + if isinstance(f, Entity): + self._children.append(f) + elif isinstance(f, dict): + self._children.append(_to_type(f, Entity)) + else: + raise TypeError(f) + + @property + def sources(self): + return [ + e for e in self.children if e.entity_type == EntityType.Source] + + @property + def anchors(self): + return [ + e for e in self.children if e.entity_type == EntityType.Anchor] + + @property + def anchor_features(self): + return [ + e for e in self.children if e.entity_type == EntityType.AnchorFeature] + + @property + def derived_features(self): + return [ + e for e in self.children if e.entity_type == EntityType.DerivedFeature] + + def to_dict(self) -> dict: + return { + "qualifiedName": self.name, + "name": self.name, + "sources": list([e.get_ref().to_dict() for e in self.sources]), + "anchors": list([e.get_ref().to_dict() for e in self.anchors]), + "anchor_features": list([e.get_ref().to_dict() for e in self.anchor_features]), + "derived_features": list([e.get_ref().to_dict() for e in self.derived_features]), + "tags": self.tags, + } + + +class SourceAttributes(Attributes): + def __init__(self, + qualified_name: str, + name: str, + type: str, + path: str, + preprocessing: Optional[str] = None, + event_timestamp_column: Optional[str] = None, + timestamp_format: Optional[str] = None, + tags: dict = {}): + self.qualified_name = qualified_name + self.name = name + self.type = type + self.path = path + self.preprocessing = preprocessing + self.event_timestamp_column = event_timestamp_column + self.timestamp_format = timestamp_format + self.tags = tags + + @property + def entity_type(self) -> EntityType: + return EntityType.Source + + def to_dict(self) -> dict: + ret = { + "qualifiedName": self.qualified_name, + "name": self.name, + "type": self.type, + "path": self.path, + "tags": self.tags, + } + if self.preprocessing is not None: + ret["preprocessing"] = self.preprocessing + if self.event_timestamp_column is not None: + ret["eventTimestampColumn"] = self.event_timestamp_column + if self.timestamp_format is not None: + ret["timestampFormat"] = self.timestamp_format + return ret + + +class AnchorAttributes(Attributes): + def __init__(self, + qualified_name: str, + name: str, + # source: Optional[Union[dict, EntityRef, Entity]] = None, + # features: list[Union[dict, EntityRef, Entity]] = [], + tags: dict = {}, + **kwargs): + self.qualified_name = qualified_name + self.name = name + self._source = None + self._features = [] + # if source is not None: + # self._source = _to_type(source, Entity).get_ref() + # if features: + # self.features = features + self.tags = tags + if 'source' in kwargs: + self._source = kwargs['source'] + + @property + def entity_type(self) -> EntityType: + return EntityType.Anchor + + @property + def source(self) -> EntityRef: + return self._source + + @source.setter + def source(self, s): + if isinstance(s, Entity): + self._source = s.get_ref() + elif isinstance(s, EntityRef): + self._source = s + elif isinstance(s, dict): + self._source = _to_type(s, Entity).get_ref() + else: + raise TypeError(s) + + @property + def features(self): + return self._features + + @features.setter + def features(self, features): + self._features = [] + for f in features: + if isinstance(f, Entity): + self._features.append(f.get_ref()) + elif isinstance(f, EntityRef): + self._features.append(f) + elif isinstance(f, dict): + self._features.append(_to_type(f, Entity).get_ref()) + else: + raise TypeError(f) + + def to_dict(self) -> dict: + ret = { + "qualifiedName": self.qualified_name, + "name": self.name, + "features": list([e.get_ref().to_dict() for e in self.features]), + "tags": self.tags, + } + if self.source is not None: + ret["source"] = self.source.get_ref().to_dict() + return ret + + +class AnchorFeatureAttributes(Attributes): + def __init__(self, + qualified_name: str, + name: str, + type: Union[dict, FeatureType], + transformation: Union[dict, Transformation], + key: list[Union[dict, TypedKey]], + tags: dict = {}): + self.qualified_name = qualified_name + self.name = name + self.type = _to_type(type, FeatureType) + self.transformation = _to_type(transformation, Transformation) + self.key = _to_type(key, TypedKey) + self.tags = tags + + @property + def entity_type(self) -> EntityType: + return EntityType.AnchorFeature + + def to_dict(self) -> dict: + return { + "qualifiedName": self.qualified_name, + "name": self.name, + "type": self.type.to_dict(), + "transformation": self.transformation.to_dict(), + "key": list([k.to_dict() for k in self.key]), + "tags": self.tags, + } + + +class DerivedFeatureAttributes(Attributes): + def __init__(self, + qualified_name: str, + name: str, + type: Union[dict, FeatureType], + transformation: Union[dict, Transformation], + key: list[Union[dict, TypedKey]], + # input_anchor_features: list[Union[dict, EntityRef, Entity]] = [], + # input_derived_features: list[Union[dict, EntityRef, Entity]] = [], + tags: dict = {}, + **kwargs): + self.qualified_name = qualified_name + self.name = name + self.type = _to_type(type, FeatureType) + self.transformation = _to_type(transformation, Transformation) + self.key = _to_type(key, TypedKey) + self._input_anchor_features = [] + self._input_derived_features = [] + self.tags = tags + # self._set_input_anchor_features(input_anchor_features) + # self._set_input_derived_features(input_derived_features) + + @property + def entity_type(self) -> EntityType: + return EntityType.DerivedFeature + + @property + def input_features(self): + return self._input_anchor_features + self._input_derived_features + + @input_features.setter + def input_features(self, v: Union[dict, Entity, EntityRef]): + self._input_anchor_features = [] + self._input_derived_features = [] + for f in v: + e = None + if isinstance(f, EntityRef): + e = f + elif isinstance(f, Entity): + e = f.get_ref() + elif isinstance(f, dict): + try: + e = _to_type(f, Entity).get_ref() + except: + e = _to_type(f, EntityRef) + else: + raise TypeError(f) + + if e.entity_type == EntityType.AnchorFeature: + self._input_anchor_features.append(e) + elif e.entity_type == EntityType.DerivedFeature: + self._input_derived_features.append(e) + else: + pass + + @property + def input_anchor_features(self): + return self._input_anchor_features + + @property + def input_derived_features(self): + return self._input_derived_features + + def to_dict(self) -> dict: + return { + "qualifiedName": self.qualified_name, + "name": self.name, + "type": self.type.to_dict(), + "transformation": self.transformation.to_dict(), + "key": list([k.to_dict() for k in self.key]), + "input_anchor_features": [e.to_dict() for e in self.input_anchor_features], + "input_derived_features": [e.to_dict() for e in self.input_derived_features], + "tags": self.tags, + } + + +class Edge(ToDict): + def __init__(self, + edge_id: Union[str, UUID], + from_id: Union[str, UUID], + to_id: Union[str, UUID], + conn_type: Union[str, RelationshipType]): + self.id = _to_uuid(edge_id) + self.from_id = _to_uuid(from_id) + self.to_id = _to_uuid(to_id) + self.conn_type = _to_type(conn_type, RelationshipType) + + def __eq__(self, o: object) -> bool: + # Edge ID is kinda useless + return self.from_id == o.from_id and self.to_id == o.to_id and self.conn_type == o.conn_type + + def __hash__(self) -> int: + return hash((self.from_id, self.to_id, self.conn_type)) + + def to_dict(self) -> dict: + return { + "relationshipId": str(self.id), + "fromEntityId": str(self.from_id), + "toEntityId": str(self.to_id), + "relationshipType": self.conn_type.name, + } + + +class EntitiesAndRelations(ToDict): + def __init__(self, entities: list[Entity], edges: list[Edge]): + self.entities = dict([(e.id, e) for e in entities]) + self.edges = set(edges) + + def to_dict(self) -> dict: + return { + "guidEntityMap": dict([(str(id), self.entities[id].to_dict()) for id in self.entities]), + "relations": list([e.to_dict() for e in self.edges]), + } + + +class ProjectDef: + def __init__(self, name: str, qualified_name: str = "", tags: dict = {}): + self.name = name + self.qualified_name = qualified_name + self.tags = tags + + def to_attr(self) -> ProjectAttributes: + return ProjectAttributes(name=self.name, tags=self.tags) + + +class SourceDef: + def __init__(self, + qualified_name: str, + name: str, + path: str, + type: str, + preprocessing: Optional[str] = None, + event_timestamp_column: Optional[str] = None, + timestamp_format: Optional[str] = None, + tags: dict = {}): + self.qualified_name = qualified_name + self.name = name + self.path = path + self.type = type + self.preprocessing = preprocessing + self.event_timestamp_column = event_timestamp_column + self.timestamp_format = timestamp_format + self.tags = tags + + def to_attr(self) -> SourceAttributes: + return SourceAttributes(qualified_name=self.qualified_name, + name=self.name, + type=self.type, + path=self.path, + preprocessing=self.preprocessing, + event_timestamp_column=self.event_timestamp_column, + timestamp_format=self.timestamp_format, + tags=self.tags) + +class AnchorDef: + def __init__(self, + qualified_name: str, + name: str, + source_id: Union[str, UUID], + tags: dict = {}): + self.qualified_name = qualified_name + self.name = name + self.source_id = _to_uuid(source_id) + self.tags = tags + + def to_attr(self, source: EntityRef) -> AnchorAttributes: + attr = AnchorAttributes(qualified_name=self.qualified_name, + name=self.name, + tags=self.tags) + attr.source = source + return attr + +class AnchorFeatureDef: + def __init__(self, + qualified_name: str, + name: str, + feature_type: Union[dict, FeatureType], + transformation: Union[dict, Transformation], + key: list[Union[dict, TypedKey]], + tags: dict = {}): + self.qualified_name = qualified_name + self.name = name + self.feature_type = _to_type(feature_type, FeatureType) + self.transformation = _to_type(transformation, Transformation) + self.key = _to_type(key, TypedKey) + self.tags = tags + + def to_attr(self) -> AnchorFeatureAttributes: + return AnchorFeatureAttributes(qualified_name=self.qualified_name, + name=self.name, + type=self.feature_type, + transformation=self.transformation, + key=self.key, + tags=self.tags) + + +class DerivedFeatureDef: + def __init__(self, + qualified_name: str, + name: str, + feature_type: Union[dict, FeatureType], + transformation: Union[dict, Transformation], + key: list[Union[dict, TypedKey]], + input_anchor_features: list[Union[str, UUID]], + input_derived_features: list[Union[str, UUID]], + tags: dict = {}): + self.qualified_name = qualified_name + self.name = name + self.feature_type = _to_type(feature_type, FeatureType) + self.transformation = _to_type(transformation, Transformation) + self.key = _to_type(key, TypedKey) + self.input_anchor_features = _to_uuid(input_anchor_features) + self.input_derived_features = _to_uuid(input_derived_features) + self.tags = tags + + def to_attr(self, input_features: list[EntityRef]) -> DerivedFeatureAttributes: + attr = DerivedFeatureAttributes(qualified_name=self.qualified_name, + name=self.name, + type=self.feature_type, + transformation=self.transformation, + key=self.key, + tags=self.tags) + attr.input_features = input_features + return attr + diff --git a/registry/purview-registry/registry/purview_registry.py b/registry/purview-registry/registry/purview_registry.py new file mode 100644 index 000000000..07839c83b --- /dev/null +++ b/registry/purview-registry/registry/purview_registry.py @@ -0,0 +1,451 @@ + +import itertools +import re +from typing import Optional, Tuple, Union +from uuid import UUID + +from azure.identity import DefaultAzureCredential +from loguru import logger +from numpy import allclose +from pyapacheatlas.auth.azcredential import AzCredentialWrapper +from pyapacheatlas.core import (AtlasEntity, AtlasProcess, + PurviewClient) +from pyapacheatlas.core.typedef import (AtlasAttributeDef,Cardinality,EntityTypeDef) +from pyapacheatlas.core.util import GuidTracker +from pyhocon import ConfigFactory + +from registry.interface import Registry +from registry.models import AnchorDef, AnchorFeatureDef, DerivedFeatureDef, Edge, EntitiesAndRelations, Entity, EntityRef, EntityType, ProjectDef, RelationshipType, SourceDef, _to_uuid +Label_Contains = "CONTAINS" +Label_BelongsTo = "BELONGSTO" +Label_Consumes = "CONSUMES" +Label_Produces = "PRODUCES" +class PurviewRegistry(Registry): + def __init__(self,azure_purview_name: str, registry_delimiter: str = "__", credential=None,register_types = False): + self.registry_delimiter = registry_delimiter + self.azure_purview_name = azure_purview_name + + self.credential = DefaultAzureCredential( + exclude_interactive_browser_credential=False) if credential is None else credential + self.oauth = AzCredentialWrapper(credential=self.credential) + self.purview_client = PurviewClient( + account_name=self.azure_purview_name, + authentication=self.oauth + ) + self.guid = GuidTracker(starting=-1000) + if register_types: + self._register_feathr_feature_types() + + def get_projects(self) -> list[str]: + """ + Returns the names of all projects + """ + searchTerm = {"entityType": str(EntityType.Project)} + result = self.purview_client.discovery.query(filter=searchTerm) + result_entities = result['value'] + return [x['qualifiedName'] for x in result_entities] + + def get_entity(self, id_or_name: Union[str, UUID],recursive = False) -> Entity: + id = self.get_entity_id(id_or_name) + if not id: + return None + purview_entity = self.purview_client.get_entity(id)['entities'][0] + entity_type = EntityType.new(purview_entity['typeName']) + if entity_type in [EntityType.AnchorFeature,EntityType.DerivedFeature]: + if "type" in purview_entity['attributes']: + conf = ConfigFactory.parse_string(purview_entity['attributes']['type']) + purview_entity['attributes']['type'] = dict(conf) + base_entity = Entity( + purview_entity["guid"], + purview_entity['attributes']["qualifiedName"], + entity_type, + attributes={x:y for x, y in purview_entity['attributes'].items() if y}) + if recursive: + if base_entity.entity_type == EntityType.Project: + edges = self.get_neighbors(base_entity.id, RelationshipType.Contains) + ids = list([e.to_id for e in edges]) + children = self.get_entities(ids) + base_entity.attributes.children = children + return base_entity + if base_entity.entity_type == EntityType.Anchor: + conn = self.get_neighbors(base_entity.id, RelationshipType.Contains) + feature_ids = [e.to_id for e in conn] + features = self.get_entities(feature_ids) + base_entity.attributes.features = features + source_id = self.get_neighbors( + base_entity.id, RelationshipType.Consumes)[0].to_id + source = self.get_entity(source_id) + base_entity.attributes.source = source + return base_entity + if base_entity.entity_type == EntityType.DerivedFeature: + conn = self.get_neighbors(base_entity.id, RelationshipType.Consumes) + feature_ids = [e.to_id for e in conn] + features = self.get_entities(feature_ids) + base_entity.attributes.input_features = features + return base_entity + return base_entity + + def get_entities(self, ids: list[UUID],recursive=False) -> list[Entity]: + """ + Get list of entities by their ids + """ + return [self.get_entity(x,recursive) for x in ids] + + def get_entity_id(self, id_or_name: Union[str, UUID]) -> UUID: + print(id_or_name) + try: + id = _to_uuid(id_or_name) + return id + except ValueError: + pass + # It is a name + return self._get_id_by_qualfiedName(id_or_name) + + def get_neighbors(self, id_or_name: Union[str, UUID], relationship: RelationshipType) -> list[Edge]: + """ + Get list of edges with specified type that connect to this entity. + The edge contains fromId and toId so we can follow to the entity it connects to + """ + entity = self.get_entity(id_or_name) + + related_entities = self.purview_client.get_entity_lineage(str(entity.id),direction="BOTH")['guidEntityMap'] + process_entities = [v for _,v in related_entities.items() if v['typeName']=="Process"] + + project_contain_process =\ + [x for x in process_entities \ + if x['attributes']['qualifiedName'].startswith(\ + str(relationship.name).upper()+self.registry_delimiter+str(entity.id))] + + edge_end_object = [related_entities[\ + x['displayText'].split(' to ')[1]] \ + for x in project_contain_process \ + if x['displayText'].split(' to ')[1] in related_entities] + + result_edges = [Edge(x['guid'],str(entity.id),x['guid'],relationship) for x in edge_end_object] + return result_edges + + def get_lineage(self, id_or_name: Union[str, UUID]) -> EntitiesAndRelations: + """ + Get all the upstream and downstream entities of an entity, along with all edges connect them. + Only meaningful to features and data sources. + """ + id = self.get_entity_id(id_or_name) + upstream_entities, upstream_edges = self._bfs( + id, RelationshipType.Consumes) + downstream_entities, downstream_edges = self._bfs( + id, RelationshipType.Produces) + return EntitiesAndRelations( + upstream_entities + downstream_entities, + upstream_edges + downstream_edges) + + def get_project(self, id_or_name: Union[str, UUID]) -> EntitiesAndRelations: + """ + Get a project and everything inside of it, both entities and edges + """ + return self.get_entity(id_or_name,True) + + def search_entity(self, + keyword: str, + type: list[EntityType], + project: Optional[Union[str, UUID]] = None) -> list[EntityRef]: + """ + Search entities with specified type that also match the keyword in a project + """ + pass + + def create_project(self, definition: ProjectDef) -> UUID: + attrs = definition.to_attr().to_dict() + feathr_project_entity = AtlasEntity( + name=attrs['name'], + qualified_name=attrs['qualifiedName'], + attributes=attrs['tags'], + typeName=str(EntityType.Project), + guid=self.guid.get_guid()) + + self._upload_entity_batch([feathr_project_entity]) + return UUID(feathr_project_entity.guid) + + def create_project_datasource(self, project_id: UUID, definition: SourceDef) -> UUID: + attrs = definition.to_attr().to_dict() + source_entity = AtlasEntity( + name=attrs['name'], + qualified_name=attrs['qualifiedName'], + attributes= {k:v for k,v in attrs.items() if k not in ['name','qualifiedName']}, + typeName=str(EntityType.Source), + guid=self.guid.get_guid(), + ) + self._upload_entity_batch( + [source_entity]) + + # change from AtlasEntity to Entity + project_entity = self.get_entity(project_id) + source_entity = self.get_entity(source_entity.guid) + + project_contains_source_relation = self._generate_relation_pairs( + project_entity, source_entity, Label_Contains) + self._upload_entity_batch(project_contains_source_relation) + + return source_entity.id + + def create_project_anchor(self, project_id: UUID, definition: AnchorDef) -> UUID: + source_entity = self.get_entity(definition.source_id) + attrs = definition.to_attr(source_entity).to_dict() + anchor_entity = AtlasEntity( + name=definition.name, + qualified_name=definition.qualified_name, + attributes= {k:v for k,v in attrs.items() if k not in ['name','qualifiedName']}, + typeName=str(EntityType.Anchor), + guid=self.guid.get_guid(), + ) + + self._upload_entity_batch( + [anchor_entity]) + + # change from AtlasEntity to Entity + project_entity = self.get_entity(project_id) + anchor_entity = self.get_entity(anchor_entity.guid) + + project_contains_anchor_relation = self._generate_relation_pairs( + project_entity, anchor_entity, Label_Contains) + anchor_consumes_source_relation = self._generate_relation_pairs( + anchor_entity,source_entity, Label_Consumes) + self._upload_entity_batch( + project_contains_anchor_relation + + anchor_consumes_source_relation) + return anchor_entity.id + + def create_project_anchor_feature(self, project_id: UUID, anchor_id: UUID, definition: AnchorFeatureDef) -> UUID: + attrs = definition.to_attr().to_dict() + anchor_feature_entity = AtlasEntity( + name=definition.name, + qualified_name=definition.qualified_name, + attributes= {k:v for k,v in attrs.items() if k not in ['name','qualifiedName']}, + typeName=str(EntityType.AnchorFeature), + guid=self.guid.get_guid()) + self._upload_entity_batch( + [anchor_feature_entity]) + + # change from AtlasEntity to Entity + project_entity = self.get_entity(project_id) + anchor_entity = self.get_entity(anchor_id) + anchor_feature_entity = self.get_entity(anchor_feature_entity.guid) + source_entity = self.get_entity(anchor_entity.attributes.source['guid']) + + project_contains_feature_relation = self._generate_relation_pairs( + project_entity, anchor_feature_entity, Label_Contains) + anchor_contains_feature_relation = self._generate_relation_pairs( + anchor_entity, anchor_feature_entity, Label_Contains) + feature_consumes_source_relation = self._generate_relation_pairs( + anchor_feature_entity, source_entity, Label_Consumes) + + self._upload_entity_batch( + project_contains_feature_relation + + anchor_contains_feature_relation + + feature_consumes_source_relation) + + return anchor_feature_entity.id + + + def create_project_derived_feature(self, project_id: UUID, definition: DerivedFeatureDef) -> UUID: + input_features = self.get_entities(definition.input_anchor_features+definition.input_derived_features) + attrs = definition.to_attr(input_features).to_dict() + derived_feature_entity = AtlasEntity( + name=definition.name, + qualified_name=definition.qualified_name, + attributes={k:v for k,v in attrs.items() if k not in ['name','qualifiedName']}, + typeName=str(EntityType.DerivedFeature), + guid=self.guid.get_guid()) + self._upload_entity_batch( + [derived_feature_entity]) + + # change from AtlasEntity to Entity + project_entity = self.get_entity(project_id) + derived_feature_entity = self.get_entity(derived_feature_entity.guid) + + feature_project_contain_belong_pairs = self._generate_relation_pairs( + project_entity, derived_feature_entity, Label_Contains) + + consume_produce_pairs = [] + for input_feature in input_features: + consume_produce_pairs += self._generate_relation_pairs( + derived_feature_entity, input_feature,Label_Consumes) + + self._upload_entity_batch( + feature_project_contain_belong_pairs + + consume_produce_pairs) + + return derived_feature_entity.id + def _bfs(self, id: UUID, conn_type: RelationshipType) -> Tuple[list[Entity], list[Edge]]: + """ + Breadth first traversal + Starts from `id`, follow edges with `conn_type` only. + + WARN: There is no depth limit. + """ + id_to_process = [id] + entity_ids = [id] + edges = [] + + while len(id_to_process)!=0: + outbound_edges = self._bfs_step(id_to_process,conn_type) + edges += outbound_edges + next_step_ids = list(set([x.to_id for x in outbound_edges])) + entity_ids.extend(next_step_ids) + entity_ids = list(set(entity_ids)) + id_to_process = next_step_ids + + entities = self.get_entities(entity_ids,True) + return (entities,edges) + + + + def _bfs_step(self, ids: list[UUID], conn_type: RelationshipType) -> list[Edge]: + """ + One step of the BFS process + Returns all edges that connect to node ids the next step + """ + return list(itertools.chain(*[self.get_neighbors(id,conn_type) for id in ids])) + + + + def _register_feathr_feature_types(self): + """ + Register the feathr types if we haven't done so. Note that this only needs to be called once per provisioning + a system. Basically this function registers all the feature type definition in a Atlas compatible system. + """ + + # Each feature is registered under a certain Feathr project. The project should what we refer to, however for backward compatibility, the type name would be `feathr_workspace` + type_feathr_project = EntityTypeDef( + name=str(EntityType.Project), + attributeDefs=[ + # "anchor_features" and "derived_features" are removed, since we are moving to use process entity + AtlasAttributeDef(name="tags", typeName="map", + cardinality=Cardinality.SINGLE), + ], + superTypes=["DataSet"], + + ) + type_feathr_sources = EntityTypeDef( + name=str(EntityType.Source), + attributeDefs=[ + + AtlasAttributeDef( + name="path", typeName="string", cardinality=Cardinality.SINGLE), + AtlasAttributeDef(name="event_timestamp_column", + typeName="string", cardinality=Cardinality.SINGLE), + AtlasAttributeDef(name="timestamp_format", + typeName="string", cardinality=Cardinality.SINGLE), + AtlasAttributeDef(name="type", typeName="string", + cardinality=Cardinality.SINGLE), + AtlasAttributeDef(name="preprocessing", typeName="string", + cardinality=Cardinality.SINGLE), + AtlasAttributeDef(name="tags", typeName="map", + cardinality=Cardinality.SINGLE), + ], + superTypes=["DataSet"], + ) + + type_feathr_anchor_features = EntityTypeDef( + name=str(EntityType.AnchorFeature), + attributeDefs=[ + AtlasAttributeDef(name="type", typeName="string", + cardinality=Cardinality.SINGLE), + AtlasAttributeDef(name="key", typeName="array>", + cardinality=Cardinality.SET), + AtlasAttributeDef(name="transformation", typeName="map", + cardinality=Cardinality.SINGLE), + AtlasAttributeDef(name="tags", typeName="map", + cardinality=Cardinality.SINGLE), + ], + superTypes=["DataSet"], + ) + + type_feathr_derived_features = EntityTypeDef( + name=str(EntityType.DerivedFeature), + attributeDefs=[ + AtlasAttributeDef(name="type", typeName="string", + cardinality=Cardinality.SINGLE), + # "input_anchor_features" and "input_derived_features" are deleted, use process entity instead + AtlasAttributeDef(name="key", typeName="array>", + cardinality=Cardinality.SET), + AtlasAttributeDef(name="transformation", typeName="map", + cardinality=Cardinality.SINGLE), + AtlasAttributeDef(name="tags", typeName="map", + cardinality=Cardinality.SINGLE), + ], + superTypes=["DataSet"], + ) + + type_feathr_anchors = EntityTypeDef( + name=str(EntityType.Anchor), + attributeDefs=[ + # "source" will be removed, use process entity instead + # "features" will be removed, use process entity instead + AtlasAttributeDef(name="tags", typeName="map", + cardinality=Cardinality.SINGLE), + ], + superTypes=["DataSet"], + ) + + def_result = self.purview_client.upload_typedefs( + entityDefs=[type_feathr_anchor_features, type_feathr_anchors, + type_feathr_derived_features, type_feathr_sources, type_feathr_project], + force_update=True) + logger.info("Feathr Feature Type System Initialized.") + + def _upload_entity_batch(self, entity_batch): + for entity in entity_batch: + logger.info(f"Creating {entity.qualifiedName} \t ({entity.typeName})") + if self.purview_client.get_entity(qualifiedName=entity.qualifiedName, typeName=entity.typeName): + #raise RuntimeError(f"entity with qualified name '{entity.qualifiedName}' and type '{entity.typeName}' already exist.") + pass + results = self.purview_client.upload_entities( + batch=entity_batch) + if results: + dict = {x.guid: x for x in entity_batch} + for k, v in results['guidAssignments'].items(): + dict[k].guid = v + else: + raise RuntimeError("Feature registration failed.", results) + + def _generate_fully_qualified_name(self, segments): + return self.registry_delimiter.join(segments) + + def _generate_relation_pairs(self, from_entity:Entity, to_entity:Entity, relation_type): + type_lookup = {Label_Contains: Label_BelongsTo, Label_Consumes: Label_Produces} + + forward_relation = AtlasProcess( + name=str(from_entity.id) + " to " + str(to_entity.id), + typeName="Process", + qualified_name=self._generate_fully_qualified_name( + [relation_type,str(from_entity.id), str(to_entity.id)]), + inputs=[from_entity.to_min_repr()], + outputs=[to_entity.to_min_repr()], + guid=self.guid.get_guid()) + + backward_relation = AtlasProcess( + name=str(to_entity.id) + " to " + str(from_entity.id), + typeName="Process", + qualified_name=self._generate_fully_qualified_name( + [type_lookup[relation_type], str(to_entity.id), str(from_entity.id)]), + inputs=[to_entity.to_min_repr()], + outputs=[from_entity.to_min_repr()], + guid=self.guid.get_guid()) + return [forward_relation,backward_relation] + + def _get_id_by_qualfiedName(self, qualifiedName): + """ + Get guid of a feature given its qualifiedName + """ + query_filter = { + "attributeName": "qualifiedName", + "operator": "eq", + "attributeValue": qualifiedName + } + result = self.purview_client.discovery.query(keywords = None, filter=query_filter) + entities = result['value'] + # There should be exactly one result, but we don't enforce the check here + for entity in entities: + if entity.get('qualifiedName') == qualifiedName: + return entity.get('id') + \ No newline at end of file diff --git a/registry/purview-registry/requirements.txt b/registry/purview-registry/requirements.txt new file mode 100644 index 000000000..f0615cfd0 --- /dev/null +++ b/registry/purview-registry/requirements.txt @@ -0,0 +1,2 @@ +fastapi +uvicorn \ No newline at end of file diff --git a/registry/purview-registry/test/test_creation.py b/registry/purview-registry/test/test_creation.py new file mode 100644 index 000000000..d99364cfc --- /dev/null +++ b/registry/purview-registry/test/test_creation.py @@ -0,0 +1,23 @@ +from unicodedata import name +from registry.models import AnchorDef, AnchorFeatureDef, DerivedFeatureDef, ExpressionTransformation, FeatureType, ProjectDef, SourceDef, TensorCategory, TypedKey, ValueType, VectorType +from registry.purview_registry import PurviewRegistry + +registry = PurviewRegistry("feathrazuretest3-purview1") + +proj_id = registry.create_project(ProjectDef("yihui_test_registry","yihui_test_registry",{"obsolete":"False"})) + +source_id = registry.create_project_datasource(proj_id,SourceDef(name="source1", qualified_name="yihui_test_registry__source1", path="hdfs://somewhere", type="hdfs")) + +anchor1_id = registry.create_project_anchor(proj_id, AnchorDef( + qualified_name="yihui_test_registry__anchor1", name="anchor1", source_id=source_id)) +ft1 = FeatureType(type=VectorType.TENSOR, tensor_category=TensorCategory.DENSE, + dimension_type=[], val_type=ValueType.INT) +t1 = ExpressionTransformation("af1") +k = TypedKey(key_column="c1", key_column_type=ValueType.INT) + +feature1 = registry.create_project_anchor_feature(proj_id, anchor1_id, AnchorFeatureDef( + qualified_name="yihui_test_registry__anchor1__af1", name="af1", feature_type=ft1, transformation=t1, key=[k])) +derived = registry.create_project_derived_feature(proj_id, DerivedFeatureDef(qualified_name="yihui_test_registry__df1", + name="df1", feature_type=ft1, transformation=t1, key=[k], input_anchor_features=[feature1], input_derived_features=[])) + +print(proj_id,source_id,anchor1_id,feature1,derived) diff --git a/registry/purview-registry/test/test_get.py b/registry/purview-registry/test/test_get.py new file mode 100644 index 000000000..1175a2dd2 --- /dev/null +++ b/registry/purview-registry/test/test_get.py @@ -0,0 +1,54 @@ +from registry.models import ProjectDef, RelationshipType +from registry.purview_registry import PurviewRegistry + +registry = PurviewRegistry("feathrazuretest3-purview1") +projects = registry.get_projects() + +entity_id_by_name = registry.get_entity_id("yihui_test_registry") +entity_id_by_id = registry.get_entity_id(entity_id_by_name) + +entity_object = registry.get_entity("yihui_test_registry") +entity_object_full = registry.get_entity("yihui_test_registry",True) +assert len(entity_object.attributes.anchor_features)==0 +assert len(entity_object_full.attributes.anchor_features)==1 +assert len(entity_object_full.attributes.derived_features)==1 + +anchor_object_full = registry.get_entity("yihui_test_registry__anchor1",True) +assert len(anchor_object_full.attributes.features)==1 + +derived_object_full = registry.get_entity("yihui_test_registry__df1",True) +assert len(derived_object_full.attributes.input_features)==1 +assert len(derived_object_full.attributes.input_anchor_features)==1 + +entity_list=registry.get_entities([registry.get_entity_id(x) for x in [ + 'yihui_test_registry', + 'yihui_test_registry__source1', + 'yihui_test_registry__anchor1', + 'yihui_test_registry__anchor1__af1', + 'yihui_test_registry__df1']],True) + +print(entity_list) +assert len(entity_list)==5 + +# project contains anchor group, anchor feature, derived feature and data source +neighbors = registry.get_neighbors("yihui_test_registry",RelationshipType.Contains) +assert len(neighbors)==4 + +# anchor group contains anchor feature +neighbors = registry.get_neighbors("yihui_test_registry__anchor1",RelationshipType.Contains) +assert len(neighbors)==1 + +# source produces anchor feature and anchor group +neighbors = registry.get_neighbors("yihui_test_registry__source1",RelationshipType.Produces) +assert len(neighbors)==2 + +df_lineage = registry.get_lineage('yihui_test_registry__df1') +# df1 Consumes af1 , af1 consumes source +assert len(df_lineage.entities)==3 +assert len(df_lineage.edges)==2 + +anchor_lineage = registry.get_lineage('yihui_test_registry__anchor1') +# anchor CONTAINS feature (which is not captured in lineage) +# anchor consumes source +assert len(anchor_lineage.entities)==2 +assert len(anchor_lineage.edges)==1 \ No newline at end of file From 1102669db321cb0648ca851a62c57aafe5f706c5 Mon Sep 17 00:00:00 2001 From: Yihui Guo Date: Tue, 28 Jun 2022 20:35:48 +0800 Subject: [PATCH 2/6] align with latest model.py --- registry/purview-registry/main.py | 10 +-- registry/purview-registry/registry/models.py | 83 +++++++++++++------ .../registry/purview_registry.py | 3 +- 3 files changed, 62 insertions(+), 34 deletions(-) diff --git a/registry/purview-registry/main.py b/registry/purview-registry/main.py index 4d34bdb1c..18efa6db3 100644 --- a/registry/purview-registry/main.py +++ b/registry/purview-registry/main.py @@ -7,7 +7,7 @@ from registry.purview_registry import PurviewRegistry from registry.models import AnchorDef, AnchorFeatureDef, DerivedFeatureDef, EntityType, ProjectDef, SourceDef, to_snake -rp = "/" +rp = "/v1" try: rp = os.environ["API_BASE"] if rp[0] != '/': @@ -16,7 +16,7 @@ pass print("Using API BASE: ", rp) -registry = PurviewRegistry() +registry = PurviewRegistry("feathrazuretest3-purview1") app = FastAPI() router = APIRouter() @@ -50,10 +50,10 @@ def get_project_datasources(project: str) -> list: @router.get("/projects/{project}/features") def get_project_features(project: str, keyword: Optional[str] = None) -> list: if keyword is None: - p = registry.get_entity(project) + p = registry.get_entity(project,True) feature_ids = [s.id for s in p.attributes.anchor_features] + \ [s.id for s in p.attributes.derived_features] - features = registry.get_entities(feature_ids) + features = registry.get_entities(feature_ids,True) return list([e.to_dict() for e in features]) else: efs = registry.search_entity( @@ -65,7 +65,7 @@ def get_project_features(project: str, keyword: Optional[str] = None) -> list: @router.get("/features/{feature}") def get_feature(feature: str) -> dict: - e = registry.get_entity(feature) + e = registry.get_entity(feature,True) if e.entity_type not in [EntityType.DerivedFeature, EntityType.AnchorFeature]: raise HTTPException( status_code=404, detail=f"Feature {feature} not found") diff --git a/registry/purview-registry/registry/models.py b/registry/purview-registry/registry/models.py index 2f956a186..d1e174e0d 100644 --- a/registry/purview-registry/registry/models.py +++ b/registry/purview-registry/registry/models.py @@ -16,7 +16,7 @@ def to_snake(d, level: int = 0): raise ValueError("Too many nested levels") if isinstance(d, str): d = d[:100] - return re.sub(r'([A-Z]\w+$)', r'_\1', d).lower() + return re.sub(r'(? bool: + return self.type == o.type \ + and self.tensor_category == o.tensor_category \ + and self.dimension_type == o.dimension_type \ + and self.val_type == o.val_type + def to_dict(self) -> dict: return { "type": self.type.name, @@ -162,6 +168,13 @@ def __init__(self, self.description = description self.key_column_alias = key_column_alias + def __eq__(self, o: object) -> bool: + if not isinstance(o, TypedKey): + return False + return self.key_column == o.key_column \ + and self.key_column_type == o.key_column_type \ + and self.key_column_alias == o.key_column_alias + def to_dict(self) -> dict: ret = { "key_column": self.key_column, @@ -193,6 +206,11 @@ class ExpressionTransformation(Transformation): def __init__(self, transform_expr: str): self.transform_expr = transform_expr + def __eq__(self, o: object) -> bool: + if not isinstance(o, ExpressionTransformation): + return False + return self.transform_expr == o.transform_expr + def to_dict(self) -> dict: return { "transform_expr": self.transform_expr @@ -214,6 +232,16 @@ def __init__(self, self.filter = filter self.limit = limit + def __eq__(self, o: object) -> bool: + if not isinstance(o, WindowAggregationTransformation): + return False + return self.def_expr == o.def_expr \ + and self.agg_func == o.agg_func \ + and self.window == o.window \ + and self.group_by == o.group_by \ + and self.filter == o.filter \ + and self.limit == o.limit + def to_dict(self) -> dict: ret = { "def_expr": self.def_expr, @@ -235,6 +263,11 @@ class UdfTransformation(Transformation): def __init__(self, name: str): self.name = name + def __eq__(self, o: object) -> bool: + if not isinstance(o, UdfTransformation): + return False + return self.name == o.name + def to_dict(self) -> dict: return { "name": self.name @@ -276,7 +309,6 @@ def to_dict(self) -> dict: class Attributes(ToDict): @staticmethod def new(entity_type: Union[str, EntityType], **kwargs): - print("YYY ", entity_type, kwargs) return { EntityType.Project: ProjectAttributes, EntityType.Source: SourceAttributes, @@ -316,7 +348,7 @@ def to_dict(self) -> dict: "typeName": str(self.attributes.entity_type), "attributes": self.attributes.to_dict(), } - + def to_min_repr(self) -> dict: return { 'qualifiedName':self.qualified_name, @@ -532,8 +564,8 @@ def __init__(self, type: Union[dict, FeatureType], transformation: Union[dict, Transformation], key: list[Union[dict, TypedKey]], - # input_anchor_features: list[Union[dict, EntityRef, Entity]] = [], - # input_derived_features: list[Union[dict, EntityRef, Entity]] = [], + input_anchor_features: list[Union[dict, EntityRef, Entity]] = [], + input_derived_features: list[Union[dict, EntityRef, Entity]] = [], tags: dict = {}, **kwargs): self.qualified_name = qualified_name @@ -544,8 +576,6 @@ def __init__(self, self._input_anchor_features = [] self._input_derived_features = [] self.tags = tags - # self._set_input_anchor_features(input_anchor_features) - # self._set_input_derived_features(input_derived_features) @property def entity_type(self) -> EntityType: @@ -556,27 +586,27 @@ def input_features(self): return self._input_anchor_features + self._input_derived_features @input_features.setter - def input_features(self, v: Union[dict, Entity, EntityRef]): + def input_features(self, input_features_list: Union[dict, Entity, EntityRef]): self._input_anchor_features = [] self._input_derived_features = [] - for f in v: - e = None - if isinstance(f, EntityRef): - e = f - elif isinstance(f, Entity): - e = f.get_ref() - elif isinstance(f, dict): + for feature in input_features_list: + entity = None + if isinstance(feature, EntityRef): + entity = feature + elif isinstance(feature, Entity): + entity = feature.get_ref() + elif isinstance(feature, dict): try: - e = _to_type(f, Entity).get_ref() + entity = _to_type(feature, Entity).get_ref() except: - e = _to_type(f, EntityRef) + entity = _to_type(feature, EntityRef) else: - raise TypeError(f) + raise TypeError(feature) - if e.entity_type == EntityType.AnchorFeature: - self._input_anchor_features.append(e) - elif e.entity_type == EntityType.DerivedFeature: - self._input_derived_features.append(e) + if entity.entity_type == EntityType.AnchorFeature: + self._input_anchor_features.append(entity) + elif entity.entity_type == EntityType.DerivedFeature: + self._input_derived_features.append(entity) else: pass @@ -652,10 +682,10 @@ def to_attr(self) -> ProjectAttributes: class SourceDef: def __init__(self, - qualified_name: str, name: str, path: str, type: str, + qualified_name: str = "", preprocessing: Optional[str] = None, event_timestamp_column: Optional[str] = None, timestamp_format: Optional[str] = None, @@ -681,9 +711,9 @@ def to_attr(self) -> SourceAttributes: class AnchorDef: def __init__(self, - qualified_name: str, name: str, source_id: Union[str, UUID], + qualified_name: str = "", tags: dict = {}): self.qualified_name = qualified_name self.name = name @@ -699,11 +729,11 @@ def to_attr(self, source: EntityRef) -> AnchorAttributes: class AnchorFeatureDef: def __init__(self, - qualified_name: str, name: str, feature_type: Union[dict, FeatureType], transformation: Union[dict, Transformation], key: list[Union[dict, TypedKey]], + qualified_name: str = "", tags: dict = {}): self.qualified_name = qualified_name self.name = name @@ -723,13 +753,13 @@ def to_attr(self) -> AnchorFeatureAttributes: class DerivedFeatureDef: def __init__(self, - qualified_name: str, name: str, feature_type: Union[dict, FeatureType], transformation: Union[dict, Transformation], key: list[Union[dict, TypedKey]], input_anchor_features: list[Union[str, UUID]], input_derived_features: list[Union[str, UUID]], + qualified_name: str = "", tags: dict = {}): self.qualified_name = qualified_name self.name = name @@ -749,4 +779,3 @@ def to_attr(self, input_features: list[EntityRef]) -> DerivedFeatureAttributes: tags=self.tags) attr.input_features = input_features return attr - diff --git a/registry/purview-registry/registry/purview_registry.py b/registry/purview-registry/registry/purview_registry.py index 07839c83b..200f98434 100644 --- a/registry/purview-registry/registry/purview_registry.py +++ b/registry/purview-registry/registry/purview_registry.py @@ -92,7 +92,6 @@ def get_entities(self, ids: list[UUID],recursive=False) -> list[Entity]: return [self.get_entity(x,recursive) for x in ids] def get_entity_id(self, id_or_name: Union[str, UUID]) -> UUID: - print(id_or_name) try: id = _to_uuid(id_or_name) return id @@ -229,7 +228,7 @@ def create_project_anchor_feature(self, project_id: UUID, anchor_id: UUID, defin project_entity = self.get_entity(project_id) anchor_entity = self.get_entity(anchor_id) anchor_feature_entity = self.get_entity(anchor_feature_entity.guid) - source_entity = self.get_entity(anchor_entity.attributes.source['guid']) + source_entity = self.get_entity(anchor_entity.id) project_contains_feature_relation = self._generate_relation_pairs( project_entity, anchor_feature_entity, Label_Contains) From a93686f21308f0aca8e2f2f02fa073592479bfb1 Mon Sep 17 00:00:00 2001 From: Yihui Guo Date: Thu, 30 Jun 2022 22:15:29 +0800 Subject: [PATCH 3/6] Merge qualifiedName on server side --- .../registry/purview_registry.py | 69 ++++++++++++------- 1 file changed, 43 insertions(+), 26 deletions(-) diff --git a/registry/purview-registry/registry/purview_registry.py b/registry/purview-registry/registry/purview_registry.py index 200f98434..f3fac535f 100644 --- a/registry/purview-registry/registry/purview_registry.py +++ b/registry/purview-registry/registry/purview_registry.py @@ -1,12 +1,14 @@ +from http.client import CONFLICT, HTTPException import itertools import re -from typing import Optional, Tuple, Union +from typing import Any, Optional, Tuple, Union +from urllib.error import HTTPError from uuid import UUID from azure.identity import DefaultAzureCredential from loguru import logger -from numpy import allclose +from numpy import allclose, typename from pyapacheatlas.auth.azcredential import AzCredentialWrapper from pyapacheatlas.core import (AtlasEntity, AtlasProcess, PurviewClient) @@ -15,7 +17,7 @@ from pyhocon import ConfigFactory from registry.interface import Registry -from registry.models import AnchorDef, AnchorFeatureDef, DerivedFeatureDef, Edge, EntitiesAndRelations, Entity, EntityRef, EntityType, ProjectDef, RelationshipType, SourceDef, _to_uuid +from registry.models import AnchorDef, AnchorFeatureDef, Attributes, DerivedFeatureDef, Edge, EntitiesAndRelations, Entity, EntityRef, EntityType, ProjectDef, RelationshipType, SourceDef, _to_uuid Label_Contains = "CONTAINS" Label_BelongsTo = "BELONGSTO" Label_Consumes = "CONSUMES" @@ -50,16 +52,7 @@ def get_entity(self, id_or_name: Union[str, UUID],recursive = False) -> Entity: if not id: return None purview_entity = self.purview_client.get_entity(id)['entities'][0] - entity_type = EntityType.new(purview_entity['typeName']) - if entity_type in [EntityType.AnchorFeature,EntityType.DerivedFeature]: - if "type" in purview_entity['attributes']: - conf = ConfigFactory.parse_string(purview_entity['attributes']['type']) - purview_entity['attributes']['type'] = dict(conf) - base_entity = Entity( - purview_entity["guid"], - purview_entity['attributes']["qualifiedName"], - entity_type, - attributes={x:y for x, y in purview_entity['attributes'].items() if y}) + base_entity = self._atlasEntity_to_entity(purview_entity) if recursive: if base_entity.entity_type == EntityType.Project: edges = self.get_neighbors(base_entity.id, RelationshipType.Contains) @@ -84,6 +77,20 @@ def get_entity(self, id_or_name: Union[str, UUID],recursive = False) -> Entity: base_entity.attributes.input_features = features return base_entity return base_entity + + def _atlasEntity_to_entity(self, purview_entity): + entity_type = EntityType.new(purview_entity['typeName']) + if entity_type in [EntityType.AnchorFeature,EntityType.DerivedFeature]: + if "type" in purview_entity['attributes']: + conf = ConfigFactory.parse_string(purview_entity['attributes']['type']) + purview_entity['attributes']['type'] = dict(conf) + base_entity = Entity( + purview_entity["guid"], + purview_entity['attributes']["qualifiedName"], + entity_type, + attributes={x:y for x, y in purview_entity['attributes'].items() if y}) + + return base_entity def get_entities(self, ids: list[UUID],recursive=False) -> list[Entity]: """ @@ -156,7 +163,7 @@ def create_project(self, definition: ProjectDef) -> UUID: attrs = definition.to_attr().to_dict() feathr_project_entity = AtlasEntity( name=attrs['name'], - qualified_name=attrs['qualifiedName'], + qualified_name=attrs['name'], attributes=attrs['tags'], typeName=str(EntityType.Project), guid=self.guid.get_guid()) @@ -165,10 +172,12 @@ def create_project(self, definition: ProjectDef) -> UUID: return UUID(feathr_project_entity.guid) def create_project_datasource(self, project_id: UUID, definition: SourceDef) -> UUID: + project_entity = self.get_entity(project_id) attrs = definition.to_attr().to_dict() + qualified_name = self.registry_delimiter.join([project_entity.qualified_name,attrs['name']]) source_entity = AtlasEntity( name=attrs['name'], - qualified_name=attrs['qualifiedName'], + qualified_name=qualified_name, attributes= {k:v for k,v in attrs.items() if k not in ['name','qualifiedName']}, typeName=str(EntityType.Source), guid=self.guid.get_guid(), @@ -189,9 +198,11 @@ def create_project_datasource(self, project_id: UUID, definition: SourceDef) -> def create_project_anchor(self, project_id: UUID, definition: AnchorDef) -> UUID: source_entity = self.get_entity(definition.source_id) attrs = definition.to_attr(source_entity).to_dict() + project_entity = self.get_entity(project_id) + qualified_name = self.registry_delimiter.join([project_entity.qualified_name,attrs['name']]) anchor_entity = AtlasEntity( name=definition.name, - qualified_name=definition.qualified_name, + qualified_name=qualified_name, attributes= {k:v for k,v in attrs.items() if k not in ['name','qualifiedName']}, typeName=str(EntityType.Anchor), guid=self.guid.get_guid(), @@ -201,7 +212,6 @@ def create_project_anchor(self, project_id: UUID, definition: AnchorDef) -> UUID [anchor_entity]) # change from AtlasEntity to Entity - project_entity = self.get_entity(project_id) anchor_entity = self.get_entity(anchor_entity.guid) project_contains_anchor_relation = self._generate_relation_pairs( @@ -215,9 +225,15 @@ def create_project_anchor(self, project_id: UUID, definition: AnchorDef) -> UUID def create_project_anchor_feature(self, project_id: UUID, anchor_id: UUID, definition: AnchorFeatureDef) -> UUID: attrs = definition.to_attr().to_dict() + project_entity = self.get_entity(project_id) + anchor_entity = self.get_entity(anchor_id) + qualified_name = self.registry_delimiter.join([project_entity.qualified_name, + anchor_entity.attributes.name, + attrs['name']]) + anchor_feature_entity = AtlasEntity( name=definition.name, - qualified_name=definition.qualified_name, + qualified_name=qualified_name, attributes= {k:v for k,v in attrs.items() if k not in ['name','qualifiedName']}, typeName=str(EntityType.AnchorFeature), guid=self.guid.get_guid()) @@ -225,8 +241,6 @@ def create_project_anchor_feature(self, project_id: UUID, anchor_id: UUID, defin [anchor_feature_entity]) # change from AtlasEntity to Entity - project_entity = self.get_entity(project_id) - anchor_entity = self.get_entity(anchor_id) anchor_feature_entity = self.get_entity(anchor_feature_entity.guid) source_entity = self.get_entity(anchor_entity.id) @@ -248,9 +262,11 @@ def create_project_anchor_feature(self, project_id: UUID, anchor_id: UUID, defin def create_project_derived_feature(self, project_id: UUID, definition: DerivedFeatureDef) -> UUID: input_features = self.get_entities(definition.input_anchor_features+definition.input_derived_features) attrs = definition.to_attr(input_features).to_dict() + project_entity = self.get_entity(project_id) + qualified_name = self.registry_delimiter.join([project_entity.qualified_name,attrs['name']]) derived_feature_entity = AtlasEntity( name=definition.name, - qualified_name=definition.qualified_name, + qualified_name=qualified_name, attributes={k:v for k,v in attrs.items() if k not in ['name','qualifiedName']}, typeName=str(EntityType.DerivedFeature), guid=self.guid.get_guid()) @@ -395,9 +411,10 @@ def _register_feathr_feature_types(self): def _upload_entity_batch(self, entity_batch): for entity in entity_batch: logger.info(f"Creating {entity.qualifiedName} \t ({entity.typeName})") - if self.purview_client.get_entity(qualifiedName=entity.qualifiedName, typeName=entity.typeName): - #raise RuntimeError(f"entity with qualified name '{entity.qualifiedName}' and type '{entity.typeName}' already exist.") - pass + existing_entity = self.purview_client.get_entity(qualifiedName=entity.qualifiedName, typeName=entity.typeName) + if existing_entity: + # perform attribute check, return id if same, conflict when different. + raise HTTPException(CONFLICT,"Entity with same qualified name and type exists") results = self.purview_client.upload_entities( batch=entity_batch) if results: @@ -405,8 +422,8 @@ def _upload_entity_batch(self, entity_batch): for k, v in results['guidAssignments'].items(): dict[k].guid = v else: - raise RuntimeError("Feature registration failed.", results) - + raise RuntimeError("Feature registration failed.", results) + def _generate_fully_qualified_name(self, segments): return self.registry_delimiter.join(segments) From 67b6db3afd66419147f02d496ac39429f9ff836e Mon Sep 17 00:00:00 2001 From: Yihui Guo Date: Sun, 3 Jul 2022 19:02:29 +0800 Subject: [PATCH 4/6] Update entity def --- .../registry/purview_registry.py | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/registry/purview-registry/registry/purview_registry.py b/registry/purview-registry/registry/purview_registry.py index f3fac535f..827a062c4 100644 --- a/registry/purview-registry/registry/purview_registry.py +++ b/registry/purview-registry/registry/purview_registry.py @@ -22,6 +22,9 @@ Label_BelongsTo = "BELONGSTO" Label_Consumes = "CONSUMES" Label_Produces = "PRODUCES" +TYPEDEF_ARRAY_ANCHOR=f"array" +TYPEDEF_ARRAY_DERIVED_FEATURE=f"array" +TYPEDEF_ARRAY_ANCHOR_FEATURE=f"array" class PurviewRegistry(Registry): def __init__(self,azure_purview_name: str, registry_delimiter: str = "__", credential=None,register_types = False): self.registry_delimiter = registry_delimiter @@ -325,15 +328,21 @@ def _bfs_step(self, ids: list[UUID], conn_type: RelationshipType) -> list[Edge]: def _register_feathr_feature_types(self): """ - Register the feathr types if we haven't done so. Note that this only needs to be called once per provisioning + Register the feathr types if we haven't done so. Note that this only needs to be called once per provisioning a system. Basically this function registers all the feature type definition in a Atlas compatible system. """ - + # Since old version of entity type definitions already exist, this method will not be called by default. + # Current schema is backward-compatible with existing. calling this method again will leads to "fail to delete def" error. + # In the future, if moving to V2, call this method in registry initialization. # Each feature is registered under a certain Feathr project. The project should what we refer to, however for backward compatibility, the type name would be `feathr_workspace` type_feathr_project = EntityTypeDef( name=str(EntityType.Project), attributeDefs=[ # "anchor_features" and "derived_features" are removed, since we are moving to use process entity + AtlasAttributeDef( + name="anchor_features", typeName=TYPEDEF_ARRAY_ANCHOR, cardinality=Cardinality.SET), + AtlasAttributeDef( + name="derived_features", typeName=TYPEDEF_ARRAY_DERIVED_FEATURE, cardinality=Cardinality.SET), AtlasAttributeDef(name="tags", typeName="map", cardinality=Cardinality.SINGLE), ], @@ -381,6 +390,10 @@ def _register_feathr_feature_types(self): AtlasAttributeDef(name="type", typeName="string", cardinality=Cardinality.SINGLE), # "input_anchor_features" and "input_derived_features" are deleted, use process entity instead + AtlasAttributeDef(name="input_anchor_features", typeName=TYPEDEF_ARRAY_ANCHOR_FEATURE, + cardinality=Cardinality.SET), + AtlasAttributeDef(name="input_derived_features", typeName=TYPEDEF_ARRAY_DERIVED_FEATURE, + cardinality=Cardinality.SET), AtlasAttributeDef(name="key", typeName="array>", cardinality=Cardinality.SET), AtlasAttributeDef(name="transformation", typeName="map", @@ -396,6 +409,10 @@ def _register_feathr_feature_types(self): attributeDefs=[ # "source" will be removed, use process entity instead # "features" will be removed, use process entity instead + AtlasAttributeDef( + name="source", typeName=str(EntityType.Source), cardinality=Cardinality.SINGLE), + AtlasAttributeDef( + name="features", typeName=TYPEDEF_ARRAY_ANCHOR_FEATURE, cardinality=Cardinality.SET), AtlasAttributeDef(name="tags", typeName="map", cardinality=Cardinality.SINGLE), ], From 5c33fb67ed068db4e2af0f7ac18378e501f8910f Mon Sep 17 00:00:00 2001 From: Yihui Guo Date: Sun, 3 Jul 2022 19:06:33 +0800 Subject: [PATCH 5/6] Read purview name from os.env --- registry/purview-registry/main.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/registry/purview-registry/main.py b/registry/purview-registry/main.py index 18efa6db3..aecbb0d7a 100644 --- a/registry/purview-registry/main.py +++ b/registry/purview-registry/main.py @@ -16,7 +16,9 @@ pass print("Using API BASE: ", rp) -registry = PurviewRegistry("feathrazuretest3-purview1") +# os.environ['PURVIEW_NAME'] = "feathrazuretest3-purview1" +purview_name = os.environ["PURVIEW_NAME"] +registry = PurviewRegistry(purview_name) app = FastAPI() router = APIRouter() From c2f7a8de3b2927a52683639c0385696c7ad5d7ac Mon Sep 17 00:00:00 2001 From: Yihui Guo Date: Sun, 3 Jul 2022 21:05:23 +0800 Subject: [PATCH 6/6] Fix docker file --- registry/purview-registry/Dockerfile | 1 + registry/purview-registry/registry/purview_registry.py | 4 +--- registry/purview-registry/requirements.txt | 9 ++++++++- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/registry/purview-registry/Dockerfile b/registry/purview-registry/Dockerfile index d2647021d..125d88304 100644 --- a/registry/purview-registry/Dockerfile +++ b/registry/purview-registry/Dockerfile @@ -4,6 +4,7 @@ COPY ./ /usr/src WORKDIR /usr/src RUN pip install -r requirements.txt +EXPOSE 80 # Start web server CMD [ "uvicorn","main:app","--host", "0.0.0.0", "--port", "80" ] diff --git a/registry/purview-registry/registry/purview_registry.py b/registry/purview-registry/registry/purview_registry.py index 827a062c4..bb9c77b98 100644 --- a/registry/purview-registry/registry/purview_registry.py +++ b/registry/purview-registry/registry/purview_registry.py @@ -1,14 +1,12 @@ from http.client import CONFLICT, HTTPException import itertools -import re from typing import Any, Optional, Tuple, Union from urllib.error import HTTPError from uuid import UUID from azure.identity import DefaultAzureCredential from loguru import logger -from numpy import allclose, typename from pyapacheatlas.auth.azcredential import AzCredentialWrapper from pyapacheatlas.core import (AtlasEntity, AtlasProcess, PurviewClient) @@ -17,7 +15,7 @@ from pyhocon import ConfigFactory from registry.interface import Registry -from registry.models import AnchorDef, AnchorFeatureDef, Attributes, DerivedFeatureDef, Edge, EntitiesAndRelations, Entity, EntityRef, EntityType, ProjectDef, RelationshipType, SourceDef, _to_uuid +from registry.models import AnchorDef, AnchorFeatureDef, DerivedFeatureDef, Edge, EntitiesAndRelations, Entity, EntityRef, EntityType, ProjectDef, RelationshipType, SourceDef, _to_uuid Label_Contains = "CONTAINS" Label_BelongsTo = "BELONGSTO" Label_Consumes = "CONSUMES" diff --git a/registry/purview-registry/requirements.txt b/registry/purview-registry/requirements.txt index f0615cfd0..3c7294c73 100644 --- a/registry/purview-registry/requirements.txt +++ b/registry/purview-registry/requirements.txt @@ -1,2 +1,9 @@ +azure-core +azure-purview-catalog==1.0.0b2 fastapi -uvicorn \ No newline at end of file +opencensus-ext-azure +pyapacheatlas +pydantic +uvicorn +loguru +pyhocon