From e629a2e3b252139b31daee8a3bc550419c0b9245 Mon Sep 17 00:00:00 2001 From: Chen Xu Date: Thu, 23 Jun 2022 20:02:18 +0800 Subject: [PATCH 1/3] Create entity --- registry/sql-registry/registry/database.py | 52 +++- registry/sql-registry/registry/db_registry.py | 277 ++++++++++++++++-- registry/sql-registry/registry/interface.py | 24 +- registry/sql-registry/registry/models.py | 87 +++--- .../sql-registry/scripts/ARM_template.json | 107 +++++++ registry/sql-registry/test/test_create.py | 44 +++ 6 files changed, 517 insertions(+), 74 deletions(-) create mode 100644 registry/sql-registry/scripts/ARM_template.json create mode 100644 registry/sql-registry/test/test_create.py diff --git a/registry/sql-registry/registry/database.py b/registry/sql-registry/registry/database.py index d82568972..fae2cb366 100644 --- a/registry/sql-registry/registry/database.py +++ b/registry/sql-registry/registry/database.py @@ -1,4 +1,6 @@ from abc import ABC, abstractmethod +from contextlib import contextmanager +import logging import threading from distutils.log import debug, warn import os @@ -9,7 +11,7 @@ class DbConnection(ABC): @abstractmethod - def execute(self, sql: str, *args, **kwargs) -> list[dict]: + def query(self, sql: str, *args, **kwargs) -> list[dict]: pass def quote(id): @@ -38,12 +40,15 @@ def parse_conn_str(s: str) -> dict: class MssqlConnection(DbConnection): @staticmethod - def connect(*args, **kwargs): + def connect(autocommit = True): conn_str = os.environ["CONNECTION_STR"] if "Server=" not in conn_str: debug("`CONNECTION_STR` is not in ADO connection string format") return None - return MssqlConnection(parse_conn_str(conn_str)) + params = parse_conn_str(conn_str) + if not autocommit: + params["autocommit"] = False + return MssqlConnection(params) def __init__(self, params): self.params = params @@ -53,8 +58,11 @@ def __init__(self, params): def make_connection(self): self.conn = pymssql.connect(**self.params) - def execute(self, sql: str, *args, **kwargs) -> list[dict]: - debug(f"SQL: `{sql}`") + def query(self, sql: str, *args, **kwargs) -> list[dict]: + """ + Make SQL query and return result + """ + warn(f"SQL: `{sql}`") # NOTE: Only one cursor is allowed at the same time retry = 0 while True: @@ -73,13 +81,43 @@ def execute(self, sql: str, *args, **kwargs) -> list[dict]: raise pass + @contextmanager + def transaction(self): + """ + Do NOT use self.query inside this block as they may reconnect + The minimal implementation could look like this if the provider doesn't support transaction + ``` + @contextmanager + def transaction(self): + try: + c = self.create_or_get_connection(...) + yield c + finally: + c.close(...) + ``` + """ + conn = None + cursor = None + try: + conn = MssqlConnection.connect(autocommit=False).conn + cursor = conn.cursor(as_dict=True) + yield cursor + except Exception as e: + logging.warning(f"Exception: {e}") + if conn: + conn.rollback() + raise e + finally: + if conn: + conn.commit() + providers.append(MssqlConnection) -def connect(): +def connect(*args, **kargs): for p in providers: - ret = p.connect() + ret = p.connect(*args, **kargs) if ret is not None: return ret raise RuntimeError("Cannot connect to database") \ No newline at end of file diff --git a/registry/sql-registry/registry/db_registry.py b/registry/sql-registry/registry/db_registry.py index f5456c5e5..72c4f2888 100644 --- a/registry/sql-registry/registry/db_registry.py +++ b/registry/sql-registry/registry/db_registry.py @@ -1,16 +1,20 @@ from typing import Optional, Tuple, Union -from uuid import UUID +from uuid import UUID, uuid4 + +from pydantic import UUID4 from registry import Registry from registry import connect -from registry.models import Edge, EntitiesAndRelations, Entity, EntityRef, EntityType, RelationshipType, _to_type, _to_uuid +from registry.models import AnchorAttributes, AnchorDef, AnchorFeatureAttributes, AnchorFeatureDef, DerivedFeatureAttributes, DerivedFeatureDef, Edge, EntitiesAndRelations, Entity, EntityRef, EntityType, ProjectAttributes, ProjectDef, RelationshipType, SourceAttributes, SourceDef, _to_type, _to_uuid import json def quote(id): if isinstance(id, str): return f"'{id}'" + if isinstance(id, UUID): + return f"'{str(id)}'" else: - return ",".join([f"'{i}'" for i in id]) + return ",".join([quote(i) for i in id]) class DbRegistry(Registry): @@ -18,7 +22,7 @@ def __init__(self): self.conn = connect() def get_projects(self) -> list[str]: - ret = self.conn.execute( + ret = self.conn.query( f"select qualified_name from entities where entity_type='{EntityType.Project}'") return list([r["qualified_name"] for r in ret]) @@ -35,12 +39,12 @@ def get_entity_id(self, id_or_name: Union[str, UUID]) -> UUID: except ValueError: pass # It is a name - ret = self.conn.execute( + ret = self.conn.query( f"select entity_id from entities where qualified_name='{id_or_name}'") return ret[0]["entity_id"] def get_neighbors(self, id_or_name: Union[str, UUID], relationship: RelationshipType) -> list[Edge]: - rows = self.conn.execute(fr''' + rows = self.conn.query(fr''' select edge_id, from_id, to_id, conn_type from edges where from_id = '{self.get_entity_id(id_or_name)}' @@ -78,7 +82,8 @@ def get_project(self, id_or_name: Union[str, UUID]) -> EntitiesAndRelations: edges = edges.union(conn) features = list([child_map[id] for id in feature_ids]) anchor.attributes.features = features - source_id = self.get_neighbors(anchor.id, RelationshipType.Consumes)[0].to_id + source_id = self.get_neighbors( + anchor.id, RelationshipType.Consumes)[0].to_id anchor.attributes.source = child_map[source_id] for df in project.attributes.derived_features: conn = self.get_neighbors(anchor.id, RelationshipType.Consumes) @@ -88,7 +93,229 @@ def get_project(self, id_or_name: Union[str, UUID]) -> EntitiesAndRelations: df.attributes.input_features = features all_edges = self._get_edges(ids) return EntitiesAndRelations([project] + children, list(edges.union(all_edges))) - + + def search_entity(self, + keyword: str, + type: list[EntityType]) -> list[EntityRef]: + """ + WARN: This search function is implemented via `like` operator, which could be extremely slow. + """ + types = ",".join([quote(str(t)) for t in type]) + sql = fr'''select entity_id as id, qualified_name, entity_type as type from entities where qualified_name like %s and entity_type in ({types})''' + rows = self.conn.query(sql, ('%' + keyword + '%', )) + return list([EntityRef(**row) for row in rows]) + + def create_project(self, definition: ProjectDef) -> UUID: + with self.conn.transaction() as c: + c.execute(f'''select entity_id, entity_type, attributes from entities where qualified_name = %s''', + definition.qualified_name) + r = c.fetchall() + if r: + if len(r) > 1: + assert False, "Data inconsistency detected, %d entities have same qualified_name %s" % ( + len(r), definition.qualified_name) + # The entity with same name already exists but with different type + if _to_type(r[0]["entity_type"], EntityType) != EntityType.Project: + raise ValueError("Entity %s already exists" % + definition.qualified_name) + # Just return the existing project id + return r[0]["entity_id"] + id = uuid4() + c.execute(f"insert into entities (entity_id, entity_type, qualified_name, attributes) values (%s, 'feathr_workspace_v1', %s, %s)", + (str(id), + definition.qualified_name, + definition.to_attr().to_json())) + return id + + def create_project_datasource(self, project_id: UUID, definition: SourceDef) -> UUID: + with self.conn.transaction() as c: + c.execute(f'''select entity_id, entity_type, attributes from entities where qualified_name = %s''', + definition.qualified_name) + r = c.fetchall() + if r: + if len(r) > 1: + assert False, "Data inconsistency detected, %d entities have same qualified_name %s" % ( + len(r), definition.qualified_name) + # The entity with same name already exists but with different type + if _to_type(r[0]["entity_type"], EntityType) != EntityType.Source: + raise ValueError("Entity %s already exists" % + definition.qualified_name) + attr: SourceAttributes = _to_type( + r[0]["attributes"], SourceAttributes) + if attr.name == definition.name \ + and attr.type == definition.type \ + and attr.path == definition.path \ + and attr.preprocessing == definition.preprocessing \ + and attr.event_timestamp_column == definition.event_timestamp_column \ + and attr.timestamp_format == definition.timestamp_format: + # Creating exactly same entity + # Just return the existing id + return r[0]["entity_id"] + raise ValueError("Entity %s already exists" % + definition.qualified_name) + id = uuid4() + c.execute(f"insert into entities (entity_id, entity_type, qualified_name, attributes) values (%s, 'feathr_source_v1', %s, %s)", + (str(id), + definition.qualified_name, + definition.to_attr().to_json())) + self._create_edge(c, project_id, id, RelationshipType.Contains) + self._create_edge(c, id, project_id, RelationshipType.BelongsTo) + return id + + def create_project_anchor(self, project_id: UUID, definition: AnchorDef) -> UUID: + with self.conn.transaction() as c: + c.execute(f'''select entity_id, entity_type, attributes from entities where qualified_name = %s''', + definition.qualified_name) + r = c.fetchall() + if r: + if len(r) > 1: + assert False, "Data inconsistency detected, %d entities have same qualified_name %s" % ( + len(r), definition.qualified_name) + # The entity with same name already exists but with different type + if _to_type(r[0]["entity_type"], EntityType) != EntityType.Anchor: + raise ValueError("Entity %s already exists" % + definition.qualified_name) + attr: AnchorAttributes = _to_type( + r[0]["attributes"], AnchorAttributes) + if attr.name == definition.name \ + and attr.source.id == definition.source_id: + # Creating exactly same entity + # Just return the existing id + return r[0]["entity_id"] + raise ValueError("Entity %s already exists" % + definition.qualified_name) + c.execute("select entity_id, qualified_name from entities where entity_id = %s and entity_type = 'feathr_source_v1'", str( + definition.source_id)) + r = c.fetchall() + if not r: + raise ValueError("Source %s does not exist" % + definition.source_id) + ref = EntityRef(r[0]["entity_id"], + EntityType.Source, r[0]["qualified_name"]) + id = uuid4() + c.execute(f"insert into entities (entity_id, entity_type, qualified_name, attributes) values (%s, 'feathr_anchor_v1', %s, %s)", + (str(id), + definition.qualified_name, + definition.to_attr(ref).to_json())) + self._create_edge(c, project_id, id, RelationshipType.Contains) + self._create_edge(c, id, project_id, RelationshipType.BelongsTo) + self._create_edge(c, id, definition.source_id, + RelationshipType.Consumes) + self._create_edge(c, definition.source_id, id, + RelationshipType.Produces) + return id + + def create_project_anchor_feature(self, project_id: UUID, anchor_id: UUID, definition: AnchorFeatureDef) -> UUID: + with self.conn.transaction() as c: + c.execute(f'''select entity_id, entity_type, attributes from entities where qualified_name = %s''', + definition.qualified_name) + r = c.fetchall() + if r: + if len(r) > 1: + assert False, "Data inconsistency detected, %d entities have same qualified_name %s" % ( + len(r), definition.qualified_name) + # The entity with same name already exists but with different type + if _to_type(r[0]["entity_type"], EntityType) != EntityType.AnchorFeature: + raise ValueError("Entity %s already exists" % + definition.qualified_name) + attr: AnchorFeatureAttributes = _to_type( + r[0]["attributes"], AnchorFeatureAttributes) + if attr.name == definition.name \ + and attr.type == definition.feature_type \ + and attr.transformation == definition.transformation \ + and attr.key == definition.key: + # Creating exactly same entity + # Just return the existing id + return r[0]["entity_id"] + raise ValueError("Entity %s already exists" % + definition.qualified_name) + anchor: AnchorAttributes = self.get_entity(anchor_id).attributes + source_id = anchor.source.id + id = uuid4() + c.execute(f"insert into entities (entity_id, entity_type, qualified_name, attributes) values (%s, 'feathr_anchor_feature_v1', %s, %s)", + (str(id), + definition.qualified_name, + definition.to_attr().to_json())) + self._create_edge(c, project_id, id, RelationshipType.Contains) + self._create_edge(c, id, project_id, RelationshipType.BelongsTo) + self._create_edge(c, anchor_id, id, RelationshipType.Contains) + self._create_edge(c, id, anchor_id, RelationshipType.BelongsTo) + self._create_edge(c, id, source_id, RelationshipType.Consumes) + self._create_edge(c, source_id, id, RelationshipType.Produces) + return id + + def create_project_derived_feature(self, project_id: UUID, definition: DerivedFeatureDef) -> UUID: + with self.conn.transaction() as c: + c.execute(f'''select entity_id, entity_type, attributes from entities where qualified_name = %s''', + definition.qualified_name) + r = c.fetchall() + if r: + if len(r) > 1: + assert False, "Data inconsistency detected, %d entities have same qualified_name %s" % ( + len(r), definition.qualified_name) + # The entity with same name already exists but with different type + if _to_type(r[0]["entity_type"], EntityType) != EntityType.DerivedFeature: + raise ValueError("Entity %s already exists" % + definition.qualified_name) + attr: DerivedFeatureAttributes = _to_type( + r[0]["attributes"], DerivedFeatureAttributes) + if attr.name == definition.name \ + and attr.type == definition.feature_type \ + and attr.transformation == definition.transformation \ + and attr.key == definition.key: + # Creating exactly same entity + # Just return the existing id + return r[0]["entity_id"] + raise ValueError("Entity %s already exists" % + definition.qualified_name) + r1 = [] + if definition.input_anchor_features: + c.execute( + fr'''select entity_id, entity_type, qualified_name from entities where entity_id in ({quote(definition.input_anchor_features)}) and entity_type = 'feathr_anchor_feature_v1' ''') + r1 = c.fetchall() + if len(r1) != len(definition.input_anchor_features): + # TODO: More detailed error + raise(ValueError("Missing input anchor features")) + r2 = [] + if definition.input_derived_features: + c.execute( + fr'''select entity_id, entity_type, qualified_name from entities where entity_id in ({quote(definition.input_derived_features)}) and entity_type = 'feathr_derived_feature_v1' ''') + r2 = c.fetchall() + if len(r2) != len(definition.input_derived_features): + # TODO: More detailed error + raise(ValueError("Missing input derived features")) + refs = list([EntityRef(r["entity_id"], r["entity_type"], r["qualified_name"]) for r in r1+r2]) + id = uuid4() + c.execute(f"insert into entities (entity_id, entity_type, qualified_name, attributes) values (%s, 'feathr_anchor_feature_v1', %s, %s)", + (str(id), + definition.qualified_name, + definition.to_attr(refs).to_json())) + self._create_edge(c, project_id, id, RelationshipType.Contains) + self._create_edge(c, id, project_id, RelationshipType.BelongsTo) + for r in r1+r2: + input_feature_id = r["entity_id"] + self._create_edge(c, id, input_feature_id, + RelationshipType.Consumes) + self._create_edge(c, input_feature_id, id, + RelationshipType.Produces) + return id + + def _create_edge(self, cursor, from_id: UUID, to_id: UUID, type: RelationshipType): + sql = r''' + IF NOT EXISTS (SELECT 1 FROM edges WHERE from_id=%(from_id)s and to_id=%(to_id)s and conn_type=%(type)s) + BEGIN + INSERT INTO edges + (edge_id, from_id, to_id, conn_type) + values + (%(edge_id)s, %(from_id)s, %(to_id)s, %(type)s) + END''' + cursor.execute(sql, { + "edge_id": str(uuid4()), + "from_id": str(from_id), + "to_id": str(to_id), + "type": type.name + }) + def _fill_entity(self, e: Entity) -> Entity: """ Entities in the DB contains only attributes belong to itself, but the returned @@ -105,7 +332,8 @@ def _fill_entity(self, e: Entity) -> Entity: feature_ids = [e.to_id for e in conn] features = self._get_entities(feature_ids) e.attributes.features = features - source_id = self.get_neighbors(e.id, RelationshipType.Consumes)[0].to_id + source_id = self.get_neighbors( + e.id, RelationshipType.Consumes)[0].to_id source = self.get_entity(source_id) e.attributes.source = source return e @@ -116,21 +344,21 @@ def _fill_entity(self, e: Entity) -> Entity: e.attributes.input_features = features return e return e - + def _get_edges(self, ids: list[UUID], types: list[RelationshipType] = []) -> list[Edge]: sql = fr"""select edge_id, from_id, to_id, conn_type from edges where from_id in ({quote(ids)}) and to_id in ({quote(ids)})""" - if len(types)>0: + if len(types) > 0: sql = fr"""select edge_id, from_id, to_id, conn_type from edges where conn_type in ({quote(types)}) and from_id in ({quote(ids)}) and to_id in ({quote(ids)})""" - rows = self.conn.execute(sql) + rows = self.conn.query(sql) return list([_to_type(row, Edge) for row in rows]) - + def _get_entity(self, id_or_name: Union[str, UUID]) -> Entity: - row = self.conn.execute(fr''' + row = self.conn.query(fr''' select entity_id, qualified_name, entity_type, attributes from entities where entity_id = '{self.get_entity_id(id_or_name)}' @@ -139,14 +367,16 @@ def _get_entity(self, id_or_name: Union[str, UUID]) -> Entity: return _to_type(row, Entity) def _get_entities(self, ids: list[UUID]) -> list[Entity]: - rows = self.conn.execute(fr''' - select entity_id, qualified_name, entity_type, attributes + if not ids: + return [] + rows = self.conn.query(fr'''select entity_id, qualified_name, entity_type, attributes from entities where entity_id in ({quote(ids)}) ''') ret = [] for row in rows: row["attributes"] = json.loads(row["attributes"]) + print("XXX", row) ret.append(Entity(**row)) return ret @@ -154,7 +384,7 @@ def _bfs(self, id: UUID, conn_type: RelationshipType) -> Tuple[list[Entity], lis """ Breadth first traversal Starts from `id`, follow edges with `conn_type` only. - + WARN: There is no depth limit. """ connections = [] @@ -180,15 +410,4 @@ def _bfs_step(self, ids: list[UUID], conn_type: RelationshipType) -> set[dict]: """ ids = list([id["to_id"] for id in ids]) sql = fr"""select edge_id, from_id, to_id, conn_type from edges where conn_type = '{conn_type.name}' and from_id in ({quote(ids)})""" - return self.conn.execute(sql) - - def search_entity(self, - keyword: str, - type: list[EntityType]) -> list[EntityRef]: - """ - WARN: This search function is implemented via `like` operator, which could be extremely slow. - """ - types = ",".join([quote(str(t)) for t in type]) - sql = fr'''select entity_id as id, qualified_name, entity_type as type from entities where qualified_name like %s and entity_type in ({types})''' - rows = self.conn.execute(sql, ('%' + keyword + '%', )) - return list([EntityRef(**row) for row in rows]) + return self.conn.query(sql) diff --git a/registry/sql-registry/registry/interface.py b/registry/sql-registry/registry/interface.py index 406c52ace..e007dfde0 100644 --- a/registry/sql-registry/registry/interface.py +++ b/registry/sql-registry/registry/interface.py @@ -1,10 +1,11 @@ -from abc import ABC, abstractmethod +from abc import ABC, abstractclassmethod, abstractmethod from typing import Union from uuid import UUID from registry.database import DbConnection from registry.models import * + class Registry(ABC): @abstractmethod def get_projects(self) -> list[str]: @@ -33,7 +34,7 @@ def get_entity_id(self, id_or_name: Union[str, UUID]) -> UUID: Get entity id by its name """ pass - + @abstractmethod def get_neighbors(self, id_or_name: Union[str, UUID], relationship: RelationshipType) -> list[Edge]: """ @@ -67,3 +68,22 @@ def search_entity(self, """ pass + @abstractmethod + def create_project(self, definition: ProjectDef) -> UUID: + pass + + @abstractmethod + def create_project_datasource(self, project_id: UUID, definition: SourceDef) -> UUID: + pass + + @abstractmethod + def create_project_anchor(self, project_id: UUID, definition: AnchorDef) -> UUID: + pass + + @abstractmethod + def create_project_anchor_feature(self, project_id: UUID, anchor_id: UUID, definition: AnchorFeatureDef) -> UUID: + pass + + @abstractmethod + def create_project_derived_feature(self, project_id: UUID, definition: DerivedFeatureDef) -> UUID: + pass diff --git a/registry/sql-registry/registry/models.py b/registry/sql-registry/registry/models.py index 3c08d2692..74bb297c9 100644 --- a/registry/sql-registry/registry/models.py +++ b/registry/sql-registry/registry/models.py @@ -276,6 +276,7 @@ def to_dict(self) -> dict: class Attributes(ToDict): @staticmethod def new(entity_type: Union[str, EntityType], **kwargs): + print("YYY ", entity_type, kwargs) return { EntityType.Project: ProjectAttributes, EntityType.Source: SourceAttributes, @@ -432,9 +433,9 @@ def __init__(self, self._source = None self._features = [] # if source is not None: - # self._source = source.get_ref() - # if len(features)>0: - # self._set_feature(features) + # self._source = _to_type(source, Entity).get_ref() + # if features: + # self.features = features self.tags = tags @property @@ -546,15 +547,20 @@ def input_features(self): return self._input_anchor_features + self._input_derived_features @input_features.setter - def input_features(self, v: Union[dict, Entity]): + def input_features(self, v: Union[dict, Entity, EntityRef]): self._input_anchor_features = [] self._input_derived_features = [] for f in v: e = None - if isinstance(f, Entity): + if isinstance(f, EntityRef): e = f + elif isinstance(f, Entity): + e = f.get_ref() elif isinstance(f, dict): - e = _to_type(f, Entity) + try: + e = _to_type(f, Entity).get_ref() + except: + e = _to_type(f, EntityRef) else: raise TypeError(f) @@ -569,38 +575,10 @@ def input_features(self, v: Union[dict, Entity]): def input_anchor_features(self): return self._input_anchor_features - # @input_anchor_features.setter - # def input_anchor_features(self, v): - # self._input_anchor_features = [] - # for f in v: - # if isinstance(f, Entity): - # self._input_anchor_features.append(f.get_ref()) - # elif isinstance(f, EntityRef): - # self._input_anchor_features.append(f) - # elif isinstance(f, dict): - # self._input_anchor_features.append( - # to_type(f, Entity).get_ref()) - # else: - # raise TypeError(f) - @property def input_derived_features(self): return self._input_derived_features - # @input_derived_features.setter - # def input_derived_features(self, v): - # self._input_derived_features = [] - # for f in v: - # if isinstance(f, Entity): - # self._input_derived_features.append(f.get_ref()) - # elif isinstance(f, EntityRef): - # self._input_derived_features.append(f) - # elif isinstance(f, dict): - # self._input_derived_features.append( - # to_type(f, Entity).get_ref()) - # else: - # raise TypeError(f) - def to_dict(self) -> dict: return { "qualifiedName": self.qualified_name, @@ -608,8 +586,8 @@ def to_dict(self) -> dict: "type": self.type.to_dict(), "transformation": self.transformation.to_dict(), "key": list([k.to_dict() for k in self.key]), - "input_anchor_features": [e.get_ref().to_dict() for e in self.input_anchor_features], - "input_derived_features": [e.get_ref().to_dict() for e in self.input_derived_features], + "input_anchor_features": [e.to_dict() for e in self.input_anchor_features], + "input_derived_features": [e.to_dict() for e in self.input_derived_features], "tags": self.tags, } @@ -658,6 +636,9 @@ def __init__(self, qualified_name: str, tags: dict = {}): self.qualified_name = qualified_name self.name = qualified_name self.tags = tags + + def to_attr(self) -> ProjectAttributes: + return ProjectAttributes(name=self.name, tags=self.tags) class SourceDef: @@ -679,6 +660,15 @@ def __init__(self, self.timestamp_format = timestamp_format self.tags = tags + def to_attr(self) -> SourceAttributes: + return SourceAttributes(qualified_name=self.qualified_name, + name=self.name, + type=self.type, + path=self.path, + preprocessing=self.preprocessing, + event_timestamp_column=self.event_timestamp_column, + timestamp_format=self.timestamp_format, + tags=self.tags) class AnchorDef: def __init__(self, @@ -691,6 +681,12 @@ def __init__(self, self.source_id = _to_uuid(source_id) self.tags = tags + def to_attr(self, source: EntityRef) -> AnchorAttributes: + attr = AnchorAttributes(qualified_name=self.qualified_name, + name=self.name, + tags=self.tags) + attr.source = source + return attr class AnchorFeatureDef: def __init__(self, @@ -707,6 +703,14 @@ def __init__(self, self.key = _to_type(key, TypedKey) self.tags = tags + def to_attr(self) -> AnchorFeatureAttributes: + return AnchorFeatureAttributes(qualified_name=self.qualified_name, + name=self.name, + type=self.feature_type, + transformation=self.transformation, + key=self.key, + tags=self.tags) + class DerivedFeatureDef: def __init__(self, @@ -726,3 +730,14 @@ def __init__(self, self.input_anchor_features = _to_uuid(input_anchor_features) self.input_derived_features = _to_uuid(input_derived_features) self.tags = tags + + def to_attr(self, input_features: list[EntityRef]) -> DerivedFeatureAttributes: + attr = DerivedFeatureAttributes(qualified_name=self.qualified_name, + name=self.name, + type=self.feature_type, + transformation=self.transformation, + key=self.key, + tags=self.tags) + attr.input_features = input_features + return attr + diff --git a/registry/sql-registry/scripts/ARM_template.json b/registry/sql-registry/scripts/ARM_template.json new file mode 100644 index 000000000..221c3169d --- /dev/null +++ b/registry/sql-registry/scripts/ARM_template.json @@ -0,0 +1,107 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "databaseServerName": { + "type": "string", + "defaultValue": "[concat('server-', uniqueString(resourceGroup().id, deployment().name))]", + "metadata": { + "description": "Specifies the name for the SQL server" + } + }, + "databaseName": { + "type": "string", + "defaultValue": "[concat('db-', uniqueString(resourceGroup().id, deployment().name), '-1')]", + "metadata": { + "description": "Specifies the name for the SQL database under the SQL server" + } + }, + "location": { + "type": "string", + "defaultValue": "[resourceGroup().location]", + "metadata": { + "description": "Specifies the location for server and database" + } + }, + "adminUser": { + "type": "string", + "metadata": { + "description": "Specifies the username for admin" + } + }, + "adminPassword": { + "type": "securestring", + "metadata": { + "description": "Specifies the password for admin" + } + }, + "storageAccountKey": { + "type": "string", + "metadata": { + "description": "Specifies the key of the storage account where the BACPAC file is stored." + } + }, + "bacpacUrl": { + "type": "string", + "defaultValue": "https://xchfeathrtest4sto.blob.core.windows.net/public/feathr-2022-6-16-18-16.bacpac", + "metadata": { + "description": "Specifies the URL of the BACPAC file." + } + } + }, + "resources": [ + { + "type": "Microsoft.Sql/servers", + "apiVersion": "2021-02-01-preview", + "name": "[parameters('databaseServerName')]", + "location": "[parameters('location')]", + "properties": { + "administratorLogin": "[parameters('adminUser')]", + "administratorLoginPassword": "[parameters('adminPassword')]", + "version": "12.0" + }, + "resources": [ + { + "type": "firewallrules", + "apiVersion": "2021-02-01-preview", + "name": "AllowAllAzureIps", + "location": "[parameters('location')]", + "dependsOn": [ + "[parameters('databaseServerName')]" + ], + "properties": { + "startIpAddress": "0.0.0.0", + "endIpAddress": "0.0.0.0" + } + } + ] + }, + { + "type": "Microsoft.Sql/servers/databases", + "apiVersion": "2021-02-01-preview", + "name": "[concat(string(parameters('databaseServerName')), '/', string(parameters('databaseName')))]", + "location": "[parameters('location')]", + "dependsOn": [ + "[concat('Microsoft.Sql/servers/', parameters('databaseServerName'))]" + ], + "resources": [ + { + "type": "extensions", + "apiVersion": "2014-04-01", + "name": "Import", + "dependsOn": [ + "[resourceId('Microsoft.Sql/servers/databases', parameters('databaseServerName'), parameters('databaseName'))]" + ], + "properties": { + "storageKeyType": "StorageAccessKey", + "storageKey": "[parameters('storageAccountKey')]", + "storageUri": "[parameters('bacpacUrl')]", + "administratorLogin": "[parameters('adminUser')]", + "administratorLoginPassword": "[parameters('adminPassword')]", + "operationMode": "Import" + } + } + ] + } + ] + } \ No newline at end of file diff --git a/registry/sql-registry/test/test_create.py b/registry/sql-registry/test/test_create.py new file mode 100644 index 000000000..848cc8a86 --- /dev/null +++ b/registry/sql-registry/test/test_create.py @@ -0,0 +1,44 @@ +import registry +from registry.db_registry import quote +from registry.models import AnchorDef, AnchorFeatureDef, DerivedFeatureDef, ExpressionTransformation, FeatureType, ProjectDef, SourceDef, TensorCategory, Transformation, TypedKey, ValueType, VectorType + +r = registry.DbRegistry() + + +def cleanup(): + with r.conn.transaction() as c: + ids = quote([project1_id, source1_id, anchor1_id, af1_id, df1_id]) + c.execute( + f"delete from edges where from_id in ({ids}) or to_id in ({ids})") + c.execute( + f"delete from entities where entity_id in ({ids})") + + +project1_id = r.create_project(ProjectDef("unit_test_project_1")) +print("project1 id ", project1_id) +project1 = r.get_entity(project1_id) +assert project1.qualified_name == "unit_test_project_1" + +# Re-create project, should return the same id +id = r.create_project(ProjectDef("unit_test_project_1")) +assert project1_id == id + +source1_id = r.create_project_datasource(project1_id, SourceDef( + qualified_name="unit_test_project_1__source1", name="source1", path="hdfs://somewhere", type="hdfs")) +print("source1 id ", source1_id) +anchor1_id = r.create_project_anchor(project1_id, AnchorDef( + qualified_name="unit_test_project_1__anchor1", name="anchor1", source_id=source1_id)) +print("anchor1 id ", anchor1_id) +ft1 = FeatureType(type=VectorType.TENSOR, tensor_category=TensorCategory.DENSE, + dimension_type=[], val_type=ValueType.INT) +t1 = ExpressionTransformation("af1") +k = TypedKey(key_column="c1", key_column_type=ValueType.INT) +af1_id = r.create_project_anchor_feature(project1_id, anchor1_id, AnchorFeatureDef( + qualified_name="unit_test_project_1__anchor1__af1", name="af1", feature_type=ft1, transformation=t1, key=[k])) +print("af1 id ", af1_id) + +df1_id = r.create_project_derived_feature(project1_id, DerivedFeatureDef(qualified_name="unit_test_project_1__df1", + name="df1", feature_type=ft1, transformation=t1, key=[k], input_anchor_features=[af1_id], input_derived_features=[])) +print("df1 id ", df1_id) + +cleanup() From 35d1e40634ef6ec8567de33c19ba3686112706cb Mon Sep 17 00:00:00 2001 From: Chen Xu Date: Thu, 23 Jun 2022 21:02:17 +0800 Subject: [PATCH 2/3] API impl --- registry/sql-registry/main.py | 52 +++++++++++++++---- registry/sql-registry/registry/db_registry.py | 22 +++++--- registry/sql-registry/registry/models.py | 16 +++--- 3 files changed, 66 insertions(+), 24 deletions(-) diff --git a/registry/sql-registry/main.py b/registry/sql-registry/main.py index a40fae89c..939719acd 100644 --- a/registry/sql-registry/main.py +++ b/registry/sql-registry/main.py @@ -1,10 +1,11 @@ import os from typing import Optional +from uuid import UUID from fastapi import APIRouter, FastAPI, HTTPException from starlette.middleware.cors import CORSMiddleware from registry import * from registry.db_registry import DbRegistry -from registry.models import EntityType +from registry.models import AnchorDef, AnchorFeatureDef, DerivedFeatureDef, EntityType, ProjectDef, SourceDef, to_snake rp = "/" try: @@ -21,11 +22,12 @@ # Enables CORS app.add_middleware(CORSMiddleware, - allow_origins=["*"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], + ) + @router.get("/projects") def get_projects() -> list[str]: @@ -54,7 +56,8 @@ def get_project_features(project: str, keyword: Optional[str] = None) -> list: features = registry.get_entities(feature_ids) return list([e.to_dict() for e in features]) else: - efs = registry.search_entity(keyword, [EntityType.AnchorFeature, EntityType.DerivedFeature]) + efs = registry.search_entity( + keyword, [EntityType.AnchorFeature, EntityType.DerivedFeature]) feature_ids = [ef.id for ef in efs] features = registry.get_entities(feature_ids) return list([e.to_dict() for e in features]) @@ -64,7 +67,8 @@ def get_project_features(project: str, keyword: Optional[str] = None) -> list: def get_feature(feature: str) -> dict: e = registry.get_entity(feature) if e.entity_type not in [EntityType.DerivedFeature, EntityType.AnchorFeature]: - raise HTTPException(status_code=404, detail=f"Feature {feature} not found") + raise HTTPException( + status_code=404, detail=f"Feature {feature} not found") return e @@ -74,4 +78,34 @@ def get_feature_lineage(feature: str) -> dict: return lineage.to_dict() -app.include_router(prefix = rp, router=router) +@router.post("/projects") +def new_project(definition: dict) -> UUID: + return registry.create_project(ProjectDef(**to_snake(definition))) + + +@router.post("/projects/{project}/datasources") +def new_project_datasource(project: str, definition: dict) -> UUID: + project_id = registry.get_entity_id(project) + return registry.create_project_datasource(project_id, SourceDef(**to_snake(definition))) + + +@router.post("/projects/{project}/anchors") +def new_project_anchor(project: str, definition: dict) -> UUID: + project_id = registry.get_entity_id(project) + return registry.create_project_anchor(project_id, AnchorDef(**to_snake(definition))) + + +@router.post("/projects/{project}/anchors/{anchor}/features") +def new_project_anchor_feature(project: str, anchor: str, definition: dict) -> UUID: + project_id = registry.get_entity_id(project) + anchor_id = registry.get_entity_id(anchor) + return registry.create_project_anchor_feature(project_id, anchor_id, AnchorFeatureDef(**to_snake(definition))) + + +@router.post("/projects/{project}/derivedfeatures") +def new_project_derived_feature(project: str, definition: dict) -> UUID: + project_id = registry.get_entity_id(project) + return registry.create_project_derived_feature(project_id, DerivedFeatureDef(**to_snake(definition))) + + +app.include_router(prefix=rp, router=router) diff --git a/registry/sql-registry/registry/db_registry.py b/registry/sql-registry/registry/db_registry.py index 72c4f2888..006aab7b8 100644 --- a/registry/sql-registry/registry/db_registry.py +++ b/registry/sql-registry/registry/db_registry.py @@ -106,6 +106,7 @@ def search_entity(self, return list([EntityRef(**row) for row in rows]) def create_project(self, definition: ProjectDef) -> UUID: + definition.qualified_name = definition.name with self.conn.transaction() as c: c.execute(f'''select entity_id, entity_type, attributes from entities where qualified_name = %s''', definition.qualified_name) @@ -119,7 +120,7 @@ def create_project(self, definition: ProjectDef) -> UUID: raise ValueError("Entity %s already exists" % definition.qualified_name) # Just return the existing project id - return r[0]["entity_id"] + return _to_uuid(r[0]["entity_id"]) id = uuid4() c.execute(f"insert into entities (entity_id, entity_type, qualified_name, attributes) values (%s, 'feathr_workspace_v1', %s, %s)", (str(id), @@ -128,6 +129,8 @@ def create_project(self, definition: ProjectDef) -> UUID: return id def create_project_datasource(self, project_id: UUID, definition: SourceDef) -> UUID: + project = self.get_entity(project_id) + definition.qualified_name = f"{project.qualified_name}__{definition.name}" with self.conn.transaction() as c: c.execute(f'''select entity_id, entity_type, attributes from entities where qualified_name = %s''', definition.qualified_name) @@ -150,7 +153,7 @@ def create_project_datasource(self, project_id: UUID, definition: SourceDef) -> and attr.timestamp_format == definition.timestamp_format: # Creating exactly same entity # Just return the existing id - return r[0]["entity_id"] + return _to_uuid(r[0]["entity_id"]) raise ValueError("Entity %s already exists" % definition.qualified_name) id = uuid4() @@ -163,6 +166,8 @@ def create_project_datasource(self, project_id: UUID, definition: SourceDef) -> return id def create_project_anchor(self, project_id: UUID, definition: AnchorDef) -> UUID: + project = self.get_entity(project_id) + definition.qualified_name = f"{project.qualified_name}__{definition.name}" with self.conn.transaction() as c: c.execute(f'''select entity_id, entity_type, attributes from entities where qualified_name = %s''', definition.qualified_name) @@ -181,7 +186,7 @@ def create_project_anchor(self, project_id: UUID, definition: AnchorDef) -> UUID and attr.source.id == definition.source_id: # Creating exactly same entity # Just return the existing id - return r[0]["entity_id"] + return _to_uuid(r[0]["entity_id"]) raise ValueError("Entity %s already exists" % definition.qualified_name) c.execute("select entity_id, qualified_name from entities where entity_id = %s and entity_type = 'feathr_source_v1'", str( @@ -206,6 +211,8 @@ def create_project_anchor(self, project_id: UUID, definition: AnchorDef) -> UUID return id def create_project_anchor_feature(self, project_id: UUID, anchor_id: UUID, definition: AnchorFeatureDef) -> UUID: + anchor = self.get_entity(anchor_id) + definition.qualified_name = f"{anchor.qualified_name}__{definition.name}" with self.conn.transaction() as c: c.execute(f'''select entity_id, entity_type, attributes from entities where qualified_name = %s''', definition.qualified_name) @@ -226,11 +233,10 @@ def create_project_anchor_feature(self, project_id: UUID, anchor_id: UUID, defin and attr.key == definition.key: # Creating exactly same entity # Just return the existing id - return r[0]["entity_id"] + return _to_uuid(r[0]["entity_id"]) raise ValueError("Entity %s already exists" % definition.qualified_name) - anchor: AnchorAttributes = self.get_entity(anchor_id).attributes - source_id = anchor.source.id + source_id = anchor.attributes.source.id id = uuid4() c.execute(f"insert into entities (entity_id, entity_type, qualified_name, attributes) values (%s, 'feathr_anchor_feature_v1', %s, %s)", (str(id), @@ -245,6 +251,8 @@ def create_project_anchor_feature(self, project_id: UUID, anchor_id: UUID, defin return id def create_project_derived_feature(self, project_id: UUID, definition: DerivedFeatureDef) -> UUID: + project = self.get_entity(project_id) + definition.qualified_name = f"{project.qualified_name}__{definition.name}" with self.conn.transaction() as c: c.execute(f'''select entity_id, entity_type, attributes from entities where qualified_name = %s''', definition.qualified_name) @@ -265,7 +273,7 @@ def create_project_derived_feature(self, project_id: UUID, definition: DerivedFe and attr.key == definition.key: # Creating exactly same entity # Just return the existing id - return r[0]["entity_id"] + return _to_uuid(r[0]["entity_id"]) raise ValueError("Entity %s already exists" % definition.qualified_name) r1 = [] diff --git a/registry/sql-registry/registry/models.py b/registry/sql-registry/registry/models.py index 74bb297c9..16534b82d 100644 --- a/registry/sql-registry/registry/models.py +++ b/registry/sql-registry/registry/models.py @@ -6,7 +6,7 @@ import re -def _to_snake(d, level: int = 0): +def to_snake(d, level: int = 0): """ Convert `string`, `list[string]`, or all keys in a `dict` into snake case The maximum length of input string or list is 100, or it will be truncated before being processed, for dict, the exception will be thrown if it has more than 100 keys. @@ -19,10 +19,10 @@ def _to_snake(d, level: int = 0): return re.sub(r'([A-Z]\w+$)', r'_\1', d).lower() if isinstance(d, list): d = d[:100] - return [_to_snake(i, level + 1) if isinstance(i, (dict, list)) else i for i in d] + return [to_snake(i, level + 1) if isinstance(i, (dict, list)) else i for i in d] if len(d) > 100: raise ValueError("Dict has too many keys") - return {_to_snake(a, level + 1): _to_snake(b, level + 1) if isinstance(b, (dict, list)) else b for a, b in d.items()} + return {to_snake(a, level + 1): to_snake(b, level + 1) if isinstance(b, (dict, list)) else b for a, b in d.items()} def _to_type(value, type): @@ -39,10 +39,10 @@ def _to_type(value, type): if hasattr(type, "new"): try: # The convention is to use `new` method to create the object from a dict - return type.new(**_to_snake(value)) + return type.new(**to_snake(value)) except TypeError: pass - return type(**_to_snake(value)) + return type(**to_snake(value)) if issubclass(type, Enum): try: n = int(value) @@ -300,7 +300,7 @@ def __init__(self, self.attributes = attributes else: self.attributes = Attributes.new( - entity_type, **_to_snake(attributes)) + entity_type, **to_snake(attributes)) def get_ref(self) -> EntityRef: return EntityRef(self.id, @@ -632,9 +632,9 @@ def to_dict(self) -> dict: class ProjectDef: - def __init__(self, qualified_name: str, tags: dict = {}): + def __init__(self, name: str, qualified_name: str = "", tags: dict = {}): + self.name = name self.qualified_name = qualified_name - self.name = qualified_name self.tags = tags def to_attr(self) -> ProjectAttributes: From 0ce2ff7f639b8811cb2298cb2820a72febce3d4a Mon Sep 17 00:00:00 2001 From: Chen Xu Date: Thu, 23 Jun 2022 21:04:00 +0800 Subject: [PATCH 3/3] Response type --- registry/sql-registry/main.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/registry/sql-registry/main.py b/registry/sql-registry/main.py index 939719acd..06cd1b5f8 100644 --- a/registry/sql-registry/main.py +++ b/registry/sql-registry/main.py @@ -80,32 +80,37 @@ def get_feature_lineage(feature: str) -> dict: @router.post("/projects") def new_project(definition: dict) -> UUID: - return registry.create_project(ProjectDef(**to_snake(definition))) + id = registry.create_project(ProjectDef(**to_snake(definition))) + return {"guid": str(id)} @router.post("/projects/{project}/datasources") def new_project_datasource(project: str, definition: dict) -> UUID: project_id = registry.get_entity_id(project) - return registry.create_project_datasource(project_id, SourceDef(**to_snake(definition))) + id = registry.create_project_datasource(project_id, SourceDef(**to_snake(definition))) + return {"guid": str(id)} @router.post("/projects/{project}/anchors") def new_project_anchor(project: str, definition: dict) -> UUID: project_id = registry.get_entity_id(project) - return registry.create_project_anchor(project_id, AnchorDef(**to_snake(definition))) + id = registry.create_project_anchor(project_id, AnchorDef(**to_snake(definition))) + return {"guid": str(id)} @router.post("/projects/{project}/anchors/{anchor}/features") def new_project_anchor_feature(project: str, anchor: str, definition: dict) -> UUID: project_id = registry.get_entity_id(project) anchor_id = registry.get_entity_id(anchor) - return registry.create_project_anchor_feature(project_id, anchor_id, AnchorFeatureDef(**to_snake(definition))) + id = registry.create_project_anchor_feature(project_id, anchor_id, AnchorFeatureDef(**to_snake(definition))) + return {"guid": str(id)} @router.post("/projects/{project}/derivedfeatures") def new_project_derived_feature(project: str, definition: dict) -> UUID: project_id = registry.get_entity_id(project) - return registry.create_project_derived_feature(project_id, DerivedFeatureDef(**to_snake(definition))) + id = registry.create_project_derived_feature(project_id, DerivedFeatureDef(**to_snake(definition))) + return {"guid": str(id)} app.include_router(prefix=rp, router=router)