Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
- Dependencies: Migrated from `zyp` to `tikray`. It's effectively the
same, but provided using a dedicated package now
- CI: Added support for Python 3.13
- DMS: Fixed handling of primary keys

## 2024/10/28 v0.0.22
- DynamoDB/Testing: Use CrateDB nightly again
Expand Down
19 changes: 19 additions & 0 deletions src/commons_codec/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,13 @@
from enum import auto
from functools import cached_property

import toolz
from attr import Factory
from attrs import define
from sqlalchemy_cratedb.support import quote_relation_name

from commons_codec.util.data import TaggableList

if sys.version_info >= (3, 11):
from enum import StrEnum
else:
Expand Down Expand Up @@ -143,3 +146,19 @@ class UniversalRecord:

def to_dict(self):
return {"pk": self.pk, "typed": self.typed, "untyped": self.untyped}

@classmethod
def from_record(
cls, record: t.Dict[str, t.Any], primary_keys: t.Union[t.List[str], None] = None
) -> "UniversalRecord":
pk = {}
untyped = {}
primary_keys_effective = set(primary_keys or [])
for key, value in record.items():
if key in primary_keys_effective:
pk[key] = value
if isinstance(value, TaggableList) and value.get_tag("varied", False):
untyped[key] = value
record = toolz.dissoc(record, *pk.keys())
record = toolz.dissoc(record, *untyped.keys())
return cls(pk=pk, typed=record, untyped=untyped)
95 changes: 83 additions & 12 deletions src/commons_codec/transform/aws_dms.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2021-2024, Crate.io Inc.
# Copyright (c) 2021-2025, Crate.io Inc.
# Distributed under the terms of the LGPLv3 license, see LICENSE.

import logging
Expand All @@ -15,6 +15,7 @@
SQLParameterizedSetClause,
SQLParameterizedWhereClause,
TableAddress,
UniversalRecord,
)

logger = logging.getLogger(__name__)
Expand All @@ -25,8 +26,18 @@ class DMSTranslatorCrateDBRecord:
Translate DMS full-load and cdc events into CrateDB SQL statements.
"""

# Define name of the column where CDC's record data will get materialized into.
DATA_COLUMN = "data"
# Define the name of the column where primary key information will get materialized into.
# This column uses the `OBJECT(STRICT)` data type.
PK_COLUMN = "pk"

# Define the name of the column where CDC's record data will get materialized into.
# This column uses the `OBJECT(DYNAMIC)` data type.
TYPED_COLUMN = "data"

# Define the name of the column where untyped fields will get materialized into.
# This column uses the `OBJECT(IGNORED)` data type.
# TODO: Currently not used with DMS.
UNTYPED_COLUMN = "aux"

def __init__(
self,
Expand Down Expand Up @@ -72,18 +83,35 @@ def __init__(
self.primary_keys: t.List[str] = self.container.primary_keys[self.address]
self.column_types: t.Dict[str, ColumnType] = self.container.column_types[self.address]

pks = self.control.get("table-def", {}).get("primary-key", [])
for pk in pks:
if pk not in self.primary_keys:
self.primary_keys.append(pk)

def to_sql(self) -> SQLOperation:
if self.operation == "create-table":
pks = self.control.get("table-def", {}).get("primary-key")
if pks:
self.primary_keys += pks
# TODO: What about dropping tables first?
return SQLOperation(f"CREATE TABLE IF NOT EXISTS {self.address.fqn} ({self.DATA_COLUMN} OBJECT(DYNAMIC));")
return SQLOperation(
f"CREATE TABLE IF NOT EXISTS {self.address.fqn} ("
f"{self.pk_clause()}"
f"{self.TYPED_COLUMN} OBJECT(DYNAMIC), "
f"{self.UNTYPED_COLUMN} OBJECT(IGNORED));"
)

elif self.operation in ["load", "insert"]:
self.decode_data()
sql = f"INSERT INTO {self.address.fqn} ({self.DATA_COLUMN}) VALUES (:record);"
parameters = {"record": self.data}
record = self.decode_record(self.data)
sql = (
f"INSERT INTO {self.address.fqn} ("
f"{self.PK_COLUMN}, "
f"{self.TYPED_COLUMN}, "
f"{self.UNTYPED_COLUMN}"
f") VALUES ("
f":pk, "
f":typed, "
f":untyped) "
f"ON CONFLICT DO NOTHING;"
)
parameters = record.to_dict()

elif self.operation == "update":
self.decode_data()
Expand All @@ -105,6 +133,43 @@ def to_sql(self) -> SQLOperation:

return SQLOperation(sql, parameters)

def pk_clause(self) -> str:
"""
Return primary key clause in string format.
"""
if self.primary_keys:
columns = self.control.get("table-def", {}).get("columns", {})
pk_clauses = []
for pk_name in self.primary_keys:
col_meta = columns.get(pk_name) or {}
ltype = col_meta.get("type", "TEXT")
pk_clauses.append(f'"{pk_name}" {self.resolve_type(ltype)} PRIMARY KEY')
if pk_clauses:
return f"{self.PK_COLUMN} OBJECT(STRICT) AS ({', '.join(pk_clauses)}), "
return ""

@staticmethod
def resolve_type(ltype: str) -> str:
"""
Map DMS/Kinesis data type to CrateDB data type.

TODO: Right now only the INT* family is mapped. Unrecognised value types are mapped
to `TEXT`, acting as a sane default. Consider adding an enriched set of type
mappings when applicable.

- https://docs.aws.amazon.com/dms/latest/userguide/CHAP_Target.S3.html#CHAP_Target.S3.DataTypes
- https://docs.aws.amazon.com/dms/latest/userguide/CHAP_Source.PostgreSQL.html#CHAP_Source-PostgreSQL-DataTypes
- https://docs.aws.amazon.com/dms/latest/userguide/CHAP_Target.Kinesis.html
- https://repost.aws/questions/QUkEPhdTIpRoCC7jcQ21xGyQ/amazon-dms-table-mapping-tranformation
"""
type_map = {
"INT8": "INT1",
"INT16": "INT2",
"INT32": "INT4",
"INT64": "INT8",
}
return type_map.get(ltype, "TEXT")

def update_clause(self) -> SQLParameterizedSetClause:
"""
Serializes an image to a comma-separated list of column/values pairs
Expand All @@ -122,7 +187,7 @@ def update_clause(self) -> SQLParameterizedSetClause:
# Skip primary key columns, they cannot be updated.
if column in self.primary_keys:
continue
clause.add(lval=f"{self.DATA_COLUMN}['{column}']", value=value, name=column)
clause.add(lval=f"{self.TYPED_COLUMN}['{column}']", value=value, name=column)
return clause

def decode_data(self):
Expand All @@ -143,6 +208,12 @@ def decode_data(self):
value = json.loads(value)
self.data[column_name] = value

def decode_record(self, item: t.Dict[str, t.Any], key_names: t.Union[t.List[str], None] = None) -> UniversalRecord:
"""
Deserialize DMS event record into vanilla Python.
"""
return UniversalRecord.from_record(item, key_names or self.primary_keys)

def keys_to_where(self) -> SQLParameterizedWhereClause:
"""
Produce an SQL WHERE clause based on primary key definition and current record's data.
Expand All @@ -152,7 +223,7 @@ def keys_to_where(self) -> SQLParameterizedWhereClause:
clause = SQLParameterizedWhereClause()
for key_name in self.primary_keys:
key_value = self.data.get(key_name)
clause.add(lval=f"{self.DATA_COLUMN}['{key_name}']", value=key_value, name=key_name)
clause.add(lval=f"{self.TYPED_COLUMN}['{key_name}']", value=key_value, name=key_name)
return clause


Expand Down
17 changes: 3 additions & 14 deletions src/commons_codec/transform/dynamodb.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,20 +133,9 @@ def decode_record(self, item: t.Dict[str, t.Any], key_names: t.Union[t.List[str]
-- https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/HowItWorks.NamingRulesDataTypes.html#HowItWorks.DataTypeDescriptors
"""
record = toolz.valmap(self.deserializer.deserialize, item)

pk = {}
untyped = {}
pk_names = key_names or []
if not pk_names and self.primary_key_schema is not None:
pk_names = self.primary_key_schema.keys()
for key, value in record.items():
if key in pk_names:
pk[key] = value
if isinstance(value, TaggableList) and value.get_tag("varied", False):
untyped[key] = value
record = toolz.dissoc(record, *pk.keys())
record = toolz.dissoc(record, *untyped.keys())
return UniversalRecord(pk=pk, typed=record, untyped=untyped)
return UniversalRecord.from_record(
record, key_names or (self.primary_key_schema and self.primary_key_schema.keys() or None)
)


class DynamoDBFullLoadTranslator(DynamoTranslatorBase):
Expand Down
34 changes: 29 additions & 5 deletions tests/transform/test_aws_dms.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# ruff: noqa: S608 FIXME: Possible SQL injection vector through string-based query construction
import base64
import json
from copy import deepcopy

import pytest

Expand Down Expand Up @@ -213,27 +214,50 @@ def test_decode_cdc_unknown_event(cdc):

def test_decode_cdc_sql_ddl_regular(cdc):
assert cdc.to_sql(MSG_CONTROL_CREATE_TABLE) == SQLOperation(
statement="CREATE TABLE IF NOT EXISTS public.foo (data OBJECT(DYNAMIC));", parameters=None
statement="CREATE TABLE IF NOT EXISTS public.foo "
'(pk OBJECT(STRICT) AS ("id" INT4 PRIMARY KEY), data OBJECT(DYNAMIC), aux OBJECT(IGNORED));',
parameters=None,
)


def test_decode_cdc_sql_ddl_awsdms(cdc):
assert cdc.to_sql(MSG_CONTROL_AWSDMS) == SQLOperation(
statement="CREATE TABLE IF NOT EXISTS dms.awsdms_apply_exceptions (data OBJECT(DYNAMIC));", parameters=None
statement="CREATE TABLE IF NOT EXISTS dms.awsdms_apply_exceptions (data OBJECT(DYNAMIC), aux OBJECT(IGNORED));",
parameters=None,
)


def test_decode_cdc_insert(cdc):
def test_decode_cdc_insert_without_pk(cdc):
"""
Emulate INSERT operation without primary keys.
"""
assert cdc.to_sql(MSG_DATA_INSERT) == SQLOperation(
statement="INSERT INTO public.foo (data) VALUES (:record);", parameters={"record": RECORD_INSERT}
statement="INSERT INTO public.foo (pk, data, aux) VALUES (:pk, :typed, :untyped) ON CONFLICT DO NOTHING;",
parameters={"pk": {}, "typed": RECORD_INSERT, "untyped": {}},
)


def test_decode_cdc_insert_with_pk(cdc):
"""
Emulate INSERT operation with primary keys.
"""
# Seed translator with a control message, describing the table schema.
cdc.to_sql(MSG_CONTROL_CREATE_TABLE)

# Emulate an INSERT operation.
record = deepcopy(RECORD_INSERT)
record.pop("id")
assert cdc.to_sql(MSG_DATA_INSERT) == SQLOperation(
statement="INSERT INTO public.foo (pk, data, aux) VALUES (:pk, :typed, :untyped) ON CONFLICT DO NOTHING;",
parameters={"pk": {"id": 46}, "typed": record, "untyped": {}},
)


def test_decode_cdc_update_success(cdc):
"""
Update statements need schema knowledge about primary keys.
"""
# Seed translator with control message, describing the table schema.
# Seed translator with a control message, describing the table schema.
cdc.to_sql(MSG_CONTROL_CREATE_TABLE)

# Emulate an UPDATE operation.
Expand Down