diff --git a/src/api/v1/__init__.py b/src/api/v1/__init__.py index 3a7ca37ea..2ba4c2b9e 100644 --- a/src/api/v1/__init__.py +++ b/src/api/v1/__init__.py @@ -22,6 +22,8 @@ resample, interpolate, interpolation_at_time, + circular_average, + circular_standard_deviation, time_weighted_average, circular_average, circular_standard_deviation, diff --git a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py index 1a8621b7f..0ee90f0e8 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py +++ b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py @@ -12,26 +12,29 @@ # See the License for the specific language governing permissions and # limitations under the License. -from pyspark.sql import DataFrame +from pyspark.sql import DataFrame, Window, SparkSession +from pyspark.sql.types import StringType from pyspark.sql.functions import ( to_json, col, struct, lit, array, - monotonically_increasing_id, floor, row_number, collect_list, expr, + udf, + sha2, + when, ) -from pyspark.sql import Window from datetime import datetime import pytz +import gzip +import base64 from ..interfaces import TransformerInterface from ..._pipeline_utils.models import Libraries, SystemType -from ..._pipeline_utils.spark import EDGEX_SCHEMA class PCDMToHoneywellAPMTransformer(TransformerInterface): @@ -41,22 +44,25 @@ class PCDMToHoneywellAPMTransformer(TransformerInterface): data (Dataframe): Spark Dataframe in PCDM format quality (str): Value for quality inside HistorySamples history_samples_per_message (int): The number of HistorySamples for each row in the DataFrame (Batch Only) - + compress_payload (bool): If True compresses CloudPlatformEvent with gzip compression """ data: DataFrame quality: str history_samples_per_message: int + compress_payload: bool def __init__( self, data: DataFrame, quality: str = "Good", history_samples_per_message: int = 1, + compress_payload: bool = True, ) -> None: self.data = data self.quality = quality self.history_samples_per_message = history_samples_per_message + self.compress_payload = compress_payload @staticmethod def system_type(): @@ -86,11 +92,17 @@ def transform(self) -> DataFrame: Returns: DataFrame: A dataframe with with rows in Honeywell APM format """ + + @udf("string") + def _compress_payload(data): + compressed_data = gzip.compress(data.encode("utf-8")) + encoded_data = base64.b64encode(compressed_data).decode("utf-8") + return encoded_data + if self.data.isStreaming == False and self.history_samples_per_message > 1: - pcdm_df = self.data.withColumn("counter", monotonically_increasing_id()) - w = Window.orderBy("counter") + w = Window.partitionBy("TagName").orderBy("TagName") cleaned_pcdm_df = ( - pcdm_df.withColumn( + self.data.withColumn( "index", floor( (row_number().over(w) - 0.01) / self.history_samples_per_message @@ -105,9 +117,9 @@ def transform(self) -> DataFrame: col("Value").alias("Value"), ).alias("HistorySamples"), ) - .groupBy("index") + .groupBy("TagName", "index") .agg(collect_list("HistorySamples").alias("HistorySamples")) - .withColumn("guid", expr("uuid()")) + .withColumn("guid", sha2(col("TagName"), 256).cast("string")) .withColumn( "value", struct( @@ -116,7 +128,9 @@ def transform(self) -> DataFrame: ) ) else: - cleaned_pcdm_df = self.data.withColumn("guid", expr("uuid()")).withColumn( + cleaned_pcdm_df = self.data.withColumn( + "guid", sha2(col("TagName"), 256).cast("string") + ).withColumn( "value", struct( col("guid").alias("SystemGuid"), @@ -131,32 +145,48 @@ def transform(self) -> DataFrame: ), ) - df = cleaned_pcdm_df.withColumn( - "CloudPlatformEvent", - struct( - lit(datetime.now(tz=pytz.UTC)).alias("CreatedTime"), - lit(expr("uuid()")).alias("Id"), - col("guid").alias("CreatorId"), - lit("CloudPlatformSystem").alias("CreatorType"), - lit(None).alias("GeneratorId"), - lit("CloudPlatformTenant").alias("GeneratorType"), - col("guid").alias("TargetId"), - lit("CloudPlatformTenant").alias("TargetType"), - lit(None).alias("TargetContext"), + df = ( + cleaned_pcdm_df.withColumn( + "CloudPlatformEvent", struct( - lit("TextualBody").alias("type"), - to_json(col("value")).alias("value"), - lit("application/json").alias("format"), - ).alias("Body"), - array( + lit(datetime.now(tz=pytz.UTC)).alias("CreatedTime"), + lit(expr("uuid()")).alias("Id"), + col("guid").alias("CreatorId"), + lit("CloudPlatformSystem").alias("CreatorType"), + lit(None).alias("GeneratorId"), + lit("CloudPlatformTenant").alias("GeneratorType"), + col("guid").alias("TargetId"), + lit("CloudPlatformTenant").alias("TargetType"), + lit(None).alias("TargetContext"), struct( - lit("SystemType").alias("Key"), - lit("apm-system").alias("Value"), - ), - struct(lit("SystemGuid").alias("Key"), col("guid").alias("Value")), - ).alias("BodyProperties"), - lit("DataChange.Update").alias("EventType"), - ), - ).withColumn("AnnotationStreamIds", lit(",")) - - return df.select("CloudPlatformEvent", "AnnotationStreamIds") + lit("TextualBody").alias("type"), + to_json(col("value")).alias("value"), + lit("application/json").alias("format"), + ).alias("Body"), + array( + struct( + lit("SystemType").alias("Key"), + lit("apm-system").alias("Value"), + ), + struct( + lit("SystemGuid").alias("Key"), col("guid").alias("Value") + ), + ).alias("BodyProperties"), + lit("DataChange.Update").alias("EventType"), + ), + ) + .withColumn("AnnotationStreamIds", lit(",")) + .withColumn("partitionKey", col("guid")) + ) + if self.compress_payload: + return df.select( + _compress_payload(to_json("CloudPlatformEvent")).alias( + "CloudPlatformEvent" + ), + "AnnotationStreamIds", + "partitionKey", + ) + else: + return df.select( + "CloudPlatformEvent", "AnnotationStreamIds", "partitionKey" + ) diff --git a/src/sdk/python/rtdip_sdk/pipelines/utilities/pipeline_components.py b/src/sdk/python/rtdip_sdk/pipelines/utilities/pipeline_components.py index f6d2a655b..5cd602673 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/utilities/pipeline_components.py +++ b/src/sdk/python/rtdip_sdk/pipelines/utilities/pipeline_components.py @@ -29,15 +29,17 @@ class PipelineComponentsGetUtility(UtilitiesInterface): Args: module (optional str): Provide the module to use for imports of rtdip-sdk components. If not populated, it will use the calling module to check for imports + spark_config (optional dict): Additional spark configuration to be applied to the spark session """ - def __init__(self, module: str = None) -> None: + def __init__(self, module: str = None, spark_config: dict = None) -> None: if module == None: frm = inspect.stack()[1] mod = inspect.getmodule(frm[0]) self.module = mod.__name__ else: self.module = module + self.spark_config = {} if spark_config is None else spark_config @staticmethod def system_type(): @@ -100,7 +102,7 @@ def execute(self) -> Tuple[Libraries, dict]: task_libraries = Libraries() task_libraries.get_libraries_from_components(component_list) - spark_configuration = {} + spark_configuration = self.spark_config for component in component_list: spark_configuration = {**spark_configuration, **component.settings()} return (task_libraries, spark_configuration) diff --git a/src/sdk/python/rtdip_sdk/pipelines/utilities/spark/session.py b/src/sdk/python/rtdip_sdk/pipelines/utilities/spark/session.py index f15cd2963..8becc24cd 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/utilities/spark/session.py +++ b/src/sdk/python/rtdip_sdk/pipelines/utilities/spark/session.py @@ -31,7 +31,7 @@ class SparkSessionUtility(UtilitiesInterface): Call this component after all imports of the RTDIP components to ensure that the spark session is configured correctly. Args: - config (dict): Dictionary of spark configuration to be applied to the spark session + config (optional dict): Dictionary of spark configuration to be applied to the spark session module (optional str): Provide the module to use for imports of rtdip-sdk components. If not populated, it will use the calling module to check for imports remote (optional str): Specify the remote parameters if intending to use Spark Connect """ @@ -40,7 +40,9 @@ class SparkSessionUtility(UtilitiesInterface): config: dict module: str - def __init__(self, config: dict, module: str = None, remote: str = None) -> None: + def __init__( + self, config: dict = None, module: str = None, remote: str = None + ) -> None: self.config = config if module == None: frm = inspect.stack()[1] @@ -70,7 +72,7 @@ def settings() -> dict: def execute(self) -> SparkSession: try: (task_libraries, spark_configuration) = PipelineComponentsGetUtility( - self.module + self.module, self.config ).execute() self.spark = SparkClient( spark_configuration=spark_configuration, diff --git a/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_pcdm_to_honeywell_apm.py b/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_pcdm_to_honeywell_apm.py index a1eb71b8e..8cdb0538a 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_pcdm_to_honeywell_apm.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_pcdm_to_honeywell_apm.py @@ -26,63 +26,76 @@ Libraries, SystemType, ) - from pyspark.sql import SparkSession, DataFrame -from pytest_mock import MockerFixture from pyspark.sql.types import StructType, StructField, StringType, TimestampType from datetime import datetime -import uuid +pcdm_schema = StructType( + [ + StructField("TagName", StringType(), True), + StructField("EventTime", TimestampType(), True), + StructField("Status", StringType(), False), + StructField("Value", StringType(), True), + StructField("ValueType", StringType(), False), + StructField("ChangeType", StringType(), False), + ] +) -def test_pcdm_to_honeywell_apm(spark_session: SparkSession, mocker: MockerFixture): - pcdm_schema = StructType( - [ - StructField("TagName", StringType(), True), - StructField("EventTime", TimestampType(), True), - StructField("Status", StringType(), False), - StructField("Value", StringType(), True), - StructField("ValueType", StringType(), False), - StructField("ChangeType", StringType(), False), - ] - ) +pcdm_data = [ + { + "TagName": "test.item1", + "EventTime": datetime.fromisoformat("2023-07-31T06:53:00+00:00"), + "Status": "Good", + "Value": 5.0, + "ValueType": "float", + "ChangeType": "insert", + }, + { + "TagName": "Test_item2", + "EventTime": datetime.fromisoformat("2023-07-31T06:54:00+00:00"), + "Status": "Good", + "Value": 1, + "ValueType": "float", + "ChangeType": "insert", + }, +] - pcdm_data = [ - { - "TagName": "test.item1", - "EventTime": datetime.fromisoformat("2023-07-31T06:53:00+00:00"), - "Status": "Good", - "Value": 5.0, - "ValueType": "float", - "ChangeType": "insert", - }, - { - "TagName": "Test_item2", - "EventTime": datetime.fromisoformat("2023-07-31T06:54:00+00:00"), - "Status": "Good", - "Value": 1, - "ValueType": "float", - "ChangeType": "insert", - }, - ] + +def test_pcdm_to_honeywell_apm(spark_session: SparkSession): pcdm_df: DataFrame = spark_session.createDataFrame( schema=pcdm_schema, data=pcdm_data ) - PCDM_to_honeywell_eventhub_json_transformer = PCDMToHoneywellAPMTransformer( - data=pcdm_df, history_samples_per_message=3 + pcdm_to_honeywell_eventhub_json_transformer = PCDMToHoneywellAPMTransformer( + data=pcdm_df, history_samples_per_message=3, compress_payload=False ) - actual_df = PCDM_to_honeywell_eventhub_json_transformer.transform() + actual_df = pcdm_to_honeywell_eventhub_json_transformer.transform() df_row = actual_df.collect()[0] - assert isinstance(uuid.UUID(df_row["CloudPlatformEvent"]["CreatorId"]), uuid.UUID) assert ( - PCDM_to_honeywell_eventhub_json_transformer.system_type() == SystemType.PYSPARK + df_row["CloudPlatformEvent"]["CreatorId"] + == "51bc4f9dda971d1b5417161bb98e5d8f77bea2587d9de783b54be25e22b56496" + ) + assert ( + pcdm_to_honeywell_eventhub_json_transformer.system_type() == SystemType.PYSPARK ) assert isinstance( - PCDM_to_honeywell_eventhub_json_transformer.libraries(), Libraries + pcdm_to_honeywell_eventhub_json_transformer.libraries(), Libraries ) - assert len(df_row) == 2 + assert len(df_row) == 3 assert len(df_row["CloudPlatformEvent"]) == 12 assert len(df_row["CloudPlatformEvent"]["Body"]) == 3 assert len(df_row["CloudPlatformEvent"]["BodyProperties"]) == 2 assert len(df_row["CloudPlatformEvent"]["BodyProperties"][0]) == 2 assert len(df_row["CloudPlatformEvent"]["BodyProperties"][1]) == 2 + + +def test_pcdm_to_honeywell_apm_gzip_compressed(spark_session: SparkSession): + pcdm_df: DataFrame = spark_session.createDataFrame( + schema=pcdm_schema, data=pcdm_data + ) + pcdm_to_honeywell_eventhub_json_transformer = PCDMToHoneywellAPMTransformer( + data=pcdm_df, history_samples_per_message=3 + ) + actual_df = pcdm_to_honeywell_eventhub_json_transformer.transform() + df_row = actual_df.collect()[0] + assert isinstance(df_row["CloudPlatformEvent"], str)