rtdip · GBBBAS · Oct 9, 2023 · Apr 12, 2023 · Aug 3, 2023 · Aug 7, 2023
diff --git a/src/api/v1/__init__.py b/src/api/v1/__init__.py
@@ -22,6 +22,8 @@
     resample,
     interpolate,
     interpolation_at_time,
+    circular_average,
+    circular_standard_deviation,
     time_weighted_average,
     circular_average,
     circular_standard_deviation,

diff --git a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py
@@ -12,26 +12,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from pyspark.sql import DataFrame
+from pyspark.sql import DataFrame, Window, SparkSession
+from pyspark.sql.types import StringType
 from pyspark.sql.functions import (
     to_json,
     col,
     struct,
     lit,
     array,
-    monotonically_increasing_id,
     floor,
     row_number,
     collect_list,
     expr,
+    udf,
+    sha2,
+    when,
 )
-from pyspark.sql import Window
 from datetime import datetime
 import pytz
+import gzip
+import base64
 
 from ..interfaces import TransformerInterface
 from ..._pipeline_utils.models import Libraries, SystemType
-from ..._pipeline_utils.spark import EDGEX_SCHEMA
 
 
 class PCDMToHoneywellAPMTransformer(TransformerInterface):
@@ -41,22 +44,25 @@ class PCDMToHoneywellAPMTransformer(TransformerInterface):
         data (Dataframe): Spark Dataframe in PCDM format
         quality (str): Value for quality inside HistorySamples
         history_samples_per_message (int): The number of HistorySamples for each row in the DataFrame (Batch Only)
-
+        compress_payload (bool): If True compresses CloudPlatformEvent with gzip compression
     """
 
     data: DataFrame
     quality: str
     history_samples_per_message: int
+    compress_payload: bool
 
     def __init__(
         self,
         data: DataFrame,
         quality: str = "Good",
         history_samples_per_message: int = 1,
+        compress_payload: bool = True,
     ) -> None:
         self.data = data
         self.quality = quality
         self.history_samples_per_message = history_samples_per_message
+        self.compress_payload = compress_payload
 
     @staticmethod
     def system_type():
@@ -86,11 +92,17 @@ def transform(self) -> DataFrame:
         Returns:
             DataFrame: A dataframe with with rows in Honeywell APM format
         """
+
+        @udf("string")
+        def _compress_payload(data):
+            compressed_data = gzip.compress(data.encode("utf-8"))
+            encoded_data = base64.b64encode(compressed_data).decode("utf-8")
+            return encoded_data
+
         if self.data.isStreaming == False and self.history_samples_per_message > 1:
-            pcdm_df = self.data.withColumn("counter", monotonically_increasing_id())
-            w = Window.orderBy("counter")
+            w = Window.partitionBy("TagName").orderBy("TagName")
             cleaned_pcdm_df = (
-                pcdm_df.withColumn(
+                self.data.withColumn(
                     "index",
                     floor(
                         (row_number().over(w) - 0.01) / self.history_samples_per_message
@@ -105,9 +117,9 @@ def transform(self) -> DataFrame:
                         col("Value").alias("Value"),
                     ).alias("HistorySamples"),
                 )
-                .groupBy("index")
+                .groupBy("TagName", "index")
                 .agg(collect_list("HistorySamples").alias("HistorySamples"))
-                .withColumn("guid", expr("uuid()"))
+                .withColumn("guid", sha2(col("TagName"), 256).cast("string"))
                 .withColumn(
                     "value",
                     struct(
@@ -116,7 +128,9 @@ def transform(self) -> DataFrame:
                 )
             )
         else:
-            cleaned_pcdm_df = self.data.withColumn("guid", expr("uuid()")).withColumn(
+            cleaned_pcdm_df = self.data.withColumn(
+                "guid", sha2(col("TagName"), 256).cast("string")
+            ).withColumn(
                 "value",
                 struct(
                     col("guid").alias("SystemGuid"),
@@ -131,32 +145,48 @@ def transform(self) -> DataFrame:
                 ),
             )
 
-        df = cleaned_pcdm_df.withColumn(
-            "CloudPlatformEvent",
-            struct(
-                lit(datetime.now(tz=pytz.UTC)).alias("CreatedTime"),
-                lit(expr("uuid()")).alias("Id"),
-                col("guid").alias("CreatorId"),
-                lit("CloudPlatformSystem").alias("CreatorType"),
-                lit(None).alias("GeneratorId"),
-                lit("CloudPlatformTenant").alias("GeneratorType"),
-                col("guid").alias("TargetId"),
-                lit("CloudPlatformTenant").alias("TargetType"),
-                lit(None).alias("TargetContext"),
+        df = (
+            cleaned_pcdm_df.withColumn(
+                "CloudPlatformEvent",
                 struct(
-                    lit("TextualBody").alias("type"),
-                    to_json(col("value")).alias("value"),
-                    lit("application/json").alias("format"),
-                ).alias("Body"),
-                array(
+                    lit(datetime.now(tz=pytz.UTC)).alias("CreatedTime"),
+                    lit(expr("uuid()")).alias("Id"),
+                    col("guid").alias("CreatorId"),
+                    lit("CloudPlatformSystem").alias("CreatorType"),
+                    lit(None).alias("GeneratorId"),
+                    lit("CloudPlatformTenant").alias("GeneratorType"),
+                    col("guid").alias("TargetId"),
+                    lit("CloudPlatformTenant").alias("TargetType"),
+                    lit(None).alias("TargetContext"),
                     struct(
-                        lit("SystemType").alias("Key"),
-                        lit("apm-system").alias("Value"),
-                    ),
-                    struct(lit("SystemGuid").alias("Key"), col("guid").alias("Value")),
-                ).alias("BodyProperties"),
-                lit("DataChange.Update").alias("EventType"),
-            ),
-        ).withColumn("AnnotationStreamIds", lit(","))
-
-        return df.select("CloudPlatformEvent", "AnnotationStreamIds")
+                        lit("TextualBody").alias("type"),
+                        to_json(col("value")).alias("value"),
+                        lit("application/json").alias("format"),
+                    ).alias("Body"),
+                    array(
+                        struct(
+                            lit("SystemType").alias("Key"),
+                            lit("apm-system").alias("Value"),
+                        ),
+                        struct(
+                            lit("SystemGuid").alias("Key"), col("guid").alias("Value")
+                        ),
+                    ).alias("BodyProperties"),
+                    lit("DataChange.Update").alias("EventType"),
+                ),
+            )
+            .withColumn("AnnotationStreamIds", lit(","))
+            .withColumn("partitionKey", col("guid"))
+        )
+        if self.compress_payload:
+            return df.select(
+                _compress_payload(to_json("CloudPlatformEvent")).alias(
+                    "CloudPlatformEvent"
+                ),
+                "AnnotationStreamIds",
+                "partitionKey",
+            )
+        else:
+            return df.select(
+                "CloudPlatformEvent", "AnnotationStreamIds", "partitionKey"
+            )
diff --git a/src/sdk/python/rtdip_sdk/pipelines/utilities/pipeline_components.py b/src/sdk/python/rtdip_sdk/pipelines/utilities/pipeline_components.py
@@ -29,15 +29,17 @@ class PipelineComponentsGetUtility(UtilitiesInterface):
 
     Args:
         module (optional str): Provide the module to use for imports of rtdip-sdk components. If not populated, it will use the calling module to check for imports
+        spark_config (optional dict): Additional spark configuration to be applied to the spark session
     """
 
-    def __init__(self, module: str = None) -> None:
+    def __init__(self, module: str = None, spark_config: dict = None) -> None:
         if module == None:
             frm = inspect.stack()[1]
             mod = inspect.getmodule(frm[0])
             self.module = mod.__name__
         else:
             self.module = module
+        self.spark_config = {} if spark_config is None else spark_config
 
     @staticmethod
     def system_type():
@@ -100,7 +102,7 @@ def execute(self) -> Tuple[Libraries, dict]:
 
             task_libraries = Libraries()
             task_libraries.get_libraries_from_components(component_list)
-            spark_configuration = {}
+            spark_configuration = self.spark_config
             for component in component_list:
                 spark_configuration = {**spark_configuration, **component.settings()}
             return (task_libraries, spark_configuration)

diff --git a/src/sdk/python/rtdip_sdk/pipelines/utilities/spark/session.py b/src/sdk/python/rtdip_sdk/pipelines/utilities/spark/session.py
@@ -31,7 +31,7 @@ class SparkSessionUtility(UtilitiesInterface):
     Call this component after all imports of the RTDIP components to ensure that the spark session is configured correctly.
 
     Args:
-        config (dict): Dictionary of spark configuration to be applied to the spark session
+        config (optional dict): Dictionary of spark configuration to be applied to the spark session
         module (optional str): Provide the module to use for imports of rtdip-sdk components. If not populated, it will use the calling module to check for imports
         remote (optional str): Specify the remote parameters if intending to use Spark Connect
     """
@@ -40,7 +40,9 @@ class SparkSessionUtility(UtilitiesInterface):
     config: dict
     module: str
 
-    def __init__(self, config: dict, module: str = None, remote: str = None) -> None:
+    def __init__(
+        self, config: dict = None, module: str = None, remote: str = None
+    ) -> None:
         self.config = config
         if module == None:
             frm = inspect.stack()[1]
@@ -70,7 +72,7 @@ def settings() -> dict:
     def execute(self) -> SparkSession:
         try:
             (task_libraries, spark_configuration) = PipelineComponentsGetUtility(
-                self.module
+                self.module, self.config
             ).execute()
             self.spark = SparkClient(
                 spark_configuration=spark_configuration,

diff --git a/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_pcdm_to_honeywell_apm.py b/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_pcdm_to_honeywell_apm.py
@@ -26,63 +26,76 @@
     Libraries,
     SystemType,
 )
-
 from pyspark.sql import SparkSession, DataFrame
-from pytest_mock import MockerFixture
 from pyspark.sql.types import StructType, StructField, StringType, TimestampType
 from datetime import datetime
-import uuid
 
+pcdm_schema = StructType(
+    [
+        StructField("TagName", StringType(), True),
+        StructField("EventTime", TimestampType(), True),
+        StructField("Status", StringType(), False),
+        StructField("Value", StringType(), True),
+        StructField("ValueType", StringType(), False),
+        StructField("ChangeType", StringType(), False),
+    ]
+)
 
-def test_pcdm_to_honeywell_apm(spark_session: SparkSession, mocker: MockerFixture):
-    pcdm_schema = StructType(
-        [
-            StructField("TagName", StringType(), True),
-            StructField("EventTime", TimestampType(), True),
-            StructField("Status", StringType(), False),
-            StructField("Value", StringType(), True),
-            StructField("ValueType", StringType(), False),
-            StructField("ChangeType", StringType(), False),
-        ]
-    )
+pcdm_data = [
+    {
+        "TagName": "test.item1",
+        "EventTime": datetime.fromisoformat("2023-07-31T06:53:00+00:00"),
+        "Status": "Good",
+        "Value": 5.0,
+        "ValueType": "float",
+        "ChangeType": "insert",
+    },
+    {
+        "TagName": "Test_item2",
+        "EventTime": datetime.fromisoformat("2023-07-31T06:54:00+00:00"),
+        "Status": "Good",
+        "Value": 1,
+        "ValueType": "float",
+        "ChangeType": "insert",
+    },
+]
 
-    pcdm_data = [
-        {
-            "TagName": "test.item1",
-            "EventTime": datetime.fromisoformat("2023-07-31T06:53:00+00:00"),
-            "Status": "Good",
-            "Value": 5.0,
-            "ValueType": "float",
-            "ChangeType": "insert",
-        },
-        {
-            "TagName": "Test_item2",
-            "EventTime": datetime.fromisoformat("2023-07-31T06:54:00+00:00"),
-            "Status": "Good",
-            "Value": 1,
-            "ValueType": "float",
-            "ChangeType": "insert",
-        },
-    ]
+
+def test_pcdm_to_honeywell_apm(spark_session: SparkSession):
     pcdm_df: DataFrame = spark_session.createDataFrame(
         schema=pcdm_schema, data=pcdm_data
     )
-    PCDM_to_honeywell_eventhub_json_transformer = PCDMToHoneywellAPMTransformer(
-        data=pcdm_df, history_samples_per_message=3
+    pcdm_to_honeywell_eventhub_json_transformer = PCDMToHoneywellAPMTransformer(
+        data=pcdm_df, history_samples_per_message=3, compress_payload=False
     )
 
-    actual_df = PCDM_to_honeywell_eventhub_json_transformer.transform()
+    actual_df = pcdm_to_honeywell_eventhub_json_transformer.transform()
     df_row = actual_df.collect()[0]
-    assert isinstance(uuid.UUID(df_row["CloudPlatformEvent"]["CreatorId"]), uuid.UUID)
     assert (
-        PCDM_to_honeywell_eventhub_json_transformer.system_type() == SystemType.PYSPARK
+        df_row["CloudPlatformEvent"]["CreatorId"]
+        == "51bc4f9dda971d1b5417161bb98e5d8f77bea2587d9de783b54be25e22b56496"
+    )
+    assert (
+        pcdm_to_honeywell_eventhub_json_transformer.system_type() == SystemType.PYSPARK
     )
     assert isinstance(
-        PCDM_to_honeywell_eventhub_json_transformer.libraries(), Libraries
+        pcdm_to_honeywell_eventhub_json_transformer.libraries(), Libraries
     )
-    assert len(df_row) == 2
+    assert len(df_row) == 3
     assert len(df_row["CloudPlatformEvent"]) == 12
     assert len(df_row["CloudPlatformEvent"]["Body"]) == 3
     assert len(df_row["CloudPlatformEvent"]["BodyProperties"]) == 2
     assert len(df_row["CloudPlatformEvent"]["BodyProperties"][0]) == 2
     assert len(df_row["CloudPlatformEvent"]["BodyProperties"][1]) == 2
+
+
+def test_pcdm_to_honeywell_apm_gzip_compressed(spark_session: SparkSession):
+    pcdm_df: DataFrame = spark_session.createDataFrame(
+        schema=pcdm_schema, data=pcdm_data
+    )
+    pcdm_to_honeywell_eventhub_json_transformer = PCDMToHoneywellAPMTransformer(
+        data=pcdm_df, history_samples_per_message=3
+    )
+    actual_df = pcdm_to_honeywell_eventhub_json_transformer.transform()
+    df_row = actual_df.collect()[0]
+    assert isinstance(df_row["CloudPlatformEvent"], str)