From d9bb204345de112d3a48eb56e273d4792f071567 Mon Sep 17 00:00:00 2001 From: JamesKnBr Date: Wed, 12 Apr 2023 15:38:52 +0100 Subject: [PATCH 01/16] typo change Signed-off-by: JamesKnBr --- src/sdk/python/rtdip_sdk/pipelines/sources/spark/eventhub.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sdk/python/rtdip_sdk/pipelines/sources/spark/eventhub.py b/src/sdk/python/rtdip_sdk/pipelines/sources/spark/eventhub.py index d53afa96e..03cc44b45 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/sources/spark/eventhub.py +++ b/src/sdk/python/rtdip_sdk/pipelines/sources/spark/eventhub.py @@ -23,7 +23,7 @@ class SparkEventhubSource(SourceInterface): ''' This Spark source class is used to read batch or streaming data from Eventhubs. Eventhub configurations need to be specified as options in a dictionary. - Additionally, there are more optional configuration which can be found [here.](https://github.com/Azure/azure-event-hubs-spark/blob/master/docs/PySpark/structured-streaming-pyspark.md#event-hubs-configuration){ target="_blank" } + Additionally, there are more optional configurations which can be found [here.](https://github.com/Azure/azure-event-hubs-spark/blob/master/docs/PySpark/structured-streaming-pyspark.md#event-hubs-configuration){ target="_blank" } If using startingPosition or endingPosition make sure to check out **Event Position** section for more details and examples. Args: spark: Spark Session From f76aea45d6808d0ce26e1e6d925928fb85dc4639 Mon Sep 17 00:00:00 2001 From: JamesKnBr Date: Fri, 18 Aug 2023 11:51:48 +0100 Subject: [PATCH 02/16] Add Honeywell transformers Signed-off-by: JamesKnBr --- .../spark/honeywell_apm_to_pcdm.md | 2 + .../spark/pcdm_to_honeywell_apm.md | 2 + mkdocs.yml | 2 + .../pipelines/_pipeline_utils/spark.py | 20 +++- .../pipelines/transformers/__init__.py | 2 + .../spark/honeywell_apm_to_pcdm.py | 79 ++++++++++++++ .../spark/pcdm_to_honeywell_apm.py | 101 ++++++++++++++++++ .../spark/test_honeywell_apm_to_pcdm.py | 54 ++++++++++ .../spark/test_pcdm_to_honeywell_apm.py | 66 ++++++++++++ 9 files changed, 327 insertions(+), 1 deletion(-) create mode 100644 docs/sdk/code-reference/pipelines/transformers/spark/honeywell_apm_to_pcdm.md create mode 100644 docs/sdk/code-reference/pipelines/transformers/spark/pcdm_to_honeywell_apm.md create mode 100644 src/sdk/python/rtdip_sdk/pipelines/transformers/spark/honeywell_apm_to_pcdm.py create mode 100644 src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py create mode 100644 tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_honeywell_apm_to_pcdm.py create mode 100644 tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_pcdm_to_honeywell_apm.py diff --git a/docs/sdk/code-reference/pipelines/transformers/spark/honeywell_apm_to_pcdm.md b/docs/sdk/code-reference/pipelines/transformers/spark/honeywell_apm_to_pcdm.md new file mode 100644 index 000000000..912246275 --- /dev/null +++ b/docs/sdk/code-reference/pipelines/transformers/spark/honeywell_apm_to_pcdm.md @@ -0,0 +1,2 @@ +# Convert Honeywell APM Json to Process Control Data Model +::: src.sdk.python.rtdip_sdk.pipelines.transformers.spark.honeywell_apm_to_pcdm \ No newline at end of file diff --git a/docs/sdk/code-reference/pipelines/transformers/spark/pcdm_to_honeywell_apm.md b/docs/sdk/code-reference/pipelines/transformers/spark/pcdm_to_honeywell_apm.md new file mode 100644 index 000000000..f82e46698 --- /dev/null +++ b/docs/sdk/code-reference/pipelines/transformers/spark/pcdm_to_honeywell_apm.md @@ -0,0 +1,2 @@ +# Convert Process Control Data Model to Honeywell APM Json +::: src.sdk.python.rtdip_sdk.pipelines.transformers.spark.pcdm_to_honeywell_apm \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index e046b0df9..c0c1f2233 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -174,6 +174,8 @@ nav: - Pandas to PySpark DataFrame Conversion: sdk/code-reference/pipelines/transformers/spark/pandas_to_pyspark.md - PySpark to Pandas DataFrame Conversion: sdk/code-reference/pipelines/transformers/spark/pyspark_to_pandas.md - Base Raw To Meters Data Model: sdk/code-reference/pipelines/transformers/spark/base_raw_to_mdm.md + - Honeywell APM To Process Control Data Model: sdk/code-reference/pipelines/transformers/spark/honeywell_apm_to_pcdm.md + - Process Control Data Model To Honeywell APM: sdk/code-reference/pipelines/transformers/spark/pcdm_to_honeywell_apm.md - ISO: - MISO To Meters Data Model: sdk/code-reference/pipelines/transformers/spark/iso/miso_to_mdm.md - PJM To Meters Data Model: sdk/code-reference/pipelines/transformers/spark/iso/pjm_to_mdm.md diff --git a/src/sdk/python/rtdip_sdk/pipelines/_pipeline_utils/spark.py b/src/sdk/python/rtdip_sdk/pipelines/_pipeline_utils/spark.py index 618171cb0..266cd0235 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/_pipeline_utils/spark.py +++ b/src/sdk/python/rtdip_sdk/pipelines/_pipeline_utils/spark.py @@ -14,7 +14,7 @@ import logging from pyspark.sql import SparkSession -from pyspark.sql.types import StructType, StructField, TimestampType, StringType, BinaryType, LongType, MapType, IntegerType, ArrayType +from pyspark.sql.types import StructType, StructField, TimestampType, StringType, BinaryType, LongType, MapType, IntegerType, ArrayType, DoubleType from .models import Libraries from ..._sdk_utils.compare_versions import _package_version_meets_minimum @@ -181,4 +181,22 @@ def get_dbutils( StructField('valueType', StringType(), True), StructField('value', StringType(), True)])) , True) +]) + +APM_SCHEMA = StructType([ + StructField('Id', StringType(), True), + StructField('TenantId', StringType(), True), + StructField('IdType', StringType(), True), + StructField('Samples', ArrayType( + StructType([ + StructField('ItemName', StringType(), True), + StructField('Time', StringType(), True), + StructField('Value', StringType(), True), + StructField('Unit', StringType(), True), + StructField('NormalizedQuality', StringType(), True), + StructField('HighValue', DoubleType(), True), + StructField('LowValue', DoubleType(), True), + StructField('TargetValue', DoubleType(), True), + ]) + ), True), ]) \ No newline at end of file diff --git a/src/sdk/python/rtdip_sdk/pipelines/transformers/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/transformers/__init__.py index 4a0df9138..149042611 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/transformers/__init__.py +++ b/src/sdk/python/rtdip_sdk/pipelines/transformers/__init__.py @@ -19,3 +19,5 @@ from .spark.ssip_pi_binary_json_to_pcdm import * from .spark.iso import * from .spark.edgex_opcua_json_to_pcdm import * +from .spark.pcdm_to_honeywell_apm import * +from .spark.honeywell_apm_to_pcdm import * diff --git a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/honeywell_apm_to_pcdm.py b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/honeywell_apm_to_pcdm.py new file mode 100644 index 000000000..bd40b5012 --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/honeywell_apm_to_pcdm.py @@ -0,0 +1,79 @@ +# Copyright 2022 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pyspark.sql import DataFrame +from pyspark.sql.functions import from_json, col, explode, when, lit , regexp_replace + +from ..interfaces import TransformerInterface +from ..._pipeline_utils.models import Libraries, SystemType +from ..._pipeline_utils.spark import APM_SCHEMA + +class HoneywellAPMJsonToPCDMTransformer(TransformerInterface): + ''' + Converts a Spark Dataframe column containing a json string created by Honeywell APM to the Process Control Data Model + + Args: + data (DataFrame): Dataframe containing the column with EdgeX data + source_column_name (str): Spark Dataframe column containing the OPC Publisher Json OPC UA data + status_null_value (optional str): If populated, will replace 'Good' in the Status column with the specified value. + change_type_value (optional str): If populated, will replace 'insert' in the ChangeType column with the specified value. + ''' + data: DataFrame + source_column_name: str + status_null_value: str + change_type_value: str + + def __init__(self, data: DataFrame, source_column_name: str, status_null_value: str = "Good", change_type_value: str = "insert") -> None: + self.data = data + self.source_column_name = source_column_name + self.status_null_value = status_null_value + self.change_type_value = change_type_value + + @staticmethod + def system_type(): + ''' + Attributes: + SystemType (Environment): Requires PYSPARK + ''' + return SystemType.PYSPARK + + @staticmethod + def libraries(): + libraries = Libraries() + return libraries + + @staticmethod + def settings() -> dict: + return {} + + def pre_transform_validation(self): + return True + + def post_transform_validation(self): + return True + + def transform(self) -> DataFrame: + ''' + Returns: + DataFrame: A dataframe with the specified column converted to PCDM + ''' + df = (self.data.withColumn("body",from_json(self.source_column_name, APM_SCHEMA)).select(explode("body.Samples")) + .selectExpr("*", "to_timestamp(col.Time) as EventTime") + .withColumn("TagName", col("col.Itemname")) + .withColumn("Status", lit(self.status_null_value)) + .withColumn("Value", col("col.Value")) + .withColumn("ValueType", when(col("value").cast("float").isNotNull(), "float").when(col("value").cast("float").isNull(), "string")) + .withColumn("ChangeType", lit(self.change_type_value))) + + return df.select("TagName", "EventTime", "Status", "Value", "ValueType", "ChangeType") \ No newline at end of file diff --git a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py new file mode 100644 index 000000000..6f02514f0 --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py @@ -0,0 +1,101 @@ +# Copyright 2022 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pyspark.sql import DataFrame +from pyspark.sql.functions import to_json, col, struct, create_map, lit, array, monotonically_increasing_id, floor, row_number, collect_list, expr +from pyspark.sql import Window +from datetime import datetime +import pytz +import uuid + +from ..interfaces import TransformerInterface +from ..._pipeline_utils.models import Libraries, SystemType +from ..._pipeline_utils.spark import EDGEX_SCHEMA + +class PCDMToHoneywellAPMTransformer(TransformerInterface): + ''' + Converts a Spark Dataframe in PCDM format to Honeywell APM format. + Args: + data (Dataframe): Spark Dataframe in PCDM format + quality (str): Value for quality inside HistorySamples + history_samples_per_message (int): The number of HistorySamples for each row in the DataFrame + + ''' + data: DataFrame + quality: str + history_samples_per_message: int + + def __init__(self, data: DataFrame, quality: str = "Good", history_samples_per_message: int = 1) -> None: + self.data = data + self.quality = quality + self.history_samples_per_message = history_samples_per_message + + + @staticmethod + def system_type(): + ''' + Attributes: + SystemType (Environment): Requires PYSPARK + ''' + return SystemType.PYSPARK + + @staticmethod + def libraries(): + libraries = Libraries() + return libraries + + @staticmethod + def settings() -> dict: + return {} + + def pre_transform_validation(self): + return True + + def post_transform_validation(self): + return True + + def transform(self) -> DataFrame: + ''' + Returns: + DataFrame: A dataframe with with rows in Honeywell APM format + ''' + pcdm_df = self.data.withColumn("counter", monotonically_increasing_id()) + w = Window.orderBy("counter") + indexed_pcdm_df = (pcdm_df.withColumn("index", floor((row_number().over(w)-0.01)/self.history_samples_per_message)).withColumn("HistorySamples", struct( + col("TagName").alias("ItemName"), + lit(self.quality).alias("Quality"), + col("EventTime").alias("Time"), + col("Value").alias("Value")).alias("HistorySamples")).groupBy("index").agg(collect_list("HistorySamples").alias("HistorySamples")) + .withColumn("guid",expr("uuid()")) + .withColumn("value", struct(col("guid").alias("SystemGuid"), col("HistorySamples")).alias("value"))) + + df = indexed_pcdm_df.withColumn("CloudPlatformEvent", + create_map( + lit("CloudPlatformEvent"), + struct( + lit(datetime.now(tz=pytz.UTC)).alias("CreatedTime"), + lit(expr("uuid()")).alias("Id"), + col("guid").alias("CreatorId"), + lit("CloudPlatformSystem").alias("CreatorType"), + lit(None).alias("GeneratorId"), + lit("CloudPlatformTenant").alias("GeneratorType"), + col("guid").alias("TargetId"), + lit("CloudPlatformTenant").alias("TargetType"), + lit(None).alias("TargetContext"), + struct(lit("TextualBody").alias("type"), to_json(col("value")).alias("value"), lit("application/json").alias("format")).alias("Body"), + array(struct(lit("SystemType").alias("Key"),lit("apm-system").alias("Value")), + struct(lit("SystemGuid").alias("Key"),col("guid").alias("Value"))).alias("BodyProperties"), + lit("DataChange.Update").alias("EventType")))).withColumn("AnnotationStreamIds", lit("self.AnnotationStreamIds")) + + return df.select("CloudPlatformEvent", "AnnotationStreamIds") diff --git a/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_honeywell_apm_to_pcdm.py b/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_honeywell_apm_to_pcdm.py new file mode 100644 index 000000000..5bb4b453c --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_honeywell_apm_to_pcdm.py @@ -0,0 +1,54 @@ +# Copyright 2022 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +sys.path.insert(0, '.') +from src.sdk.python.rtdip_sdk.pipelines.transformers.spark.honeywell_apm_to_pcdm import HoneywellAPMJsonToPCDMTransformer +from src.sdk.python.rtdip_sdk.pipelines._pipeline_utils.models import Libraries, SystemType + +from pyspark.sql import SparkSession, DataFrame +from pyspark.sql.types import StructType, StructField, StringType, TimestampType +from datetime import datetime, timezone + +def test_honeywell_apm_to_pcdm(spark_session: SparkSession): + honeywell_json_data = '{"Id": "testId","TenantId": "testTenantId","IdType": "calculatedpoint","Samples": [{"ItemName": "test.item1", "Time": "2023-07-31T06:53:00+00:00","Value": "5.0","Unit": null,"NormalizedQuality": "good", "HighValue": null,"LowValue": null,"TargetValue": null},{"ItemName": "test_item2","Time": "2023-07-31T06:53:00+00:00","Value": 0.0,"Unit": null,"NormalizedQuality": "good","HighValue": null,"LowValue": null,"TargetValue": null},{"ItemName": "testItem3","Time": "2023-07-31T06:53:00.205+00:00","Value": "test_string","Unit": null,"NormalizedQuality": "good","HighValue": null,"LowValue": null,"TargetValue": null}]}' + honeywell_df: DataFrame = spark_session.createDataFrame([{"body": honeywell_json_data}]) + + expected_schema = StructType([ + StructField("TagName", StringType(), True), + StructField("EventTime", TimestampType(), True), + StructField("Status", StringType(), False), + StructField("Value", StringType(), True), + StructField("ValueType", StringType(), True), + StructField("ChangeType", StringType(), False), + ]) + + expected_data = [ + {"TagName":"test.item1", "EventTime": datetime.fromisoformat("2023-07-31T06:53:00+00:00"), "Status":"Good", "Value": 5.0, "ValueType":"float", "ChangeType": "insert"}, + {"TagName":"test_item2", "EventTime": datetime.fromisoformat("2023-07-31T06:53:00+00:00"), "Status":"Good", "Value": 0.0, "ValueType":"float", "ChangeType": "insert"}, + {"TagName":"testItem3", "EventTime": datetime.fromisoformat("2023-07-31T06:53:00.205+00:00"), "Status":"Good", "Value": "test_string", "ValueType":"string", "ChangeType": "insert"}, + ] + + expected_df: DataFrame = spark_session.createDataFrame( + schema=expected_schema, + data=expected_data + ) + + honeywell_eventhub_json_to_PCDM_transformer = HoneywellAPMJsonToPCDMTransformer(data=honeywell_df, source_column_name="body") + actual_df = honeywell_eventhub_json_to_PCDM_transformer.transform() + + assert honeywell_eventhub_json_to_PCDM_transformer.system_type() == SystemType.PYSPARK + assert isinstance(honeywell_eventhub_json_to_PCDM_transformer.libraries(), Libraries) + assert expected_schema == actual_df.schema + assert expected_df.collect() == actual_df.collect() \ No newline at end of file diff --git a/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_pcdm_to_honeywell_apm.py b/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_pcdm_to_honeywell_apm.py new file mode 100644 index 000000000..6a6ce8a9d --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_pcdm_to_honeywell_apm.py @@ -0,0 +1,66 @@ +# Copyright 2022 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sys +sys.path.insert(0, '.') +import os +os.environ['PYSPARK_PYTHON'] = sys.executable +os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable + +from src.sdk.python.rtdip_sdk.pipelines.transformers.spark.pcdm_to_honeywell_apm import PCDMToHoneywellAPMTransformer +from src.sdk.python.rtdip_sdk.pipelines._pipeline_utils.models import Libraries, SystemType + +from pyspark.sql import SparkSession, DataFrame +from pytest_mock import MockerFixture +from pyspark.sql.types import StructType, StructField, StringType, TimestampType +from pyspark.sql.functions import regexp_replace +from datetime import datetime +import json + +def test_pcdm_to_honeywell_apm(spark_session: SparkSession, mocker: MockerFixture): + + pcdm_schema = StructType([ + StructField("TagName", StringType(), True), + StructField("EventTime", TimestampType(), True), + StructField("Status", StringType(), False), + StructField("Value", StringType(), True), + StructField("ValueType", StringType(), False), + StructField("ChangeType", StringType(), False), + ]) + + pcdm_data = [ + {"TagName":"test.item1", "EventTime": datetime.fromisoformat("2023-07-31T06:53:00+00:00"), "Status":"Good", "Value": 5.0, "ValueType":"float", "ChangeType": "insert"}, + {"TagName":"Test_item2", "EventTime": datetime.fromisoformat("2023-07-31T06:54:00+00:00"), "Status":"Good", "Value": 1, "ValueType":"float", "ChangeType": "insert"}, + ] + + pcdm_df: DataFrame = spark_session.createDataFrame( + schema=pcdm_schema, + data=pcdm_data + ) + honeywell_json_data = {"CloudPlatformEvent": {"CreatedTime": "2023-08-10T06:53:00+00:00","Id": "2b2a64f6-bfee-49f5-9d1b-04df844e80be","CreatorId": "065a7343-a3b5-4ecd-9bac-19cdff5cf048","CreatorType": "CloudPlatformSystem","GeneratorId": None,"GeneratorType": "CloudPlatformTenant","TargetId": "065a7343-a3b5-4ecd-9bac-19cdff5cf048","TargetType": "CloudPlatformTenant","TargetContext": None,"Body": {"type": "TextualBody","value": "{\"SystemGuid\":\"065a7343-a3b5-4ecd-9bac-19cdff5cf048\",\"HistorySamples\":[{\"ItemName\":\"test.item1\",\"Quality\":\"Good\",\"Time\":\"2023-07-31T06:53:00+00:00\",\"Value\":5},{\"ItemName\":\"Test_item2\",\"Quality\":\"Good\",\"Time\":\"2023-07-31T06:54:00+00:00\",\"Value\":1}]}","format": "application/json"},"BodyProperties":[{"Key": "SystemType","Value": "apm-system"},{"Key": "SystemGuid","Value": "065a7343-a3b5-4ecd-9bac-19cdff5cf048"}],"EventType": "DataChange.Update"},"AnnotationStreamIds": ","} + expected_df = spark_session.createDataFrame([honeywell_json_data]) + PCDM_to_honeywell_eventhub_json_transformer = PCDMToHoneywellAPMTransformer(data=pcdm_df, history_samples_per_message=3) + + actual_df = PCDM_to_honeywell_eventhub_json_transformer.transform() + dict = actual_df.collect()[0]['CloudPlatformEvent'] + + + assert len(dict) == 1 + assert PCDM_to_honeywell_eventhub_json_transformer.system_type() == SystemType.PYSPARK + assert isinstance(PCDM_to_honeywell_eventhub_json_transformer.libraries(), Libraries) + assert len(dict) == 1 + assert len(dict["CloudPlatformEvent"]) == 12 + assert len(dict["CloudPlatformEvent"]["Body"]) == 3 + assert len(dict["CloudPlatformEvent"]["BodyProperties"]) == 2 + assert len(dict["CloudPlatformEvent"]["BodyProperties"][0]) == 2 + assert len(dict["CloudPlatformEvent"]["BodyProperties"][1]) == 2 From 3562afd11feadb4a3cdcae785829659a544abf7c Mon Sep 17 00:00:00 2001 From: JamesKnBr Date: Wed, 23 Aug 2023 12:20:50 +0100 Subject: [PATCH 03/16] add eventhub destination and pcdm-honeywell Signed-off-by: JamesKnBr --- .../pipelines/destinations/spark/eventhub.py | 71 ++++++++++- .../spark/pcdm_to_honeywell_apm.py | 112 ++++++++++-------- .../destinations/spark/test_eventhub.py | 52 +++++++- .../spark/test_pcdm_to_honeywell_apm.py | 36 +----- 4 files changed, 185 insertions(+), 86 deletions(-) diff --git a/src/sdk/python/rtdip_sdk/pipelines/destinations/spark/eventhub.py b/src/sdk/python/rtdip_sdk/pipelines/destinations/spark/eventhub.py index fbaf338c9..ec89c93b1 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/destinations/spark/eventhub.py +++ b/src/sdk/python/rtdip_sdk/pipelines/destinations/spark/eventhub.py @@ -16,6 +16,8 @@ import time from pyspark.sql import DataFrame from py4j.protocol import Py4JJavaError +from pyspark.sql.functions import col, struct, to_json +from pyspark.sql.types import StringType, BinaryType from ..interfaces import DestinationInterface from ..._pipeline_utils.models import Libraries, SystemType @@ -79,12 +81,60 @@ def pre_write_validation(self): def post_write_validation(self): return True + def prepare_columns(self): + if "body" in self.data.columns: + if self.data.schema["body"].dataType not in [StringType(), BinaryType()]: + try: + self.data.withColumn("body", col("body").cast(StringType())) + except: + raise ValueError("'body' column must be of string or binary type") + else: + self.data = self.data.withColumn( + "body", + to_json( + struct( + [ + col(column).alias(column) + for column in self.data.columns + if column not in ["partitionId", "partitionKey"] + ] + ) + ), + ) + for column in self.data.schema: + if ( + column.name in ["partitionId", "partitionKey"] + and column.dataType != StringType() + ): + try: + self.data = self.data.withColumn( + column.name, col(column.name).cast(StringType()) + ) + except: + raise ValueError(f"Column {column.name} must be of string type") + return self.data.select( + [ + column + for column in self.data.columns + if column in ["partitionId", "partitionKey", "body"] + ] + ) + def write_batch(self): """ Writes batch data to Eventhubs. """ + eventhub_connection_string = "eventhubs.connectionString" try: - return self.data.write.format("eventhubs").options(**self.options).save() + if eventhub_connection_string in self.options: + sc = self.spark.sparkContext + self.options[ + eventhub_connection_string + ] = sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt( + self.options[eventhub_connection_string] + ) + df = self.prepare_columns() + return df.write.format("eventhubs").options(**self.options).save() except Py4JJavaError as e: logging.exception(e.errmsg) @@ -97,14 +147,31 @@ def write_stream(self): """ Writes steaming data to Eventhubs. """ + eventhub_connection_string = "eventhubs.connectionString" try: TRIGGER_OPTION = ( {"availableNow": True} if self.trigger == "availableNow" else {"processingTime": self.trigger} ) + if eventhub_connection_string in self.options: + sc = self.spark.sparkContext + self.options[ + eventhub_connection_string + ] = sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt( + self.options[eventhub_connection_string] + ) + df = self.prepare_columns() + df = self.data.select( + [ + column + for column in self.data.columns + if column in ["partitionId", "partitionKey", "body"] + ] + ) + query = ( - self.data.writeStream.trigger(**TRIGGER_OPTION) + df.writeStream.trigger(**TRIGGER_OPTION) .format("eventhubs") .options(**self.options) .queryName(self.query_name) diff --git a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py index e88e2baac..2a797977a 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py +++ b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py @@ -29,7 +29,6 @@ from pyspark.sql import Window from datetime import datetime import pytz -import uuid from ..interfaces import TransformerInterface from ..._pipeline_utils.models import Libraries, SystemType @@ -42,7 +41,7 @@ class PCDMToHoneywellAPMTransformer(TransformerInterface): Args: data (Dataframe): Spark Dataframe in PCDM format quality (str): Value for quality inside HistorySamples - history_samples_per_message (int): The number of HistorySamples for each row in the DataFrame + history_samples_per_message (int): The number of HistorySamples for each row in the DataFrame (Batch Only) """ @@ -88,63 +87,74 @@ def transform(self) -> DataFrame: Returns: DataFrame: A dataframe with with rows in Honeywell APM format """ - pcdm_df = self.data.withColumn("counter", monotonically_increasing_id()) - w = Window.orderBy("counter") - indexed_pcdm_df = ( - pcdm_df.withColumn( - "index", - floor((row_number().over(w) - 0.01) / self.history_samples_per_message), - ) - .withColumn( - "HistorySamples", - struct( - col("TagName").alias("ItemName"), - lit(self.quality).alias("Quality"), - col("EventTime").alias("Time"), - col("Value").alias("Value"), - ).alias("HistorySamples"), + if self.data.isStreaming and self.history_samples_per_message > 1: + pcdm_df = self.data.withColumn("counter", monotonically_increasing_id()) + w = Window.orderBy("counter") + cleaned_pcdm_df = ( + pcdm_df.withColumn( + "index", + floor( + (row_number().over(w) - 0.01) / self.history_samples_per_message + ), + ) + .withColumn( + "HistorySamples", + struct( + col("TagName").alias("ItemName"), + lit(self.quality).alias("Quality"), + col("EventTime").alias("Time"), + col("Value").alias("Value"), + ).alias("HistorySamples"), + ) + .groupBy("index") + .agg(collect_list("HistorySamples").alias("HistorySamples")) + .withColumn("guid", expr("uuid()")) + .withColumn( + "value", + struct( + col("guid").alias("SystemGuid"), col("HistorySamples") + ).alias("value"), + ) ) - .groupBy("index") - .agg(collect_list("HistorySamples").alias("HistorySamples")) - .withColumn("guid", expr("uuid()")) - .withColumn( + else: + cleaned_pcdm_df = self.data.withColumn("guid", expr("uuid()")).withColumn( "value", - struct(col("guid").alias("SystemGuid"), col("HistorySamples")).alias( - "value" + struct( + col("guid").alias("SystemGuid"), + struct( + col("TagName").alias("ItemName"), + lit(self.quality).alias("Quality"), + col("EventTime").alias("Time"), + col("Value").alias("Value"), + ).alias("HistorySamples"), ), ) - ) - df = indexed_pcdm_df.withColumn( + df = cleaned_pcdm_df.withColumn( "CloudPlatformEvent", - create_map( - lit("CloudPlatformEvent"), + struct( + lit(datetime.now(tz=pytz.UTC)).alias("CreatedTime"), + lit(expr("uuid()")).alias("Id"), + col("guid").alias("CreatorId"), + lit("CloudPlatformSystem").alias("CreatorType"), + lit(None).alias("GeneratorId"), + lit("CloudPlatformTenant").alias("GeneratorType"), + col("guid").alias("TargetId"), + lit("CloudPlatformTenant").alias("TargetType"), + lit(None).alias("TargetContext"), struct( - lit(datetime.now(tz=pytz.UTC)).alias("CreatedTime"), - lit(expr("uuid()")).alias("Id"), - col("guid").alias("CreatorId"), - lit("CloudPlatformSystem").alias("CreatorType"), - lit(None).alias("GeneratorId"), - lit("CloudPlatformTenant").alias("GeneratorType"), - col("guid").alias("TargetId"), - lit("CloudPlatformTenant").alias("TargetType"), - lit(None).alias("TargetContext"), + lit("TextualBody").alias("type"), + to_json(col("value")).alias("value"), + lit("application/json").alias("format"), + ).alias("Body"), + array( struct( - lit("TextualBody").alias("type"), - to_json(col("value")).alias("value"), - lit("application/json").alias("format"), - ).alias("Body"), - array( - struct( - lit("SystemType").alias("Key"), - lit("apm-system").alias("Value"), - ), - struct( - lit("SystemGuid").alias("Key"), col("guid").alias("Value") - ), - ).alias("BodyProperties"), - lit("DataChange.Update").alias("EventType"), - ), + lit("SystemType").alias("Key"), + lit("apm-system").alias("Value"), + ), + struct(lit("SystemGuid").alias("Key"), col("guid").alias("Value")), + ).alias("BodyProperties"), + lit("DataChange.Update").alias("EventType"), ), ).withColumn("AnnotationStreamIds", lit("self.AnnotationStreamIds")) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/destinations/spark/test_eventhub.py b/tests/sdk/python/rtdip_sdk/pipelines/destinations/spark/test_eventhub.py index 563fc8e4f..8718af372 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/destinations/spark/test_eventhub.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/destinations/spark/test_eventhub.py @@ -25,7 +25,14 @@ MavenLibrary, ) from pyspark.sql import SparkSession, DataFrame -from pyspark.sql.streaming import StreamingQuery +from pyspark.sql.types import ( + StructType, + StructField, + StringType, + TimestampType, + LongType, +) +from datetime import datetime class TestStreamingQueryClass: @@ -51,6 +58,49 @@ def test_spark_eventhub_write_setup(): assert eventhub_destination.post_write_validation() +def test_prepare_columns(spark_session: SparkSession): + pcdm_schema = StructType( + [ + StructField("TagName", StringType(), True), + StructField("EventTime", TimestampType(), True), + StructField("Status", StringType(), False), + StructField("Value", StringType(), True), + StructField("ValueType", StringType(), False), + StructField("ChangeType", StringType(), False), + StructField("partitionId", LongType(), False), + ] + ) + + pcdm_data = [ + { + "TagName": "test.item1", + "EventTime": datetime.fromisoformat("2023-07-31T06:53:00+00:00"), + "Status": "Good", + "Value": 5.0, + "ValueType": "float", + "ChangeType": "insert", + "partitionId": 134343345, + }, + { + "TagName": "Test_item2", + "EventTime": datetime.fromisoformat("2023-07-31T06:54:00+00:00"), + "Status": "Good", + "Value": 1, + "ValueType": "float", + "ChangeType": "insert", + "partitionId": 134343345, + }, + ] + pcdm_df: DataFrame = spark_session.createDataFrame( + schema=pcdm_schema, data=pcdm_data + ) + eventhub_destination = SparkEventhubDestination(pcdm_df, {}) + prepared_df = eventhub_destination.prepare_columns() + assert len(prepared_df.schema) == 2 + assert prepared_df.schema["body"].dataType == StringType() + assert prepared_df.schema["partitionId"].dataType == StringType() + + def test_spark_eventhub_write_batch(spark_session: SparkSession, mocker: MockerFixture): mocker.patch( "pyspark.sql.DataFrame.write", diff --git a/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_pcdm_to_honeywell_apm.py b/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_pcdm_to_honeywell_apm.py index 675c6c62c..79b8b6e5e 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_pcdm_to_honeywell_apm.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_pcdm_to_honeywell_apm.py @@ -30,9 +30,8 @@ from pyspark.sql import SparkSession, DataFrame from pytest_mock import MockerFixture from pyspark.sql.types import StructType, StructField, StringType, TimestampType -from pyspark.sql.functions import regexp_replace from datetime import datetime -import json +import uuid def test_pcdm_to_honeywell_apm(spark_session: SparkSession, mocker: MockerFixture): @@ -65,50 +64,23 @@ def test_pcdm_to_honeywell_apm(spark_session: SparkSession, mocker: MockerFixtur "ChangeType": "insert", }, ] - pcdm_df: DataFrame = spark_session.createDataFrame( schema=pcdm_schema, data=pcdm_data ) - honeywell_json_data = { - "CloudPlatformEvent": { - "CreatedTime": "2023-08-10T06:53:00+00:00", - "Id": "2b2a64f6-bfee-49f5-9d1b-04df844e80be", - "CreatorId": "065a7343-a3b5-4ecd-9bac-19cdff5cf048", - "CreatorType": "CloudPlatformSystem", - "GeneratorId": None, - "GeneratorType": "CloudPlatformTenant", - "TargetId": "065a7343-a3b5-4ecd-9bac-19cdff5cf048", - "TargetType": "CloudPlatformTenant", - "TargetContext": None, - "Body": { - "type": "TextualBody", - "value": '{"SystemGuid":"065a7343-a3b5-4ecd-9bac-19cdff5cf048","HistorySamples":[{"ItemName":"test.item1","Quality":"Good","Time":"2023-07-31T06:53:00+00:00","Value":5},{"ItemName":"Test_item2","Quality":"Good","Time":"2023-07-31T06:54:00+00:00","Value":1}]}', - "format": "application/json", - }, - "BodyProperties": [ - {"Key": "SystemType", "Value": "apm-system"}, - {"Key": "SystemGuid", "Value": "065a7343-a3b5-4ecd-9bac-19cdff5cf048"}, - ], - "EventType": "DataChange.Update", - }, - "AnnotationStreamIds": ",", - } - expected_df = spark_session.createDataFrame([honeywell_json_data]) PCDM_to_honeywell_eventhub_json_transformer = PCDMToHoneywellAPMTransformer( data=pcdm_df, history_samples_per_message=3 ) actual_df = PCDM_to_honeywell_eventhub_json_transformer.transform() - dict = actual_df.collect()[0]["CloudPlatformEvent"] - - assert len(dict) == 1 + dict = actual_df.collect()[0] + assert isinstance(uuid.UUID(dict["CloudPlatformEvent"]["CreatorId"]), uuid.UUID) assert ( PCDM_to_honeywell_eventhub_json_transformer.system_type() == SystemType.PYSPARK ) assert isinstance( PCDM_to_honeywell_eventhub_json_transformer.libraries(), Libraries ) - assert len(dict) == 1 + assert len(dict) == 2 assert len(dict["CloudPlatformEvent"]) == 12 assert len(dict["CloudPlatformEvent"]["Body"]) == 3 assert len(dict["CloudPlatformEvent"]["BodyProperties"]) == 2 From 07393988e9926f5629af919789995c2d8f4f3808 Mon Sep 17 00:00:00 2001 From: JamesKnBr Date: Tue, 26 Sep 2023 16:17:57 +0100 Subject: [PATCH 04/16] pcdm-hw_apm partition column & compression Signed-off-by: JamesKnBr --- .../spark/pcdm_to_honeywell_apm.py | 85 ++++++++++++------- .../spark/test_pcdm_to_honeywell_apm.py | 8 +- 2 files changed, 58 insertions(+), 35 deletions(-) diff --git a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py index 1a8621b7f..50173bbf2 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py +++ b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from pyspark.sql import DataFrame +from pyspark.sql import DataFrame, Window +from pyspark.sql.types import StringType from pyspark.sql.functions import ( to_json, col, @@ -24,14 +25,16 @@ row_number, collect_list, expr, + udf, + sha2, + when, ) -from pyspark.sql import Window from datetime import datetime import pytz +import gzip from ..interfaces import TransformerInterface from ..._pipeline_utils.models import Libraries, SystemType -from ..._pipeline_utils.spark import EDGEX_SCHEMA class PCDMToHoneywellAPMTransformer(TransformerInterface): @@ -41,7 +44,7 @@ class PCDMToHoneywellAPMTransformer(TransformerInterface): data (Dataframe): Spark Dataframe in PCDM format quality (str): Value for quality inside HistorySamples history_samples_per_message (int): The number of HistorySamples for each row in the DataFrame (Batch Only) - + compress_payload (bool): If True compresses body.value with gzip compression """ data: DataFrame @@ -53,10 +56,12 @@ def __init__( data: DataFrame, quality: str = "Good", history_samples_per_message: int = 1, + compress_payload: bool = False, ) -> None: self.data = data self.quality = quality self.history_samples_per_message = history_samples_per_message + self.compress_payload = compress_payload @staticmethod def system_type(): @@ -81,11 +86,16 @@ def pre_transform_validation(self): def post_transform_validation(self): return True + def _compress_body(data): + compressed_data = gzip.compress(bytes(data, "utf-8")) + return compressed_data + def transform(self) -> DataFrame: """ Returns: DataFrame: A dataframe with with rows in Honeywell APM format """ + compress_udf = udf(self._compress_body, StringType()) if self.data.isStreaming == False and self.history_samples_per_message > 1: pcdm_df = self.data.withColumn("counter", monotonically_increasing_id()) w = Window.orderBy("counter") @@ -105,9 +115,9 @@ def transform(self) -> DataFrame: col("Value").alias("Value"), ).alias("HistorySamples"), ) - .groupBy("index") + .groupBy("TagName", "index") .agg(collect_list("HistorySamples").alias("HistorySamples")) - .withColumn("guid", expr("uuid()")) + .withColumn("guid", sha2(col("TagName"), 256).cast("string")) .withColumn( "value", struct( @@ -131,32 +141,43 @@ def transform(self) -> DataFrame: ), ) - df = cleaned_pcdm_df.withColumn( - "CloudPlatformEvent", - struct( - lit(datetime.now(tz=pytz.UTC)).alias("CreatedTime"), - lit(expr("uuid()")).alias("Id"), - col("guid").alias("CreatorId"), - lit("CloudPlatformSystem").alias("CreatorType"), - lit(None).alias("GeneratorId"), - lit("CloudPlatformTenant").alias("GeneratorType"), - col("guid").alias("TargetId"), - lit("CloudPlatformTenant").alias("TargetType"), - lit(None).alias("TargetContext"), + df = ( + cleaned_pcdm_df.withColumn( + "CloudPlatformEvent", struct( - lit("TextualBody").alias("type"), - to_json(col("value")).alias("value"), - lit("application/json").alias("format"), - ).alias("Body"), - array( + lit(datetime.now(tz=pytz.UTC)).alias("CreatedTime"), + lit(expr("uuid()")).alias("Id"), + col("guid").alias("CreatorId"), + lit("CloudPlatformSystem").alias("CreatorType"), + lit(None).alias("GeneratorId"), + lit("CloudPlatformTenant").alias("GeneratorType"), + col("guid").alias("TargetId"), + lit("CloudPlatformTenant").alias("TargetType"), + lit(None).alias("TargetContext"), struct( - lit("SystemType").alias("Key"), - lit("apm-system").alias("Value"), - ), - struct(lit("SystemGuid").alias("Key"), col("guid").alias("Value")), - ).alias("BodyProperties"), - lit("DataChange.Update").alias("EventType"), - ), - ).withColumn("AnnotationStreamIds", lit(",")) + lit("TextualBody").alias("type"), + when( + self.compress_payload == True, + compress_udf(to_json(col("value"))), + ) + .otherwise(to_json(col("value"))) + .alias("value"), + lit("application/json").alias("format"), + ).alias("Body"), + array( + struct( + lit("SystemType").alias("Key"), + lit("apm-system").alias("Value"), + ), + struct( + lit("SystemGuid").alias("Key"), col("guid").alias("Value") + ), + ).alias("BodyProperties"), + lit("DataChange.Update").alias("EventType"), + ), + ) + .withColumn("AnnotationStreamIds", lit(",")) + .withColumn("partitionKey", col("guid")) + ) - return df.select("CloudPlatformEvent", "AnnotationStreamIds") + return df.select("CloudPlatformEvent", "AnnotationStreamIds", "partitionKey") diff --git a/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_pcdm_to_honeywell_apm.py b/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_pcdm_to_honeywell_apm.py index a1eb71b8e..0341e6bbf 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_pcdm_to_honeywell_apm.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_pcdm_to_honeywell_apm.py @@ -31,7 +31,6 @@ from pytest_mock import MockerFixture from pyspark.sql.types import StructType, StructField, StringType, TimestampType from datetime import datetime -import uuid def test_pcdm_to_honeywell_apm(spark_session: SparkSession, mocker: MockerFixture): @@ -73,14 +72,17 @@ def test_pcdm_to_honeywell_apm(spark_session: SparkSession, mocker: MockerFixtur actual_df = PCDM_to_honeywell_eventhub_json_transformer.transform() df_row = actual_df.collect()[0] - assert isinstance(uuid.UUID(df_row["CloudPlatformEvent"]["CreatorId"]), uuid.UUID) + assert ( + df_row["CloudPlatformEvent"]["CreatorId"] + == "a567edda0e37a9c98b0e73536234ad1b951dc6fa3b4bee4644ce54fc0df7cadd" + ) assert ( PCDM_to_honeywell_eventhub_json_transformer.system_type() == SystemType.PYSPARK ) assert isinstance( PCDM_to_honeywell_eventhub_json_transformer.libraries(), Libraries ) - assert len(df_row) == 2 + assert len(df_row) == 3 assert len(df_row["CloudPlatformEvent"]) == 12 assert len(df_row["CloudPlatformEvent"]["Body"]) == 3 assert len(df_row["CloudPlatformEvent"]["BodyProperties"]) == 2 From 044c770e1128d63825345c2fae09ba5ed7bc377d Mon Sep 17 00:00:00 2001 From: JamesKnBr Date: Wed, 27 Sep 2023 08:58:56 +0100 Subject: [PATCH 05/16] add deterministic guid hw transformer Signed-off-by: JamesKnBr --- .../pipelines/transformers/spark/pcdm_to_honeywell_apm.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py index 50173bbf2..d66ce649b 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py +++ b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py @@ -126,7 +126,9 @@ def transform(self) -> DataFrame: ) ) else: - cleaned_pcdm_df = self.data.withColumn("guid", expr("uuid()")).withColumn( + cleaned_pcdm_df = self.data.withColumn( + "guid", sha2(col("TagName"), 256).cast("string") + ).withColumn( "value", struct( col("guid").alias("SystemGuid"), From 6a54cb17c4ea9e1e1624ad52704635f46630ff94 Mon Sep 17 00:00:00 2001 From: JamesKnBr Date: Wed, 27 Sep 2023 11:11:25 +0100 Subject: [PATCH 06/16] change conditional logic Signed-off-by: JamesKnBr --- .../pipelines/transformers/spark/pcdm_to_honeywell_apm.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py index d66ce649b..424970f9a 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py +++ b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py @@ -50,6 +50,7 @@ class PCDMToHoneywellAPMTransformer(TransformerInterface): data: DataFrame quality: str history_samples_per_message: int + compress_payload: bool def __init__( self, @@ -144,7 +145,8 @@ def transform(self) -> DataFrame: ) df = ( - cleaned_pcdm_df.withColumn( + cleaned_pcdm_df.withColumn("compress_payload", lit(self.compress_payload)) + .withColumn( "CloudPlatformEvent", struct( lit(datetime.now(tz=pytz.UTC)).alias("CreatedTime"), @@ -159,7 +161,7 @@ def transform(self) -> DataFrame: struct( lit("TextualBody").alias("type"), when( - self.compress_payload == True, + col("compress_payload") == True, compress_udf(to_json(col("value"))), ) .otherwise(to_json(col("value"))) From 1aed56cefaec6349999014c294f97728747bde80 Mon Sep 17 00:00:00 2001 From: JamesKnBr Date: Wed, 27 Sep 2023 11:21:16 +0100 Subject: [PATCH 07/16] change default compress_payload to true Signed-off-by: JamesKnBr --- .../pipelines/transformers/spark/pcdm_to_honeywell_apm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py index 424970f9a..16128311c 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py +++ b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py @@ -57,7 +57,7 @@ def __init__( data: DataFrame, quality: str = "Good", history_samples_per_message: int = 1, - compress_payload: bool = False, + compress_payload: bool = True, ) -> None: self.data = data self.quality = quality From 9ad7d79659bc6283632c7cba48c63127ad1fa8fc Mon Sep 17 00:00:00 2001 From: JamesKnBr Date: Tue, 3 Oct 2023 13:59:00 +0100 Subject: [PATCH 08/16] refactor transformer Signed-off-by: JamesKnBr --- .../destinations/spark/kafka_eventhub.py | 3 ++- .../transformers/spark/pcdm_to_honeywell_apm.py | 16 +++++++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/sdk/python/rtdip_sdk/pipelines/destinations/spark/kafka_eventhub.py b/src/sdk/python/rtdip_sdk/pipelines/destinations/spark/kafka_eventhub.py index 3174f5c25..fc651a647 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/destinations/spark/kafka_eventhub.py +++ b/src/sdk/python/rtdip_sdk/pipelines/destinations/spark/kafka_eventhub.py @@ -50,8 +50,9 @@ class SparkKafkaEventhubDestination(DestinationInterface): data (DataFrame): Any columns not listed in the required schema [here](https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html#writing-data-to-kafka){ target="_blank" } will be merged into a single column named "value", or ignored if "value" is an existing column connection_string (str): Eventhubs connection string is required to connect to the Eventhubs service. This must include the Eventhub name as the `EntityPath` parameter. Example `"Endpoint=sb://test.servicebus.windows.net/;SharedAccessKeyName=test;SharedAccessKey=test_key;EntityPath=test_eventhub"` options (dict): A dictionary of Kafka configurations (See Attributes tables below) + consumer_group (str): The Eventhub consumer group to use for the connection trigger (optional str): Frequency of the write operation. Specify "availableNow" to execute a trigger once, otherwise specify a time period such as "30 seconds", "5 minutes". Set to "0 seconds" if you do not want to use a trigger. (stream) Default is 10 seconds - query_name (str): Unique name for the query in associated SparkSession + query_name (optional str): Unique name for the query in associated SparkSession query_wait_interval (optional int): If set, waits for the streaming query to complete before returning. (stream) Default is None The following are commonly used parameters that may be included in the options dict. kafka.bootstrap.servers is the only required config. A full list of configs can be found [here](https://kafka.apache.org/documentation/#producerconfigs){ target="_blank" } diff --git a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py index 16128311c..87256f833 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py +++ b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from pyspark.sql import DataFrame, Window +from pyspark.sql import DataFrame, Window, SparkSession from pyspark.sql.types import StringType from pyspark.sql.functions import ( to_json, @@ -32,6 +32,8 @@ from datetime import datetime import pytz import gzip +import base64 +from pyspark import SparkContext from ..interfaces import TransformerInterface from ..._pipeline_utils.models import Libraries, SystemType @@ -54,11 +56,13 @@ class PCDMToHoneywellAPMTransformer(TransformerInterface): def __init__( self, + spark: SparkSession, data: DataFrame, quality: str = "Good", history_samples_per_message: int = 1, compress_payload: bool = True, ) -> None: + self.spark = spark self.data = data self.quality = quality self.history_samples_per_message = history_samples_per_message @@ -89,19 +93,20 @@ def post_transform_validation(self): def _compress_body(data): compressed_data = gzip.compress(bytes(data, "utf-8")) - return compressed_data + encoded_data = base64.b64encode(compressed_data).decode("utf-8") + return encoded_data def transform(self) -> DataFrame: """ Returns: DataFrame: A dataframe with with rows in Honeywell APM format """ + # self.data = self.spark.sparkContext.broadcast(self.data).value compress_udf = udf(self._compress_body, StringType()) if self.data.isStreaming == False and self.history_samples_per_message > 1: - pcdm_df = self.data.withColumn("counter", monotonically_increasing_id()) - w = Window.orderBy("counter") + w = Window.partitionBy("TagName").orderBy("TagName") cleaned_pcdm_df = ( - pcdm_df.withColumn( + self.data.withColumn( "index", floor( (row_number().over(w) - 0.01) / self.history_samples_per_message @@ -163,6 +168,7 @@ def transform(self) -> DataFrame: when( col("compress_payload") == True, compress_udf(to_json(col("value"))), + # to_json(col("value")), ) .otherwise(to_json(col("value"))) .alias("value"), From 63cc4ab4ce7f3fc473edd03cb84d925ebb25f33d Mon Sep 17 00:00:00 2001 From: JamesKnBr Date: Tue, 3 Oct 2023 16:00:47 +0100 Subject: [PATCH 09/16] update pcdm-apm transformer Signed-off-by: JamesKnBr --- .../transformers/spark/pcdm_to_honeywell_apm.py | 14 ++++++-------- .../spark/test_pcdm_to_honeywell_apm.py | 6 ++---- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py index 87256f833..6109fec3e 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py +++ b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py @@ -20,7 +20,6 @@ struct, lit, array, - monotonically_increasing_id, floor, row_number, collect_list, @@ -33,7 +32,6 @@ import pytz import gzip import base64 -from pyspark import SparkContext from ..interfaces import TransformerInterface from ..._pipeline_utils.models import Libraries, SystemType @@ -62,8 +60,8 @@ def __init__( history_samples_per_message: int = 1, compress_payload: bool = True, ) -> None: - self.spark = spark self.data = data + self.spark = spark self.quality = quality self.history_samples_per_message = history_samples_per_message self.compress_payload = compress_payload @@ -92,7 +90,7 @@ def post_transform_validation(self): return True def _compress_body(data): - compressed_data = gzip.compress(bytes(data, "utf-8")) + compressed_data = gzip.compress(data.encode("utf-8")) encoded_data = base64.b64encode(compressed_data).decode("utf-8") return encoded_data @@ -101,8 +99,8 @@ def transform(self) -> DataFrame: Returns: DataFrame: A dataframe with with rows in Honeywell APM format """ - # self.data = self.spark.sparkContext.broadcast(self.data).value - compress_udf = udf(self._compress_body, StringType()) + self.spark.udf.register("compress_udf", self._compress_body, StringType()) + # compress_udf = udf(self._compress_body, StringType()) if self.data.isStreaming == False and self.history_samples_per_message > 1: w = Window.partitionBy("TagName").orderBy("TagName") cleaned_pcdm_df = ( @@ -167,8 +165,8 @@ def transform(self) -> DataFrame: lit("TextualBody").alias("type"), when( col("compress_payload") == True, - compress_udf(to_json(col("value"))), - # to_json(col("value")), + # compress_udf(to_json(col("value"))), + expr("compress_udf(to_json(value))"), ) .otherwise(to_json(col("value"))) .alias("value"), diff --git a/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_pcdm_to_honeywell_apm.py b/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_pcdm_to_honeywell_apm.py index 0341e6bbf..3aa7fac8b 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_pcdm_to_honeywell_apm.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_pcdm_to_honeywell_apm.py @@ -26,14 +26,12 @@ Libraries, SystemType, ) - from pyspark.sql import SparkSession, DataFrame -from pytest_mock import MockerFixture from pyspark.sql.types import StructType, StructField, StringType, TimestampType from datetime import datetime -def test_pcdm_to_honeywell_apm(spark_session: SparkSession, mocker: MockerFixture): +def test_pcdm_to_honeywell_apm(spark_session: SparkSession): pcdm_schema = StructType( [ StructField("TagName", StringType(), True), @@ -67,7 +65,7 @@ def test_pcdm_to_honeywell_apm(spark_session: SparkSession, mocker: MockerFixtur schema=pcdm_schema, data=pcdm_data ) PCDM_to_honeywell_eventhub_json_transformer = PCDMToHoneywellAPMTransformer( - data=pcdm_df, history_samples_per_message=3 + data=pcdm_df, history_samples_per_message=3, spark=spark_session ) actual_df = PCDM_to_honeywell_eventhub_json_transformer.transform() From fd58995569a6a12271966bb0d42e97d98d0b3ed6 Mon Sep 17 00:00:00 2001 From: JamesKnBr Date: Thu, 5 Oct 2023 16:08:16 +0100 Subject: [PATCH 10/16] Circular average & std dev APIs Signed-off-by: JamesKnBr --- src/api/v1/__init__.py | 2 + src/api/v1/circular_average.py | 162 +++++++++++++++++ src/api/v1/circular_standard_deviation.py | 162 +++++++++++++++++ src/api/v1/common.py | 10 ++ src/api/v1/models.py | 24 +++ tests/api/v1/api_test_objects.py | 22 +++ tests/api/v1/test_api_circular_average.py | 161 +++++++++++++++++ .../test_api_circular_standard_deviation.py | 167 ++++++++++++++++++ 8 files changed, 710 insertions(+) create mode 100644 src/api/v1/circular_average.py create mode 100644 src/api/v1/circular_standard_deviation.py create mode 100644 tests/api/v1/test_api_circular_average.py create mode 100644 tests/api/v1/test_api_circular_standard_deviation.py diff --git a/src/api/v1/__init__.py b/src/api/v1/__init__.py index de7eb8af2..4f482d590 100644 --- a/src/api/v1/__init__.py +++ b/src/api/v1/__init__.py @@ -22,6 +22,8 @@ resample, interpolate, interpolation_at_time, + circular_average, + circular_standard_deviation, time_weighted_average, graphql, ) diff --git a/src/api/v1/circular_average.py b/src/api/v1/circular_average.py new file mode 100644 index 000000000..351998483 --- /dev/null +++ b/src/api/v1/circular_average.py @@ -0,0 +1,162 @@ +# Copyright 2022 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import logging +from typing import Union +import numpy as np +from requests import request +from src.api.FastAPIApp import api_v1_router +from fastapi import HTTPException, Depends, Body +import nest_asyncio +from pandas.io.json import build_table_schema +from src.sdk.python.rtdip_sdk.queries import circular_average +from src.api.v1.models import ( + BaseQueryParams, + BaseHeaders, + ResampleInterpolateResponse, + PivotResponse, + HTTPError, + RawQueryParams, + TagsQueryParams, + TagsBodyParams, + CircularAverageQueryParams, + PivotQueryParams, + LimitOffsetQueryParams, +) +import src.api.v1.common + +nest_asyncio.apply() + + +def circular_average_events_get( + base_query_parameters, + raw_query_parameters, + tag_query_parameters, + circular_average_query_parameters, + pivot_parameters, + limit_offset_parameters, + base_headers, +): + try: + (connection, parameters) = src.api.v1.common.common_api_setup_tasks( + base_query_parameters, + raw_query_parameters=raw_query_parameters, + tag_query_parameters=tag_query_parameters, + circular_average_query_parameters=circular_average_query_parameters, + pivot_query_parameters=pivot_parameters, + limit_offset_query_parameters=limit_offset_parameters, + base_headers=base_headers, + ) + + data = circular_average.get(connection, parameters) + if parameters.get("pivot") == True: + return PivotResponse( + schema=build_table_schema(data, index=False, primary_key=False), + data=data.replace({np.nan: None}).to_dict(orient="records"), + ) + else: + return ResampleInterpolateResponse( + schema=build_table_schema(data, index=False, primary_key=False), + data=data.replace({np.nan: None}).to_dict(orient="records"), + ) + except Exception as e: + logging.error(str(e)) + raise HTTPException(status_code=400, detail=str(e)) + + +get_description = """ +## Circular Average + +Circular Average of timeseries data. +""" + + +@api_v1_router.get( + path="/events/circularaverage", + name="Circular Average GET", + description=get_description, + tags=["Events"], + responses={ + 200: {"model": Union[ResampleInterpolateResponse, PivotResponse]}, + 400: {"model": HTTPError}, + }, + openapi_extra={ + "externalDocs": { + "description": "RTDIP Circular Average Query Documentation", + "url": "https://www.rtdip.io/sdk/code-reference/query/circular-average/", + } + }, +) +async def circular_average_get( + base_query_parameters: BaseQueryParams = Depends(), + raw_query_parameters: RawQueryParams = Depends(), + tag_query_parameters: TagsQueryParams = Depends(), + circular_average_parameters: CircularAverageQueryParams = Depends(), + pivot_parameters: PivotQueryParams = Depends(), + limit_offset_parameters: LimitOffsetQueryParams = Depends(), + base_headers: BaseHeaders = Depends(), +): + return circular_average_events_get( + base_query_parameters, + raw_query_parameters, + tag_query_parameters, + circular_average_parameters, + pivot_parameters, + limit_offset_parameters, + base_headers, + ) + + +post_description = """ +## Circular Average + +Circular Average of timeseries data via a POST method to enable providing a list of tag names that can exceed url length restrictions via GET Query Parameters. +""" + + +@api_v1_router.post( + path="/events/circularaverage", + name="Circular Average POST", + description=post_description, + tags=["Events"], + responses={ + 200: {"model": Union[ResampleInterpolateResponse, PivotResponse]}, + 400: {"model": HTTPError}, + }, + openapi_extra={ + "externalDocs": { + "description": "RTDIP Circular Average Query Documentation", + "url": "https://www.rtdip.io/sdk/code-reference/query/circular-average/", + } + }, +) +async def resample_post( + base_query_parameters: BaseQueryParams = Depends(), + raw_query_parameters: RawQueryParams = Depends(), + tag_query_parameters: TagsBodyParams = Body(default=...), + circular_average_parameters: CircularAverageQueryParams = Depends(), + pivot_parameters: PivotQueryParams = Depends(), + limit_offset_parameters: LimitOffsetQueryParams = Depends(), + base_headers: BaseHeaders = Depends(), +): + return circular_average_events_get( + base_query_parameters, + raw_query_parameters, + tag_query_parameters, + circular_average_parameters, + pivot_parameters, + limit_offset_parameters, + base_headers, + ) diff --git a/src/api/v1/circular_standard_deviation.py b/src/api/v1/circular_standard_deviation.py new file mode 100644 index 000000000..5705eed0e --- /dev/null +++ b/src/api/v1/circular_standard_deviation.py @@ -0,0 +1,162 @@ +# Copyright 2022 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import logging +from typing import Union +import numpy as np +from requests import request +from src.api.FastAPIApp import api_v1_router +from fastapi import HTTPException, Depends, Body +import nest_asyncio +from pandas.io.json import build_table_schema +from src.sdk.python.rtdip_sdk.queries import circular_standard_deviation +from src.api.v1.models import ( + BaseQueryParams, + BaseHeaders, + ResampleInterpolateResponse, + PivotResponse, + HTTPError, + RawQueryParams, + TagsQueryParams, + TagsBodyParams, + PivotQueryParams, + LimitOffsetQueryParams, + CircularAverageQueryParams, +) +import src.api.v1.common + +nest_asyncio.apply() + + +def circular_standard_deviation_events_get( + base_query_parameters, + raw_query_parameters, + tag_query_parameters, + circular_standard_deviation_query_parameters, + pivot_parameters, + limit_offset_parameters, + base_headers, +): + try: + (connection, parameters) = src.api.v1.common.common_api_setup_tasks( + base_query_parameters, + raw_query_parameters=raw_query_parameters, + tag_query_parameters=tag_query_parameters, + circular_standard_deviation_query_parameters=circular_standard_deviation_query_parameters, + pivot_query_parameters=pivot_parameters, + limit_offset_query_parameters=limit_offset_parameters, + base_headers=base_headers, + ) + + data = circular_standard_deviation.get(connection, parameters) + if parameters.get("pivot") == True: + return PivotResponse( + schema=build_table_schema(data, index=False, primary_key=False), + data=data.replace({np.nan: None}).to_dict(orient="records"), + ) + else: + return ResampleInterpolateResponse( + schema=build_table_schema(data, index=False, primary_key=False), + data=data.replace({np.nan: None}).to_dict(orient="records"), + ) + except Exception as e: + logging.error(str(e)) + raise HTTPException(status_code=400, detail=str(e)) + + +get_description = """ +## Circular Standard Deviation + +Circular Standard Deviation of timeseries data. +""" + + +@api_v1_router.get( + path="/events/circularstandarddeviation", + name="Circular Standard Deviation GET", + description=get_description, + tags=["Events"], + responses={ + 200: {"model": Union[ResampleInterpolateResponse, PivotResponse]}, + 400: {"model": HTTPError}, + }, + openapi_extra={ + "externalDocs": { + "description": "RTDIP Circular Standard Deviation Query Documentation", + "url": "https://www.rtdip.io/sdk/code-reference/query/circular-standard-deviation/", + } + }, +) +async def circular_standard_deviation_get( + base_query_parameters: BaseQueryParams = Depends(), + raw_query_parameters: RawQueryParams = Depends(), + tag_query_parameters: TagsQueryParams = Depends(), + circular_standard_deviation_parameters: CircularAverageQueryParams = Depends(), + pivot_parameters: PivotQueryParams = Depends(), + limit_offset_parameters: LimitOffsetQueryParams = Depends(), + base_headers: BaseHeaders = Depends(), +): + return circular_standard_deviation_events_get( + base_query_parameters, + raw_query_parameters, + tag_query_parameters, + circular_standard_deviation_parameters, + pivot_parameters, + limit_offset_parameters, + base_headers, + ) + + +post_description = """ +## Circular Standard Deviation + +Circular Standard Deviation of timeseries data via a POST method to enable providing a list of tag names that can exceed url length restrictions via GET Query Parameters. +""" + + +@api_v1_router.post( + path="/events/circularstandarddeviation", + name="Circular Standard Deviation POST", + description=post_description, + tags=["Events"], + responses={ + 200: {"model": Union[ResampleInterpolateResponse, PivotResponse]}, + 400: {"model": HTTPError}, + }, + openapi_extra={ + "externalDocs": { + "description": "RTDIP Circular Standard Deviation Query Documentation", + "url": "https://www.rtdip.io/sdk/code-reference/query/circular-standard-deviation/", + } + }, +) +async def resample_post( + base_query_parameters: BaseQueryParams = Depends(), + raw_query_parameters: RawQueryParams = Depends(), + tag_query_parameters: TagsBodyParams = Body(default=...), + circular_standard_deviation_parameters: CircularAverageQueryParams = Depends(), + pivot_parameters: PivotQueryParams = Depends(), + limit_offset_parameters: LimitOffsetQueryParams = Depends(), + base_headers: BaseHeaders = Depends(), +): + return circular_standard_deviation_events_get( + base_query_parameters, + raw_query_parameters, + tag_query_parameters, + circular_standard_deviation_parameters, + pivot_parameters, + limit_offset_parameters, + base_headers, + ) diff --git a/src/api/v1/common.py b/src/api/v1/common.py index 0ca2e7922..c1becbade 100644 --- a/src/api/v1/common.py +++ b/src/api/v1/common.py @@ -32,6 +32,8 @@ def common_api_setup_tasks( interpolate_query_parameters=None, interpolation_at_time_query_parameters=None, time_weighted_average_query_parameters=None, + circular_average_query_parameters=None, + circular_standard_deviation_query_parameters=None, pivot_query_parameters=None, limit_offset_query_parameters=None, ): @@ -96,6 +98,14 @@ def common_api_setup_tasks( if time_weighted_average_query_parameters != None: parameters = dict(parameters, **time_weighted_average_query_parameters.__dict__) + if circular_average_query_parameters != None: + parameters = dict(parameters, **circular_average_query_parameters.__dict__) + + if circular_standard_deviation_query_parameters != None: + parameters = dict( + parameters, **circular_standard_deviation_query_parameters.__dict__ + ) + if pivot_query_parameters != None: parameters = dict(parameters, **pivot_query_parameters.__dict__) diff --git a/src/api/v1/models.py b/src/api/v1/models.py index a42ee816c..6a8ff2563 100644 --- a/src/api/v1/models.py +++ b/src/api/v1/models.py @@ -352,3 +352,27 @@ def __init__( self.time_interval_unit = time_interval_unit self.window_length = window_length self.step = step + + +class CircularAverageQueryParams: + def __init__( + self, + time_interval_rate: str = Query( + ..., description="Time Interval Rate as a numeric input", examples=[5] + ), + time_interval_unit: str = Query( + ..., + description="Time Interval Unit can be one of the options: [second, minute, day, hour]", + examples=["second", "minute", "hour", "day"], + ), + lower_bound: int = Query( + ..., description="Lower boundary for the sample range", examples=[5] + ), + upper_bound: int = Query( + ..., description="Upper boundary for the sample range", examples=[20] + ), + ): + self.time_interval_rate = time_interval_rate + self.time_interval_unit = time_interval_unit + self.lower_bound = lower_bound + self.upper_bound = upper_bound diff --git a/tests/api/v1/api_test_objects.py b/tests/api/v1/api_test_objects.py index 89c561cd9..cfad84606 100644 --- a/tests/api/v1/api_test_objects.py +++ b/tests/api/v1/api_test_objects.py @@ -145,6 +145,28 @@ "MOCKED-TAGNAME1", "MOCKED-TAGNAME2", ] +CIRCULAR_AVERAGE_MOCKED_PARAMETER_DICT = RAW_MOCKED_PARAMETER_DICT.copy() +CIRCULAR_AVERAGE_MOCKED_PARAMETER_ERROR_DICT = RAW_MOCKED_PARAMETER_ERROR_DICT.copy() + +CIRCULAR_AVERAGE_MOCKED_PARAMETER_DICT["time_interval_rate"] = "15" +CIRCULAR_AVERAGE_MOCKED_PARAMETER_DICT["time_interval_unit"] = "minute" +CIRCULAR_AVERAGE_MOCKED_PARAMETER_DICT["lower_bound"] = 5 +CIRCULAR_AVERAGE_MOCKED_PARAMETER_DICT["upper_bound"] = 20 +CIRCULAR_AVERAGE_MOCKED_PARAMETER_ERROR_DICT["time_interval_rate"] = "15" +CIRCULAR_AVERAGE_MOCKED_PARAMETER_ERROR_DICT["time_interval_unit"] = "minute" +CIRCULAR_AVERAGE_MOCKED_PARAMETER_ERROR_DICT["lower_bound"] = 5 +CIRCULAR_AVERAGE_MOCKED_PARAMETER_ERROR_DICT["upper_bound"] = 20 + +CIRCULAR_AVERAGE_POST_MOCKED_PARAMETER_DICT = ( + CIRCULAR_AVERAGE_MOCKED_PARAMETER_DICT.copy() +) +CIRCULAR_AVERAGE_POST_MOCKED_PARAMETER_DICT.pop("tag_name") + +CIRCULAR_AVERAGE_POST_BODY_MOCKED_PARAMETER_DICT = {} +CIRCULAR_AVERAGE_POST_BODY_MOCKED_PARAMETER_DICT["tag_name"] = [ + "MOCKED-TAGNAME1", + "MOCKED-TAGNAME2", +] TEST_HEADERS = { "Authorization": "Bearer Test Token", diff --git a/tests/api/v1/test_api_circular_average.py b/tests/api/v1/test_api_circular_average.py new file mode 100644 index 000000000..fd200a6d1 --- /dev/null +++ b/tests/api/v1/test_api_circular_average.py @@ -0,0 +1,161 @@ +# Copyright 2022 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from pytest_mock import MockerFixture +import pandas as pd +from datetime import datetime +from tests.sdk.python.rtdip_sdk.connectors.odbc.test_db_sql_connector import ( + MockedDBConnection, +) +from tests.sdk.python.rtdip_sdk.queries.test_raw import DATABRICKS_SQL_CONNECT +from tests.api.v1.api_test_objects import ( + CIRCULAR_AVERAGE_MOCKED_PARAMETER_DICT, + CIRCULAR_AVERAGE_MOCKED_PARAMETER_ERROR_DICT, + CIRCULAR_AVERAGE_POST_MOCKED_PARAMETER_DICT, + CIRCULAR_AVERAGE_POST_BODY_MOCKED_PARAMETER_DICT, + mocker_setup, + TEST_HEADERS, + BASE_URL, +) +from httpx import AsyncClient +from src.api.v1 import app + +MOCK_METHOD = "src.sdk.python.rtdip_sdk.queries.time_series.circular_average.get" +MOCK_API_NAME = "/api/v1/events/circularaverage" + +pytestmark = pytest.mark.anyio + + +async def test_api_circular_average_get_success(mocker: MockerFixture): + test_data = pd.DataFrame( + {"EventTime": [datetime.utcnow()], "TagName": ["TestTag"], "Value": [1.01]} + ) + mocker = mocker_setup(mocker, MOCK_METHOD, test_data) + + async with AsyncClient(app=app, base_url=BASE_URL) as ac: + response = await ac.get( + MOCK_API_NAME, + headers=TEST_HEADERS, + params=CIRCULAR_AVERAGE_MOCKED_PARAMETER_DICT, + ) + actual = response.text + expected = test_data.to_json(orient="table", index=False, date_unit="us") + + assert response.status_code == 200 + assert actual == expected + + +async def test_api_circular_average_get_validation_error(mocker: MockerFixture): + test_data = pd.DataFrame( + {"EventTime": [datetime.utcnow()], "TagName": ["TestTag"], "Value": [1.01]} + ) + mocker = mocker_setup(mocker, MOCK_METHOD, test_data) + + async with AsyncClient(app=app, base_url=BASE_URL) as ac: + response = await ac.get( + MOCK_API_NAME, + headers=TEST_HEADERS, + params=CIRCULAR_AVERAGE_MOCKED_PARAMETER_ERROR_DICT, + ) + actual = response.text + + assert response.status_code == 422 + assert ( + actual + == '{"detail":[{"loc":["query","start_date"],"msg":"field required","type":"value_error.missing"}]}' + ) + + +async def test_api_circular_average_get_error(mocker: MockerFixture): + test_data = pd.DataFrame( + {"EventTime": [datetime.utcnow()], "TagName": ["TestTag"], "Value": [1.01]} + ) + mocker = mocker_setup( + mocker, MOCK_METHOD, test_data, Exception("Error Connecting to Database") + ) + + async with AsyncClient(app=app, base_url=BASE_URL) as ac: + response = await ac.get( + MOCK_API_NAME, + headers=TEST_HEADERS, + params=CIRCULAR_AVERAGE_MOCKED_PARAMETER_DICT, + ) + actual = response.text + + assert response.status_code == 400 + assert actual == '{"detail":"Error Connecting to Database"}' + + +async def test_api_circular_average_post_success(mocker: MockerFixture): + test_data = pd.DataFrame( + {"EventTime": [datetime.utcnow()], "TagName": ["TestTag"], "Value": [1.01]} + ) + mocker = mocker_setup(mocker, MOCK_METHOD, test_data) + + async with AsyncClient(app=app, base_url=BASE_URL) as ac: + response = await ac.post( + MOCK_API_NAME, + headers=TEST_HEADERS, + params=CIRCULAR_AVERAGE_POST_MOCKED_PARAMETER_DICT, + json=CIRCULAR_AVERAGE_POST_BODY_MOCKED_PARAMETER_DICT, + ) + actual = response.text + expected = test_data.to_json(orient="table", index=False, date_unit="us") + + assert response.status_code == 200 + assert actual == expected + + +async def test_api_circular_average_post_validation_error(mocker: MockerFixture): + test_data = pd.DataFrame( + {"EventTime": [datetime.utcnow()], "TagName": ["TestTag"], "Value": [1.01]} + ) + mocker = mocker_setup(mocker, MOCK_METHOD, test_data) + + async with AsyncClient(app=app, base_url=BASE_URL) as ac: + response = await ac.post( + MOCK_API_NAME, + headers=TEST_HEADERS, + params=CIRCULAR_AVERAGE_MOCKED_PARAMETER_ERROR_DICT, + json=CIRCULAR_AVERAGE_POST_BODY_MOCKED_PARAMETER_DICT, + ) + actual = response.text + + assert response.status_code == 422 + assert ( + actual + == '{"detail":[{"loc":["query","start_date"],"msg":"field required","type":"value_error.missing"}]}' + ) + + +async def test_api_circular_average_post_error(mocker: MockerFixture): + test_data = pd.DataFrame( + {"EventTime": [datetime.utcnow()], "TagName": ["TestTag"], "Value": [1.01]} + ) + mocker = mocker_setup( + mocker, MOCK_METHOD, test_data, Exception("Error Connecting to Database") + ) + + async with AsyncClient(app=app, base_url=BASE_URL) as ac: + response = await ac.post( + MOCK_API_NAME, + headers=TEST_HEADERS, + params=CIRCULAR_AVERAGE_MOCKED_PARAMETER_DICT, + json=CIRCULAR_AVERAGE_POST_BODY_MOCKED_PARAMETER_DICT, + ) + actual = response.text + + assert response.status_code == 400 + assert actual == '{"detail":"Error Connecting to Database"}' diff --git a/tests/api/v1/test_api_circular_standard_deviation.py b/tests/api/v1/test_api_circular_standard_deviation.py new file mode 100644 index 000000000..41a2cca9c --- /dev/null +++ b/tests/api/v1/test_api_circular_standard_deviation.py @@ -0,0 +1,167 @@ +# Copyright 2022 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from pytest_mock import MockerFixture +import pandas as pd +from datetime import datetime +from tests.sdk.python.rtdip_sdk.connectors.odbc.test_db_sql_connector import ( + MockedDBConnection, +) +from tests.sdk.python.rtdip_sdk.queries.test_raw import DATABRICKS_SQL_CONNECT +from tests.api.v1.api_test_objects import ( + CIRCULAR_AVERAGE_MOCKED_PARAMETER_DICT, + CIRCULAR_AVERAGE_MOCKED_PARAMETER_ERROR_DICT, + CIRCULAR_AVERAGE_POST_MOCKED_PARAMETER_DICT, + CIRCULAR_AVERAGE_POST_BODY_MOCKED_PARAMETER_DICT, + mocker_setup, + TEST_HEADERS, + BASE_URL, +) +from httpx import AsyncClient +from src.api.v1 import app + +MOCK_METHOD = ( + "src.sdk.python.rtdip_sdk.queries.time_series.circular_standard_deviation.get" +) +MOCK_API_NAME = "/api/v1/events/circularstandarddeviation" + +pytestmark = pytest.mark.anyio + + +async def test_api_circular_standard_deviation_get_success(mocker: MockerFixture): + test_data = pd.DataFrame( + {"EventTime": [datetime.utcnow()], "TagName": ["TestTag"], "Value": [1.01]} + ) + mocker = mocker_setup(mocker, MOCK_METHOD, test_data) + + async with AsyncClient(app=app, base_url=BASE_URL) as ac: + response = await ac.get( + MOCK_API_NAME, + headers=TEST_HEADERS, + params=CIRCULAR_AVERAGE_MOCKED_PARAMETER_DICT, + ) + actual = response.text + expected = test_data.to_json(orient="table", index=False, date_unit="us") + + assert response.status_code == 200 + assert actual == expected + + +async def test_api_circular_standard_deviation_get_validation_error( + mocker: MockerFixture, +): + test_data = pd.DataFrame( + {"EventTime": [datetime.utcnow()], "TagName": ["TestTag"], "Value": [1.01]} + ) + mocker = mocker_setup(mocker, MOCK_METHOD, test_data) + + async with AsyncClient(app=app, base_url=BASE_URL) as ac: + response = await ac.get( + MOCK_API_NAME, + headers=TEST_HEADERS, + params=CIRCULAR_AVERAGE_MOCKED_PARAMETER_ERROR_DICT, + ) + actual = response.text + + assert response.status_code == 422 + assert ( + actual + == '{"detail":[{"loc":["query","start_date"],"msg":"field required","type":"value_error.missing"}]}' + ) + + +async def test_api_circular_standard_deviation_get_error(mocker: MockerFixture): + test_data = pd.DataFrame( + {"EventTime": [datetime.utcnow()], "TagName": ["TestTag"], "Value": [1.01]} + ) + mocker = mocker_setup( + mocker, MOCK_METHOD, test_data, Exception("Error Connecting to Database") + ) + + async with AsyncClient(app=app, base_url=BASE_URL) as ac: + response = await ac.get( + MOCK_API_NAME, + headers=TEST_HEADERS, + params=CIRCULAR_AVERAGE_MOCKED_PARAMETER_DICT, + ) + actual = response.text + + assert response.status_code == 400 + assert actual == '{"detail":"Error Connecting to Database"}' + + +async def test_api_circular_standard_deviation_post_success(mocker: MockerFixture): + test_data = pd.DataFrame( + {"EventTime": [datetime.utcnow()], "TagName": ["TestTag"], "Value": [1.01]} + ) + mocker = mocker_setup(mocker, MOCK_METHOD, test_data) + + async with AsyncClient(app=app, base_url=BASE_URL) as ac: + response = await ac.post( + MOCK_API_NAME, + headers=TEST_HEADERS, + params=CIRCULAR_AVERAGE_POST_MOCKED_PARAMETER_DICT, + json=CIRCULAR_AVERAGE_POST_BODY_MOCKED_PARAMETER_DICT, + ) + actual = response.text + expected = test_data.to_json(orient="table", index=False, date_unit="us") + + assert response.status_code == 200 + assert actual == expected + + +async def test_api_circular_standard_deviation_post_validation_error( + mocker: MockerFixture, +): + test_data = pd.DataFrame( + {"EventTime": [datetime.utcnow()], "TagName": ["TestTag"], "Value": [1.01]} + ) + mocker = mocker_setup(mocker, MOCK_METHOD, test_data) + + async with AsyncClient(app=app, base_url=BASE_URL) as ac: + response = await ac.post( + MOCK_API_NAME, + headers=TEST_HEADERS, + params=CIRCULAR_AVERAGE_MOCKED_PARAMETER_ERROR_DICT, + json=CIRCULAR_AVERAGE_POST_BODY_MOCKED_PARAMETER_DICT, + ) + actual = response.text + + assert response.status_code == 422 + assert ( + actual + == '{"detail":[{"loc":["query","start_date"],"msg":"field required","type":"value_error.missing"}]}' + ) + + +async def test_api_circular_standard_deviation_post_error(mocker: MockerFixture): + test_data = pd.DataFrame( + {"EventTime": [datetime.utcnow()], "TagName": ["TestTag"], "Value": [1.01]} + ) + mocker = mocker_setup( + mocker, MOCK_METHOD, test_data, Exception("Error Connecting to Database") + ) + + async with AsyncClient(app=app, base_url=BASE_URL) as ac: + response = await ac.post( + MOCK_API_NAME, + headers=TEST_HEADERS, + params=CIRCULAR_AVERAGE_MOCKED_PARAMETER_DICT, + json=CIRCULAR_AVERAGE_POST_BODY_MOCKED_PARAMETER_DICT, + ) + actual = response.text + + assert response.status_code == 400 + assert actual == '{"detail":"Error Connecting to Database"}' From cff86ccf1bfd8920f03f0e1b916544da89e0eca5 Mon Sep 17 00:00:00 2001 From: JamesKnBr Date: Thu, 5 Oct 2023 16:15:18 +0100 Subject: [PATCH 11/16] updates to pcdm-apm Signed-off-by: JamesKnBr --- .../spark/pcdm_to_honeywell_apm.py | 37 ++++++++++--------- .../spark/test_pcdm_to_honeywell_apm.py | 2 +- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py index 6109fec3e..8de9542c0 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py +++ b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py @@ -37,6 +37,12 @@ from ..._pipeline_utils.models import Libraries, SystemType +def _compress_payload(data): + compressed_data = gzip.compress(data.encode("utf-8")) + encoded_data = base64.b64encode(compressed_data).decode("utf-8") + return encoded_data + + class PCDMToHoneywellAPMTransformer(TransformerInterface): """ Converts a Spark Dataframe in PCDM format to Honeywell APM format. @@ -89,18 +95,11 @@ def pre_transform_validation(self): def post_transform_validation(self): return True - def _compress_body(data): - compressed_data = gzip.compress(data.encode("utf-8")) - encoded_data = base64.b64encode(compressed_data).decode("utf-8") - return encoded_data - def transform(self) -> DataFrame: """ Returns: DataFrame: A dataframe with with rows in Honeywell APM format """ - self.spark.udf.register("compress_udf", self._compress_body, StringType()) - # compress_udf = udf(self._compress_body, StringType()) if self.data.isStreaming == False and self.history_samples_per_message > 1: w = Window.partitionBy("TagName").orderBy("TagName") cleaned_pcdm_df = ( @@ -148,8 +147,7 @@ def transform(self) -> DataFrame: ) df = ( - cleaned_pcdm_df.withColumn("compress_payload", lit(self.compress_payload)) - .withColumn( + cleaned_pcdm_df.withColumn( "CloudPlatformEvent", struct( lit(datetime.now(tz=pytz.UTC)).alias("CreatedTime"), @@ -163,13 +161,7 @@ def transform(self) -> DataFrame: lit(None).alias("TargetContext"), struct( lit("TextualBody").alias("type"), - when( - col("compress_payload") == True, - # compress_udf(to_json(col("value"))), - expr("compress_udf(to_json(value))"), - ) - .otherwise(to_json(col("value"))) - .alias("value"), + to_json(col("value")).alias("value"), lit("application/json").alias("format"), ).alias("Body"), array( @@ -187,5 +179,14 @@ def transform(self) -> DataFrame: .withColumn("AnnotationStreamIds", lit(",")) .withColumn("partitionKey", col("guid")) ) - - return df.select("CloudPlatformEvent", "AnnotationStreamIds", "partitionKey") + if self.compress_payload: + compress_udf = udf(_compress_payload, StringType()) + return df.select( + compress_udf(to_json("CloudPlatformEvent")).alias("CloudPlatformEvent"), + "AnnotationStreamIds", + "partitionKey", + ) + else: + return df.select( + "CloudPlatformEvent", "AnnotationStreamIds", "partitionKey" + ) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_pcdm_to_honeywell_apm.py b/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_pcdm_to_honeywell_apm.py index 3aa7fac8b..07cd29b5b 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_pcdm_to_honeywell_apm.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_pcdm_to_honeywell_apm.py @@ -65,7 +65,7 @@ def test_pcdm_to_honeywell_apm(spark_session: SparkSession): schema=pcdm_schema, data=pcdm_data ) PCDM_to_honeywell_eventhub_json_transformer = PCDMToHoneywellAPMTransformer( - data=pcdm_df, history_samples_per_message=3, spark=spark_session + data=pcdm_df, history_samples_per_message=3 ) actual_df = PCDM_to_honeywell_eventhub_json_transformer.transform() From a4ed1ad9fca2d2cf76003a7640253729ad14653f Mon Sep 17 00:00:00 2001 From: JamesKnBr Date: Fri, 6 Oct 2023 15:58:00 +0100 Subject: [PATCH 12/16] fix udf error Signed-off-by: JamesKnBr --- .../spark/pcdm_to_honeywell_apm.py | 20 ++--- .../spark/test_pcdm_to_honeywell_apm.py | 75 +++++++++++-------- 2 files changed, 54 insertions(+), 41 deletions(-) diff --git a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py index 8de9542c0..6a46e4c5d 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py +++ b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py @@ -37,12 +37,6 @@ from ..._pipeline_utils.models import Libraries, SystemType -def _compress_payload(data): - compressed_data = gzip.compress(data.encode("utf-8")) - encoded_data = base64.b64encode(compressed_data).decode("utf-8") - return encoded_data - - class PCDMToHoneywellAPMTransformer(TransformerInterface): """ Converts a Spark Dataframe in PCDM format to Honeywell APM format. @@ -60,14 +54,12 @@ class PCDMToHoneywellAPMTransformer(TransformerInterface): def __init__( self, - spark: SparkSession, data: DataFrame, quality: str = "Good", history_samples_per_message: int = 1, compress_payload: bool = True, ) -> None: self.data = data - self.spark = spark self.quality = quality self.history_samples_per_message = history_samples_per_message self.compress_payload = compress_payload @@ -100,6 +92,13 @@ def transform(self) -> DataFrame: Returns: DataFrame: A dataframe with with rows in Honeywell APM format """ + + @udf("string") + def _compress_payload(data): + compressed_data = gzip.compress(data.encode("utf-8")) + encoded_data = base64.b64encode(compressed_data).decode("utf-8") + return encoded_data + if self.data.isStreaming == False and self.history_samples_per_message > 1: w = Window.partitionBy("TagName").orderBy("TagName") cleaned_pcdm_df = ( @@ -180,9 +179,10 @@ def transform(self) -> DataFrame: .withColumn("partitionKey", col("guid")) ) if self.compress_payload: - compress_udf = udf(_compress_payload, StringType()) return df.select( - compress_udf(to_json("CloudPlatformEvent")).alias("CloudPlatformEvent"), + _compress_payload(to_json("CloudPlatformEvent")).alias( + "CloudPlatformEvent" + ), "AnnotationStreamIds", "partitionKey", ) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_pcdm_to_honeywell_apm.py b/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_pcdm_to_honeywell_apm.py index 07cd29b5b..cdb837d2d 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_pcdm_to_honeywell_apm.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_pcdm_to_honeywell_apm.py @@ -30,49 +30,50 @@ from pyspark.sql.types import StructType, StructField, StringType, TimestampType from datetime import datetime +pcdm_schema = StructType( + [ + StructField("TagName", StringType(), True), + StructField("EventTime", TimestampType(), True), + StructField("Status", StringType(), False), + StructField("Value", StringType(), True), + StructField("ValueType", StringType(), False), + StructField("ChangeType", StringType(), False), + ] +) -def test_pcdm_to_honeywell_apm(spark_session: SparkSession): - pcdm_schema = StructType( - [ - StructField("TagName", StringType(), True), - StructField("EventTime", TimestampType(), True), - StructField("Status", StringType(), False), - StructField("Value", StringType(), True), - StructField("ValueType", StringType(), False), - StructField("ChangeType", StringType(), False), - ] - ) +pcdm_data = [ + { + "TagName": "test.item1", + "EventTime": datetime.fromisoformat("2023-07-31T06:53:00+00:00"), + "Status": "Good", + "Value": 5.0, + "ValueType": "float", + "ChangeType": "insert", + }, + { + "TagName": "Test_item2", + "EventTime": datetime.fromisoformat("2023-07-31T06:54:00+00:00"), + "Status": "Good", + "Value": 1, + "ValueType": "float", + "ChangeType": "insert", + }, +] - pcdm_data = [ - { - "TagName": "test.item1", - "EventTime": datetime.fromisoformat("2023-07-31T06:53:00+00:00"), - "Status": "Good", - "Value": 5.0, - "ValueType": "float", - "ChangeType": "insert", - }, - { - "TagName": "Test_item2", - "EventTime": datetime.fromisoformat("2023-07-31T06:54:00+00:00"), - "Status": "Good", - "Value": 1, - "ValueType": "float", - "ChangeType": "insert", - }, - ] + +def test_pcdm_to_honeywell_apm(spark_session: SparkSession): pcdm_df: DataFrame = spark_session.createDataFrame( schema=pcdm_schema, data=pcdm_data ) PCDM_to_honeywell_eventhub_json_transformer = PCDMToHoneywellAPMTransformer( - data=pcdm_df, history_samples_per_message=3 + data=pcdm_df, history_samples_per_message=3, compress_payload=False ) actual_df = PCDM_to_honeywell_eventhub_json_transformer.transform() df_row = actual_df.collect()[0] assert ( df_row["CloudPlatformEvent"]["CreatorId"] - == "a567edda0e37a9c98b0e73536234ad1b951dc6fa3b4bee4644ce54fc0df7cadd" + == "51bc4f9dda971d1b5417161bb98e5d8f77bea2587d9de783b54be25e22b56496" ) assert ( PCDM_to_honeywell_eventhub_json_transformer.system_type() == SystemType.PYSPARK @@ -86,3 +87,15 @@ def test_pcdm_to_honeywell_apm(spark_session: SparkSession): assert len(df_row["CloudPlatformEvent"]["BodyProperties"]) == 2 assert len(df_row["CloudPlatformEvent"]["BodyProperties"][0]) == 2 assert len(df_row["CloudPlatformEvent"]["BodyProperties"][1]) == 2 + + +def test_pcdm_to_honeywell_apm_gzip_compressed(spark_session: SparkSession): + pcdm_df: DataFrame = spark_session.createDataFrame( + schema=pcdm_schema, data=pcdm_data + ) + PCDM_to_honeywell_eventhub_json_transformer = PCDMToHoneywellAPMTransformer( + data=pcdm_df, history_samples_per_message=3 + ) + actual_df = PCDM_to_honeywell_eventhub_json_transformer.transform() + df_row = actual_df.collect()[0] + assert isinstance(df_row["CloudPlatformEvent"], str) From cfb848f60870027ff9117e541da3ab886304bba2 Mon Sep 17 00:00:00 2001 From: JamesKnBr Date: Fri, 6 Oct 2023 16:21:05 +0100 Subject: [PATCH 13/16] fix code smell Signed-off-by: JamesKnBr --- .../transformers/spark/test_pcdm_to_honeywell_apm.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_pcdm_to_honeywell_apm.py b/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_pcdm_to_honeywell_apm.py index cdb837d2d..8cdb0538a 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_pcdm_to_honeywell_apm.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/test_pcdm_to_honeywell_apm.py @@ -65,21 +65,21 @@ def test_pcdm_to_honeywell_apm(spark_session: SparkSession): pcdm_df: DataFrame = spark_session.createDataFrame( schema=pcdm_schema, data=pcdm_data ) - PCDM_to_honeywell_eventhub_json_transformer = PCDMToHoneywellAPMTransformer( + pcdm_to_honeywell_eventhub_json_transformer = PCDMToHoneywellAPMTransformer( data=pcdm_df, history_samples_per_message=3, compress_payload=False ) - actual_df = PCDM_to_honeywell_eventhub_json_transformer.transform() + actual_df = pcdm_to_honeywell_eventhub_json_transformer.transform() df_row = actual_df.collect()[0] assert ( df_row["CloudPlatformEvent"]["CreatorId"] == "51bc4f9dda971d1b5417161bb98e5d8f77bea2587d9de783b54be25e22b56496" ) assert ( - PCDM_to_honeywell_eventhub_json_transformer.system_type() == SystemType.PYSPARK + pcdm_to_honeywell_eventhub_json_transformer.system_type() == SystemType.PYSPARK ) assert isinstance( - PCDM_to_honeywell_eventhub_json_transformer.libraries(), Libraries + pcdm_to_honeywell_eventhub_json_transformer.libraries(), Libraries ) assert len(df_row) == 3 assert len(df_row["CloudPlatformEvent"]) == 12 @@ -93,9 +93,9 @@ def test_pcdm_to_honeywell_apm_gzip_compressed(spark_session: SparkSession): pcdm_df: DataFrame = spark_session.createDataFrame( schema=pcdm_schema, data=pcdm_data ) - PCDM_to_honeywell_eventhub_json_transformer = PCDMToHoneywellAPMTransformer( + pcdm_to_honeywell_eventhub_json_transformer = PCDMToHoneywellAPMTransformer( data=pcdm_df, history_samples_per_message=3 ) - actual_df = PCDM_to_honeywell_eventhub_json_transformer.transform() + actual_df = pcdm_to_honeywell_eventhub_json_transformer.transform() df_row = actual_df.collect()[0] assert isinstance(df_row["CloudPlatformEvent"], str) From 3c6ad95d0381a240b0e7796a20ebf8813327164a Mon Sep 17 00:00:00 2001 From: JamesKnBr Date: Fri, 6 Oct 2023 16:51:38 +0100 Subject: [PATCH 14/16] SparkSessionUtility bug fix Signed-off-by: JamesKnBr --- .../pipelines/utilities/pipeline_components.py | 6 ++++-- .../rtdip_sdk/pipelines/utilities/spark/session.py | 10 ++++++---- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/sdk/python/rtdip_sdk/pipelines/utilities/pipeline_components.py b/src/sdk/python/rtdip_sdk/pipelines/utilities/pipeline_components.py index f6d2a655b..5cd602673 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/utilities/pipeline_components.py +++ b/src/sdk/python/rtdip_sdk/pipelines/utilities/pipeline_components.py @@ -29,15 +29,17 @@ class PipelineComponentsGetUtility(UtilitiesInterface): Args: module (optional str): Provide the module to use for imports of rtdip-sdk components. If not populated, it will use the calling module to check for imports + spark_config (optional dict): Additional spark configuration to be applied to the spark session """ - def __init__(self, module: str = None) -> None: + def __init__(self, module: str = None, spark_config: dict = None) -> None: if module == None: frm = inspect.stack()[1] mod = inspect.getmodule(frm[0]) self.module = mod.__name__ else: self.module = module + self.spark_config = {} if spark_config is None else spark_config @staticmethod def system_type(): @@ -100,7 +102,7 @@ def execute(self) -> Tuple[Libraries, dict]: task_libraries = Libraries() task_libraries.get_libraries_from_components(component_list) - spark_configuration = {} + spark_configuration = self.spark_config for component in component_list: spark_configuration = {**spark_configuration, **component.settings()} return (task_libraries, spark_configuration) diff --git a/src/sdk/python/rtdip_sdk/pipelines/utilities/spark/session.py b/src/sdk/python/rtdip_sdk/pipelines/utilities/spark/session.py index f15cd2963..b8aaab569 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/utilities/spark/session.py +++ b/src/sdk/python/rtdip_sdk/pipelines/utilities/spark/session.py @@ -31,7 +31,7 @@ class SparkSessionUtility(UtilitiesInterface): Call this component after all imports of the RTDIP components to ensure that the spark session is configured correctly. Args: - config (dict): Dictionary of spark configuration to be applied to the spark session + config (optional dict): Dictionary of spark configuration to be applied to the spark session module (optional str): Provide the module to use for imports of rtdip-sdk components. If not populated, it will use the calling module to check for imports remote (optional str): Specify the remote parameters if intending to use Spark Connect """ @@ -40,8 +40,10 @@ class SparkSessionUtility(UtilitiesInterface): config: dict module: str - def __init__(self, config: dict, module: str = None, remote: str = None) -> None: - self.config = config + def __init__( + self, config: dict = None, module: str = None, remote: str = None + ) -> None: + self.config = {} if config is None else config if module == None: frm = inspect.stack()[1] mod = inspect.getmodule(frm[0]) @@ -70,7 +72,7 @@ def settings() -> dict: def execute(self) -> SparkSession: try: (task_libraries, spark_configuration) = PipelineComponentsGetUtility( - self.module + self.module, self.config ).execute() self.spark = SparkClient( spark_configuration=spark_configuration, From 4ea5b8bd8a31d47baf452b9f559c08f2c6d22c5f Mon Sep 17 00:00:00 2001 From: JamesKnBr Date: Fri, 6 Oct 2023 16:56:30 +0100 Subject: [PATCH 15/16] updated SparkSessionUtility Signed-off-by: JamesKnBr --- src/sdk/python/rtdip_sdk/pipelines/utilities/spark/session.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sdk/python/rtdip_sdk/pipelines/utilities/spark/session.py b/src/sdk/python/rtdip_sdk/pipelines/utilities/spark/session.py index b8aaab569..8becc24cd 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/utilities/spark/session.py +++ b/src/sdk/python/rtdip_sdk/pipelines/utilities/spark/session.py @@ -43,7 +43,7 @@ class SparkSessionUtility(UtilitiesInterface): def __init__( self, config: dict = None, module: str = None, remote: str = None ) -> None: - self.config = {} if config is None else config + self.config = config if module == None: frm = inspect.stack()[1] mod = inspect.getmodule(frm[0]) From ebadfefc4397490f9fc1b82b7da5b7918e476f6b Mon Sep 17 00:00:00 2001 From: JamesKnBr Date: Fri, 6 Oct 2023 17:07:21 +0100 Subject: [PATCH 16/16] change docs typo in pcdm-hw Signed-off-by: JamesKnBr --- .../pipelines/transformers/spark/pcdm_to_honeywell_apm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py index 6a46e4c5d..0ee90f0e8 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py +++ b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py @@ -44,7 +44,7 @@ class PCDMToHoneywellAPMTransformer(TransformerInterface): data (Dataframe): Spark Dataframe in PCDM format quality (str): Value for quality inside HistorySamples history_samples_per_message (int): The number of HistorySamples for each row in the DataFrame (Batch Only) - compress_payload (bool): If True compresses body.value with gzip compression + compress_payload (bool): If True compresses CloudPlatformEvent with gzip compression """ data: DataFrame