Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
d9bb204
typo change
JamesKnBr Apr 12, 2023
a773ac7
Merge remote-tracking branch 'upstream/develop' into develop
JamesKnBr Aug 3, 2023
32ea5d4
Merge remote-tracking branch 'upstream/develop' into develop
JamesKnBr Aug 7, 2023
f76aea4
Add Honeywell transformers
JamesKnBr Aug 18, 2023
5085d58
Merge remote-tracking branch 'upstream/develop' into develop
JamesKnBr Aug 18, 2023
7563b8a
Merge remote-tracking branch 'upstream/develop' into develop
JamesKnBr Aug 21, 2023
5b57d0c
Merge remote-tracking branch 'upstream/develop' into hotfix/00455
JamesKnBr Aug 23, 2023
1e2aec5
update eventhub destination and pcdm-honeywell
JamesKnBr Aug 23, 2023
3562afd
add eventhub destination and pcdm-honeywell
JamesKnBr Aug 23, 2023
bc8e6f5
Merge branch 'hotfix/00455' into develop
JamesKnBr Aug 23, 2023
d28338e
Merge remote-tracking branch 'upstream/develop' into develop
JamesKnBr Aug 30, 2023
1e3445f
Merge remote-tracking branch 'upstream/develop' into develop
JamesKnBr Sep 6, 2023
18d1d01
Merge remote-tracking branch 'upstream/develop' into develop
JamesKnBr Sep 8, 2023
6133317
Merge remote-tracking branch 'upstream/develop' into develop
JamesKnBr Sep 11, 2023
14544c4
Merge remote-tracking branch 'upstream/develop' into develop
JamesKnBr Sep 25, 2023
caa026d
Merge remote-tracking branch 'upstream/develop' into develop
JamesKnBr Sep 26, 2023
0739398
pcdm-hw_apm partition column & compression
JamesKnBr Sep 26, 2023
7ac0305
Merge remote-tracking branch 'upstream/develop' into hotfix/00509
JamesKnBr Sep 26, 2023
044c770
add deterministic guid hw transformer
JamesKnBr Sep 27, 2023
6a54cb1
change conditional logic
JamesKnBr Sep 27, 2023
1aed56c
change default compress_payload to true
JamesKnBr Sep 27, 2023
9ad7d79
refactor transformer
JamesKnBr Oct 3, 2023
a1c976c
Merge remote-tracking branch 'upstream/develop' into hotfix/00518
JamesKnBr Oct 3, 2023
63cc4ab
update pcdm-apm transformer
JamesKnBr Oct 3, 2023
cbf8447
Merge remote-tracking branch 'upstream/develop' into hotfix/00518
JamesKnBr Oct 5, 2023
fd58995
Circular average & std dev APIs
JamesKnBr Oct 5, 2023
cff86cc
updates to pcdm-apm
JamesKnBr Oct 5, 2023
a4ed1ad
fix udf error
JamesKnBr Oct 6, 2023
4d4df00
Merge remote-tracking branch 'upstream/develop' into hotfix/00518
JamesKnBr Oct 6, 2023
cfb848f
fix code smell
JamesKnBr Oct 6, 2023
3c6ad95
SparkSessionUtility bug fix
JamesKnBr Oct 6, 2023
4ea5b8b
updated SparkSessionUtility
JamesKnBr Oct 6, 2023
ebadfef
change docs typo in pcdm-hw
JamesKnBr Oct 6, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/api/v1/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
resample,
interpolate,
interpolation_at_time,
circular_average,
circular_standard_deviation,
time_weighted_average,
circular_average,
circular_standard_deviation,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,26 +12,29 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from pyspark.sql import DataFrame
from pyspark.sql import DataFrame, Window, SparkSession
from pyspark.sql.types import StringType
from pyspark.sql.functions import (
to_json,
col,
struct,
lit,
array,
monotonically_increasing_id,
floor,
row_number,
collect_list,
expr,
udf,
sha2,
when,
)
from pyspark.sql import Window
from datetime import datetime
import pytz
import gzip
import base64

from ..interfaces import TransformerInterface
from ..._pipeline_utils.models import Libraries, SystemType
from ..._pipeline_utils.spark import EDGEX_SCHEMA


class PCDMToHoneywellAPMTransformer(TransformerInterface):
Expand All @@ -41,22 +44,25 @@ class PCDMToHoneywellAPMTransformer(TransformerInterface):
data (Dataframe): Spark Dataframe in PCDM format
quality (str): Value for quality inside HistorySamples
history_samples_per_message (int): The number of HistorySamples for each row in the DataFrame (Batch Only)

compress_payload (bool): If True compresses CloudPlatformEvent with gzip compression
"""

data: DataFrame
quality: str
history_samples_per_message: int
compress_payload: bool

def __init__(
self,
data: DataFrame,
quality: str = "Good",
history_samples_per_message: int = 1,
compress_payload: bool = True,
) -> None:
self.data = data
self.quality = quality
self.history_samples_per_message = history_samples_per_message
self.compress_payload = compress_payload

@staticmethod
def system_type():
Expand Down Expand Up @@ -86,11 +92,17 @@ def transform(self) -> DataFrame:
Returns:
DataFrame: A dataframe with with rows in Honeywell APM format
"""

@udf("string")
def _compress_payload(data):
compressed_data = gzip.compress(data.encode("utf-8"))
encoded_data = base64.b64encode(compressed_data).decode("utf-8")
return encoded_data

if self.data.isStreaming == False and self.history_samples_per_message > 1:
pcdm_df = self.data.withColumn("counter", monotonically_increasing_id())
w = Window.orderBy("counter")
w = Window.partitionBy("TagName").orderBy("TagName")
cleaned_pcdm_df = (
pcdm_df.withColumn(
self.data.withColumn(
"index",
floor(
(row_number().over(w) - 0.01) / self.history_samples_per_message
Expand All @@ -105,9 +117,9 @@ def transform(self) -> DataFrame:
col("Value").alias("Value"),
).alias("HistorySamples"),
)
.groupBy("index")
.groupBy("TagName", "index")
.agg(collect_list("HistorySamples").alias("HistorySamples"))
.withColumn("guid", expr("uuid()"))
.withColumn("guid", sha2(col("TagName"), 256).cast("string"))
.withColumn(
"value",
struct(
Expand All @@ -116,7 +128,9 @@ def transform(self) -> DataFrame:
)
)
else:
cleaned_pcdm_df = self.data.withColumn("guid", expr("uuid()")).withColumn(
cleaned_pcdm_df = self.data.withColumn(
"guid", sha2(col("TagName"), 256).cast("string")
).withColumn(
"value",
struct(
col("guid").alias("SystemGuid"),
Expand All @@ -131,32 +145,48 @@ def transform(self) -> DataFrame:
),
)

df = cleaned_pcdm_df.withColumn(
"CloudPlatformEvent",
struct(
lit(datetime.now(tz=pytz.UTC)).alias("CreatedTime"),
lit(expr("uuid()")).alias("Id"),
col("guid").alias("CreatorId"),
lit("CloudPlatformSystem").alias("CreatorType"),
lit(None).alias("GeneratorId"),
lit("CloudPlatformTenant").alias("GeneratorType"),
col("guid").alias("TargetId"),
lit("CloudPlatformTenant").alias("TargetType"),
lit(None).alias("TargetContext"),
df = (
cleaned_pcdm_df.withColumn(
"CloudPlatformEvent",
struct(
lit("TextualBody").alias("type"),
to_json(col("value")).alias("value"),
lit("application/json").alias("format"),
).alias("Body"),
array(
lit(datetime.now(tz=pytz.UTC)).alias("CreatedTime"),
lit(expr("uuid()")).alias("Id"),
col("guid").alias("CreatorId"),
lit("CloudPlatformSystem").alias("CreatorType"),
lit(None).alias("GeneratorId"),
lit("CloudPlatformTenant").alias("GeneratorType"),
col("guid").alias("TargetId"),
lit("CloudPlatformTenant").alias("TargetType"),
lit(None).alias("TargetContext"),
struct(
lit("SystemType").alias("Key"),
lit("apm-system").alias("Value"),
),
struct(lit("SystemGuid").alias("Key"), col("guid").alias("Value")),
).alias("BodyProperties"),
lit("DataChange.Update").alias("EventType"),
),
).withColumn("AnnotationStreamIds", lit(","))

return df.select("CloudPlatformEvent", "AnnotationStreamIds")
lit("TextualBody").alias("type"),
to_json(col("value")).alias("value"),
lit("application/json").alias("format"),
).alias("Body"),
array(
struct(
lit("SystemType").alias("Key"),
lit("apm-system").alias("Value"),
),
struct(
lit("SystemGuid").alias("Key"), col("guid").alias("Value")
),
).alias("BodyProperties"),
lit("DataChange.Update").alias("EventType"),
),
)
.withColumn("AnnotationStreamIds", lit(","))
.withColumn("partitionKey", col("guid"))
)
if self.compress_payload:
return df.select(
_compress_payload(to_json("CloudPlatformEvent")).alias(
"CloudPlatformEvent"
),
"AnnotationStreamIds",
"partitionKey",
)
else:
return df.select(
"CloudPlatformEvent", "AnnotationStreamIds", "partitionKey"
)
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,17 @@ class PipelineComponentsGetUtility(UtilitiesInterface):

Args:
module (optional str): Provide the module to use for imports of rtdip-sdk components. If not populated, it will use the calling module to check for imports
spark_config (optional dict): Additional spark configuration to be applied to the spark session
"""

def __init__(self, module: str = None) -> None:
def __init__(self, module: str = None, spark_config: dict = None) -> None:
if module == None:
frm = inspect.stack()[1]
mod = inspect.getmodule(frm[0])
self.module = mod.__name__
else:
self.module = module
self.spark_config = {} if spark_config is None else spark_config

@staticmethod
def system_type():
Expand Down Expand Up @@ -100,7 +102,7 @@ def execute(self) -> Tuple[Libraries, dict]:

task_libraries = Libraries()
task_libraries.get_libraries_from_components(component_list)
spark_configuration = {}
spark_configuration = self.spark_config
for component in component_list:
spark_configuration = {**spark_configuration, **component.settings()}
return (task_libraries, spark_configuration)
Expand Down
8 changes: 5 additions & 3 deletions src/sdk/python/rtdip_sdk/pipelines/utilities/spark/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class SparkSessionUtility(UtilitiesInterface):
Call this component after all imports of the RTDIP components to ensure that the spark session is configured correctly.

Args:
config (dict): Dictionary of spark configuration to be applied to the spark session
config (optional dict): Dictionary of spark configuration to be applied to the spark session
module (optional str): Provide the module to use for imports of rtdip-sdk components. If not populated, it will use the calling module to check for imports
remote (optional str): Specify the remote parameters if intending to use Spark Connect
"""
Expand All @@ -40,7 +40,9 @@ class SparkSessionUtility(UtilitiesInterface):
config: dict
module: str

def __init__(self, config: dict, module: str = None, remote: str = None) -> None:
def __init__(
self, config: dict = None, module: str = None, remote: str = None
) -> None:
self.config = config
if module == None:
frm = inspect.stack()[1]
Expand Down Expand Up @@ -70,7 +72,7 @@ def settings() -> dict:
def execute(self) -> SparkSession:
try:
(task_libraries, spark_configuration) = PipelineComponentsGetUtility(
self.module
self.module, self.config
).execute()
self.spark = SparkClient(
spark_configuration=spark_configuration,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,63 +26,76 @@
Libraries,
SystemType,
)

from pyspark.sql import SparkSession, DataFrame
from pytest_mock import MockerFixture
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
from datetime import datetime
import uuid

pcdm_schema = StructType(
[
StructField("TagName", StringType(), True),
StructField("EventTime", TimestampType(), True),
StructField("Status", StringType(), False),
StructField("Value", StringType(), True),
StructField("ValueType", StringType(), False),
StructField("ChangeType", StringType(), False),
]
)

def test_pcdm_to_honeywell_apm(spark_session: SparkSession, mocker: MockerFixture):
pcdm_schema = StructType(
[
StructField("TagName", StringType(), True),
StructField("EventTime", TimestampType(), True),
StructField("Status", StringType(), False),
StructField("Value", StringType(), True),
StructField("ValueType", StringType(), False),
StructField("ChangeType", StringType(), False),
]
)
pcdm_data = [
{
"TagName": "test.item1",
"EventTime": datetime.fromisoformat("2023-07-31T06:53:00+00:00"),
"Status": "Good",
"Value": 5.0,
"ValueType": "float",
"ChangeType": "insert",
},
{
"TagName": "Test_item2",
"EventTime": datetime.fromisoformat("2023-07-31T06:54:00+00:00"),
"Status": "Good",
"Value": 1,
"ValueType": "float",
"ChangeType": "insert",
},
]

pcdm_data = [
{
"TagName": "test.item1",
"EventTime": datetime.fromisoformat("2023-07-31T06:53:00+00:00"),
"Status": "Good",
"Value": 5.0,
"ValueType": "float",
"ChangeType": "insert",
},
{
"TagName": "Test_item2",
"EventTime": datetime.fromisoformat("2023-07-31T06:54:00+00:00"),
"Status": "Good",
"Value": 1,
"ValueType": "float",
"ChangeType": "insert",
},
]

def test_pcdm_to_honeywell_apm(spark_session: SparkSession):
pcdm_df: DataFrame = spark_session.createDataFrame(
schema=pcdm_schema, data=pcdm_data
)
PCDM_to_honeywell_eventhub_json_transformer = PCDMToHoneywellAPMTransformer(
data=pcdm_df, history_samples_per_message=3
pcdm_to_honeywell_eventhub_json_transformer = PCDMToHoneywellAPMTransformer(
data=pcdm_df, history_samples_per_message=3, compress_payload=False
)

actual_df = PCDM_to_honeywell_eventhub_json_transformer.transform()
actual_df = pcdm_to_honeywell_eventhub_json_transformer.transform()
df_row = actual_df.collect()[0]
assert isinstance(uuid.UUID(df_row["CloudPlatformEvent"]["CreatorId"]), uuid.UUID)
assert (
PCDM_to_honeywell_eventhub_json_transformer.system_type() == SystemType.PYSPARK
df_row["CloudPlatformEvent"]["CreatorId"]
== "51bc4f9dda971d1b5417161bb98e5d8f77bea2587d9de783b54be25e22b56496"
)
assert (
pcdm_to_honeywell_eventhub_json_transformer.system_type() == SystemType.PYSPARK
)
assert isinstance(
PCDM_to_honeywell_eventhub_json_transformer.libraries(), Libraries
pcdm_to_honeywell_eventhub_json_transformer.libraries(), Libraries
)
assert len(df_row) == 2
assert len(df_row) == 3
assert len(df_row["CloudPlatformEvent"]) == 12
assert len(df_row["CloudPlatformEvent"]["Body"]) == 3
assert len(df_row["CloudPlatformEvent"]["BodyProperties"]) == 2
assert len(df_row["CloudPlatformEvent"]["BodyProperties"][0]) == 2
assert len(df_row["CloudPlatformEvent"]["BodyProperties"][1]) == 2


def test_pcdm_to_honeywell_apm_gzip_compressed(spark_session: SparkSession):
pcdm_df: DataFrame = spark_session.createDataFrame(
schema=pcdm_schema, data=pcdm_data
)
pcdm_to_honeywell_eventhub_json_transformer = PCDMToHoneywellAPMTransformer(
data=pcdm_df, history_samples_per_message=3
)
actual_df = pcdm_to_honeywell_eventhub_json_transformer.transform()
df_row = actual_df.collect()[0]
assert isinstance(df_row["CloudPlatformEvent"], str)