From 255b49f968a9cb2a7b8aa557156e44af78be70db Mon Sep 17 00:00:00 2001 From: GBBBAS <42962356+GBBBAS@users.noreply.github.com> Date: Thu, 3 Aug 2023 13:38:10 +0100 Subject: [PATCH] Add option to remove Nanoseconds Signed-off-by: GBBBAS <42962356+GBBBAS@users.noreply.github.com> --- .../pipelines/destinations/spark/pcdm_to_delta.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/sdk/python/rtdip_sdk/pipelines/destinations/spark/pcdm_to_delta.py b/src/sdk/python/rtdip_sdk/pipelines/destinations/spark/pcdm_to_delta.py index f75246031..f282b78a2 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/destinations/spark/pcdm_to_delta.py +++ b/src/sdk/python/rtdip_sdk/pipelines/destinations/spark/pcdm_to_delta.py @@ -15,7 +15,7 @@ import logging import time from pyspark.sql import DataFrame, SparkSession -from pyspark.sql.functions import col, when, date_format +from pyspark.sql.functions import col, when, date_format, floor from py4j.protocol import Py4JJavaError from ..interfaces import DestinationInterface @@ -45,6 +45,7 @@ class SparkPCDMToDeltaDestination(DestinationInterface): query_name (str): Unique name for the query in associated SparkSession merge (bool): Use Delta Merge to perform inserts, updates and deletes try_broadcast_join (bool): Attempts to perform a broadcast join in the merge which can leverage data skipping using partition pruning and file pruning automatically. Can fail if dataframe being merged is large and therefore more suitable for streaming merges than batch merges + remove_nanoseconds (bool): Removes nanoseconds from the EventTime column and replaces with zeros remove_duplicates (bool: Removes duplicates before writing the data Attributes: @@ -61,6 +62,7 @@ class SparkPCDMToDeltaDestination(DestinationInterface): query_name: str merge: bool try_broadcast_join: bool + remove_nanoseconds: bool remove_duplicates: bool def __init__(self, @@ -75,6 +77,7 @@ def __init__(self, query_name: str ="PCDMToDeltaMergeDestination", merge: bool = True, try_broadcast_join = False, + remove_nanoseconds: bool = False, remove_duplicates: bool = True) -> None: self.spark = spark self.data = data @@ -87,6 +90,7 @@ def __init__(self, self.query_name = query_name self.merge = merge self.try_broadcast_join = try_broadcast_join + self.remove_nanoseconds = remove_nanoseconds self.remove_duplicates = remove_duplicates @staticmethod @@ -189,6 +193,9 @@ def _write_data_by_type(self, df: DataFrame): if self.merge == True: df = df.withColumn("ChangeType", when(df["ChangeType"].isin("insert", "update"), "upsert").otherwise(df["ChangeType"])) + if self.remove_nanoseconds == True: + df = df.withColumn("EventTime", (floor(col("EventTime").cast("double")*1000)/1000).cast("timestamp")) + if self.remove_duplicates == True: df = df.drop_duplicates(["TagName", "EventTime"])