From 255b49f968a9cb2a7b8aa557156e44af78be70db Mon Sep 17 00:00:00 2001
From: GBBBAS <42962356+GBBBAS@users.noreply.github.com>
Date: Thu, 3 Aug 2023 13:38:10 +0100
Subject: [PATCH] Add option to remove Nanoseconds

Signed-off-by: GBBBAS <42962356+GBBBAS@users.noreply.github.com>
---
 .../pipelines/destinations/spark/pcdm_to_delta.py        | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/sdk/python/rtdip_sdk/pipelines/destinations/spark/pcdm_to_delta.py b/src/sdk/python/rtdip_sdk/pipelines/destinations/spark/pcdm_to_delta.py
index f75246031..f282b78a2 100644
--- a/src/sdk/python/rtdip_sdk/pipelines/destinations/spark/pcdm_to_delta.py
+++ b/src/sdk/python/rtdip_sdk/pipelines/destinations/spark/pcdm_to_delta.py
@@ -15,7 +15,7 @@
 import logging
 import time
 from pyspark.sql import DataFrame, SparkSession
-from pyspark.sql.functions import col, when, date_format
+from pyspark.sql.functions import col, when, date_format, floor
 from py4j.protocol import Py4JJavaError
 
 from ..interfaces import DestinationInterface
@@ -45,6 +45,7 @@ class SparkPCDMToDeltaDestination(DestinationInterface):
         query_name (str): Unique name for the query in associated SparkSession
         merge (bool): Use Delta Merge to perform inserts, updates and deletes
         try_broadcast_join (bool): Attempts to perform a broadcast join in the merge which can leverage data skipping using partition pruning and file pruning automatically. Can fail if dataframe being merged is large and therefore more suitable for streaming merges than batch merges
+        remove_nanoseconds (bool): Removes nanoseconds from the EventTime column and replaces with zeros
         remove_duplicates (bool: Removes duplicates before writing the data 
 
     Attributes:
@@ -61,6 +62,7 @@ class SparkPCDMToDeltaDestination(DestinationInterface):
     query_name: str
     merge: bool
     try_broadcast_join: bool
+    remove_nanoseconds: bool
     remove_duplicates: bool
 
     def __init__(self, 
@@ -75,6 +77,7 @@ def __init__(self,
                  query_name: str ="PCDMToDeltaMergeDestination",
                  merge: bool = True,
                  try_broadcast_join = False,
+                 remove_nanoseconds: bool = False,
                  remove_duplicates: bool = True) -> None: 
         self.spark = spark
         self.data = data
@@ -87,6 +90,7 @@ def __init__(self,
         self.query_name = query_name
         self.merge = merge
         self.try_broadcast_join = try_broadcast_join
+        self.remove_nanoseconds = remove_nanoseconds
         self.remove_duplicates = remove_duplicates
 
     @staticmethod
@@ -189,6 +193,9 @@ def _write_data_by_type(self, df: DataFrame):
         if self.merge == True:
             df = df.withColumn("ChangeType", when(df["ChangeType"].isin("insert", "update"), "upsert").otherwise(df["ChangeType"]))
 
+        if self.remove_nanoseconds == True:
+            df = df.withColumn("EventTime", (floor(col("EventTime").cast("double")*1000)/1000).cast("timestamp"))
+
         if self.remove_duplicates == True:
             df = df.drop_duplicates(["TagName", "EventTime"])