rtdip · GBBBAS · Aug 3, 2023 · Aug 3, 2023
diff --git a/src/sdk/python/rtdip_sdk/pipelines/destinations/spark/pcdm_to_delta.py b/src/sdk/python/rtdip_sdk/pipelines/destinations/spark/pcdm_to_delta.py
@@ -15,7 +15,7 @@
 import logging
 import time
 from pyspark.sql import DataFrame, SparkSession
-from pyspark.sql.functions import col, when, date_format
+from pyspark.sql.functions import col, when, date_format, floor
 from py4j.protocol import Py4JJavaError
 
 from ..interfaces import DestinationInterface
@@ -45,6 +45,7 @@ class SparkPCDMToDeltaDestination(DestinationInterface):
         query_name (str): Unique name for the query in associated SparkSession
         merge (bool): Use Delta Merge to perform inserts, updates and deletes
         try_broadcast_join (bool): Attempts to perform a broadcast join in the merge which can leverage data skipping using partition pruning and file pruning automatically. Can fail if dataframe being merged is large and therefore more suitable for streaming merges than batch merges
+        remove_nanoseconds (bool): Removes nanoseconds from the EventTime column and replaces with zeros
         remove_duplicates (bool: Removes duplicates before writing the data 
 
     Attributes:
@@ -61,6 +62,7 @@ class SparkPCDMToDeltaDestination(DestinationInterface):
     query_name: str
     merge: bool
     try_broadcast_join: bool
+    remove_nanoseconds: bool
     remove_duplicates: bool
 
     def __init__(self, 
@@ -75,6 +77,7 @@ def __init__(self,
                  query_name: str ="PCDMToDeltaMergeDestination",
                  merge: bool = True,
                  try_broadcast_join = False,
+                 remove_nanoseconds: bool = False,
                  remove_duplicates: bool = True) -> None: 
         self.spark = spark
         self.data = data
@@ -87,6 +90,7 @@ def __init__(self,
         self.query_name = query_name
         self.merge = merge
         self.try_broadcast_join = try_broadcast_join
+        self.remove_nanoseconds = remove_nanoseconds
         self.remove_duplicates = remove_duplicates
 
     @staticmethod
@@ -189,6 +193,9 @@ def _write_data_by_type(self, df: DataFrame):
         if self.merge == True:
             df = df.withColumn("ChangeType", when(df["ChangeType"].isin("insert", "update"), "upsert").otherwise(df["ChangeType"]))
 
+        if self.remove_nanoseconds == True:
+            df = df.withColumn("EventTime", (floor(col("EventTime").cast("double")*1000)/1000).cast("timestamp"))
+
         if self.remove_duplicates == True:
             df = df.drop_duplicates(["TagName", "EventTime"])