Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import logging
import time
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import col, when, date_format
from pyspark.sql.functions import col, when, date_format, floor
from py4j.protocol import Py4JJavaError

from ..interfaces import DestinationInterface
Expand Down Expand Up @@ -45,6 +45,7 @@ class SparkPCDMToDeltaDestination(DestinationInterface):
query_name (str): Unique name for the query in associated SparkSession
merge (bool): Use Delta Merge to perform inserts, updates and deletes
try_broadcast_join (bool): Attempts to perform a broadcast join in the merge which can leverage data skipping using partition pruning and file pruning automatically. Can fail if dataframe being merged is large and therefore more suitable for streaming merges than batch merges
remove_nanoseconds (bool): Removes nanoseconds from the EventTime column and replaces with zeros
remove_duplicates (bool: Removes duplicates before writing the data

Attributes:
Expand All @@ -61,6 +62,7 @@ class SparkPCDMToDeltaDestination(DestinationInterface):
query_name: str
merge: bool
try_broadcast_join: bool
remove_nanoseconds: bool
remove_duplicates: bool

def __init__(self,
Expand All @@ -75,6 +77,7 @@ def __init__(self,
query_name: str ="PCDMToDeltaMergeDestination",
merge: bool = True,
try_broadcast_join = False,
remove_nanoseconds: bool = False,
remove_duplicates: bool = True) -> None:
self.spark = spark
self.data = data
Expand All @@ -87,6 +90,7 @@ def __init__(self,
self.query_name = query_name
self.merge = merge
self.try_broadcast_join = try_broadcast_join
self.remove_nanoseconds = remove_nanoseconds
self.remove_duplicates = remove_duplicates

@staticmethod
Expand Down Expand Up @@ -189,6 +193,9 @@ def _write_data_by_type(self, df: DataFrame):
if self.merge == True:
df = df.withColumn("ChangeType", when(df["ChangeType"].isin("insert", "update"), "upsert").otherwise(df["ChangeType"]))

if self.remove_nanoseconds == True:
df = df.withColumn("EventTime", (floor(col("EventTime").cast("double")*1000)/1000).cast("timestamp"))

if self.remove_duplicates == True:
df = df.drop_duplicates(["TagName", "EventTime"])

Expand Down