Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@
],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true,
// "python.testing.cwd": "${workspaceFolder}",
"python.testing.cwd": "${workspaceFolder}",
"python.analysis.extraPaths": ["${workspaceFolder}"],
"terminal.integrated.env.osx":{
"PYTHONPATH": "${workspaceFolder}:${env:PYTHONPATH}"
},
// "terminal.integrated.env.osx":{
// "PYTHONPATH": "${workspaceFolder}:${env:PYTHONPATH}"
// },
"terminal.integrated.env.linux":{
"PYTHONPATH": "${workspaceFolder}:${env:PYTHONPATH}"
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ class SparkDeltaMergeDestination(DestinationInterface):
when_not_matched_insert_list (list[DeltaMergeConditionValues]): Conditions(optional) and values to be used when inserting rows that do not match the `merge_condition`. Specify `*` for Values if all columns from Dataframe should be inserted.
when_not_matched_by_source_update_list (list[DeltaMergeConditionValues]): Conditions(optional) and values to be used when updating rows that do not match the `merge_condition`.
when_not_matched_by_source_delete_list (list[DeltaMergeCondition]): Conditions(optional) to be used when deleting rows that do not match the `merge_condition`.
try_broadcast_join (bool): Attempts to perform a broadcast join in the merge which can leverage data skipping using partition pruning and file pruning automatically. Can fail if dataframe being merged is large and therefore more suitable for streaming merges than batch merges
trigger (str): Frequency of the write operation
query_name (str): Unique name for the query in associated SparkSession

Expand Down Expand Up @@ -78,7 +79,7 @@ def __init__(self,
when_not_matched_insert_list: List[DeltaMergeConditionValues] = None,
when_not_matched_by_source_update_list: List[DeltaMergeConditionValues] = None,
when_not_matched_by_source_delete_list: List[DeltaMergeCondition] = None,
try_broadcast_join: bool = True,
try_broadcast_join: bool = False,
trigger="10 seconds",
query_name: str ="DeltaMergeDestination") -> None:
self.spark = spark
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import logging
import time
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import col, when
from pyspark.sql.functions import col, when, date_format
from py4j.protocol import Py4JJavaError

from ..interfaces import DestinationInterface
Expand All @@ -38,6 +38,7 @@ class SparkPCDMToDeltaDestination(DestinationInterface):
trigger (str): Frequency of the write operation
query_name (str): Unique name for the query in associated SparkSession
merge (bool): Use Delta Merge to perform inserts, updates and deletes
try_broadcast_join (bool): Attempts to perform a broadcast join in the merge which can leverage data skipping using partition pruning and file pruning automatically. Can fail if dataframe being merged is large and therefore more suitable for streaming merges than batch merges
remove_duplicates (bool: Removes duplicates before writing the data

Attributes:
Expand All @@ -53,6 +54,7 @@ class SparkPCDMToDeltaDestination(DestinationInterface):
trigger: str
query_name: str
merge: bool
try_broadcast_join: bool
remove_duplicates: bool

def __init__(self,
Expand All @@ -66,6 +68,7 @@ def __init__(self,
trigger="10 seconds",
query_name: str ="PCDMToDeltaMergeDestination",
merge: bool = True,
try_broadcast_join = False,
remove_duplicates: bool = True) -> None:
self.spark = spark
self.data = data
Expand All @@ -77,6 +80,7 @@ def __init__(self,
self.trigger = trigger
self.query_name = query_name
self.merge = merge
self.try_broadcast_join = try_broadcast_join
self.remove_duplicates = remove_duplicates

@staticmethod
Expand All @@ -103,6 +107,12 @@ def pre_write_validation(self):
def post_write_validation(self):
return True

def _get_eventdate_string(self, df: DataFrame) -> str:
dates_df = df.select("EventDate").distinct()
dates_df = dates_df.select(date_format("EventDate", "yyyy-MM-dd").alias("EventDate"))
dates_list = list(dates_df.toPandas()["EventDate"])
return str(dates_list).replace('[','').replace(']','')

def _write_delta_batch(self, df: DataFrame, table_name: str):

if self.merge == True:
Expand Down Expand Up @@ -130,15 +140,23 @@ def _write_delta_batch(self, df: DataFrame, table_name: str):
values="*"
)
]

merge_condition = "source.EventDate = target.EventDate AND source.TagName = target.TagName AND source.EventTime = target.EventTime"

if self.try_broadcast_join != True:
eventdate_string = self._get_eventdate_string(df)
merge_condition = "target.EventDate in ({}) AND ".format(eventdate_string) + merge_condition

delta = SparkDeltaMergeDestination(
spark=self.spark,
data=df,
table_name=table_name,
options=self.options,
merge_condition="source.EventDate = target.EventDate AND source.TagName = target.TagName AND source.EventTime = target.EventTime",
merge_condition=merge_condition,
when_matched_update_list=when_matched_update_list,
when_matched_delete_list=when_matched_delete_list,
when_not_matched_insert_list=when_not_matched_insert_list
when_not_matched_insert_list=when_not_matched_insert_list,
try_broadcast_join=self.try_broadcast_join
)
else:
df = df.select("TagName", "EventTime", "Status", "Value")
Expand All @@ -155,7 +173,7 @@ def _write_data_by_type(self, df: DataFrame):
df = df.withColumn("ChangeType", when(df["ChangeType"].isin("insert", "update"), "upsert").otherwise(df["ChangeType"]))

if self.remove_duplicates == True:
df = df.drop_duplicates()
df = df.drop_duplicates(["TagName", "EventTime"])

float_df = (
df
Expand Down Expand Up @@ -185,8 +203,14 @@ def write_batch(self):
Writes Process Control Data Model data to Delta
'''
try:
if self.try_broadcast_join != True:
self.data.persist()

self._write_data_by_type(self.data)

if self.try_broadcast_join != True:
self.data.unpersist()

except Py4JJavaError as e:
logging.exception(e.errmsg)
raise e
Expand Down