Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,12 +122,14 @@ def deploy(self) -> Union[bool, ValueError]:
if task.notebook_task is not None:
module = self._load_module(task.task_key + "file_upload", task.notebook_task.notebook_path)
(task_libraries, spark_configuration) = PipelineComponentsGetUtility(module.__name__).execute()
workspace_client.workspace.mkdirs(path=self.workspace_directory)
path="{}/{}".format(self.workspace_directory, Path(task.notebook_task.notebook_path).name)
workspace_client.workspace.upload(path=path, overwrite=True, content=self._convert_file_to_binary(task.notebook_task.notebook_path))
task.notebook_task.notebook_path = path
else:
module = self._load_module(task.task_key + "file_upload", task.spark_python_task.python_file)
(task_libraries, spark_configuration) = PipelineComponentsGetUtility(module).execute()
workspace_client.workspace.mkdirs(path=self.workspace_directory)
path="{}/{}".format(self.workspace_directory, Path(task.spark_python_task.python_file).name)
workspace_client.workspace.upload(path=path, overwrite=True, content=self._convert_file_to_binary(task.spark_python_task.python_file))
task.spark_python_task.python_file = path
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ class SparkDeltaDestination(DestinationInterface):
options (dict): Options that can be specified for a Delta Table write operation (See Attributes table below). Further information on the options is available for [batch](https://docs.delta.io/latest/delta-batch.html#write-to-a-table){ target="_blank" } and [streaming](https://docs.delta.io/latest/delta-streaming.html#delta-table-as-a-sink){ target="_blank" }.
destination (str): Either the name of the Hive Metastore or Unity Catalog Delta Table **or** the path to the Delta table
mode (str): Method of writing to Delta Table - append/overwrite (batch), append/complete (stream)
trigger (str): Frequency of the write operation
trigger (str): Frequency of the write operation. Specify "availableNow" to execute a trigger once, otherwise specify a time period such as "30 seconds", "5 minutes"
query_name (str): Unique name for the query in associated SparkSession

Attributes:
Expand Down Expand Up @@ -119,12 +119,13 @@ def write_stream(self):
'''
Writes streaming data to Delta. Exactly-once processing is guaranteed
'''
TRIGGER_OPTION = {'availableNow': True} if self.trigger == "availableNow" else {'processingTime': self.trigger}
try:
if "/" in self.destination:
query = (
self.data
.writeStream
.trigger(processingTime=self.trigger)
.trigger(**TRIGGER_OPTION)
.format("delta")
.queryName(self.query_name)
.outputMode(self.mode)
Expand All @@ -135,7 +136,7 @@ def write_stream(self):
query = (
self.data
.writeStream
.trigger(processingTime=self.trigger)
.trigger(**TRIGGER_OPTION)
.format("delta")
.queryName(self.query_name)
.outputMode(self.mode)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ class SparkDeltaMergeDestination(DestinationInterface):
when_not_matched_by_source_update_list (list[DeltaMergeConditionValues]): Conditions(optional) and values to be used when updating rows that do not match the `merge_condition`.
when_not_matched_by_source_delete_list (list[DeltaMergeCondition]): Conditions(optional) to be used when deleting rows that do not match the `merge_condition`.
try_broadcast_join (bool): Attempts to perform a broadcast join in the merge which can leverage data skipping using partition pruning and file pruning automatically. Can fail if dataframe being merged is large and therefore more suitable for streaming merges than batch merges
trigger (str): Frequency of the write operation
trigger (str): Frequency of the write operation. Specify "availableNow" to execute a trigger once, otherwise specify a time period such as "30 seconds", "5 minutes"
query_name (str): Unique name for the query in associated SparkSession

Attributes:
Expand Down Expand Up @@ -226,11 +226,12 @@ def write_stream(self):
'''
Merges streaming data to Delta using foreachBatch
'''
TRIGGER_OPTION = {'availableNow': True} if self.trigger == "availableNow" else {'processingTime': self.trigger}
try:
query = (
self.data
.writeStream
.trigger(processingTime=self.trigger)
.trigger(**TRIGGER_OPTION)
.format("delta")
.foreachBatch(self._stream_merge_micro_batch)
.queryName(self.query_name)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@ class SparkEventhubDestination(DestinationInterface):
Args:
data (DataFrame): Dataframe to be written to Eventhub
options (dict): A dictionary of Eventhub configurations (See Attributes table below). All Configuration options for Eventhubs can be found [here.](https://github.com/Azure/azure-event-hubs-spark/blob/master/docs/PySpark/structured-streaming-pyspark.md#event-hubs-configuration){ target="_blank" }

trigger (str): Frequency of the write operation. Specify "availableNow" to execute a trigger once, otherwise specify a time period such as "30 seconds", "5 minutes"
query_name (str): Unique name for the query in associated SparkSession

Attributes:
checkpointLocation (str): Path to checkpoint files. (Streaming)
eventhubs.connectionString (str): Eventhubs connection string is required to connect to the Eventhubs service. (Streaming and Batch)
Expand All @@ -38,12 +40,12 @@ class SparkEventhubDestination(DestinationInterface):
eventhubs.endingPosition: (JSON str): The ending position of a batch query. This works the same as startingPosition. (Batch)
maxEventsPerTrigger (long): Rate limit on maximum number of events processed per trigger interval. The specified total number of events will be proportionally split across partitions of different volume. (Stream)
'''
data: DataFrame
options: dict

def __init__(self, data: DataFrame, options: dict) -> None:
def __init__(self, data: DataFrame, options: dict, trigger="10 seconds", query_name="EventhubDestination") -> None:
self.data = data
self.options = options
self.trigger = trigger
self.query_name = query_name

@staticmethod
def system_type():
Expand Down Expand Up @@ -94,11 +96,14 @@ def write_stream(self):
Writes steaming data to Eventhubs.
'''
try:
TRIGGER_OPTION = {'availableNow': True} if self.trigger == "availableNow" else {'processingTime': self.trigger}
query = (
self.data
.writeStream
.trigger(**TRIGGER_OPTION)
.format("eventhubs")
.options(**self.options)
.queryName(self.query_name)
.start()
)
while query.isActive:
Expand Down
12 changes: 8 additions & 4 deletions src/sdk/python/rtdip_sdk/pipelines/destinations/spark/kafka.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ class SparkKafkaDestination(DestinationInterface):
Args:
data (DataFrame): Dataframe to be written to Kafka
options (dict): A dictionary of Kafka configurations (See Attributes tables below). For more information on configuration options see [here](https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html){ target="_blank" }
trigger (str): Frequency of the write operation. Specify "availableNow" to execute a trigger once, otherwise specify a time period such as "30 seconds", "5 minutes"
query_name (str): Unique name for the query in associated SparkSession

The following options must be set for the Kafka destination for both batch and streaming queries.

Expand All @@ -46,12 +48,11 @@ class SparkKafkaDestination(DestinationInterface):
includeHeaders (bool): Whether to include the Kafka headers in the row. (Streaming and Batch)

'''
data: DataFrame
options: dict

def __init__(self, data: DataFrame, options: dict) -> None:
def __init__(self, data: DataFrame, options: dict, trigger="10 seconds", query_name="KafkaDestination") -> None:
self.data = data
self.options = options
self.trigger = trigger
self.query_name = query_name

@staticmethod
def system_type():
Expand Down Expand Up @@ -103,12 +104,15 @@ def write_stream(self):
Writes steaming data to Kafka.
'''
try:
TRIGGER_OPTION = {'availableNow': True} if self.trigger == "availableNow" else {'processingTime': self.trigger}
query = (
self.data
.select(to_json(struct("*")).alias("value"))
.writeStream
.trigger(**TRIGGER_OPTION)
.format("kafka")
.options(**self.options)
.queryName(self.query_name)
.start()
)
while query.isActive:
Expand Down
14 changes: 7 additions & 7 deletions src/sdk/python/rtdip_sdk/pipelines/destinations/spark/kinesis.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,23 +26,21 @@ class SparkKinesisDestination(DestinationInterface):
data (DataFrame): Dataframe to be written to Delta
options (dict): A dictionary of Kinesis configurations (See Attributes table below). All Configuration options for Kinesis can be found [here.](https://github.com/qubole/kinesis-sql#kinesis-sink-configuration){ target="_blank" }
mode (str): Method of writing to Kinesis - append, complete, update
trigger (str): Frequency of the write operation
trigger (str): Frequency of the write operation. Specify "availableNow" to execute a trigger once, otherwise specify a time period such as "30 seconds", "5 minutes"
query_name (str): Unique name for the query in associated SparkSession

Attributes:
endpointUrl (str): Endpoint of the kinesis stream.
awsAccessKey (str): AWS access key.
awsSecretKey (str): AWS secret access key corresponding to the access key.
streamName (List[str]): Name of the streams in Kinesis to write to.
'''
options: dict
mode: str
trigger: str

def __init__(self, data: DataFrame, options: dict, mode:str = "update", trigger:str= "10 seconds") -> None:
def __init__(self, data: DataFrame, options: dict, mode:str = "update", trigger:str= "10 seconds", query_name="KinesisDestination") -> None:
self.data = data
self.options = options
self.mode = mode
self.trigger = trigger
self.query_name = query_name

@staticmethod
def system_type():
Expand Down Expand Up @@ -91,13 +89,15 @@ def write_stream(self):
Writes steaming data to Kinesis.
'''
try:
TRIGGER_OPTION = {'availableNow': True} if self.trigger == "availableNow" else {'processingTime': self.trigger}
query = (
self.data
.writeStream
.trigger(processingTime=self.trigger)
.trigger(**TRIGGER_OPTION)
.format("kinesis")
.outputMode(self.mode)
.options(**self.options)
.queryName(self.query_name)
.start()
)
while query.isActive:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class SparkPCDMToDeltaDestination(DestinationInterface):
destination_string (str): Either the name of the Hive Metastore or Unity Catalog Delta Table **or** the path to the Delta table to store string values.
destination_integer (Optional str): Either the name of the Hive Metastore or Unity Catalog Delta Table **or** the path to the Delta table to store integer values
mode (str): Method of writing to Delta Table - append/overwrite (batch), append/complete (stream)
trigger (str): Frequency of the write operation
trigger (str): Frequency of the write operation. Specify "availableNow" to execute a trigger once, otherwise specify a time period such as "30 seconds", "5 minutes"
query_name (str): Unique name for the query in associated SparkSession
merge (bool): Use Delta Merge to perform inserts, updates and deletes
try_broadcast_join (bool): Attempts to perform a broadcast join in the merge which can leverage data skipping using partition pruning and file pruning automatically. Can fail if dataframe being merged is large and therefore more suitable for streaming merges than batch merges
Expand Down Expand Up @@ -74,7 +74,7 @@ def __init__(self,
destination_integer: str = None,
mode: str = None,
trigger="10 seconds",
query_name: str ="PCDMToDeltaMergeDestination",
query_name: str ="PCDMToDeltaDestination",
merge: bool = True,
try_broadcast_join = False,
remove_nanoseconds: bool = False,
Expand Down Expand Up @@ -247,11 +247,12 @@ def write_stream(self):
Writes streaming Process Control Data Model data to Delta using foreachBatch
'''
try:
TRIGGER_OPTION = {'availableNow': True} if self.trigger == "availableNow" else {'processingTime': self.trigger}
if self.merge == True:
query = (
self.data
.writeStream
.trigger(processingTime=self.trigger)
.trigger(**TRIGGER_OPTION)
.format("delta")
.foreachBatch(self._write_stream_microbatches)
.queryName(self.query_name)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class SparkRestAPIDestination(DestinationInterface):
batch_size (int): The number of DataFrame rows to be used in each Rest API call
method (str): The method to be used when calling the Rest API. Allowed values are POST, PATCH and PUT
parallelism (int): The number of concurrent calls to be made to the Rest API
trigger (str): Frequency of the write operation
trigger (str): Frequency of the write operation. Specify "availableNow" to execute a trigger once, otherwise specify a time period such as "30 seconds", "5 minutes"
query_name (str): Unique name for the query in associated SparkSession

Attributes:
Expand Down Expand Up @@ -172,10 +172,11 @@ def write_stream(self):
Writes streaming data to a Rest API
'''
try:
TRIGGER_OPTION = {'availableNow': True} if self.trigger == "availableNow" else {'processingTime': self.trigger}
query = (
self.data
.writeStream
.trigger(processingTime=self.trigger)
.trigger(**TRIGGER_OPTION)
.foreachBatch(self._api_micro_batch)
.queryName(self.query_name)
.outputMode("update")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,9 @@ def post_read_validation(self, df: DataFrame):
def read_batch(self):
'''
Raises:
NotImplementedError: Auto Loader only supports streaming reads. To perform a batch read, use the read_stream method of this component and specify the Trigger on the write_stream to be `availableNow=True` to perform batch-like reads of cloud storage files.
NotImplementedError: Auto Loader only supports streaming reads. To perform a batch read, use the read_stream method of this component and specify the Trigger on the write_stream to be `availableNow` to perform batch-like reads of cloud storage files.
'''
raise NotImplementedError("Auto Loader only supports streaming reads. To perform a batch read, use the read_stream method and specify Trigger on the write_stream as `availableNow=True`")
raise NotImplementedError("Auto Loader only supports streaming reads. To perform a batch read, use the read_stream method and specify Trigger on the write_stream as `availableNow`")

def read_stream(self) -> DataFrame:
'''
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ def test_pipeline_job_deploy(mocker: MockerFixture):

mocker.patch("src.sdk.python.rtdip_sdk.pipelines.deploy.databricks.DatabricksSDKDeploy._load_module", return_value=DummyModule())
mocker.patch("src.sdk.python.rtdip_sdk.pipelines.deploy.databricks.DatabricksSDKDeploy._convert_file_to_binary", return_value=None)
mocker.patch("databricks.sdk.mixins.workspace.WorkspaceExt.mkdirs", return_value=None)
mocker.patch("databricks.sdk.mixins.workspace.WorkspaceExt.upload", return_value=None)
libraries = Libraries(pypi_libraries=[PyPiLibrary(name="rtdip-sdk", version=default_version)], maven_libraries=[MavenLibrary(group_id="rtdip", artifact_id="rtdip-sdk", version=default_version)], python_wheel_libraries=[PythonWheelLibrary(path="test_wheel.whl")])
mocker.patch("src.sdk.python.rtdip_sdk.pipelines.utilities.pipeline_components.PipelineComponentsGetUtility.execute", return_value=(libraries, {"config": "test_config"}))
Expand Down Expand Up @@ -114,6 +115,7 @@ def test_pipeline_job_deploy_fails(mocker: MockerFixture):

mocker.patch("src.sdk.python.rtdip_sdk.pipelines.deploy.databricks.DatabricksSDKDeploy._load_module", return_value=DummyModule())
mocker.patch("src.sdk.python.rtdip_sdk.pipelines.deploy.databricks.DatabricksSDKDeploy._convert_file_to_binary", return_value=None)
mocker.patch("databricks.sdk.mixins.workspace.WorkspaceExt.mkdirs", return_value=None)
mocker.patch("databricks.sdk.mixins.workspace.WorkspaceExt.upload", return_value=None)
libraries = Libraries(pypi_libraries=[PyPiLibrary(name="rtdip-sdk", version=default_version)], maven_libraries=[MavenLibrary(group_id="rtdip", artifact_id="rtdip-sdk", version=default_version)], python_wheel_libraries=[PythonWheelLibrary(path="test_wheel.whl")])
mocker.patch("src.sdk.python.rtdip_sdk.pipelines.utilities.pipeline_components.PipelineComponentsGetUtility.execute", return_value=(libraries, {"config": "test_config"}))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def test_spark_eventhub_write_batch(spark_session: SparkSession, mocker: MockerF
assert actual is None

def test_spark_eventhub_write_stream(spark_session: SparkSession, mocker: MockerFixture):
mocker.patch("pyspark.sql.DataFrame.writeStream", new_callable=mocker.Mock(return_value=mocker.Mock(format=mocker.Mock(return_value=mocker.Mock(options=mocker.Mock(return_value=mocker.Mock(start=mocker.Mock(return_value=TestStreamingQueryClass()))))))))
mocker.patch("pyspark.sql.DataFrame.writeStream", new_callable=mocker.Mock(return_value=mocker.Mock(trigger=mocker.Mock(return_value=mocker.Mock(format=mocker.Mock(return_value=mocker.Mock(options=mocker.Mock(return_value=mocker.Mock(queryName=mocker.Mock(return_value=mocker.Mock(start=mocker.Mock(return_value=TestStreamingQueryClass()))))))))))))
expected_df = spark_session.createDataFrame([{"id": "1"}])
eventhub_destination = SparkEventhubDestination(expected_df, {})
actual = eventhub_destination.write_stream()
Expand All @@ -58,7 +58,7 @@ def test_spark_eventhub_write_batch_fails(spark_session: SparkSession, mocker: M
eventhub_destination.write_batch()

def test_spark_eventhub_write_stream_fails(spark_session: SparkSession, mocker: MockerFixture):
mocker.patch("pyspark.sql.DataFrame.writeStream", new_callable=mocker.Mock(return_value=mocker.Mock(format=mocker.Mock(return_value=mocker.Mock(options=mocker.Mock(return_value=mocker.Mock(start=mocker.Mock(side_effect=Exception))))))))
mocker.patch("pyspark.sql.DataFrame.writeStream", new_callable=mocker.Mock(return_value=mocker.Mock(trigger=mocker.Mock(return_value=mocker.Mock(format=mocker.Mock(return_value=mocker.Mock(options=mocker.Mock(return_value=mocker.Mock(queryName=mocker.Mock(return_value=mocker.Mock(start=mocker.Mock(side_effect=Exception))))))))))))
expected_df = spark_session.createDataFrame([{"id": "1"}])
eventhub_destination = SparkEventhubDestination(expected_df, {})
with pytest.raises(Exception):
Expand Down
Loading