Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/sdk/code-reference/pipelines/sources/spark/autoloader.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Read from Autoloader
::: src.sdk.python.rtdip_sdk.pipelines.sources.spark.autoloader
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ nav:
- Eventhub: sdk/code-reference/pipelines/sources/spark/eventhub.md
- Delta: sdk/code-reference/pipelines/sources/spark/delta.md
- Delta Sharing: sdk/code-reference/pipelines/sources/spark/delta_sharing.md
- Autoloader: sdk/code-reference/pipelines/sources/spark/autoloader.md
- Destinations:
- Spark:
- Eventhub: sdk/code-reference/pipelines/destinations/spark/eventhub.md
Expand Down
83 changes: 83 additions & 0 deletions src/sdk/python/rtdip_sdk/pipelines/sources/spark/autoloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# Copyright 2022 RTDIP
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
from pyspark.sql import DataFrame, SparkSession

from ..interfaces import SourceInterface
from ..._pipeline_utils.models import Libraries, MavenLibrary, SystemType
from ..._pipeline_utils.constants import DEFAULT_PACKAGES

class DataBricksAutoLoaderSource(SourceInterface):
'''
The Spark Auto Loader is used to read new data files as they arrive in cloud storage. Further information on Auto Loader is available [here](https://docs.databricks.com/ingestion/auto-loader/index.html)

Args:
spark: Spark Session required to read data from cloud storage
options: Options that can be specified for configuring the Auto Loader. Further information on the options available are [here](https://docs.databricks.com/ingestion/auto-loader/options.html)
path: The cloud storage path
format: Specifies the file format to be read. Supported formats are available [here](https://docs.databricks.com/ingestion/auto-loader/options.html#file-format-options)
'''
spark: SparkSession
options: dict
path: str

def __init__(self, spark: SparkSession, options: dict, path: str, format: str) -> None:
self.spark = spark
self.options = options
self.path = path
self.options["cloudFiles.format"] = format

@staticmethod
def system_type():
return SystemType.PYSPARK_DATABRICKS

@staticmethod
def libraries():
libraries = Libraries()
libraries.add_maven_library(DEFAULT_PACKAGES["spark_delta_core"])
return libraries

@staticmethod
def settings() -> dict:
return {}

def pre_read_validation(self):
return True

def post_read_validation(self, df: DataFrame):
return True

def read_batch(self):
'''
Raises:
NotImplementedError: Auto Loader only supports streaming reads. To perform a batch read, use the read_stream method of this component and specify the Trigger on the write_stream to be `availableNow=True` to perform batch-like reads of cloud storage files.
'''
raise NotImplementedError("Auto Loader only supports streaming reads. To perform a batch read, use the read_stream method and specify Trigger on the write_stream as `availableNow=True`")

def read_stream(self) -> DataFrame:
'''
Performs streaming reads of files in cloud storage.
'''
try:
return (self.spark
.readStream
.format("cloudFiles")
.options(**self.options)
.load(self.path)
)

except Exception as e:
logging.exception('error with spark read stream auto loader function', e.__traceback__)
raise e
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright 2022 RTDIP
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
sys.path.insert(0, '.')
import pytest
from pytest_mock import MockerFixture
from src.sdk.python.rtdip_sdk.pipelines.sources.spark.autoloader import DataBricksAutoLoaderSource
from tests.sdk.python.rtdip_sdk.pipelines._pipeline_utils.spark_configuration_constants import spark_session
import json
from pyspark.sql import DataFrame, SparkSession

def test_databricks_autoloader_read_batch(spark_session: SparkSession):
with pytest.raises(NotImplementedError) as excinfo:
autoloader_source = DataBricksAutoLoaderSource(spark_session, {}, "/path", "parquet")
autoloader_source.read_batch()
assert str(excinfo.value) == 'Auto Loader only supports streaming reads. To perform a batch read, use the read_stream method and specify Trigger on the write_stream as `availableNow=True`'

def test_databricks_autoloader_read_stream(spark_session: SparkSession, mocker: MockerFixture):
autoloader_source = DataBricksAutoLoaderSource(spark_session, {}, "/path", "parquet")
expected_df = spark_session.createDataFrame([{"a": "x"}])
mocker.patch.object(autoloader_source, "spark", new_callable=mocker.PropertyMock(return_value=mocker.Mock(readStream=mocker.Mock(format=mocker.Mock(return_value=mocker.Mock(options=mocker.Mock(return_value=mocker.Mock(load=mocker.Mock(return_value=expected_df)))))))))
assert autoloader_source.pre_read_validation()
df = autoloader_source.read_stream()
assert isinstance(df, DataFrame)
assert autoloader_source.post_read_validation(df)