rtdip · GBBBAS · Mar 16, 2023 · Mar 16, 2023
diff --git a/docs/sdk/code-reference/pipelines/sources/spark/autoloader.md b/docs/sdk/code-reference/pipelines/sources/spark/autoloader.md
@@ -0,0 +1,2 @@
+# Read from Autoloader
+::: src.sdk.python.rtdip_sdk.pipelines.sources.spark.autoloader
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -128,6 +128,7 @@ nav:
                                  - Eventhub: sdk/code-reference/pipelines/sources/spark/eventhub.md
                                  - Delta: sdk/code-reference/pipelines/sources/spark/delta.md
                                  - Delta Sharing: sdk/code-reference/pipelines/sources/spark/delta_sharing.md
+                                 - Autoloader: sdk/code-reference/pipelines/sources/spark/autoloader.md
                       - Destinations:
                             - Spark:
                                  - Eventhub: sdk/code-reference/pipelines/destinations/spark/eventhub.md

diff --git a/src/sdk/python/rtdip_sdk/pipelines/sources/spark/autoloader.py b/src/sdk/python/rtdip_sdk/pipelines/sources/spark/autoloader.py
@@ -0,0 +1,83 @@
+# Copyright 2022 RTDIP
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from pyspark.sql import DataFrame, SparkSession
+
+from ..interfaces import SourceInterface
+from ..._pipeline_utils.models import Libraries, MavenLibrary, SystemType
+from ..._pipeline_utils.constants import DEFAULT_PACKAGES
+
+class DataBricksAutoLoaderSource(SourceInterface):
+    '''
+    The Spark Auto Loader is used to read new data files as they arrive in cloud storage. Further information on Auto Loader is available [here](https://docs.databricks.com/ingestion/auto-loader/index.html)
+
+    Args:
+        spark: Spark Session required to read data from cloud storage
+        options: Options that can be specified for configuring the Auto Loader. Further information on the options available are [here](https://docs.databricks.com/ingestion/auto-loader/options.html)
+        path: The cloud storage path
+        format: Specifies the file format to be read. Supported formats are available [here](https://docs.databricks.com/ingestion/auto-loader/options.html#file-format-options)
+    ''' 
+    spark: SparkSession
+    options: dict
+    path: str
+
+    def __init__(self, spark: SparkSession, options: dict, path: str, format: str) -> None:
+        self.spark = spark
+        self.options = options
+        self.path = path
+        self.options["cloudFiles.format"] = format
+
+    @staticmethod
+    def system_type():
+        return SystemType.PYSPARK_DATABRICKS
+
+    @staticmethod
+    def libraries():
+        libraries = Libraries()
+        libraries.add_maven_library(DEFAULT_PACKAGES["spark_delta_core"])
+        return libraries
+
+    @staticmethod
+    def settings() -> dict:
+        return {}
+
+    def pre_read_validation(self):
+        return True
+
+    def post_read_validation(self, df: DataFrame):
+        return True
+
+    def read_batch(self):
+        '''
+        Raises:
+            NotImplementedError: Auto Loader only supports streaming reads. To perform a batch read, use the read_stream method of this component and specify the Trigger on the write_stream to be `availableNow=True` to perform batch-like reads of cloud storage files.
+        '''
+        raise NotImplementedError("Auto Loader only supports streaming reads. To perform a batch read, use the read_stream method and specify Trigger on the write_stream as `availableNow=True`")
+
+    def read_stream(self) -> DataFrame:
+        '''
+        Performs streaming reads of files in cloud storage.
+        '''
+        try:
+            return (self.spark
+                .readStream
+                .format("cloudFiles")
+                .options(**self.options)
+                .load(self.path)
+            )
+
+        except Exception as e:
+            logging.exception('error with spark read stream auto loader function', e.__traceback__)
+            raise e
diff --git a/tests/sdk/python/rtdip_sdk/pipelines/sources/spark/test_autoloader.py b/tests/sdk/python/rtdip_sdk/pipelines/sources/spark/test_autoloader.py
@@ -0,0 +1,37 @@
+# Copyright 2022 RTDIP
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+sys.path.insert(0, '.')
+import pytest
+from pytest_mock import MockerFixture
+from src.sdk.python.rtdip_sdk.pipelines.sources.spark.autoloader import DataBricksAutoLoaderSource
+from tests.sdk.python.rtdip_sdk.pipelines._pipeline_utils.spark_configuration_constants import spark_session
+import json
+from pyspark.sql import DataFrame, SparkSession
+
+def test_databricks_autoloader_read_batch(spark_session: SparkSession):
+    with pytest.raises(NotImplementedError) as excinfo:
+        autoloader_source = DataBricksAutoLoaderSource(spark_session, {}, "/path", "parquet")
+        autoloader_source.read_batch()
+    assert str(excinfo.value) == 'Auto Loader only supports streaming reads. To perform a batch read, use the read_stream method and specify Trigger on the write_stream as `availableNow=True`' 
+
+def test_databricks_autoloader_read_stream(spark_session: SparkSession, mocker: MockerFixture):
+    autoloader_source = DataBricksAutoLoaderSource(spark_session, {}, "/path", "parquet")
+    expected_df = spark_session.createDataFrame([{"a": "x"}])
+    mocker.patch.object(autoloader_source, "spark", new_callable=mocker.PropertyMock(return_value=mocker.Mock(readStream=mocker.Mock(format=mocker.Mock(return_value=mocker.Mock(options=mocker.Mock(return_value=mocker.Mock(load=mocker.Mock(return_value=expected_df)))))))))
+    assert autoloader_source.pre_read_validation()
+    df = autoloader_source.read_stream()
+    assert isinstance(df, DataFrame)
+    assert autoloader_source.post_read_validation(df)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Read from Autoloader
		::: src.sdk.python.rtdip_sdk.pipelines.sources.spark.autoloader