rtdip · GBBBAS · Aug 8, 2023 · Aug 4, 2023 · Aug 4, 2023 · Aug 7, 2023
diff --git a/README.md b/README.md
@@ -4,9 +4,14 @@
 
 <div align="center">
 
-| Branch | Workflow Status | Code Coverage | Vulnerabilities | Bugs | OpenSSF |
-|--------|-----------------|---------------|----------|------|------|
-| main | [![Main](https://github.com/rtdip/core/actions/workflows/main.yml/badge.svg?branch=main)](https://github.com/rtdip/core/actions/workflows/main.yml) | [![Coverage](https://sonarcloud.io/api/project_badges/measure?project=rtdip_core&metric=coverage&branch=main)](https://sonarcloud.io/summary/new_code?id=rtdip_core) | [![Vulnerabilities](https://sonarcloud.io/api/project_badges/measure?project=rtdip_core&metric=vulnerabilities&branch=main)](https://sonarcloud.io/summary/new_code?id=rtdip_core) | [![Bugs](https://sonarcloud.io/api/project_badges/measure?project=rtdip_core&metric=bugs&branch=main)](https://sonarcloud.io/summary/new_code?id=rtdip_core) | [![OpenSSF Best Practices](https://bestpractices.coreinfrastructure.org/projects/7557/badge)](https://bestpractices.coreinfrastructure.org/projects/7557) |
+[![PyPI version](https://img.shields.io/pypi/v/rtdip-sdk.svg?logo=pypi&logoColor=FFE873)](https://pypi.org/project/rtdip-sdk/)
+[![Supported Python versions](https://img.shields.io/pypi/pyversions/rtdip-sdk.svg?logo=python&logoColor=FFE873)](https://pypi.org/project/rtdip-sdk/)
+[![PyPI downloads](https://img.shields.io/pypi/dm/rtdip-sdk.svg)](https://pypistats.org/packages/rtdip-sdk)
+[![OpenSSF Best Practices](https://bestpractices.coreinfrastructure.org/projects/7557/badge)](https://bestpractices.coreinfrastructure.org/projects/7557)
+
+| Branch | Workflow Status | Code Coverage | Vulnerabilities | Bugs |
+|--------|-----------------|---------------|----------|------|
+| main | [![Main](https://github.com/rtdip/core/actions/workflows/main.yml/badge.svg?branch=main)](https://github.com/rtdip/core/actions/workflows/main.yml) | [![Coverage](https://sonarcloud.io/api/project_badges/measure?project=rtdip_core&metric=coverage&branch=main)](https://sonarcloud.io/summary/new_code?id=rtdip_core) | [![Vulnerabilities](https://sonarcloud.io/api/project_badges/measure?project=rtdip_core&metric=vulnerabilities&branch=main)](https://sonarcloud.io/summary/new_code?id=rtdip_core) | [![Bugs](https://sonarcloud.io/api/project_badges/measure?project=rtdip_core&metric=bugs&branch=main)](https://sonarcloud.io/summary/new_code?id=rtdip_core) |
 | develop | [![Develop](https://github.com/rtdip/core/actions/workflows/develop.yml/badge.svg)](https://github.com/rtdip/core/actions/workflows/develop.yml) | [![Coverage](https://sonarcloud.io/api/project_badges/measure?project=rtdip_core&metric=coverage&branch=develop)](https://sonarcloud.io/summary/new_code?id=rtdip_core) | [![Vulnerabilities](https://sonarcloud.io/api/project_badges/measure?project=rtdip_core&metric=vulnerabilities&branch=develop)](https://sonarcloud.io/summary/new_code?id=rtdip_core) | [![Bugs](https://sonarcloud.io/api/project_badges/measure?project=rtdip_core&metric=bugs&branch=develop)](https://sonarcloud.io/summary/new_code?id=rtdip_core) |
 | feature | [![.github/workflows/pr.yml](https://github.com/rtdip/core/actions/workflows/pr.yml/badge.svg)](https://github.com/rtdip/core/actions/workflows/pr.yml) |
 

diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md
@@ -112,17 +112,17 @@ Features of the SDK can be installed using different extras statements when inst
 === "Queries"
     When installing the package for only quering data, simply specify  in your preferred python package installer:
 
-        rtdip-sdk
+        pip install rtdip-sdk
 
 === "Pipelines"
     RTDIP SDK can be installed to include the packages required to build, execute and deploy pipelines. Specify the following extra **[pipelines]** when installing RTDIP SDK so that the required python packages are included during installation.
 
-        rtdip-sdk[pipelines]
+        pip install "rtdip-sdk[pipelines]"
 
 === "Pipelines + Pyspark"
     RTDIP SDK can also execute pyspark functions as a part of the pipelines functionality. Specify the following extra **[pipelines,pyspark]** when installing RTDIP SDK so that the required pyspark python packages are included during installation.
 
-        rtdip-sdk[pipelines,pyspark]
+        pip install "rtdip-sdk[pipelines,pyspark]"
 
     !!! note "Java"
         Ensure that Java is installed prior to installing the rtdip-sdk with the **[pipelines,pyspark]**. See [here](#java) for more information.

diff --git a/docs/sdk/code-reference/query/interpolate.md b/docs/sdk/code-reference/query/interpolate.md
@@ -23,7 +23,7 @@ parameters = {
     "time_interval_rate": "15", #numeric input
     "time_interval_unit": "minute", #options: ["second", "minute", "day", "hour"]
     "agg_method": "first", #options: ["first", "last", "avg", "min", "max"]
-    "interpolation_method": "forward_fill", #options: ["forward_fill", "backward_fill"]
+    "interpolation_method": "forward_fill", #options: ["forward_fill", "backward_fill", "linear"]
     "include_bad_data": True, #options: [True, False]
 }
 x = interpolate.get(connection, parameters)

diff --git a/docs/sdk/queries/databricks/databricks-sql.md b/docs/sdk/queries/databricks/databricks-sql.md
@@ -159,9 +159,9 @@ parameters = {
     "start_date": "2022-03-08", #start_date can be a date in the format "YYYY-MM-DD" or a datetime in the format "YYYY-MM-DDTHH:MM:SS"
     "end_date": "2022-03-10", #end_date can be a date in the format "YYYY-MM-DD" or a datetime in the format "YYYY-MM-DDTHH:MM:SS"
     "time_interval_rate": "1", #numeric input
-    "time_interval_unit": "hour", #options are second, minute, day, hour
+    "time_interval_unit": "hour", #options are second, minute, day or hour
     "agg_method": "first", #options are first, last, avg, min, max
-    "interpolation_method": "forward_fill", #options are forward_fill or backward_fill
+    "interpolation_method": "forward_fill", #options are forward_fill, backward_fill or linear
     "include_bad_data": True #boolean options are True or False
 }
 

diff --git a/environment.yml b/environment.yml
@@ -29,7 +29,7 @@ dependencies:
     - pip==23.1.2
     - turbodbc==4.5.10
     - numpy>=1.23.4
-    - pandas==1.5.2
+    - pandas>=2.0.1,<3.0.0
     - oauthlib>=3.2.2
     - cryptography>=38.0.3
     - azure-identity==1.12.0
@@ -69,7 +69,7 @@ dependencies:
         - azure-functions==1.15.0
         - nest_asyncio==1.5.6
         - hvac==1.1.1
-        - langchain==0.0.247
         - build==0.10.0
+        - langchain>=0.0.239,<=0.0.247
         - deltalake==0.10.1
 
diff --git a/setup.py b/setup.py
@@ -32,15 +32,15 @@
   "databricks-sql-connector==2.8.0",
   "azure-identity==1.12.0",
   "pyodbc==4.0.39",
-  "pandas==1.5.2",
+  "pandas>=2.0.1,<3.0.0",
   "jinja2==3.1.2",
   "importlib_metadata>=1.0.0",
   "semver==3.0.0",
   "xlrd==2.0.1",
   "grpcio>=1.48.1",
   "grpcio-status>=1.48.1",
   "googleapis-common-protos>=1.56.4",
-  "langchain==0.0.247",
+  "langchain>=0.0.239,<=0.0.247",
   "openai==0.27.8"
 ]
 

diff --git a/src/api/requirements.txt b/src/api/requirements.txt
@@ -9,7 +9,7 @@ importlib_metadata>=1.0.0
 databricks-sql-connector==2.8.0
 azure-identity==1.12.0
 oauthlib>=3.2.2
-pandas==1.5.2
+pandas>=2.0.1,<3.0.0
 numpy==1.23.5
 jinja2==3.1.2 
 pytz==2022.6
@@ -19,5 +19,5 @@ packaging==23.1
 grpcio>=1.48.1
 grpcio-status>=1.48.1
 googleapis-common-protos>=1.56.4
-langchain==0.0.230
+langchain>=0.0.239,<=0.0.247
 openai==0.27.8
diff --git a/src/api/v1/models.py b/src/api/v1/models.py
@@ -151,7 +151,7 @@ def __init__(
 class InterpolateQueryParams:
     def __init__(
         self,
-        interpolation_method: str = Query(..., description="Interpolation Method can be forward_fill or backward_fill", examples=["forward_fill", "backward_fill"]),
+        interpolation_method: str = Query(..., description="Interpolation Method can e one of the following [forward_fill, backward_fill, linear]", examples=["forward_fill", "backward_fill", "linear"]),
     ):
         self.interpolation_method = interpolation_method
 

diff --git a/src/sdk/python/rtdip_sdk/_sdk_utils/pandas.py b/src/sdk/python/rtdip_sdk/_sdk_utils/pandas.py
@@ -0,0 +1,25 @@
+# Copyright 2022 RTDIP
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pandas import DataFrame
+from .compare_versions import _package_version_meets_minimum
+
+def _prepare_pandas_to_convert_to_spark(df: DataFrame) -> DataFrame:
+    # Spark < 3.4.0 does not support iteritems method in Pandas > 2.0.1
+    try:
+        _package_version_meets_minimum("pyspark", "3.4.0")
+    except:
+        df.iteritems = df.items
+
+    return df
diff --git a/src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py b/src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py
@@ -122,12 +122,14 @@ def deploy(self) -> Union[bool, ValueError]:
             if task.notebook_task is not None:
                 module = self._load_module(task.task_key + "file_upload", task.notebook_task.notebook_path)
                 (task_libraries, spark_configuration) = PipelineComponentsGetUtility(module.__name__).execute()
+                workspace_client.workspace.mkdirs(path=self.workspace_directory)
                 path="{}/{}".format(self.workspace_directory, Path(task.notebook_task.notebook_path).name)
                 workspace_client.workspace.upload(path=path, overwrite=True, content=self._convert_file_to_binary(task.notebook_task.notebook_path))
                 task.notebook_task.notebook_path = path
             else:
                 module = self._load_module(task.task_key + "file_upload", task.spark_python_task.python_file)
                 (task_libraries, spark_configuration) = PipelineComponentsGetUtility(module).execute()
+                workspace_client.workspace.mkdirs(path=self.workspace_directory)
                 path="{}/{}".format(self.workspace_directory, Path(task.spark_python_task.python_file).name)
                 workspace_client.workspace.upload(path=path, overwrite=True, content=self._convert_file_to_binary(task.spark_python_task.python_file))
                 task.spark_python_task.python_file = path

diff --git a/src/sdk/python/rtdip_sdk/pipelines/destinations/spark/delta.py b/src/sdk/python/rtdip_sdk/pipelines/destinations/spark/delta.py
@@ -30,7 +30,7 @@ class SparkDeltaDestination(DestinationInterface):
         options (dict): Options that can be specified for a Delta Table write operation (See Attributes table below). Further information on the options is available for [batch](https://docs.delta.io/latest/delta-batch.html#write-to-a-table){ target="_blank" } and [streaming](https://docs.delta.io/latest/delta-streaming.html#delta-table-as-a-sink){ target="_blank" }.
         destination (str): Either the name of the Hive Metastore or Unity Catalog Delta Table **or** the path to the Delta table
         mode (str): Method of writing to Delta Table - append/overwrite (batch), append/complete (stream)
-        trigger (str): Frequency of the write operation
+        trigger (str): Frequency of the write operation. Specify "availableNow" to execute a trigger once, otherwise specify a time period such as "30 seconds", "5 minutes"
         query_name (str): Unique name for the query in associated SparkSession
 
     Attributes:
@@ -119,12 +119,13 @@ def write_stream(self):
         '''
         Writes streaming data to Delta. Exactly-once processing is guaranteed
         '''
+        TRIGGER_OPTION = {'availableNow': True} if self.trigger == "availableNow" else {'processingTime': self.trigger}
         try:
             if "/" in self.destination:
                 query = (
                     self.data
                     .writeStream
-                    .trigger(processingTime=self.trigger)
+                    .trigger(**TRIGGER_OPTION)
                     .format("delta")
                     .queryName(self.query_name)
                     .outputMode(self.mode)
@@ -135,7 +136,7 @@ def write_stream(self):
                 query = (
                     self.data
                     .writeStream
-                    .trigger(processingTime=self.trigger)
+                    .trigger(**TRIGGER_OPTION)
                     .format("delta")
                     .queryName(self.query_name)
                     .outputMode(self.mode)

diff --git a/src/sdk/python/rtdip_sdk/pipelines/destinations/spark/delta_merge.py b/src/sdk/python/rtdip_sdk/pipelines/destinations/spark/delta_merge.py
@@ -48,7 +48,7 @@ class SparkDeltaMergeDestination(DestinationInterface):
         when_not_matched_by_source_update_list (list[DeltaMergeConditionValues]): Conditions(optional) and values to be used when updating rows that do not match the `merge_condition`.
         when_not_matched_by_source_delete_list (list[DeltaMergeCondition]): Conditions(optional) to be used when deleting rows that do not match the `merge_condition`.
         try_broadcast_join (bool): Attempts to perform a broadcast join in the merge which can leverage data skipping using partition pruning and file pruning automatically. Can fail if dataframe being merged is large and therefore more suitable for streaming merges than batch merges
-        trigger (str): Frequency of the write operation
+        trigger (str): Frequency of the write operation. Specify "availableNow" to execute a trigger once, otherwise specify a time period such as "30 seconds", "5 minutes"
         query_name (str): Unique name for the query in associated SparkSession
 
     Attributes:
@@ -226,11 +226,12 @@ def write_stream(self):
         '''
         Merges streaming data to Delta using foreachBatch
         '''
+        TRIGGER_OPTION = {'availableNow': True} if self.trigger == "availableNow" else {'processingTime': self.trigger}
         try:
             query = (
                 self.data
                 .writeStream
-                .trigger(processingTime=self.trigger)
+                .trigger(**TRIGGER_OPTION)
                 .format("delta")
                 .foreachBatch(self._stream_merge_micro_batch)
                 .queryName(self.query_name)

diff --git a/src/sdk/python/rtdip_sdk/pipelines/destinations/spark/eventhub.py b/src/sdk/python/rtdip_sdk/pipelines/destinations/spark/eventhub.py
@@ -29,7 +29,9 @@ class SparkEventhubDestination(DestinationInterface):
     Args:
         data (DataFrame): Dataframe to be written to Eventhub
         options (dict): A dictionary of Eventhub configurations (See Attributes table below). All Configuration options for Eventhubs can be found [here.](https://github.com/Azure/azure-event-hubs-spark/blob/master/docs/PySpark/structured-streaming-pyspark.md#event-hubs-configuration){ target="_blank" }
-
+        trigger (str): Frequency of the write operation. Specify "availableNow" to execute a trigger once, otherwise specify a time period such as "30 seconds", "5 minutes"
+        query_name (str): Unique name for the query in associated SparkSession
+
     Attributes:
         checkpointLocation (str): Path to checkpoint files. (Streaming)
         eventhubs.connectionString (str):  Eventhubs connection string is required to connect to the Eventhubs service. (Streaming and Batch)
@@ -38,12 +40,12 @@ class SparkEventhubDestination(DestinationInterface):
         eventhubs.endingPosition: (JSON str): The ending position of a batch query. This works the same as startingPosition. (Batch)
         maxEventsPerTrigger (long): Rate limit on maximum number of events processed per trigger interval. The specified total number of events will be proportionally split across partitions of different volume. (Stream)
     '''
-    data: DataFrame
-    options: dict
 
-    def __init__(self, data: DataFrame, options: dict) -> None:
+    def __init__(self, data: DataFrame, options: dict, trigger="10 seconds", query_name="EventhubDestination") -> None:
         self.data = data
         self.options = options
+        self.trigger = trigger
+        self.query_name = query_name
 
     @staticmethod
     def system_type():
@@ -94,11 +96,14 @@ def write_stream(self):
         Writes steaming data to Eventhubs.
         '''
         try:
+            TRIGGER_OPTION = {'availableNow': True} if self.trigger == "availableNow" else {'processingTime': self.trigger}
             query = (
                 self.data
                 .writeStream
+                .trigger(**TRIGGER_OPTION)
                 .format("eventhubs")
                 .options(**self.options)
+                .queryName(self.query_name)
                 .start()
             )
             while query.isActive:

diff --git a/src/sdk/python/rtdip_sdk/pipelines/destinations/spark/kafka.py b/src/sdk/python/rtdip_sdk/pipelines/destinations/spark/kafka.py
@@ -33,6 +33,8 @@ class SparkKafkaDestination(DestinationInterface):
     Args:
         data (DataFrame): Dataframe to be written to Kafka
         options (dict): A dictionary of Kafka configurations (See Attributes tables below). For more information on configuration options see [here](https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html){ target="_blank" }
+        trigger (str): Frequency of the write operation. Specify "availableNow" to execute a trigger once, otherwise specify a time period such as "30 seconds", "5 minutes"
+        query_name (str): Unique name for the query in associated SparkSession
 
     The following options must be set for the Kafka destination for both batch and streaming queries.
 
@@ -46,12 +48,11 @@ class SparkKafkaDestination(DestinationInterface):
         includeHeaders (bool): Whether to include the Kafka headers in the row. (Streaming and Batch)
 
     '''
-    data: DataFrame
-    options: dict
-
-    def __init__(self, data: DataFrame, options: dict) -> None:
+    def __init__(self, data: DataFrame, options: dict, trigger="10 seconds", query_name="KafkaDestination") -> None:
         self.data = data
         self.options = options
+        self.trigger = trigger
+        self.query_name = query_name
 
     @staticmethod
     def system_type():
@@ -103,12 +104,15 @@ def write_stream(self):
         Writes steaming data to Kafka.
         '''
         try:
+            TRIGGER_OPTION = {'availableNow': True} if self.trigger == "availableNow" else {'processingTime': self.trigger}
             query = (
                 self.data
                 .select(to_json(struct("*")).alias("value"))
                 .writeStream
+                .trigger(**TRIGGER_OPTION)
                 .format("kafka")
                 .options(**self.options)
+                .queryName(self.query_name)
                 .start()
             )
             while query.isActive:

diff --git a/src/sdk/python/rtdip_sdk/pipelines/destinations/spark/kinesis.py b/src/sdk/python/rtdip_sdk/pipelines/destinations/spark/kinesis.py
@@ -26,23 +26,21 @@ class SparkKinesisDestination(DestinationInterface):
         data (DataFrame): Dataframe to be written to Delta
         options (dict): A dictionary of Kinesis configurations (See Attributes table below). All Configuration options for Kinesis can be found [here.](https://github.com/qubole/kinesis-sql#kinesis-sink-configuration){ target="_blank" }
         mode (str): Method of writing to Kinesis - append, complete, update
-        trigger (str): Frequency of the write operation
+        trigger (str): Frequency of the write operation. Specify "availableNow" to execute a trigger once, otherwise specify a time period such as "30 seconds", "5 minutes"
+        query_name (str): Unique name for the query in associated SparkSession
 
     Attributes:
         endpointUrl (str): Endpoint of the kinesis stream.
         awsAccessKey (str): AWS access key.
         awsSecretKey (str): AWS secret access key corresponding to the access key.
         streamName (List[str]): Name of the streams in Kinesis to write to.
     '''
-    options: dict
-    mode: str
-    trigger: str
-
-    def __init__(self, data: DataFrame, options: dict, mode:str = "update", trigger:str= "10 seconds") -> None:
+    def __init__(self, data: DataFrame, options: dict, mode:str = "update", trigger:str= "10 seconds", query_name="KinesisDestination") -> None:
         self.data = data
         self.options = options
         self.mode = mode
         self.trigger = trigger
+        self.query_name = query_name
 
     @staticmethod
     def system_type():
@@ -91,13 +89,15 @@ def write_stream(self):
         Writes steaming data to Kinesis.
         '''
         try:
+            TRIGGER_OPTION = {'availableNow': True} if self.trigger == "availableNow" else {'processingTime': self.trigger}
             query = (
                 self.data
                 .writeStream
-                .trigger(processingTime=self.trigger)
+                .trigger(**TRIGGER_OPTION)
                 .format("kinesis")
                 .outputMode(self.mode)
                 .options(**self.options)
+                .queryName(self.query_name)
                 .start()
             )
             while query.isActive: