rtdip · GBBBAS · Sep 14, 2023 · Apr 12, 2023 · Aug 3, 2023 · Aug 7, 2023
diff --git a/src/sdk/python/rtdip_sdk/pipelines/destinations/spark/kafka_eventhub.py b/src/sdk/python/rtdip_sdk/pipelines/destinations/spark/kafka_eventhub.py
@@ -16,9 +16,16 @@
 import logging
 from py4j.protocol import Py4JJavaError
 from pyspark.sql import DataFrame, SparkSession
-from pyspark.sql.functions import col, struct, to_json
+from pyspark.sql.functions import col, struct, to_json, array
 from urllib.parse import urlparse
-from pyspark.sql.types import StringType, BinaryType
+from pyspark.sql.types import (
+    StringType,
+    BinaryType,
+    ArrayType,
+    IntegerType,
+    StructType,
+    StructField,
+)
 import time
 
 from ..interfaces import DestinationInterface
@@ -161,10 +168,8 @@ def _connection_string_builder(self, properties: dict) -> str:
         return connection_string
 
     def _configure_options(self, options: dict) -> dict:
-        if "subscribe" not in options:
-            options["subscribe"] = self.connection_string_properties.get(
-                "eventhub_name"
-            )
+        if "topic" not in options:
+            options["topic"] = self.connection_string_properties.get("eventhub_name")
 
         if "kafka.bootstrap.servers" not in options:
             options["kafka.bootstrap.servers"] = (
@@ -205,27 +210,26 @@ def _configure_options(self, options: dict) -> dict:
         return options
 
     def _transform_to_eventhub_schema(self, df: DataFrame) -> DataFrame:
-        if "value" in df.columns:
-            if df.schema["value"].dataType not in [StringType(), BinaryType()]:
-                try:
-                    df.withColumn("value", col("value").cast(StringType()))
-                except Exception as e:
-                    raise ValueError(
-                        "Couldn't convert 'value' column to string or binary type", e
-                    )
-        else:
+        column_list = ["key", "headers", "topic", "partition"]
+        if "value" not in df.columns:
             df = df.withColumn(
                 "value",
                 to_json(
                     struct(
                         [
                             col(column).alias(column)
                             for column in df.columns
-                            if column not in ["key", "headers", "topic", "partition"]
+                            if column not in column_list
                         ]
                     )
                 ),
             )
+        if "headers" in df.columns and (
+            df.schema["headers"].dataType.elementType["key"].nullable == True
+            or df.schema["headers"].dataType.elementType["value"].nullable == True
+        ):
+            raise ValueError("key and value in the headers column cannot be nullable")
+
         return df.select(
             [
                 column

diff --git a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/pcdm_to_honeywell_apm.py
@@ -126,8 +126,8 @@ def transform(self) -> DataFrame:
                             lit(self.quality).alias("Quality"),
                             col("EventTime").alias("Time"),
                             col("Value").alias("Value"),
-                        ).alias("HistorySamples"),
-                    ),
+                        ),
+                    ).alias("HistorySamples"),
                 ),
             )
 

diff --git a/tests/sdk/python/rtdip_sdk/pipelines/destinations/spark/test_kafka_eventhub.py b/tests/sdk/python/rtdip_sdk/pipelines/destinations/spark/test_kafka_eventhub.py
@@ -27,6 +27,14 @@
 )
 from pyspark.sql import SparkSession
 from pytest_mock import MockerFixture
+from pyspark.sql.types import (
+    StructField,
+    StructType,
+    StringType,
+    IntegerType,
+    BinaryType,
+    ArrayType,
+)
 
 
 kafka_configuration_dict = {"failOnDataLoss": "true", "startingOffsets": "earliest"}
@@ -84,7 +92,14 @@ def test_spark_kafka_write_batch(spark_session: SparkSession, mocker: MockerFixt
     kafka_configuration = kafka_configuration_dict
     kafka_destination = SparkKafkaEventhubDestination(
         spark=spark_session,
-        data=spark_session.createDataFrame([{"value": 1}]),
+        data=spark_session.createDataFrame(
+            [
+                {"value": 1},
+                {"key": 2},
+                {"topic": 3},
+                {"partition": "1"},
+            ]
+        ),
         options=kafka_configuration,
         connection_string=eventhub_connection_string,
         consumer_group="test_consumer_group",
@@ -167,6 +182,55 @@ def test_spark_kafka_write_batch_fails(
         kafka_destination.write_batch()
 
 
+def test_spark_kafka_fails_on_converting_column_type(
+    spark_session: SparkSession,
+):
+    kafka_configuration = kafka_configuration_dict
+    schema = StructType(
+        [
+            StructField("value", IntegerType(), True),
+            StructField("key", IntegerType(), True),
+            StructField(
+                "headers",
+                ArrayType(
+                    StructType(
+                        [
+                            StructField("key", StringType(), True),
+                            StructField("value", StringType(), False),
+                        ]
+                    ),
+                    False,
+                ),
+                True,
+            ),
+            StructField("topic", IntegerType(), True),
+            StructField("partition", StringType(), True),
+        ]
+    )
+    df = spark_session.createDataFrame(
+        [
+            {
+                "value": 1,
+                "key": 2,
+                "headers": [{"key": "testKey", "value": "strValue"}],
+                "topic": 3,
+                "partition": "nonInt",
+            }
+        ],
+        schema=schema,
+    )
+    destination = SparkKafkaEventhubDestination(
+        spark=spark_session,
+        data=df,
+        options=kafka_configuration,
+        connection_string=eventhub_connection_string,
+        consumer_group="test_consumer_group",
+    )
+    with pytest.raises(ValueError) as error:
+        destination._transform_to_eventhub_schema(df)
+    assert str(error.value) == "key and value in the headers column cannot be nullable"
+
+
 def test_spark_kafka_fails_on_invalid_connection_string_malformed(
     spark_session: SparkSession,
 ):