Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion src/sdk/python/rtdip_sdk/pipelines/_pipeline_utils/spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,9 +143,13 @@ def get_dbutils(
KAFKA_EVENTHUB_SCHEMA = StructType(
[StructField('body', BinaryType(), True),
StructField('partition', StringType(), True),
StructField('offset', StringType(), True),
StructField('sequenceNumber', LongType(), True),
StructField('enqueuedTime', TimestampType(), True),
StructField('properties', MapType(StringType(), StringType(), True), True)]
StructField('publisher', StringType(), True),
StructField('partitionKey', StringType(), True),
StructField('properties', MapType(StringType(), StringType(), True), True),
StructField('systemProperties', MapType(StringType(), StringType(), True), True)],
)

KINESIS_SCHEMA = StructType(
Expand Down
45 changes: 39 additions & 6 deletions src/sdk/python/rtdip_sdk/pipelines/sources/spark/kafka_eventhub.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@
import logging
from py4j.protocol import Py4JJavaError
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import col, map_from_entries, udf
from pyspark.sql.types import MapType, StringType
from pyspark.sql.functions import col, map_from_entries, map_filter
from urllib.parse import urlparse

from ..interfaces import SourceInterface
Expand All @@ -26,17 +25,35 @@
from ..._pipeline_utils.constants import get_default_package
from ..._pipeline_utils.amqp import decode_kafka_headers_to_amqp_properties

eventhub_system_properties = [
"x-opt-enqueued-time",
"x-opt-sequence-number",
"x-opt-offset",
"x-opt-publisher",
"x-opt-partition-key",
"message-id",
"iothub-enqueuedtime",
"user-id",
"iothub-connection-device-id",
"iothub-connection-module-id",
"iothub-connection-auth-generation-id",
"iothub-connection-auth-method",
"iothub-app-iothub-creation-time-utc",
"iothub-creation-time-utc",
"dt-dataschema",
"dt-subject"
]

class SparkKafkaEventhubSource(SourceInterface):
'''
This Spark source class is used to read batch or streaming data from an Eventhub using the Kafka protocol. This enables Eventhubs to be used as a source in applications like Delta Live Tables or Databricks Serverless Jobs as the Spark Eventhubs JAR is not supported in this scenarios.

The dataframe returned is transformed to ensure the schema is as close to the Eventhub Spark source as possible. There are some minor differences:

- `offset` is not included in the Kafka source and therefore is not available in the returned Dataframe
- `publisher` is not included in the Kafka source and therefore is not available in the returned Dataframe
- `partitionKey` is not included in the Kafka source and therefore is not available in the returned Dataframe
- `systemProperties` and `properties` are merged in `properties` in the returned Dataframe as Kafka Headers returns them all in the same column with no way to differentiate between them
- `offset` is dependent on `x-opt-offset` being populated in the headers provided. If this is not found in the headers, the value will be null
- `publisher` is dependent on `x-opt-publisher` being populated in the headers provided. If this is not found in the headers, the value will be null
- `partitionKey` is dependent on `x-opt-partition-key` being populated in the headers provided. If this is not found in the headers, the value will be null
- `systemProperties` are identified according to the list provided in the [Eventhub documentation](https://learn.microsoft.com/en-us/azure/data-explorer/ingest-data-event-hub-overview#event-system-properties-mapping){ target="_blank" } and [IoT Hub documentation](https://learn.microsoft.com/en-us/azure/data-explorer/ingest-data-iot-hub-overview#event-system-properties-mapping){ target="_blank" }

Default settings will be specified if not provided in the `options` parameter:

Expand Down Expand Up @@ -226,6 +243,22 @@ def _transform_to_eventhub_schema(self, df: DataFrame) -> DataFrame:
col("timestamp").alias("enqueuedTime"),
decode_kafka_headers_to_amqp_properties(col("headers")).alias("properties")
)
.withColumn("offset", col("properties").getItem("x-opt-offset"))
.withColumn("publisher", col("properties").getItem("x-opt-publisher"))
.withColumn("partitionKey", col("properties").getItem("x-opt-partition-key"))
.withColumn("systemProperties", map_filter(col("properties"), lambda k, _: k.isin(eventhub_system_properties)))
.withColumn("properties", map_filter(col("properties"), lambda k, _: ~k.isin(eventhub_system_properties)))
.select(
col("body"),
col("partition"),
col("offset"),
col("sequenceNumber"),
col("enqueuedTime"),
col("publisher"),
col("partitionKey"),
col("properties"),
col("systemProperties")
)
)

def read_batch(self) -> DataFrame:
Expand Down