From 5ab4f53222fb6f3bc35e32938d90ea8db3aef2ed Mon Sep 17 00:00:00 2001 From: GBBBAS <42962356+GBBBAS@users.noreply.github.com> Date: Mon, 14 Aug 2023 08:33:19 +0100 Subject: [PATCH] Additional fields for Eventhub Kafka component Signed-off-by: GBBBAS <42962356+GBBBAS@users.noreply.github.com> --- .../pipelines/_pipeline_utils/spark.py | 6 ++- .../pipelines/sources/spark/kafka_eventhub.py | 45 ++++++++++++++++--- 2 files changed, 44 insertions(+), 7 deletions(-) diff --git a/src/sdk/python/rtdip_sdk/pipelines/_pipeline_utils/spark.py b/src/sdk/python/rtdip_sdk/pipelines/_pipeline_utils/spark.py index 618171cb0..9dadbbb67 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/_pipeline_utils/spark.py +++ b/src/sdk/python/rtdip_sdk/pipelines/_pipeline_utils/spark.py @@ -143,9 +143,13 @@ def get_dbutils( KAFKA_EVENTHUB_SCHEMA = StructType( [StructField('body', BinaryType(), True), StructField('partition', StringType(), True), + StructField('offset', StringType(), True), StructField('sequenceNumber', LongType(), True), StructField('enqueuedTime', TimestampType(), True), - StructField('properties', MapType(StringType(), StringType(), True), True)] + StructField('publisher', StringType(), True), + StructField('partitionKey', StringType(), True), + StructField('properties', MapType(StringType(), StringType(), True), True), + StructField('systemProperties', MapType(StringType(), StringType(), True), True)], ) KINESIS_SCHEMA = StructType( diff --git a/src/sdk/python/rtdip_sdk/pipelines/sources/spark/kafka_eventhub.py b/src/sdk/python/rtdip_sdk/pipelines/sources/spark/kafka_eventhub.py index e61d67491..ff5242d28 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/sources/spark/kafka_eventhub.py +++ b/src/sdk/python/rtdip_sdk/pipelines/sources/spark/kafka_eventhub.py @@ -16,8 +16,7 @@ import logging from py4j.protocol import Py4JJavaError from pyspark.sql import DataFrame, SparkSession -from pyspark.sql.functions import col, map_from_entries, udf -from pyspark.sql.types import MapType, StringType +from pyspark.sql.functions import col, map_from_entries, map_filter from urllib.parse import urlparse from ..interfaces import SourceInterface @@ -26,6 +25,24 @@ from ..._pipeline_utils.constants import get_default_package from ..._pipeline_utils.amqp import decode_kafka_headers_to_amqp_properties +eventhub_system_properties = [ + "x-opt-enqueued-time", + "x-opt-sequence-number", + "x-opt-offset", + "x-opt-publisher", + "x-opt-partition-key", + "message-id", + "iothub-enqueuedtime", + "user-id", + "iothub-connection-device-id", + "iothub-connection-module-id", + "iothub-connection-auth-generation-id", + "iothub-connection-auth-method", + "iothub-app-iothub-creation-time-utc", + "iothub-creation-time-utc", + "dt-dataschema", + "dt-subject" +] class SparkKafkaEventhubSource(SourceInterface): ''' @@ -33,10 +50,10 @@ class SparkKafkaEventhubSource(SourceInterface): The dataframe returned is transformed to ensure the schema is as close to the Eventhub Spark source as possible. There are some minor differences: - - `offset` is not included in the Kafka source and therefore is not available in the returned Dataframe - - `publisher` is not included in the Kafka source and therefore is not available in the returned Dataframe - - `partitionKey` is not included in the Kafka source and therefore is not available in the returned Dataframe - - `systemProperties` and `properties` are merged in `properties` in the returned Dataframe as Kafka Headers returns them all in the same column with no way to differentiate between them + - `offset` is dependent on `x-opt-offset` being populated in the headers provided. If this is not found in the headers, the value will be null + - `publisher` is dependent on `x-opt-publisher` being populated in the headers provided. If this is not found in the headers, the value will be null + - `partitionKey` is dependent on `x-opt-partition-key` being populated in the headers provided. If this is not found in the headers, the value will be null + - `systemProperties` are identified according to the list provided in the [Eventhub documentation](https://learn.microsoft.com/en-us/azure/data-explorer/ingest-data-event-hub-overview#event-system-properties-mapping){ target="_blank" } and [IoT Hub documentation](https://learn.microsoft.com/en-us/azure/data-explorer/ingest-data-iot-hub-overview#event-system-properties-mapping){ target="_blank" } Default settings will be specified if not provided in the `options` parameter: @@ -226,6 +243,22 @@ def _transform_to_eventhub_schema(self, df: DataFrame) -> DataFrame: col("timestamp").alias("enqueuedTime"), decode_kafka_headers_to_amqp_properties(col("headers")).alias("properties") ) + .withColumn("offset", col("properties").getItem("x-opt-offset")) + .withColumn("publisher", col("properties").getItem("x-opt-publisher")) + .withColumn("partitionKey", col("properties").getItem("x-opt-partition-key")) + .withColumn("systemProperties", map_filter(col("properties"), lambda k, _: k.isin(eventhub_system_properties))) + .withColumn("properties", map_filter(col("properties"), lambda k, _: ~k.isin(eventhub_system_properties))) + .select( + col("body"), + col("partition"), + col("offset"), + col("sequenceNumber"), + col("enqueuedTime"), + col("publisher"), + col("partitionKey"), + col("properties"), + col("systemProperties") + ) ) def read_batch(self) -> DataFrame: