Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
d9bb204
typo change
JamesKnBr Apr 12, 2023
a773ac7
Merge remote-tracking branch 'upstream/develop' into develop
JamesKnBr Aug 3, 2023
32ea5d4
Merge remote-tracking branch 'upstream/develop' into develop
JamesKnBr Aug 7, 2023
f76aea4
Add Honeywell transformers
JamesKnBr Aug 18, 2023
5085d58
Merge remote-tracking branch 'upstream/develop' into develop
JamesKnBr Aug 18, 2023
7563b8a
Merge remote-tracking branch 'upstream/develop' into develop
JamesKnBr Aug 21, 2023
5b57d0c
Merge remote-tracking branch 'upstream/develop' into hotfix/00455
JamesKnBr Aug 23, 2023
1e2aec5
update eventhub destination and pcdm-honeywell
JamesKnBr Aug 23, 2023
3562afd
add eventhub destination and pcdm-honeywell
JamesKnBr Aug 23, 2023
bc8e6f5
Merge branch 'hotfix/00455' into develop
JamesKnBr Aug 23, 2023
d28338e
Merge remote-tracking branch 'upstream/develop' into develop
JamesKnBr Aug 30, 2023
1e3445f
Merge remote-tracking branch 'upstream/develop' into develop
JamesKnBr Sep 6, 2023
18d1d01
Merge remote-tracking branch 'upstream/develop' into develop
JamesKnBr Sep 8, 2023
6133317
Merge remote-tracking branch 'upstream/develop' into develop
JamesKnBr Sep 11, 2023
5f65524
Kafka eventhub destination transformation changes
JamesKnBr Sep 12, 2023
3058751
remove imports
JamesKnBr Sep 12, 2023
79e0b09
change test
JamesKnBr Sep 12, 2023
ff6a8b0
remove imports
JamesKnBr Sep 12, 2023
462dbbf
changes to kafka eventhub destination & hw transformer fix
JamesKnBr Sep 14, 2023
265164c
Merge remote-tracking branch 'upstream/develop' into hotfix/00485
JamesKnBr Sep 14, 2023
0125284
fix code smell
JamesKnBr Sep 14, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,16 @@
import logging
from py4j.protocol import Py4JJavaError
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import col, struct, to_json
from pyspark.sql.functions import col, struct, to_json, array
from urllib.parse import urlparse
from pyspark.sql.types import StringType, BinaryType
from pyspark.sql.types import (
StringType,
BinaryType,
ArrayType,
IntegerType,
StructType,
StructField,
)
import time

from ..interfaces import DestinationInterface
Expand Down Expand Up @@ -161,10 +168,8 @@ def _connection_string_builder(self, properties: dict) -> str:
return connection_string

def _configure_options(self, options: dict) -> dict:
if "subscribe" not in options:
options["subscribe"] = self.connection_string_properties.get(
"eventhub_name"
)
if "topic" not in options:
options["topic"] = self.connection_string_properties.get("eventhub_name")

if "kafka.bootstrap.servers" not in options:
options["kafka.bootstrap.servers"] = (
Expand Down Expand Up @@ -205,27 +210,26 @@ def _configure_options(self, options: dict) -> dict:
return options

def _transform_to_eventhub_schema(self, df: DataFrame) -> DataFrame:
if "value" in df.columns:
if df.schema["value"].dataType not in [StringType(), BinaryType()]:
try:
df.withColumn("value", col("value").cast(StringType()))
except Exception as e:
raise ValueError(
"Couldn't convert 'value' column to string or binary type", e
)
else:
column_list = ["key", "headers", "topic", "partition"]
if "value" not in df.columns:
df = df.withColumn(
"value",
to_json(
struct(
[
col(column).alias(column)
for column in df.columns
if column not in ["key", "headers", "topic", "partition"]
if column not in column_list
]
)
),
)
if "headers" in df.columns and (
df.schema["headers"].dataType.elementType["key"].nullable == True
or df.schema["headers"].dataType.elementType["value"].nullable == True
):
raise ValueError("key and value in the headers column cannot be nullable")

return df.select(
[
column
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,8 @@ def transform(self) -> DataFrame:
lit(self.quality).alias("Quality"),
col("EventTime").alias("Time"),
col("Value").alias("Value"),
).alias("HistorySamples"),
),
),
).alias("HistorySamples"),
),
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,14 @@
)
from pyspark.sql import SparkSession
from pytest_mock import MockerFixture
from pyspark.sql.types import (
StructField,
StructType,
StringType,
IntegerType,
BinaryType,
ArrayType,
)


kafka_configuration_dict = {"failOnDataLoss": "true", "startingOffsets": "earliest"}
Expand Down Expand Up @@ -84,7 +92,14 @@ def test_spark_kafka_write_batch(spark_session: SparkSession, mocker: MockerFixt
kafka_configuration = kafka_configuration_dict
kafka_destination = SparkKafkaEventhubDestination(
spark=spark_session,
data=spark_session.createDataFrame([{"value": 1}]),
data=spark_session.createDataFrame(
[
{"value": 1},
{"key": 2},
{"topic": 3},
{"partition": "1"},
]
),
options=kafka_configuration,
connection_string=eventhub_connection_string,
consumer_group="test_consumer_group",
Expand Down Expand Up @@ -167,6 +182,55 @@ def test_spark_kafka_write_batch_fails(
kafka_destination.write_batch()


def test_spark_kafka_fails_on_converting_column_type(
spark_session: SparkSession,
):
kafka_configuration = kafka_configuration_dict
schema = StructType(
[
StructField("value", IntegerType(), True),
StructField("key", IntegerType(), True),
StructField(
"headers",
ArrayType(
StructType(
[
StructField("key", StringType(), True),
StructField("value", StringType(), False),
]
),
False,
),
True,
),
StructField("topic", IntegerType(), True),
StructField("partition", StringType(), True),
]
)
df = spark_session.createDataFrame(
[
{
"value": 1,
"key": 2,
"headers": [{"key": "testKey", "value": "strValue"}],
"topic": 3,
"partition": "nonInt",
}
],
schema=schema,
)
destination = SparkKafkaEventhubDestination(
spark=spark_session,
data=df,
options=kafka_configuration,
connection_string=eventhub_connection_string,
consumer_group="test_consumer_group",
)
with pytest.raises(ValueError) as error:
destination._transform_to_eventhub_schema(df)
assert str(error.value) == "key and value in the headers column cannot be nullable"


def test_spark_kafka_fails_on_invalid_connection_string_malformed(
spark_session: SparkSession,
):
Expand Down