Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
::: src.sdk.python.rtdip_sdk.pipelines.utilities.spark.adls_gen2_spn_connect
2 changes: 1 addition & 1 deletion docs/sdk/pipelines/components.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ Utilities are components that perform utility functions such as logging, error h
|[Delta Table Vacuum](../code-reference/pipelines/utilities/spark/delta_table_vacuum.md)||:heavy_check_mark:|:heavy_check_mark:|:heavy_check_mark:|:heavy_check_mark:|
|[AWS S3 Bucket Policy](../code-reference/pipelines/utilities/aws/s3_bucket_policy.md)|:heavy_check_mark:|:heavy_check_mark:|:heavy_check_mark:|:heavy_check_mark:|:heavy_check_mark:|
|[ADLS Gen 2 ACLs](../code-reference/pipelines/utilities/azure/adls_gen2_acl.md)|:heavy_check_mark:|:heavy_check_mark:|:heavy_check_mark:|:heavy_check_mark:|:heavy_check_mark:|

|[Spark ADLS Gen 2 Service Principal Connect](../code-reference/pipelines/utilities/spark/adls_gen2_spn_connect.md)||:heavy_check_mark:|:heavy_check_mark:|:heavy_check_mark:|:heavy_check_mark:|

!!! note "Note"
This list will dynamically change as the framework is further developed and new components are added.
Expand Down
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ nav:
- Delta Table Create: sdk/code-reference/pipelines/utilities/spark/delta_table_create.md
- Delta Table Optimize: sdk/code-reference/pipelines/utilities/spark/delta_table_optimize.md
- Delta Table Vacuum: sdk/code-reference/pipelines/utilities/spark/delta_table_vacuum.md
- ADLS Gen 2 Service Principal Connect: sdk/code-reference/pipelines/utilities/spark/adls_gen2_spn_connect.md
- AWS:
- S3 Bucket Policy: sdk/code-reference/pipelines/utilities/aws/s3_bucket_policy.md
- Azure:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# Copyright 2022 RTDIP
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField
from py4j.protocol import Py4JJavaError
from delta.tables import DeltaTable

from ..interfaces import UtilitiesInterface
from .configuration import SparkConfigurationUtility
from ..._pipeline_utils.models import Libraries, SystemType

class SparkADLSGen2SPNConnectUtility(UtilitiesInterface):
'''
Configures Spark to Connect to an ADLS Gen 2 Storage Account using a Service Principal

Args:
spark (SparkSession): Spark Session required to read data from cloud storage
storage_account (str): Name of the ADLS Gen 2 Storage Account
tenant_id (str): Tenant ID of the Service Principal
client_id (str): Service Principal Client ID
client_secret (str): Service Principal Client Secret
'''
spark: SparkSession
storage_account: str
tenant_id: str
client_id: str
client_secret: str

def __init__(self, spark: SparkSession, storage_account: str, tenant_id: str, client_id: str, client_secret: str) -> None:
self.spark = spark
self.storage_account = storage_account
self.tenant_id = tenant_id
self.client_id = client_id
self.client_secret = client_secret

@staticmethod
def system_type():
'''
Attributes:
SystemType (Environment): Requires PYSPARK
'''
return SystemType.PYSPARK

@staticmethod
def libraries():
libraries = Libraries()
return libraries

@staticmethod
def settings() -> dict:
return {}

def execute(self) -> bool:
try:
adls_gen2_config = SparkConfigurationUtility(
spark=self.spark,
config={
"fs.azure.account.auth.type.{}.dfs.core.windows.net".format(self.storage_account): "OAuth",
"fs.azure.account.oauth.provider.type.{}.dfs.core.windows.net".format(self.storage_account): "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
"fs.azure.account.oauth2.client.id.{}.dfs.core.windows.net".format(self.storage_account): self.client_id,
"fs.azure.account.oauth2.client.secret.{}.dfs.core.windows.net".format(self.storage_account): self.client_secret,
"fs.azure.account.oauth2.client.endpoint.{}.dfs.core.windows.net".format(self.storage_account): "https://login.microsoftonline.com/{}/oauth2/token".format(self.tenant_id)
}
)
adls_gen2_config.execute()
return True

except Py4JJavaError as e:
logging.exception(e.errmsg)
raise e
except Exception as e:
logging.exception(str(e))
raise e
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Copyright 2022 RTDIP
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
sys.path.insert(0, '.')
from src.sdk.python.rtdip_sdk.pipelines._pipeline_utils.models import Libraries
from src.sdk.python.rtdip_sdk.pipelines.utilities.spark.adls_gen2_spn_connect import SparkADLSGen2SPNConnectUtility
from tests.sdk.python.rtdip_sdk.pipelines._pipeline_utils.spark_configuration_constants import spark_session
from pyspark.sql import SparkSession

def test_adls_gen2_spn_connect_setup():
adls_gen2_spn_connect_utility = SparkADLSGen2SPNConnectUtility(
spark=spark_session,
storage_account="test_storage_account",
tenant_id="test_tenant_id",
client_id="test_client_id",
client_secret="test_client_secret"
)

assert adls_gen2_spn_connect_utility.system_type().value == 2
assert adls_gen2_spn_connect_utility.libraries() == Libraries()
assert isinstance(adls_gen2_spn_connect_utility.settings(), dict)

def test_adls_gen2_spn_connect_utility(spark_session: SparkSession):
adls_gen2_spn_connect_utility = SparkADLSGen2SPNConnectUtility(
spark=spark_session,
storage_account="test_storage_account",
tenant_id="test_tenant_id",
client_id="test_client_id",
client_secret="test_client_secret"
)

result = adls_gen2_spn_connect_utility.execute()
assert result
assert "OAuth" == spark_session.conf.get("fs.azure.account.auth.type.test_storage_account.dfs.core.windows.net")
assert "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider" == spark_session.conf.get("fs.azure.account.oauth.provider.type.test_storage_account.dfs.core.windows.net")
assert "test_client_id" == spark_session.conf.get("fs.azure.account.oauth2.client.id.test_storage_account.dfs.core.windows.net")
assert "test_client_secret" == spark_session.conf.get("fs.azure.account.oauth2.client.secret.test_storage_account.dfs.core.windows.net")
assert "https://login.microsoftonline.com/test_tenant_id/oauth2/token" == spark_session.conf.get("fs.azure.account.oauth2.client.endpoint.test_storage_account.dfs.core.windows.net")