Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,15 @@
"python.linting.pydocstylePath": "/opt/conda/bin/pydocstyle",
"python.linting.pylintPath": "/opt/conda/bin/pylint",
"python.testing.pytestArgs": [
"--cov=.",
"--cov-report=xml:cov.xml",
"tests"
],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true,
"python.testing.cwd": "${workspaceFolder}",
"python.analysis.extraPaths": ["${workspaceFolder}"],
"python.defaultInterpreterPath": "/opt/conda/envs/rtdip-sdk/bin/python",
"python.defaultInterpreterPath": "~/micromamba/envs/rtdip-sdk/bin/python",
"terminal.integrated.env.linux":{
"PYTHONPATH": "${workspaceFolder}:${env:PYTHONPATH}"
},
Expand Down
14 changes: 13 additions & 1 deletion .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,18 @@
"request": "attach",
"port": 9091,
"preLaunchTask": "func: host start",
}
},
{
"name": "Python: Debug Tests",
"type": "python",
"request": "launch",
"program": "${file}",
"purpose": ["debug-test"],
"console": "internalConsole",
"env": {
"PYTEST_ADDOPTS": "--no-cov"
},
"justMyCode": false
}
]
}
4 changes: 3 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,15 @@
"python.linting.pydocstylePath": "/opt/conda/bin/pydocstyle",
"python.linting.pylintPath": "/opt/conda/bin/pylint",
"python.testing.pytestArgs": [
"--cov=.",
"--cov-report=xml:cov.xml",
"tests"
],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true,
"python.testing.cwd": "${workspaceFolder}",
"python.analysis.extraPaths": ["${workspaceFolder}"],
"python.defaultInterpreterPath": "/opt/conda/envs/rtdip-sdk/bin/python",
"python.defaultInterpreterPath": "~/micromamba/envs/rtdip-sdk/bin/python",
"terminal.integrated.env.osx":{
"PYTHONPATH": "${workspaceFolder}:${env:PYTHONPATH}"
},
Expand Down
2 changes: 2 additions & 0 deletions docs/sdk/code-reference/pipelines/secrets/databricks.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Databricks Secret Scope
::: src.sdk.python.rtdip_sdk.pipelines.secrets.databricks
2 changes: 2 additions & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,8 @@ nav:
- Utilities:
- Spark:
- Delta Table Create: sdk/code-reference/pipelines/utilities/spark/delta_table_create.md
- Secrets:
- Databricks: sdk/code-reference/pipelines/secrets/databricks.md
- API:
- Overview: api/overview.md
- Authentication: api/authentication.md
Expand Down
16 changes: 15 additions & 1 deletion src/sdk/python/rtdip_sdk/pipelines/_pipeline_utils/spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,23 @@ def get_spark_session(self) -> SparkSession:
return spark_session

except Exception as e:
logging.exception('error with spark session function', e.__traceback__)
logging.exception(str(e))
raise e

def get_dbutils(
spark: SparkSession,
): # please note that this function is used in mocking by its name
try:
from pyspark.dbutils import DBUtils # noqa

if "dbutils" not in locals():
utils = DBUtils(spark)
return utils
else:
return locals().get("dbutils")
except ImportError:
return None

# # TODO: Implemented in DBR 11 but not yet available in open source pyspark
# from pyspark.sql.streaming import StreamingQueryListener
# class SparkStreamingListener(StreamingQueryListener):
Expand Down
3 changes: 0 additions & 3 deletions src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,6 @@ def deploy(self) -> bool:
databricks_job_task.libraries.append(DatabricksLibraries(maven=DatabricksLibrariesMaven(coordinates=maven_library.to_string(), repo=maven_library.repo)))
for wheel_library in libraries.pythonwheel_libraries:
databricks_job_task.libraries.append(DatabricksLibraries(whl=wheel_library))

# convert to string for json conversion later
step.component = step.component.__name__

try:
rtdip_version = version("rtdip-sdk")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,7 @@
from pyspark.sql import SparkSession
import sys


def get_dbutils(
spark: SparkSession,
): # please note that this function is used in mocking by its name
try:
from pyspark.dbutils import DBUtils # noqa

if "dbutils" not in locals():
utils = DBUtils(spark)
return utils
else:
return locals().get("dbutils")
except ImportError:
return None

from ....._pipeline_utils.spark import get_dbutils

class Task(ABC):
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,10 @@ class RTDIPPipelineTask(Task):
def launch(self):
self.logger.info("Launching RTDIP Pipeline Task")
self.logger.info("Job to execute {}".format(sys.argv[0]))
pipeline_job = PipelineJobFromJson(sys.argv[0])
pipeline_job = PipelineJobFromJson(sys.argv[0]).convert()
pipeline_job_execute = PipelineJobExecute(pipeline_job)
pipeline_job_execute.run()
self.logger.info("RTDIP Pipeline Task finished!")
self.logger.info("RTDIP Pipeline Task completed")

# if you're using python_wheel_task, you'll need the entrypoint function to be used in setup.py
def entrypoint(): # pragma: no cover
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ def __init__(self, table_name:str, options: dict, mode: str = "append", trigger=

@staticmethod
def system_type():
'''
Attributes:
SystemType (Environment): Requires PYSPARK
'''
return SystemType.PYSPARK

@staticmethod
Expand Down Expand Up @@ -92,7 +96,7 @@ def write_batch(self, df: DataFrame):
)

except Py4JJavaError as e:
logging.exception('error with spark write batch delta function', e.errmsg)
logging.exception(e.errmsg)
raise e
except Exception as e:
logging.exception(str(e))
Expand All @@ -119,7 +123,7 @@ def write_stream(self, df: DataFrame) -> DataFrame:
time.sleep(30)

except Py4JJavaError as e:
logging.exception('error with spark write stream delta function', e.errmsg)
logging.exception(e.errmsg)
raise e
except Exception as e:
logging.exception(str(e))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ def __init__(self, options: dict) -> None:

@staticmethod
def system_type():
'''
Attributes:
SystemType (Environment): Requires PYSPARK
'''
return SystemType.PYSPARK

@staticmethod
Expand Down Expand Up @@ -75,7 +79,7 @@ def write_batch(self, df: DataFrame):
)

except Py4JJavaError as e:
logging.exception('error with spark write batch eventhub function', e.errmsg)
logging.exception(e.errmsg)
raise e
except Exception as e:
logging.exception(str(e))
Expand All @@ -98,7 +102,7 @@ def write_stream(self, df: DataFrame):
time.sleep(30)

except Py4JJavaError as e:
logging.exception('error with spark write stream eventhub function', e.errmsg)
logging.exception(e.errmsg)
raise e
except Exception as e:
logging.exception(str(e))
Expand Down
57 changes: 50 additions & 7 deletions src/sdk/python/rtdip_sdk/pipelines/execute/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,18 @@
from .container import Clients, Configs
from .models import PipelineJob, PipelineTask, PipelineStep
from .._pipeline_utils.models import Libraries, SystemType
from ..sources import * # NOSONAR
from ..sources.interfaces import SourceInterface
from ..transformers import * # NOSONAR
from ..transformers.interfaces import TransformerInterface
from ..destinations import * # NOSONAR
from ..destinations.interfaces import DestinationInterface
from ..utilities import * # NOSONAR
from ..utilities.interfaces import UtilitiesInterface
from ..secrets import * # NOSONAR
from ..secrets.models import PipelineSecret



class PipelineJobExecute():
'''
Expand All @@ -37,6 +45,21 @@ class PipelineJobExecute():
def __init__(self, job: PipelineJob, batch_job: bool = False):
self.job = job

def _get_provider_attributes(self, provider: providers.Factory, component: object) -> providers.Factory:
attributes = getattr(component, '__annotations__', {}).items()
# add spark session, if needed
for key, value in attributes:
# if isinstance(value, SparkSession): # TODO: fix this as value does not seem to be an instance of SparkSession
if key == "spark":
provider.add_kwargs(spark=Clients.spark_client().spark_session)
return provider

def _get_secret_provider_attributes(self, pipeline_secret: PipelineSecret) -> providers.Factory:
secret_provider = providers.Factory(pipeline_secret.type)
secret_provider = self._get_provider_attributes(secret_provider, pipeline_secret.type)
secret_provider.add_kwargs(vault=pipeline_secret.vault, key=pipeline_secret.key)
return secret_provider

def _tasks_order(self, task_list: list[PipelineTask]):
'''
Orders tasks within a job
Expand Down Expand Up @@ -107,16 +130,23 @@ def _task_setup_dependency_injection(self, step_list: list[PipelineStep]):
for step in step_list:
# setup factory provider for component
provider = providers.Factory(step.component)
attributes = getattr(step.component, '__annotations__', {}).items()
# add spark session, if needed
for key, value in attributes:
# if isinstance(value, SparkSession): # TODO: fix this as value does not seem to be an instance of SparkSession
if key == "spark":
provider.add_kwargs(spark=Clients.spark_client().spark_session)
provider = self._get_provider_attributes(provider, step.component)

# add parameters
if isinstance(step.component, DestinationInterface):
step.component_parameters["query_name"] = step.name

# get secrets
for param_key, param_value in step.component_parameters.items():
if isinstance(param_value, PipelineSecret):
step.component_parameters[param_key] = self._get_secret_provider_attributes(param_value)().get()
if isinstance(param_value, dict):
for key, value in param_value.items():
if isinstance(value, PipelineSecret):
step.component_parameters[param_key][key] = self._get_secret_provider_attributes(value)().get()

provider.add_kwargs(**step.component_parameters)

# set provider
container.set_provider(
step.name,
Expand Down Expand Up @@ -175,8 +205,16 @@ class PipelineJobFromJson():
'''
pipeline_json: str

def init(self, pipeline_json: str):
def __init__(self, pipeline_json: str):
self.pipeline_json = pipeline_json

def _try_convert_to_pipeline_secret(self, value):
try:
if "type" in value:
value["type"] = getattr(sys.modules[__name__], value["type"])
return PipelineSecret.parse_obj(value)
except: # NOSONAR
return value

def convert(self) -> PipelineJob:
pipeline_job_dict = json.loads(self.pipeline_json)
Expand All @@ -185,6 +223,11 @@ def convert(self) -> PipelineJob:
for task in pipeline_job_dict["task_list"]:
for step in task["step_list"]:
step["component"] = getattr(sys.modules[__name__], step["component"])
for param_key, param_value in step["component_parameters"].items():
step["component_parameters"][param_key] = self._try_convert_to_pipeline_secret(param_value)
if not isinstance(step["component_parameters"][param_key], PipelineSecret) and isinstance(param_value, dict):
for key, value in param_value.items():
step["component_parameters"][param_key][key] = self._try_convert_to_pipeline_secret(value)

return PipelineJob(**pipeline_job_dict)

22 changes: 18 additions & 4 deletions src/sdk/python/rtdip_sdk/pipelines/execute/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Optional, List, Type, Union
from typing import Optional, Type, Union
from pydantic import BaseModel

from abc import ABCMeta
from ..sources.interfaces import SourceInterface
from ..transformers.interfaces import TransformerInterface
from ..destinations.interfaces import DestinationInterface
from ..utilities.interfaces import UtilitiesInterface
from ..secrets.models import PipelineSecret

class PipelineStep(BaseModel):
name: str
Expand All @@ -30,7 +31,8 @@ class PipelineStep(BaseModel):

class Config:
json_encoders = {
Union[Type[SourceInterface], Type[TransformerInterface], Type[DestinationInterface], Type[UtilitiesInterface]]: lambda x: x.__name__
ABCMeta: lambda x: x.__name__,
PipelineSecret: lambda x: {'__type__': "PipelineSecret", "__values__": x.dict()}
}

class PipelineTask(BaseModel):
Expand All @@ -40,8 +42,20 @@ class PipelineTask(BaseModel):
step_list: list[PipelineStep]
batch_task: Optional[bool]

class Config:
json_encoders = {
ABCMeta: lambda x: x.__name__,
PipelineSecret: lambda x: {'__type__': "PipelineSecret", "__values__": x.dict()}
}

class PipelineJob(BaseModel):
name: str
description: str
version: str
task_list: list[PipelineTask]
task_list: list[PipelineTask]

class Config:
json_encoders = {
ABCMeta: lambda x: x.__name__,
PipelineSecret: lambda x: {'__type__': "PipelineSecret", "__values__": x.dict()}
}
15 changes: 15 additions & 0 deletions src/sdk/python/rtdip_sdk/pipelines/secrets/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Copyright 2022 RTDIP
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .databricks import *
Loading