Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,9 @@ dependencies:
- great-expectations>=0.18.8,<1.0.0
- statsmodels>=0.14.1,<0.15.0
- pmdarima>=2.0.4
- protobuf>=4.25.0,<5.0.0
- pip:
- databricks-sdk>=0.20.0,<0.58.0
- databricks-sdk>=0.59.0,<1.0.0
- dependency-injector>=4.41.0,<5.0.0
- azure-functions>=1.15.0,<2.0.0
- azure-mgmt-eventgrid>=10.2.0
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@

PIPELINE_PACKAGES = [
"dependency-injector>=4.41.0,<5.0.0",
"databricks-sdk>=0.20.0,<0.58.0",
"databricks-sdk>=0.59.0,<1.0.0",
"azure-storage-file-datalake>=12.12.0,<13.0.0",
"azure-mgmt-storage>=21.0.0",
"azure-mgmt-eventgrid>=10.2.0",
Expand Down
267 changes: 260 additions & 7 deletions src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,39 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from dataclasses import dataclass
import sys
from typing import Union
from typing import List, Optional, Union
from importlib_metadata import PackageNotFoundError, version
from importlib.util import module_from_spec, spec_from_file_location
from pathlib import Path
from io import BytesIO

from enum import Enum
from typing import Any, Callable, Dict, Iterator, List, Optional
from databricks.sdk import WorkspaceClient
from databricks.sdk.config import Config
from databricks.sdk.service.jobs import CreateJob, JobSettings
from databricks.sdk.service.jobs import (
JobSettings,
Continuous,
JobAccessControlRequest,
JobDeployment,
JobEditMode,
JobEmailNotifications,
JobEnvironment,
Format,
GitSource,
JobsHealthRules,
JobCluster,
JobNotificationSettings,
JobParameterDefinition,
PerformanceTarget,
QueueSettings,
JobRunAs,
CronSchedule,
Task,
WebhookNotifications,
TriggerSettings,
)
from databricks.sdk.service.compute import Library, PythonPyPiLibrary, MavenLibrary
from .interfaces import DeployInterface
from ..utilities.pipeline_components import PipelineComponentsGetUtility
Expand All @@ -30,6 +53,237 @@
__description__: str


@dataclass
class CreateJob:
access_control_list: Optional[List[JobAccessControlRequest]] = None
"""List of permissions to set on the job."""

budget_policy_id: Optional[str] = None
"""The id of the user specified budget policy to use for this job. If not specified, a default
budget policy may be applied when creating or modifying the job. See
`effective_budget_policy_id` for the budget policy used by this workload."""

continuous: Optional[Continuous] = None
"""An optional continuous property for this job. The continuous property will ensure that there is
always one run executing. Only one of `schedule` and `continuous` can be used."""

deployment: Optional[JobDeployment] = None
"""Deployment information for jobs managed by external sources."""

description: Optional[str] = None
"""An optional description for the job. The maximum length is 27700 characters in UTF-8 encoding."""

edit_mode: Optional[JobEditMode] = None
"""Edit mode of the job.

* `UI_LOCKED`: The job is in a locked UI state and cannot be modified. * `EDITABLE`: The job is
in an editable state and can be modified."""

email_notifications: Optional[JobEmailNotifications] = None
"""An optional set of email addresses that is notified when runs of this job begin or complete as
well as when this job is deleted."""

environments: Optional[List[JobEnvironment]] = None
"""A list of task execution environment specifications that can be referenced by serverless tasks
of this job. An environment is required to be present for serverless tasks. For serverless
notebook tasks, the environment is accessible in the notebook environment panel. For other
serverless tasks, the task environment is required to be specified using environment_key in the
task settings."""

format: Optional[Format] = None
"""Used to tell what is the format of the job. This field is ignored in Create/Update/Reset calls.
When using the Jobs API 2.1 this value is always set to `"MULTI_TASK"`."""

git_source: Optional[GitSource] = None
"""An optional specification for a remote Git repository containing the source code used by tasks.
Version-controlled source code is supported by notebook, dbt, Python script, and SQL File tasks.

If `git_source` is set, these tasks retrieve the file from the remote repository by default.
However, this behavior can be overridden by setting `source` to `WORKSPACE` on the task.

Note: dbt and SQL File tasks support only version-controlled sources. If dbt or SQL File tasks
are used, `git_source` must be defined on the job."""

health: Optional[JobsHealthRules] = None

job_clusters: Optional[List[JobCluster]] = None
"""A list of job cluster specifications that can be shared and reused by tasks of this job.
Libraries cannot be declared in a shared job cluster. You must declare dependent libraries in
task settings."""

max_concurrent_runs: Optional[int] = None
"""An optional maximum allowed number of concurrent runs of the job. Set this value if you want to
be able to execute multiple runs of the same job concurrently. This is useful for example if you
trigger your job on a frequent schedule and want to allow consecutive runs to overlap with each
other, or if you want to trigger multiple runs which differ by their input parameters. This
setting affects only new runs. For example, suppose the job’s concurrency is 4 and there are 4
concurrent active runs. Then setting the concurrency to 3 won’t kill any of the active runs.
However, from then on, new runs are skipped unless there are fewer than 3 active runs. This
value cannot exceed 1000. Setting this value to `0` causes all new runs to be skipped."""

name: Optional[str] = None
"""An optional name for the job. The maximum length is 4096 bytes in UTF-8 encoding."""

notification_settings: Optional[JobNotificationSettings] = None
"""Optional notification settings that are used when sending notifications to each of the
`email_notifications` and `webhook_notifications` for this job."""

parameters: Optional[List[JobParameterDefinition]] = None
"""Job-level parameter definitions"""

performance_target: Optional[PerformanceTarget] = None
"""The performance mode on a serverless job. This field determines the level of compute performance
or cost-efficiency for the run.

* `STANDARD`: Enables cost-efficient execution of serverless workloads. *
`PERFORMANCE_OPTIMIZED`: Prioritizes fast startup and execution times through rapid scaling and
optimized cluster performance."""

queue: Optional[QueueSettings] = None
"""The queue settings of the job."""

run_as: Optional[JobRunAs] = None

schedule: Optional[CronSchedule] = None
"""An optional periodic schedule for this job. The default behavior is that the job only runs when
triggered by clicking “Run Now” in the Jobs UI or sending an API request to `runNow`."""

tags: Optional[Dict[str, str]] = None
"""A map of tags associated with the job. These are forwarded to the cluster as cluster tags for
jobs clusters, and are subject to the same limitations as cluster tags. A maximum of 25 tags can
be added to the job."""

tasks: Optional[List[Task]] = None
"""A list of task specifications to be executed by this job. It supports up to 1000 elements in
write endpoints (:method:jobs/create, :method:jobs/reset, :method:jobs/update,
:method:jobs/submit). Read endpoints return only 100 tasks. If more than 100 tasks are
available, you can paginate through them using :method:jobs/get. Use the `next_page_token` field
at the object root to determine if more results are available."""

timeout_seconds: Optional[int] = None
"""An optional timeout applied to each run of this job. A value of `0` means no timeout."""

trigger: Optional[TriggerSettings] = None
"""A configuration to trigger a run when certain conditions are met. The default behavior is that
the job runs only when triggered by clicking “Run Now” in the Jobs UI or sending an API
request to `runNow`."""

webhook_notifications: Optional[WebhookNotifications] = None
"""A collection of system notification IDs to notify when runs of this job begin or complete."""

def as_dict(self) -> dict: # pragma: no cover
"""Serializes the CreateJob into a dictionary suitable for use as a JSON request body."""
body = {}
if self.access_control_list:
body["access_control_list"] = [
v.as_dict() for v in self.access_control_list
]
if self.budget_policy_id is not None:
body["budget_policy_id"] = self.budget_policy_id
if self.continuous:
body["continuous"] = self.continuous.as_dict()
if self.deployment:
body["deployment"] = self.deployment.as_dict()
if self.description is not None:
body["description"] = self.description
if self.edit_mode is not None:
body["edit_mode"] = self.edit_mode.value
if self.email_notifications:
body["email_notifications"] = self.email_notifications.as_dict()
if self.environments:
body["environments"] = [v.as_dict() for v in self.environments]
if self.format is not None:
body["format"] = self.format.value
if self.git_source:
body["git_source"] = self.git_source.as_dict()
if self.health:
body["health"] = self.health.as_dict()
if self.job_clusters:
body["job_clusters"] = [v.as_dict() for v in self.job_clusters]
if self.max_concurrent_runs is not None:
body["max_concurrent_runs"] = self.max_concurrent_runs
if self.name is not None:
body["name"] = self.name
if self.notification_settings:
body["notification_settings"] = self.notification_settings.as_dict()
if self.parameters:
body["parameters"] = [v.as_dict() for v in self.parameters]
if self.performance_target is not None:
body["performance_target"] = self.performance_target.value
if self.queue:
body["queue"] = self.queue.as_dict()
if self.run_as:
body["run_as"] = self.run_as.as_dict()
if self.schedule:
body["schedule"] = self.schedule.as_dict()
if self.tags:
body["tags"] = self.tags
if self.tasks:
body["tasks"] = [v.as_dict() for v in self.tasks]
if self.timeout_seconds is not None:
body["timeout_seconds"] = self.timeout_seconds
if self.trigger:
body["trigger"] = self.trigger.as_dict()
if self.webhook_notifications:
body["webhook_notifications"] = self.webhook_notifications.as_dict()
return body

def as_shallow_dict(self) -> dict: # pragma: no cover
"""Serializes the CreateJob into a shallow dictionary of its immediate attributes."""
body = {}
if self.access_control_list:
body["access_control_list"] = self.access_control_list
if self.budget_policy_id is not None:
body["budget_policy_id"] = self.budget_policy_id
if self.continuous:
body["continuous"] = self.continuous
if self.deployment:
body["deployment"] = self.deployment
if self.description is not None:
body["description"] = self.description
if self.edit_mode is not None:
body["edit_mode"] = self.edit_mode
if self.email_notifications:
body["email_notifications"] = self.email_notifications
if self.environments:
body["environments"] = self.environments
if self.format is not None:
body["format"] = self.format
if self.git_source:
body["git_source"] = self.git_source
if self.health:
body["health"] = self.health
if self.job_clusters:
body["job_clusters"] = self.job_clusters
if self.max_concurrent_runs is not None:
body["max_concurrent_runs"] = self.max_concurrent_runs
if self.name is not None:
body["name"] = self.name
if self.notification_settings:
body["notification_settings"] = self.notification_settings
if self.parameters:
body["parameters"] = self.parameters
if self.performance_target is not None:
body["performance_target"] = self.performance_target
if self.queue:
body["queue"] = self.queue
if self.run_as:
body["run_as"] = self.run_as
if self.schedule:
body["schedule"] = self.schedule
if self.tags:
body["tags"] = self.tags
if self.tasks:
body["tasks"] = self.tasks
if self.timeout_seconds is not None:
body["timeout_seconds"] = self.timeout_seconds
if self.trigger:
body["trigger"] = self.trigger
if self.webhook_notifications:
body["webhook_notifications"] = self.webhook_notifications
return body


class DatabricksSDKDeploy(DeployInterface):
"""
Deploys an RTDIP Pipeline to Databricks Workflows leveraging the Databricks [SDK.](https://docs.databricks.com/dev-tools/sdk-python.html)
Expand Down Expand Up @@ -72,7 +326,6 @@ class DatabricksSDKDeploy(DeployInterface):
notebook_path="/path/to/pipeline/rtdip_pipeline.py"
)
))

job = CreateJob(
name="test_job_rtdip",
job_clusters=cluster_list,
Expand Down Expand Up @@ -109,11 +362,11 @@ def __init__(
self.token = token
self.workspace_directory = workspace_directory

def _convert_file_to_binary(self, path) -> BytesIO:
def _convert_file_to_binary(self, path) -> BytesIO: # pragma: no cover
with open(path, "rb") as f:
return BytesIO(f.read())

def _load_module(self, module_name, path):
def _load_module(self, module_name, path): # pragma: no cover
spec = spec_from_file_location(module_name, path)
module = module_from_spec(spec)
spec.loader.exec_module(module)
Expand All @@ -133,7 +386,7 @@ def deploy(self) -> Union[bool, ValueError]:
auth_type="pat",
)
)
for task in self.databricks_job.tasks:
for task in self.databricks_job.tasks: # pragma: no cover
if task.notebook_task is None and task.spark_python_task is None:
return ValueError(
"A Notebook or Spark Python Task must be populated for each task in the Databricks Job"
Expand Down
Loading