From 2d1ec8c6da72394d4144363844101fe7d7d4fd45 Mon Sep 17 00:00:00 2001 From: cching95 Date: Mon, 21 Jul 2025 14:44:12 +0100 Subject: [PATCH 01/13] refactor summary query Signed-off-by: cching95 --- .../time_series/_time_series_query_builder.py | 132 ++++++++++++------ .../rtdip_sdk/queries/time_series/summary.py | 10 +- 2 files changed, 98 insertions(+), 44 deletions(-) diff --git a/src/sdk/python/rtdip_sdk/queries/time_series/_time_series_query_builder.py b/src/sdk/python/rtdip_sdk/queries/time_series/_time_series_query_builder.py index a5a31ca2f..a03aa45f2 100644 --- a/src/sdk/python/rtdip_sdk/queries/time_series/_time_series_query_builder.py +++ b/src/sdk/python/rtdip_sdk/queries/time_series/_time_series_query_builder.py @@ -1654,47 +1654,7 @@ def _circular_stats_query(parameters_dict: dict) -> str: def _summary_query(parameters_dict: dict) -> str: - summary_query = ( - "WITH summary AS (SELECT `{{ tagname_column }}`, " - "count(`{{ value_column }}`) as Count, " - "CAST(Avg(`{{ value_column }}`) as decimal(10, 2)) as Avg, " - "CAST(Min(`{{ value_column }}`) as decimal(10, 2)) as Min, " - "CAST(Max(`{{ value_column }}`) as decimal(10, 2)) as Max, " - "CAST(stddev(`{{ value_column }}`) as decimal(10, 2)) as StDev, " - "CAST(sum(`{{ value_column }}`) as decimal(10, 2)) as Sum, " - "CAST(variance(`{{ value_column }}`) as decimal(10, 2)) as Var FROM " - "{% if source is defined and source is not none %}" - "`{{ source|lower }}` " - "{% else %}" - "`{{ business_unit|lower }}`.`sensors`.`{{ asset|lower }}_{{ data_security_level|lower }}_events_{{ data_type|lower }}` " - "{% endif %}" - "{% if case_insensitivity_tag_search is defined and case_insensitivity_tag_search == true %}" - "WHERE `{{ timestamp_column }}` BETWEEN to_timestamp(\"{{ start_date }}\") AND to_timestamp(\"{{ end_date }}\") AND UPPER(`{{ tagname_column }}`) IN ('{{ tag_names | join('\\', \\'') | upper }}') " - "{% else %}" - "WHERE `{{ timestamp_column }}` BETWEEN to_timestamp(\"{{ start_date }}\") AND to_timestamp(\"{{ end_date }}\") AND `{{ tagname_column }}` IN ('{{ tag_names | join('\\', \\'') }}') " - "{% endif %}" - "{% if include_status is defined and include_status == true and include_bad_data is defined and include_bad_data == false %}" - "AND `{{ status_column }}` <> 'Bad'" - "{% endif %}" - "GROUP BY `{{ tagname_column }}`) " - "{% if display_uom is defined and display_uom == true %}" - 'SELECT {% if to_json is defined and to_json == true %}to_json(struct(s.*, m.`UoM`), map("timestampFormat", "yyyy-MM-dd\'T\'HH:mm:ss.SSSSSSSSSXXX")) as Value{% else %}s.*, m.`UoM`{% endif %} FROM summary s ' - "LEFT OUTER JOIN " - "{% if metadata_source is defined and metadata_source is not none %}" - "`{{ metadata_source|lower }}` m ON s.`{{ tagname_column }}` = m.`{{ metadata_tagname_column }}` " - "{% else %}" - "`{{ business_unit|lower }}`.`sensors`.`{{ asset|lower }}_{{ data_security_level|lower }}_metadata` m ON s.`{{ tagname_column }}` = m.`{{ tagname_column }}` " - "{% endif %}" - "{% else%}" - 'SELECT {% if to_json is defined and to_json == true %}to_json(struct(*), map("timestampFormat", "yyyy-MM-dd\'T\'HH:mm:ss.SSSSSSSSSXXX")) as Value{% else %}*{% endif %} FROM summary ' - "{% endif %}" - "{% if limit is defined and limit is not none %}" - "LIMIT {{ limit }} " - "{% endif %}" - "{% if offset is defined and offset is not none %}" - "OFFSET {{ offset }} " - "{% endif %}" - ) + sql_query_list = [] summary_parameters = { "source": parameters_dict.get("source", None), @@ -1737,8 +1697,94 @@ def _summary_query(parameters_dict: dict) -> str: "to_json": parameters_dict.get("to_json", False), } - sql_template = Template(summary_query) - return sql_template.render(summary_parameters) + summary_query = _build_summary_query() + + _build_summary_query = () + # summary_query = ( + # "WITH summary AS (SELECT `{{ tagname_column }}`, " + # "count(`{{ value_column }}`) as Count, " + # "CAST(Avg(`{{ value_column }}`) as decimal(10, 2)) as Avg, " + # "CAST(Min(`{{ value_column }}`) as decimal(10, 2)) as Min, " + # "CAST(Max(`{{ value_column }}`) as decimal(10, 2)) as Max, " + # "CAST(stddev(`{{ value_column }}`) as decimal(10, 2)) as StDev, " + # "CAST(sum(`{{ value_column }}`) as decimal(10, 2)) as Sum, " + # "CAST(variance(`{{ value_column }}`) as decimal(10, 2)) as Var FROM " + # "{% if source is defined and source is not none %}" + # "`{{ source|lower }}` " + # "{% else %}" + # "`{{ business_unit|lower }}`.`sensors`.`{{ asset|lower }}_{{ data_security_level|lower }}_events_{{ data_type|lower }}` " + # "{% endif %}" + # "{% if case_insensitivity_tag_search is defined and case_insensitivity_tag_search == true %}" + # "WHERE `{{ timestamp_column }}` BETWEEN to_timestamp(\"{{ start_date }}\") AND to_timestamp(\"{{ end_date }}\") AND UPPER(`{{ tagname_column }}`) IN ('{{ tag_names | join('\\', \\'') | upper }}') " + # "{% else %}" + # "WHERE `{{ timestamp_column }}` BETWEEN to_timestamp(\"{{ start_date }}\") AND to_timestamp(\"{{ end_date }}\") AND `{{ tagname_column }}` IN ('{{ tag_names | join('\\', \\'') }}') " + # "{% endif %}" + # "{% if include_status is defined and include_status == true and include_bad_data is defined and include_bad_data == false %}" + # "AND `{{ status_column }}` <> 'Bad'" + # "{% endif %}" + # "GROUP BY `{{ tagname_column }}`) " + # "{% if display_uom is defined and display_uom == true %}" + # 'SELECT {% if to_json is defined and to_json == true %}to_json(struct(s.*, m.`UoM`), map("timestampFormat", "yyyy-MM-dd\'T\'HH:mm:ss.SSSSSSSSSXXX")) as Value{% else %}s.*, m.`UoM`{% endif %} FROM summary s ' + # "LEFT OUTER JOIN " + # "{% if metadata_source is defined and metadata_source is not none %}" + # "`{{ metadata_source|lower }}` m ON s.`{{ tagname_column }}` = m.`{{ metadata_tagname_column }}` " + # "{% else %}" + # "`{{ business_unit|lower }}`.`sensors`.`{{ asset|lower }}_{{ data_security_level|lower }}_metadata` m ON s.`{{ tagname_column }}` = m.`{{ tagname_column }}` " + # "{% endif %}" + # "{% else%}" + # 'SELECT {% if to_json is defined and to_json == true %}to_json(struct(*), map("timestampFormat", "yyyy-MM-dd\'T\'HH:mm:ss.SSSSSSSSSXXX")) as Value{% else %}*{% endif %} FROM summary ' + # "{% endif %}" + # "{% if limit is defined and limit is not none %}" + # "LIMIT {{ limit }} " + # "{% endif %}" + # "{% if offset is defined and offset is not none %}" + # "OFFSET {{ offset }} " + # "{% endif %}" + # ) + + # summary_parameters = { + # "source": parameters_dict.get("source", None), + # "metadata_source": parameters_dict.get("metadata_source", None), + # "business_unit": parameters_dict.get("business_unit"), + # "region": parameters_dict.get("region"), + # "asset": parameters_dict.get("asset"), + # "data_security_level": parameters_dict.get("data_security_level"), + # "data_type": parameters_dict.get("data_type"), + # "start_date": parameters_dict["start_date"], + # "end_date": parameters_dict["end_date"], + # "tag_names": list(dict.fromkeys(parameters_dict["tag_names"])), + # "include_bad_data": parameters_dict["include_bad_data"], + # "display_uom": parameters_dict.get("display_uom", False), + # "limit": parameters_dict.get("limit", None), + # "offset": parameters_dict.get("offset", None), + # "time_zone": parameters_dict["time_zone"], + # "tagname_column": parameters_dict.get("tagname_column", "TagName"), + # "timestamp_column": parameters_dict.get("timestamp_column", "EventTime"), + # "include_status": ( + # False + # if "status_column" in parameters_dict + # and parameters_dict.get("status_column") is None + # else True + # ), + # "status_column": ( + # "Status" + # if "status_column" in parameters_dict + # and parameters_dict.get("status_column") is None + # else parameters_dict.get("status_column", "Status") + # ), + # "value_column": parameters_dict.get("value_column", "Value"), + # "case_insensitivity_tag_search": parameters_dict.get( + # "case_insensitivity_tag_search", False + # ), + # "metadata_tagname_column": parameters_dict.get( + # "metadata_tagname_column", "TagName" + # ), + # "metadata_uom_column": parameters_dict.get("metadata_uom_column", "UoM"), + # "to_json": parameters_dict.get("to_json", False), + # } + + # sql_template = Template(summary_query) + # return sql_template.render(summary_parameters) def _query_builder(parameters_dict: dict, query_type: str) -> str: diff --git a/src/sdk/python/rtdip_sdk/queries/time_series/summary.py b/src/sdk/python/rtdip_sdk/queries/time_series/summary.py index 341767ff9..95179077d 100644 --- a/src/sdk/python/rtdip_sdk/queries/time_series/summary.py +++ b/src/sdk/python/rtdip_sdk/queries/time_series/summary.py @@ -14,7 +14,15 @@ import logging import pandas as pd -from ._time_series_query_builder import _query_builder + +# from ._time_series_query_builder import _query_builder + +import sys + +sys.path.insert(0, ".") +from src.sdk.python.rtdip_sdk.queries.time_series._time_series_query_builder import ( + _query_builder, +) def get(connection: object, parameters_dict: dict) -> pd.DataFrame: From 52a8488b32fd1ffe38060f49b0164dbfbdde20bd Mon Sep 17 00:00:00 2001 From: cching95 Date: Tue, 22 Jul 2025 11:37:13 +0100 Subject: [PATCH 02/13] refactor summary query and unit test Signed-off-by: cching95 --- .../time_series/_time_series_query_builder.py | 132 +++++++++++++++++- .../time_series/time_series_query_builder.py | 2 +- .../queries/_test_utils/sdk_test_objects.py | 6 +- .../queries/time_series/test_summary.py | 2 +- 4 files changed, 135 insertions(+), 7 deletions(-) diff --git a/src/sdk/python/rtdip_sdk/queries/time_series/_time_series_query_builder.py b/src/sdk/python/rtdip_sdk/queries/time_series/_time_series_query_builder.py index a03aa45f2..9fed59360 100644 --- a/src/sdk/python/rtdip_sdk/queries/time_series/_time_series_query_builder.py +++ b/src/sdk/python/rtdip_sdk/queries/time_series/_time_series_query_builder.py @@ -290,6 +290,87 @@ def _build_interpolate_query( return interpolate_query_sql + ")" +def _build_summary_query( + sql_query_name, + timestamp_column, + tagname_column, + status_column, + value_column, + start_date, + end_date, + source=None, + business_unit=None, + asset=None, + data_security_level=None, + data_type=None, + tag_names=None, + include_status=None, + include_bad_data=None, + case_insensitivity_tag_search=None, +): + + # Select + summary_query_sql = f"{sql_query_name} AS (SELECT `{tagname_column}`, " + summary_query_sql = " ".join( + [ + summary_query_sql, + f"count(`{value_column}`) as Count,", + f"CAST(Avg(`{value_column}`) as decimal(10, 2)) as Avg,", + f"CAST(Min(`{value_column}`) as decimal(10, 2)) as Min,", + f"CAST(Max(`{value_column}`) as decimal(10, 2)) as Max,", + f"CAST(stddev(`{value_column}`) as decimal(10, 2)) as StDev,", + f"CAST(sum(`{value_column}`) as decimal(10, 2)) as Sum,", + f"CAST(variance(`{value_column}`) as decimal(10, 2)) as Var FROM", + ] + ) + + # From + if source is not None: + summary_query_sql = " ".join([summary_query_sql, f"`{source.lower()}`"]) + else: + summary_query_sql = " ".join( + [ + summary_query_sql, + f"`{business_unit.lower()}`.`sensors`.`{asset.lower()}_{data_security_level.lower()}_events_{data_type.lower()}`", + ] + ) + + # Where EventTime + summary_query_sql = " ".join( + [ + summary_query_sql, + f"WHERE `{timestamp_column}` BETWEEN to_timestamp('{start_date}') AND to_timestamp('{end_date}') AND", + ] + ) + + # TagName + if case_insensitivity_tag_search == True: + quoted_tag_names = "', '".join([tag.upper() for tag in tag_names]) + summary_query_sql = " ".join( + [ + summary_query_sql, + f"UPPER(`{tagname_column}`) IN ('{quoted_tag_names}')", + ] + ) + else: + quoted_tag_names = "', '".join(tag_names) + summary_query_sql = " ".join( + [summary_query_sql, f"`{tagname_column}` IN ('{quoted_tag_names}')"] + ) + + # Optional bad data filtering + if include_status == True and include_bad_data == False: + summary_query_sql = " ".join( + [summary_query_sql, f"AND `{status_column}` <> 'Bad'"] + ) + + # Group by + summary_query_sql = " ".join([summary_query_sql, f"GROUP BY `{tagname_column}`"]) + summary_query_sql += ")" + + return summary_query_sql + + def _build_pivot_query( sql_query_list, sql_query_name, @@ -1697,9 +1778,56 @@ def _summary_query(parameters_dict: dict) -> str: "to_json": parameters_dict.get("to_json", False), } - summary_query = _build_summary_query() + summary_query = _build_summary_query( + sql_query_name="summary", + timestamp_column=summary_parameters["timestamp_column"], + tagname_column=summary_parameters["tagname_column"], + status_column=summary_parameters["status_column"], + value_column=summary_parameters["value_column"], + start_date=summary_parameters["start_date"], + end_date=summary_parameters["end_date"], + source=summary_parameters["source"], + business_unit=summary_parameters["business_unit"], + asset=summary_parameters["asset"], + data_security_level=summary_parameters["data_security_level"], + data_type=summary_parameters["data_type"], + tag_names=summary_parameters["tag_names"], + include_status=summary_parameters["include_status"], + include_bad_data=summary_parameters["include_bad_data"], + case_insensitivity_tag_search=summary_parameters[ + "case_insensitivity_tag_search" + ], + ) + + sql_query_list.append({"query_name": "summary", "sql_query": summary_query}) + + if summary_parameters["display_uom"] == True: + uom_query = _build_uom_query( + sql_query_list=sql_query_list, + sql_query_name="uom", + metadata_source=summary_parameters["metadata_source"], + business_unit=summary_parameters["business_unit"], + asset=summary_parameters["asset"], + data_security_level=summary_parameters["data_security_level"], + tagname_column=summary_parameters["tagname_column"], + metadata_tagname_column=summary_parameters["metadata_tagname_column"], + metadata_uom_column=summary_parameters["metadata_uom_column"], + ) + sql_query_list.append({"query_name": "uom", "sql_query": uom_query}) + + # Add output query + output_query = _build_output_query( + sql_query_list=sql_query_list, + to_json=summary_parameters["to_json"], + limit=summary_parameters["limit"], + offset=summary_parameters["offset"], + ) + sql_query_list.append({"query_name": "output", "sql_query": output_query}) + # Build final SQL using CTE statement builder + sql_query = _build_sql_cte_statement(sql_query_list) + + return sql_query - _build_summary_query = () # summary_query = ( # "WITH summary AS (SELECT `{{ tagname_column }}`, " # "count(`{{ value_column }}`) as Count, " diff --git a/src/sdk/python/rtdip_sdk/queries/time_series/time_series_query_builder.py b/src/sdk/python/rtdip_sdk/queries/time_series/time_series_query_builder.py index 383ab7fca..7f4ce20a0 100644 --- a/src/sdk/python/rtdip_sdk/queries/time_series/time_series_query_builder.py +++ b/src/sdk/python/rtdip_sdk/queries/time_series/time_series_query_builder.py @@ -162,7 +162,7 @@ def m_source( metadata_tagname_column (optional str): The column name in the source that contains the tagnames or series metadata_uom_column (optional str): The column name in the source that contains the unit of measure """ - self.metadata_source = "`.`".join(metadata_source.split(".")) + self.metadata_source = f"`{'`.`'.join(metadata_source.split('.'))}`" self.metadata_tagname_column = metadata_tagname_column self.metadata_uom_column = metadata_uom_column return self diff --git a/tests/sdk/python/rtdip_sdk/queries/_test_utils/sdk_test_objects.py b/tests/sdk/python/rtdip_sdk/queries/_test_utils/sdk_test_objects.py index 27fa00424..3a048991d 100644 --- a/tests/sdk/python/rtdip_sdk/queries/_test_utils/sdk_test_objects.py +++ b/tests/sdk/python/rtdip_sdk/queries/_test_utils/sdk_test_objects.py @@ -93,6 +93,6 @@ LATEST_MOCKED_QUERY_UOM = "WITH latest AS (SELECT * FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_latest` WHERE `TagName` IN ('mocked-TAGNAME') ORDER BY `TagName` ) SELECT l.*, m.`UoM` FROM latest l LEFT OUTER JOIN `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_metadata` m ON l.`TagName` = m.`TagName` " # Summary -SUMMARY_MOCKED_QUERY = 'WITH summary AS (SELECT `TagName`, count(`Value`) as Count, CAST(Avg(`Value`) as decimal(10, 2)) as Avg, CAST(Min(`Value`) as decimal(10, 2)) as Min, CAST(Max(`Value`) as decimal(10, 2)) as Max, CAST(stddev(`Value`) as decimal(10, 2)) as StDev, CAST(sum(`Value`) as decimal(10, 2)) as Sum, CAST(variance(`Value`) as decimal(10, 2)) as Var FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp("2011-01-01T00:00:00+00:00") AND to_timestamp("2011-01-02T23:59:59+00:00") AND `TagName` IN (\'mocked-TAGNAME\') GROUP BY `TagName`) SELECT * FROM summary ' -SUMMARY_MOCKED_QUERY_CHECK_TAGS = 'WITH summary AS (SELECT `TagName`, count(`Value`) as Count, CAST(Avg(`Value`) as decimal(10, 2)) as Avg, CAST(Min(`Value`) as decimal(10, 2)) as Min, CAST(Max(`Value`) as decimal(10, 2)) as Max, CAST(stddev(`Value`) as decimal(10, 2)) as StDev, CAST(sum(`Value`) as decimal(10, 2)) as Sum, CAST(variance(`Value`) as decimal(10, 2)) as Var FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp("2011-01-01T00:00:00+00:00") AND to_timestamp("2011-01-02T23:59:59+00:00") AND UPPER(`TagName`) IN (\'MOCKED-TAGNAME\') GROUP BY `TagName`) SELECT * FROM summary ' -SUMMARY_MOCKED_QUERY_UOM = 'WITH summary AS (SELECT `TagName`, count(`Value`) as Count, CAST(Avg(`Value`) as decimal(10, 2)) as Avg, CAST(Min(`Value`) as decimal(10, 2)) as Min, CAST(Max(`Value`) as decimal(10, 2)) as Max, CAST(stddev(`Value`) as decimal(10, 2)) as StDev, CAST(sum(`Value`) as decimal(10, 2)) as Sum, CAST(variance(`Value`) as decimal(10, 2)) as Var FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp("2011-01-01T00:00:00+00:00") AND to_timestamp("2011-01-02T23:59:59+00:00") AND `TagName` IN (\'mocked-TAGNAME\') GROUP BY `TagName`) SELECT s.*, m.`UoM` FROM summary s LEFT OUTER JOIN `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_metadata` m ON s.`TagName` = m.`TagName` ' +SUMMARY_MOCKED_QUERY = "WITH summary AS (SELECT `TagName`, count(`Value`) as Count, CAST(Avg(`Value`) as decimal(10, 2)) as Avg, CAST(Min(`Value`) as decimal(10, 2)) as Min, CAST(Max(`Value`) as decimal(10, 2)) as Max, CAST(stddev(`Value`) as decimal(10, 2)) as StDev, CAST(sum(`Value`) as decimal(10, 2)) as Sum, CAST(variance(`Value`) as decimal(10, 2)) as Var FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp('2011-01-01T00:00:00+00:00') AND to_timestamp('2011-01-02T23:59:59+00:00') AND `TagName` IN ('mocked-TAGNAME') GROUP BY `TagName`) SELECT * FROM summary" +SUMMARY_MOCKED_QUERY_CHECK_TAGS = "WITH summary AS (SELECT `TagName`, count(`Value`) as Count, CAST(Avg(`Value`) as decimal(10, 2)) as Avg, CAST(Min(`Value`) as decimal(10, 2)) as Min, CAST(Max(`Value`) as decimal(10, 2)) as Max, CAST(stddev(`Value`) as decimal(10, 2)) as StDev, CAST(sum(`Value`) as decimal(10, 2)) as Sum, CAST(variance(`Value`) as decimal(10, 2)) as Var FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp('2011-01-01T00:00:00+00:00') AND to_timestamp('2011-01-02T23:59:59+00:00') AND UPPER(`TagName`) IN ('MOCKED-TAGNAME') GROUP BY `TagName`) SELECT * FROM summary" +SUMMARY_MOCKED_QUERY_UOM = "WITH summary AS (SELECT `TagName`, count(`Value`) as Count, CAST(Avg(`Value`) as decimal(10, 2)) as Avg, CAST(Min(`Value`) as decimal(10, 2)) as Min, CAST(Max(`Value`) as decimal(10, 2)) as Max, CAST(stddev(`Value`) as decimal(10, 2)) as StDev, CAST(sum(`Value`) as decimal(10, 2)) as Sum, CAST(variance(`Value`) as decimal(10, 2)) as Var FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp('2011-01-01T00:00:00+00:00') AND to_timestamp('2011-01-02T23:59:59+00:00') AND `TagName` IN ('mocked-TAGNAME') GROUP BY `TagName`), uom AS (SELECT summary.*, metadata.`UoM` FROM summary LEFT OUTER JOIN `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_metadata` AS metadata ON summary.`TagName` = metadata.`TagName`) SELECT * FROM uom" diff --git a/tests/sdk/python/rtdip_sdk/queries/time_series/test_summary.py b/tests/sdk/python/rtdip_sdk/queries/time_series/test_summary.py index d706bf328..7d14dec43 100644 --- a/tests/sdk/python/rtdip_sdk/queries/time_series/test_summary.py +++ b/tests/sdk/python/rtdip_sdk/queries/time_series/test_summary.py @@ -77,7 +77,7 @@ def test_summary_offset_limit(mocker: MockerFixture): _test_base_succeed( mocker, MOCKED_SUMMARY_DICT, - SUMMARY_MOCKED_QUERY + MOCKED_QUERY_OFFSET_LIMIT, + SUMMARY_MOCKED_QUERY + " " + MOCKED_QUERY_OFFSET_LIMIT.strip(), summary_get, ) From f75353c6dc743fc53425685f27da9cfa6ad54bf3 Mon Sep 17 00:00:00 2001 From: cching95 Date: Wed, 23 Jul 2025 11:27:33 +0100 Subject: [PATCH 03/13] test fix Signed-off-by: cching95 --- .../rtdip_sdk/pipelines/deploy/test_databricks_deploy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/deploy/test_databricks_deploy.py b/tests/sdk/python/rtdip_sdk/pipelines/deploy/test_databricks_deploy.py index d20dc35f8..46b9f1e8b 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/deploy/test_databricks_deploy.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/deploy/test_databricks_deploy.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import sys +# import sys -sys.path.insert(0, ".") +# sys.path.insert(0, ".") from pytest_mock import MockerFixture import pytest From 217c35e4405d005550dec9b22daeef1ea1dcb604 Mon Sep 17 00:00:00 2001 From: cching95 Date: Fri, 25 Jul 2025 12:04:40 +0100 Subject: [PATCH 04/13] undo test fix Signed-off-by: cching95 --- .../rtdip_sdk/pipelines/deploy/test_databricks_deploy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/deploy/test_databricks_deploy.py b/tests/sdk/python/rtdip_sdk/pipelines/deploy/test_databricks_deploy.py index 46b9f1e8b..d20dc35f8 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/deploy/test_databricks_deploy.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/deploy/test_databricks_deploy.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -# import sys +import sys -# sys.path.insert(0, ".") +sys.path.insert(0, ".") from pytest_mock import MockerFixture import pytest From 26c4ccb67cd38ba11e8238e64be0e9eb95e5abfc Mon Sep 17 00:00:00 2001 From: cching95 Date: Mon, 4 Aug 2025 15:26:09 +0100 Subject: [PATCH 05/13] change CreateJob class to a dictionary and update code to get from dictionary Signed-off-by: cching95 --- .../rtdip_sdk/pipelines/deploy/databricks.py | 38 +++++++++++-------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py b/src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py index 3fa53a3a2..9d8cc6cac 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py +++ b/src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py @@ -20,7 +20,7 @@ from databricks.sdk import WorkspaceClient from databricks.sdk.config import Config -from databricks.sdk.service.jobs import CreateJob, JobSettings +from databricks.sdk.service.jobs import CreateResponse, JobSettings from databricks.sdk.service.compute import Library, PythonPyPiLibrary, MavenLibrary from .interfaces import DeployInterface from ..utilities.pipeline_components import PipelineComponentsGetUtility @@ -73,11 +73,11 @@ class DatabricksSDKDeploy(DeployInterface): ) )) - job = CreateJob( - name="test_job_rtdip", - job_clusters=cluster_list, - tasks=task_list - ) + job = { + "name": "test_job_rtdip", + "job_clusters": cluster_list, + "tasks": task_list + } databricks_job = DatabricksSDKDeploy(databricks_job=job, host="https://test.databricks.net", token="test_token") @@ -97,12 +97,12 @@ class DatabricksSDKDeploy(DeployInterface): def __init__( self, - databricks_job: CreateJob, + databricks_job: dict, host: str, token: str, workspace_directory: str = "/rtdip", ) -> None: - if databricks_job.name is None or databricks_job.name == "": + if databricks_job.get("name") is None or databricks_job.get("name") == "": raise ValueError("databricks_job.name cannot be empty") self.databricks_job = databricks_job self.host = host @@ -133,7 +133,7 @@ def deploy(self) -> Union[bool, ValueError]: auth_type="pat", ) ) - for task in self.databricks_job.tasks: + for task in self.databricks_job.get("tasks", []): if task.notebook_task is None and task.spark_python_task is None: return ValueError( "A Notebook or Spark Python Task must be populated for each task in the Databricks Job" @@ -230,7 +230,7 @@ def deploy(self) -> Union[bool, ValueError]: task.new_cluster.spark_conf = {} task.new_cluster.spark_conf.update(spark_configuration) elif task.job_cluster_key is not None: - for job_cluster in self.databricks_job.job_clusters: + for job_cluster in self.databricks_job.get("job_clusters", []): if job_cluster.job_cluster_key == task.job_cluster_key: if spark_configuration is not None: if job_cluster.new_cluster.spark_conf is None: @@ -240,7 +240,7 @@ def deploy(self) -> Union[bool, ValueError]: ) break elif task.compute_key is not None: - for compute in self.databricks_job.compute: + for compute in self.databricks_job.get("compute"): if compute.compute_key == task.compute_key: # TODO : Add spark config for compute. Does not seem to be currently available in the Databricks SDK # NOSONAR # compute.spark_conf.update(spark_configuration) @@ -248,9 +248,11 @@ def deploy(self) -> Union[bool, ValueError]: # Create Databricks Job job_found = False - for existing_job in workspace_client.jobs.list(name=self.databricks_job.name): + for existing_job in workspace_client.jobs.list( + name=self.databricks_job.get("name") + ): new_settings = JobSettings() - for key, value in self.databricks_job.__dict__.items(): + for key, value in self.databricks_job.items(): if key in new_settings.__dict__: setattr(new_settings, key, value) workspace_client.jobs.reset( @@ -260,7 +262,7 @@ def deploy(self) -> Union[bool, ValueError]: break if job_found == False: - workspace_client.jobs.create(**self.databricks_job.__dict__) + workspace_client.jobs.create(**self.databricks_job) return True @@ -277,7 +279,9 @@ def launch(self): ) ) job_found = False - for existing_job in workspace_client.jobs.list(name=self.databricks_job.name): + for existing_job in workspace_client.jobs.list( + name=self.databricks_job.get("name") + ): workspace_client.jobs.run_now(job_id=existing_job.job_id) job_found = True break @@ -300,7 +304,9 @@ def stop(self): ) ) job_found = False - for existing_job in workspace_client.jobs.list(name=self.databricks_job.name): + for existing_job in workspace_client.jobs.list( + name=self.databricks_job.get("name") + ): workspace_client.jobs.cancel_all_runs(job_id=existing_job.job_id) job_found = True break From 44577438976f7700e7e2060ccc8e63933e3e78ad Mon Sep 17 00:00:00 2001 From: cching95 Date: Mon, 4 Aug 2025 15:59:40 +0100 Subject: [PATCH 06/13] update unit tests to remove CreateJobs class Signed-off-by: cching95 --- docs/sdk/pipelines/deploy/databricks.md | 12 ++++++------ .../python/rtdip_sdk/pipelines/deploy/databricks.py | 5 ++--- .../pipelines/deploy/test_databricks_deploy.py | 9 ++++++--- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/docs/sdk/pipelines/deploy/databricks.md b/docs/sdk/pipelines/deploy/databricks.md index 1d87cdaa0..f1667b4ea 100644 --- a/docs/sdk/pipelines/deploy/databricks.md +++ b/docs/sdk/pipelines/deploy/databricks.md @@ -57,7 +57,7 @@ Parameters for a Databricks Job can be managed using the following Classes: |ClusterSpec| Provides Parameters for setting up a Databricks Cluster| |JobCluster| Sets up a Jobs Cluster as defined by the provided `DatabricksCluster`| |Task| Defines the setup of the Task at the Databricks Task level including Task specific Clusters, Libraries, Schedules, Notifications and Timeouts | -|CreateJob| Defines the setup at the Job level including Clusters, Libraries, Schedules, Notifications, Access Controls, Timeouts and Tags | + |NotebookTask| Provides the Notebook information to the `Task`| |DatabricksSDKDeploy|Leverages the Databricks SDK to deploy the job to Databricks Workflows| @@ -95,11 +95,11 @@ task_list.append(Task( )) # Create a Databricks Job for the Task -job = CreateJob( - name="test_job_rtdip", - job_clusters=cluster_list, - tasks=task_list -) +job = { + "name": "test_job_rtdip", + "job_clusters": cluster_list, + "tasks": task_list +} # Deploy to Databricks databricks_job = DatabricksSDKDeploy(databricks_job=job, host=databricks_host_name, token=access_token) diff --git a/src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py b/src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py index 9d8cc6cac..5c6eb920e 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py +++ b/src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py @@ -20,7 +20,7 @@ from databricks.sdk import WorkspaceClient from databricks.sdk.config import Config -from databricks.sdk.service.jobs import CreateResponse, JobSettings +from databricks.sdk.service.jobs import JobSettings from databricks.sdk.service.compute import Library, PythonPyPiLibrary, MavenLibrary from .interfaces import DeployInterface from ..utilities.pipeline_components import PipelineComponentsGetUtility @@ -41,7 +41,6 @@ class DatabricksSDKDeploy(DeployInterface): All options available in the [Databricks Jobs REST API v2.1](https://docs.databricks.com/dev-tools/api/latest/jobs.html) can be configured in the Databricks classes that have been defined in `rtdip_sdk.pipelines.deploy.models.databricks`, enabling full control of the configuration of the Databricks Workflow : - - `CreateJob` - `Task` RTDIP Pipeline Components provide Databricks with all the required Python packages and JARs to execute each component and these will be setup on the Workflow automatically during the Databricks Workflow creation. @@ -50,7 +49,7 @@ class DatabricksSDKDeploy(DeployInterface): This example assumes that a PipelineJob has already been defined by a variable called `pipeline_job` ```python - from rtdip_sdk.pipelines.deploy import DatabricksSDKDeploy, CreateJob, JobCluster, ClusterSpec, Task, NotebookTask, ComputeSpecKind, AutoScale, RuntimeEngine, DataSecurityMode + from rtdip_sdk.pipelines.deploy import DatabricksSDKDeploy, JobCluster, ClusterSpec, Task, NotebookTask, ComputeSpecKind, AutoScale, RuntimeEngine, DataSecurityMode cluster_list = [] cluster_list.append(JobCluster( diff --git a/tests/sdk/python/rtdip_sdk/pipelines/deploy/test_databricks_deploy.py b/tests/sdk/python/rtdip_sdk/pipelines/deploy/test_databricks_deploy.py index d20dc35f8..a1b3b9b95 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/deploy/test_databricks_deploy.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/deploy/test_databricks_deploy.py @@ -20,7 +20,6 @@ from src.sdk.python.rtdip_sdk.pipelines.deploy import ( DatabricksSDKDeploy, - CreateJob, JobCluster, ClusterSpec, Task, @@ -124,7 +123,9 @@ def test_pipeline_job_deploy(mocker: MockerFixture): ) ) - job = CreateJob(name="test_job_rtdip", job_clusters=cluster_list, tasks=task_list) + job = {"name": "test_job_rtdip", "job_clusters": cluster_list, "tasks": task_list} + + # job = CreateJob(name="test_job_rtdip", job_clusters=cluster_list, tasks=task_list) databricks_job = DatabricksSDKDeploy( databricks_job=job, host="https://test.databricks.net", token="test_token" @@ -207,7 +208,9 @@ def test_pipeline_job_deploy_fails(mocker: MockerFixture): ) ) - job = CreateJob(name="test_job_rtdip", job_clusters=cluster_list, tasks=task_list) + job = {"name": "test_job_rtdip", "job_clusters": cluster_list, "tasks": task_list} + + # job = CreateJob(name="test_job_rtdip", job_clusters=cluster_list, tasks=task_list) databricks_job = DatabricksSDKDeploy( databricks_job=job, host="https://test.databricks.net", token="test_token" From 7526279896286d7b439ffb202e93b0c659109f0f Mon Sep 17 00:00:00 2001 From: cching95 Date: Mon, 4 Aug 2025 17:22:32 +0100 Subject: [PATCH 07/13] revert changes and add CreateJobs class Signed-off-by: cching95 --- docs/sdk/pipelines/deploy/databricks.md | 12 +- .../rtdip_sdk/pipelines/deploy/databricks.py | 297 ++++++++++++++++-- .../deploy/test_databricks_deploy.py | 9 +- 3 files changed, 281 insertions(+), 37 deletions(-) diff --git a/docs/sdk/pipelines/deploy/databricks.md b/docs/sdk/pipelines/deploy/databricks.md index f1667b4ea..1d87cdaa0 100644 --- a/docs/sdk/pipelines/deploy/databricks.md +++ b/docs/sdk/pipelines/deploy/databricks.md @@ -57,7 +57,7 @@ Parameters for a Databricks Job can be managed using the following Classes: |ClusterSpec| Provides Parameters for setting up a Databricks Cluster| |JobCluster| Sets up a Jobs Cluster as defined by the provided `DatabricksCluster`| |Task| Defines the setup of the Task at the Databricks Task level including Task specific Clusters, Libraries, Schedules, Notifications and Timeouts | - +|CreateJob| Defines the setup at the Job level including Clusters, Libraries, Schedules, Notifications, Access Controls, Timeouts and Tags | |NotebookTask| Provides the Notebook information to the `Task`| |DatabricksSDKDeploy|Leverages the Databricks SDK to deploy the job to Databricks Workflows| @@ -95,11 +95,11 @@ task_list.append(Task( )) # Create a Databricks Job for the Task -job = { - "name": "test_job_rtdip", - "job_clusters": cluster_list, - "tasks": task_list -} +job = CreateJob( + name="test_job_rtdip", + job_clusters=cluster_list, + tasks=task_list +) # Deploy to Databricks databricks_job = DatabricksSDKDeploy(databricks_job=job, host=databricks_host_name, token=access_token) diff --git a/src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py b/src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py index 5c6eb920e..7a9886389 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py +++ b/src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py @@ -11,16 +11,39 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from dataclasses import dataclass import sys -from typing import Union +from typing import List, Optional, Union from importlib_metadata import PackageNotFoundError, version from importlib.util import module_from_spec, spec_from_file_location from pathlib import Path from io import BytesIO - +from enum import Enum +from typing import Any, Callable, Dict, Iterator, List, Optional from databricks.sdk import WorkspaceClient from databricks.sdk.config import Config -from databricks.sdk.service.jobs import JobSettings +from databricks.sdk.service.jobs import ( + JobSettings, + Continuous, + JobAccessControlRequest, + JobDeployment, + JobEditMode, + JobEmailNotifications, + JobEnvironment, + Format, + GitSource, + JobsHealthRules, + JobCluster, + JobNotificationSettings, + JobParameterDefinition, + PerformanceTarget, + QueueSettings, + JobRunAs, + CronSchedule, + Task, + WebhookNotifications, + TriggerSettings, +) from databricks.sdk.service.compute import Library, PythonPyPiLibrary, MavenLibrary from .interfaces import DeployInterface from ..utilities.pipeline_components import PipelineComponentsGetUtility @@ -30,6 +53,237 @@ __description__: str +@dataclass +class CreateJob: + access_control_list: Optional[List[JobAccessControlRequest]] = None + """List of permissions to set on the job.""" + + budget_policy_id: Optional[str] = None + """The id of the user specified budget policy to use for this job. If not specified, a default + budget policy may be applied when creating or modifying the job. See + `effective_budget_policy_id` for the budget policy used by this workload.""" + + continuous: Optional[Continuous] = None + """An optional continuous property for this job. The continuous property will ensure that there is + always one run executing. Only one of `schedule` and `continuous` can be used.""" + + deployment: Optional[JobDeployment] = None + """Deployment information for jobs managed by external sources.""" + + description: Optional[str] = None + """An optional description for the job. The maximum length is 27700 characters in UTF-8 encoding.""" + + edit_mode: Optional[JobEditMode] = None + """Edit mode of the job. + + * `UI_LOCKED`: The job is in a locked UI state and cannot be modified. * `EDITABLE`: The job is + in an editable state and can be modified.""" + + email_notifications: Optional[JobEmailNotifications] = None + """An optional set of email addresses that is notified when runs of this job begin or complete as + well as when this job is deleted.""" + + environments: Optional[List[JobEnvironment]] = None + """A list of task execution environment specifications that can be referenced by serverless tasks + of this job. An environment is required to be present for serverless tasks. For serverless + notebook tasks, the environment is accessible in the notebook environment panel. For other + serverless tasks, the task environment is required to be specified using environment_key in the + task settings.""" + + format: Optional[Format] = None + """Used to tell what is the format of the job. This field is ignored in Create/Update/Reset calls. + When using the Jobs API 2.1 this value is always set to `"MULTI_TASK"`.""" + + git_source: Optional[GitSource] = None + """An optional specification for a remote Git repository containing the source code used by tasks. + Version-controlled source code is supported by notebook, dbt, Python script, and SQL File tasks. + + If `git_source` is set, these tasks retrieve the file from the remote repository by default. + However, this behavior can be overridden by setting `source` to `WORKSPACE` on the task. + + Note: dbt and SQL File tasks support only version-controlled sources. If dbt or SQL File tasks + are used, `git_source` must be defined on the job.""" + + health: Optional[JobsHealthRules] = None + + job_clusters: Optional[List[JobCluster]] = None + """A list of job cluster specifications that can be shared and reused by tasks of this job. + Libraries cannot be declared in a shared job cluster. You must declare dependent libraries in + task settings.""" + + max_concurrent_runs: Optional[int] = None + """An optional maximum allowed number of concurrent runs of the job. Set this value if you want to + be able to execute multiple runs of the same job concurrently. This is useful for example if you + trigger your job on a frequent schedule and want to allow consecutive runs to overlap with each + other, or if you want to trigger multiple runs which differ by their input parameters. This + setting affects only new runs. For example, suppose the job’s concurrency is 4 and there are 4 + concurrent active runs. Then setting the concurrency to 3 won’t kill any of the active runs. + However, from then on, new runs are skipped unless there are fewer than 3 active runs. This + value cannot exceed 1000. Setting this value to `0` causes all new runs to be skipped.""" + + name: Optional[str] = None + """An optional name for the job. The maximum length is 4096 bytes in UTF-8 encoding.""" + + notification_settings: Optional[JobNotificationSettings] = None + """Optional notification settings that are used when sending notifications to each of the + `email_notifications` and `webhook_notifications` for this job.""" + + parameters: Optional[List[JobParameterDefinition]] = None + """Job-level parameter definitions""" + + performance_target: Optional[PerformanceTarget] = None + """The performance mode on a serverless job. This field determines the level of compute performance + or cost-efficiency for the run. + + * `STANDARD`: Enables cost-efficient execution of serverless workloads. * + `PERFORMANCE_OPTIMIZED`: Prioritizes fast startup and execution times through rapid scaling and + optimized cluster performance.""" + + queue: Optional[QueueSettings] = None + """The queue settings of the job.""" + + run_as: Optional[JobRunAs] = None + + schedule: Optional[CronSchedule] = None + """An optional periodic schedule for this job. The default behavior is that the job only runs when + triggered by clicking “Run Now” in the Jobs UI or sending an API request to `runNow`.""" + + tags: Optional[Dict[str, str]] = None + """A map of tags associated with the job. These are forwarded to the cluster as cluster tags for + jobs clusters, and are subject to the same limitations as cluster tags. A maximum of 25 tags can + be added to the job.""" + + tasks: Optional[List[Task]] = None + """A list of task specifications to be executed by this job. It supports up to 1000 elements in + write endpoints (:method:jobs/create, :method:jobs/reset, :method:jobs/update, + :method:jobs/submit). Read endpoints return only 100 tasks. If more than 100 tasks are + available, you can paginate through them using :method:jobs/get. Use the `next_page_token` field + at the object root to determine if more results are available.""" + + timeout_seconds: Optional[int] = None + """An optional timeout applied to each run of this job. A value of `0` means no timeout.""" + + trigger: Optional[TriggerSettings] = None + """A configuration to trigger a run when certain conditions are met. The default behavior is that + the job runs only when triggered by clicking “Run Now” in the Jobs UI or sending an API + request to `runNow`.""" + + webhook_notifications: Optional[WebhookNotifications] = None + """A collection of system notification IDs to notify when runs of this job begin or complete.""" + + def as_dict(self) -> dict: + """Serializes the CreateJob into a dictionary suitable for use as a JSON request body.""" + body = {} + if self.access_control_list: + body["access_control_list"] = [ + v.as_dict() for v in self.access_control_list + ] + if self.budget_policy_id is not None: + body["budget_policy_id"] = self.budget_policy_id + if self.continuous: + body["continuous"] = self.continuous.as_dict() + if self.deployment: + body["deployment"] = self.deployment.as_dict() + if self.description is not None: + body["description"] = self.description + if self.edit_mode is not None: + body["edit_mode"] = self.edit_mode.value + if self.email_notifications: + body["email_notifications"] = self.email_notifications.as_dict() + if self.environments: + body["environments"] = [v.as_dict() for v in self.environments] + if self.format is not None: + body["format"] = self.format.value + if self.git_source: + body["git_source"] = self.git_source.as_dict() + if self.health: + body["health"] = self.health.as_dict() + if self.job_clusters: + body["job_clusters"] = [v.as_dict() for v in self.job_clusters] + if self.max_concurrent_runs is not None: + body["max_concurrent_runs"] = self.max_concurrent_runs + if self.name is not None: + body["name"] = self.name + if self.notification_settings: + body["notification_settings"] = self.notification_settings.as_dict() + if self.parameters: + body["parameters"] = [v.as_dict() for v in self.parameters] + if self.performance_target is not None: + body["performance_target"] = self.performance_target.value + if self.queue: + body["queue"] = self.queue.as_dict() + if self.run_as: + body["run_as"] = self.run_as.as_dict() + if self.schedule: + body["schedule"] = self.schedule.as_dict() + if self.tags: + body["tags"] = self.tags + if self.tasks: + body["tasks"] = [v.as_dict() for v in self.tasks] + if self.timeout_seconds is not None: + body["timeout_seconds"] = self.timeout_seconds + if self.trigger: + body["trigger"] = self.trigger.as_dict() + if self.webhook_notifications: + body["webhook_notifications"] = self.webhook_notifications.as_dict() + return body + + def as_shallow_dict(self) -> dict: + """Serializes the CreateJob into a shallow dictionary of its immediate attributes.""" + body = {} + if self.access_control_list: + body["access_control_list"] = self.access_control_list + if self.budget_policy_id is not None: + body["budget_policy_id"] = self.budget_policy_id + if self.continuous: + body["continuous"] = self.continuous + if self.deployment: + body["deployment"] = self.deployment + if self.description is not None: + body["description"] = self.description + if self.edit_mode is not None: + body["edit_mode"] = self.edit_mode + if self.email_notifications: + body["email_notifications"] = self.email_notifications + if self.environments: + body["environments"] = self.environments + if self.format is not None: + body["format"] = self.format + if self.git_source: + body["git_source"] = self.git_source + if self.health: + body["health"] = self.health + if self.job_clusters: + body["job_clusters"] = self.job_clusters + if self.max_concurrent_runs is not None: + body["max_concurrent_runs"] = self.max_concurrent_runs + if self.name is not None: + body["name"] = self.name + if self.notification_settings: + body["notification_settings"] = self.notification_settings + if self.parameters: + body["parameters"] = self.parameters + if self.performance_target is not None: + body["performance_target"] = self.performance_target + if self.queue: + body["queue"] = self.queue + if self.run_as: + body["run_as"] = self.run_as + if self.schedule: + body["schedule"] = self.schedule + if self.tags: + body["tags"] = self.tags + if self.tasks: + body["tasks"] = self.tasks + if self.timeout_seconds is not None: + body["timeout_seconds"] = self.timeout_seconds + if self.trigger: + body["trigger"] = self.trigger + if self.webhook_notifications: + body["webhook_notifications"] = self.webhook_notifications + return body + + class DatabricksSDKDeploy(DeployInterface): """ Deploys an RTDIP Pipeline to Databricks Workflows leveraging the Databricks [SDK.](https://docs.databricks.com/dev-tools/sdk-python.html) @@ -71,12 +325,11 @@ class DatabricksSDKDeploy(DeployInterface): notebook_path="/path/to/pipeline/rtdip_pipeline.py" ) )) - - job = { - "name": "test_job_rtdip", - "job_clusters": cluster_list, - "tasks": task_list - } + job = CreateJob( + name="test_job_rtdip", + job_clusters=cluster_list, + tasks=task_list + ) databricks_job = DatabricksSDKDeploy(databricks_job=job, host="https://test.databricks.net", token="test_token") @@ -96,12 +349,12 @@ class DatabricksSDKDeploy(DeployInterface): def __init__( self, - databricks_job: dict, + databricks_job: CreateJob, host: str, token: str, workspace_directory: str = "/rtdip", ) -> None: - if databricks_job.get("name") is None or databricks_job.get("name") == "": + if databricks_job.name is None or databricks_job.name == "": raise ValueError("databricks_job.name cannot be empty") self.databricks_job = databricks_job self.host = host @@ -132,7 +385,7 @@ def deploy(self) -> Union[bool, ValueError]: auth_type="pat", ) ) - for task in self.databricks_job.get("tasks", []): + for task in self.databricks_job.tasks: if task.notebook_task is None and task.spark_python_task is None: return ValueError( "A Notebook or Spark Python Task must be populated for each task in the Databricks Job" @@ -229,7 +482,7 @@ def deploy(self) -> Union[bool, ValueError]: task.new_cluster.spark_conf = {} task.new_cluster.spark_conf.update(spark_configuration) elif task.job_cluster_key is not None: - for job_cluster in self.databricks_job.get("job_clusters", []): + for job_cluster in self.databricks_job.job_clusters: if job_cluster.job_cluster_key == task.job_cluster_key: if spark_configuration is not None: if job_cluster.new_cluster.spark_conf is None: @@ -239,7 +492,7 @@ def deploy(self) -> Union[bool, ValueError]: ) break elif task.compute_key is not None: - for compute in self.databricks_job.get("compute"): + for compute in self.databricks_job.compute: if compute.compute_key == task.compute_key: # TODO : Add spark config for compute. Does not seem to be currently available in the Databricks SDK # NOSONAR # compute.spark_conf.update(spark_configuration) @@ -247,11 +500,9 @@ def deploy(self) -> Union[bool, ValueError]: # Create Databricks Job job_found = False - for existing_job in workspace_client.jobs.list( - name=self.databricks_job.get("name") - ): + for existing_job in workspace_client.jobs.list(name=self.databricks_job.name): new_settings = JobSettings() - for key, value in self.databricks_job.items(): + for key, value in self.databricks_job.__dict__.items(): if key in new_settings.__dict__: setattr(new_settings, key, value) workspace_client.jobs.reset( @@ -261,7 +512,7 @@ def deploy(self) -> Union[bool, ValueError]: break if job_found == False: - workspace_client.jobs.create(**self.databricks_job) + workspace_client.jobs.create(**self.databricks_job.__dict__) return True @@ -278,9 +529,7 @@ def launch(self): ) ) job_found = False - for existing_job in workspace_client.jobs.list( - name=self.databricks_job.get("name") - ): + for existing_job in workspace_client.jobs.list(name=self.databricks_job.name): workspace_client.jobs.run_now(job_id=existing_job.job_id) job_found = True break @@ -303,9 +552,7 @@ def stop(self): ) ) job_found = False - for existing_job in workspace_client.jobs.list( - name=self.databricks_job.get("name") - ): + for existing_job in workspace_client.jobs.list(name=self.databricks_job.name): workspace_client.jobs.cancel_all_runs(job_id=existing_job.job_id) job_found = True break diff --git a/tests/sdk/python/rtdip_sdk/pipelines/deploy/test_databricks_deploy.py b/tests/sdk/python/rtdip_sdk/pipelines/deploy/test_databricks_deploy.py index a1b3b9b95..2802b0430 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/deploy/test_databricks_deploy.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/deploy/test_databricks_deploy.py @@ -19,6 +19,7 @@ import pytest from src.sdk.python.rtdip_sdk.pipelines.deploy import ( + CreateJob, DatabricksSDKDeploy, JobCluster, ClusterSpec, @@ -123,9 +124,7 @@ def test_pipeline_job_deploy(mocker: MockerFixture): ) ) - job = {"name": "test_job_rtdip", "job_clusters": cluster_list, "tasks": task_list} - - # job = CreateJob(name="test_job_rtdip", job_clusters=cluster_list, tasks=task_list) + job = CreateJob(name="test_job_rtdip", job_clusters=cluster_list, tasks=task_list) databricks_job = DatabricksSDKDeploy( databricks_job=job, host="https://test.databricks.net", token="test_token" @@ -208,9 +207,7 @@ def test_pipeline_job_deploy_fails(mocker: MockerFixture): ) ) - job = {"name": "test_job_rtdip", "job_clusters": cluster_list, "tasks": task_list} - - # job = CreateJob(name="test_job_rtdip", job_clusters=cluster_list, tasks=task_list) + job = CreateJob(name="test_job_rtdip", job_clusters=cluster_list, tasks=task_list) databricks_job = DatabricksSDKDeploy( databricks_job=job, host="https://test.databricks.net", token="test_token" From 1c688c4914ec18cab511fc06999fb23b2d65dc01 Mon Sep 17 00:00:00 2001 From: cching95 Date: Mon, 4 Aug 2025 17:25:05 +0100 Subject: [PATCH 08/13] update changes to example Signed-off-by: cching95 --- src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py b/src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py index 7a9886389..ad47a050f 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py +++ b/src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py @@ -295,6 +295,7 @@ class DatabricksSDKDeploy(DeployInterface): All options available in the [Databricks Jobs REST API v2.1](https://docs.databricks.com/dev-tools/api/latest/jobs.html) can be configured in the Databricks classes that have been defined in `rtdip_sdk.pipelines.deploy.models.databricks`, enabling full control of the configuration of the Databricks Workflow : + - `CreateJob` - `Task` RTDIP Pipeline Components provide Databricks with all the required Python packages and JARs to execute each component and these will be setup on the Workflow automatically during the Databricks Workflow creation. @@ -303,7 +304,7 @@ class DatabricksSDKDeploy(DeployInterface): This example assumes that a PipelineJob has already been defined by a variable called `pipeline_job` ```python - from rtdip_sdk.pipelines.deploy import DatabricksSDKDeploy, JobCluster, ClusterSpec, Task, NotebookTask, ComputeSpecKind, AutoScale, RuntimeEngine, DataSecurityMode + from rtdip_sdk.pipelines.deploy import DatabricksSDKDeploy, CreateJob, JobCluster, ClusterSpec, Task, NotebookTask, ComputeSpecKind, AutoScale, RuntimeEngine, DataSecurityMode cluster_list = [] cluster_list.append(JobCluster( From 34749a1aa104e21762f999eafb7406c5215cfc39 Mon Sep 17 00:00:00 2001 From: cching95 Date: Mon, 4 Aug 2025 17:27:05 +0100 Subject: [PATCH 09/13] update databricks sdk version imports Signed-off-by: cching95 --- environment.yml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/environment.yml b/environment.yml index ab6486628..8897ab8a7 100644 --- a/environment.yml +++ b/environment.yml @@ -71,7 +71,7 @@ dependencies: - statsmodels>=0.14.1,<0.15.0 - pmdarima>=2.0.4 - pip: - - databricks-sdk>=0.20.0,<1.0.0 + - databricks-sdk>=0.59.0,<1.0.0 - dependency-injector>=4.41.0,<5.0.0 - azure-functions>=1.15.0,<2.0.0 - azure-mgmt-eventgrid>=10.2.0 diff --git a/setup.py b/setup.py index 37835c7ad..bff07fd8b 100644 --- a/setup.py +++ b/setup.py @@ -57,7 +57,7 @@ PIPELINE_PACKAGES = [ "dependency-injector>=4.41.0,<5.0.0", - "databricks-sdk>=0.20.0,<1.0.0", + "databricks-sdk>=0.59.0,<1.0.0", "azure-storage-file-datalake>=12.12.0,<13.0.0", "azure-mgmt-storage>=21.0.0", "azure-mgmt-eventgrid>=10.2.0", From 5f8e570a599df96c14bded8fbec19785b3a4ef35 Mon Sep 17 00:00:00 2001 From: cching95 Date: Tue, 5 Aug 2025 10:24:12 +0100 Subject: [PATCH 10/13] add sonar exceptions Signed-off-by: cching95 --- .../python/rtdip_sdk/pipelines/deploy/databricks.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py b/src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py index ad47a050f..9f892da75 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py +++ b/src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py @@ -177,7 +177,7 @@ def as_dict(self) -> dict: if self.access_control_list: body["access_control_list"] = [ v.as_dict() for v in self.access_control_list - ] + ] # NOSONAR if self.budget_policy_id is not None: body["budget_policy_id"] = self.budget_policy_id if self.continuous: @@ -226,7 +226,7 @@ def as_dict(self) -> dict: body["trigger"] = self.trigger.as_dict() if self.webhook_notifications: body["webhook_notifications"] = self.webhook_notifications.as_dict() - return body + return body # NOSONAR def as_shallow_dict(self) -> dict: """Serializes the CreateJob into a shallow dictionary of its immediate attributes.""" @@ -281,7 +281,7 @@ def as_shallow_dict(self) -> dict: body["trigger"] = self.trigger if self.webhook_notifications: body["webhook_notifications"] = self.webhook_notifications - return body + return body # NOSONAR class DatabricksSDKDeploy(DeployInterface): @@ -364,14 +364,14 @@ def __init__( def _convert_file_to_binary(self, path) -> BytesIO: with open(path, "rb") as f: - return BytesIO(f.read()) + return BytesIO(f.read()) # NOSONAR def _load_module(self, module_name, path): spec = spec_from_file_location(module_name, path) module = module_from_spec(spec) spec.loader.exec_module(module) sys.modules[module.__name__] = module - return module + return module # NOSONAR def deploy(self) -> Union[bool, ValueError]: """ From 3dcb3d0d652c3ff82dccfcc478abbd4373ea9b02 Mon Sep 17 00:00:00 2001 From: cching95 Date: Tue, 5 Aug 2025 13:13:21 +0100 Subject: [PATCH 11/13] add protobuf version and sonar exceptions Signed-off-by: cching95 --- environment.yml | 1 + src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/environment.yml b/environment.yml index 3d4c26034..5536214b5 100644 --- a/environment.yml +++ b/environment.yml @@ -70,6 +70,7 @@ dependencies: - great-expectations>=0.18.8,<1.0.0 - statsmodels>=0.14.1,<0.15.0 - pmdarima>=2.0.4 + - protobuf>=4.25.0,<5.0.0 - pip: - databricks-sdk>=0.59.0,<1.0.0 - dependency-injector>=4.41.0,<5.0.0 diff --git a/src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py b/src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py index 9f892da75..b8526c9d8 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py +++ b/src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py @@ -171,7 +171,7 @@ class CreateJob: webhook_notifications: Optional[WebhookNotifications] = None """A collection of system notification IDs to notify when runs of this job begin or complete.""" - def as_dict(self) -> dict: + def as_dict(self) -> dict: # NOSONAR """Serializes the CreateJob into a dictionary suitable for use as a JSON request body.""" body = {} if self.access_control_list: @@ -362,11 +362,11 @@ def __init__( self.token = token self.workspace_directory = workspace_directory - def _convert_file_to_binary(self, path) -> BytesIO: + def _convert_file_to_binary(self, path) -> BytesIO: # NOSONAR with open(path, "rb") as f: return BytesIO(f.read()) # NOSONAR - def _load_module(self, module_name, path): + def _load_module(self, module_name, path): # NOSONAR spec = spec_from_file_location(module_name, path) module = module_from_spec(spec) spec.loader.exec_module(module) @@ -386,7 +386,7 @@ def deploy(self) -> Union[bool, ValueError]: auth_type="pat", ) ) - for task in self.databricks_job.tasks: + for task in self.databricks_job.tasks: # NOSONAR if task.notebook_task is None and task.spark_python_task is None: return ValueError( "A Notebook or Spark Python Task must be populated for each task in the Databricks Job" From ccdd0d540542dee0aed4b15b0d652ba7ee2a9dcb Mon Sep 17 00:00:00 2001 From: cching95 Date: Tue, 5 Aug 2025 14:37:38 +0100 Subject: [PATCH 12/13] update sonar exclusions Signed-off-by: cching95 --- .../rtdip_sdk/pipelines/deploy/databricks.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py b/src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py index b8526c9d8..fb3f2617f 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py +++ b/src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py @@ -171,13 +171,13 @@ class CreateJob: webhook_notifications: Optional[WebhookNotifications] = None """A collection of system notification IDs to notify when runs of this job begin or complete.""" - def as_dict(self) -> dict: # NOSONAR + def as_dict(self) -> dict: # pragma: no cover """Serializes the CreateJob into a dictionary suitable for use as a JSON request body.""" body = {} if self.access_control_list: body["access_control_list"] = [ v.as_dict() for v in self.access_control_list - ] # NOSONAR + ] if self.budget_policy_id is not None: body["budget_policy_id"] = self.budget_policy_id if self.continuous: @@ -226,9 +226,9 @@ def as_dict(self) -> dict: # NOSONAR body["trigger"] = self.trigger.as_dict() if self.webhook_notifications: body["webhook_notifications"] = self.webhook_notifications.as_dict() - return body # NOSONAR + return body - def as_shallow_dict(self) -> dict: + def as_shallow_dict(self) -> dict: # pragma: no cover """Serializes the CreateJob into a shallow dictionary of its immediate attributes.""" body = {} if self.access_control_list: @@ -281,7 +281,7 @@ def as_shallow_dict(self) -> dict: body["trigger"] = self.trigger if self.webhook_notifications: body["webhook_notifications"] = self.webhook_notifications - return body # NOSONAR + return body class DatabricksSDKDeploy(DeployInterface): @@ -362,16 +362,16 @@ def __init__( self.token = token self.workspace_directory = workspace_directory - def _convert_file_to_binary(self, path) -> BytesIO: # NOSONAR + def _convert_file_to_binary(self, path) -> BytesIO: # pragma: no cover with open(path, "rb") as f: - return BytesIO(f.read()) # NOSONAR + return BytesIO(f.read()) - def _load_module(self, module_name, path): # NOSONAR + def _load_module(self, module_name, path): # pragma: no cover spec = spec_from_file_location(module_name, path) module = module_from_spec(spec) spec.loader.exec_module(module) sys.modules[module.__name__] = module - return module # NOSONAR + return module def deploy(self) -> Union[bool, ValueError]: """ @@ -386,7 +386,7 @@ def deploy(self) -> Union[bool, ValueError]: auth_type="pat", ) ) - for task in self.databricks_job.tasks: # NOSONAR + for task in self.databricks_job.tasks: # pragma: no cover if task.notebook_task is None and task.spark_python_task is None: return ValueError( "A Notebook or Spark Python Task must be populated for each task in the Databricks Job" From 5c2a12be259bbe1385481afd41148d7e3ff64295 Mon Sep 17 00:00:00 2001 From: cching95 Date: Tue, 5 Aug 2025 16:03:23 +0100 Subject: [PATCH 13/13] remove commented out code in summary query Signed-off-by: cching95 --- .../time_series/_time_series_query_builder.py | 86 ------------------- 1 file changed, 86 deletions(-) diff --git a/src/sdk/python/rtdip_sdk/queries/time_series/_time_series_query_builder.py b/src/sdk/python/rtdip_sdk/queries/time_series/_time_series_query_builder.py index 5d9f79ccd..0cd24988c 100644 --- a/src/sdk/python/rtdip_sdk/queries/time_series/_time_series_query_builder.py +++ b/src/sdk/python/rtdip_sdk/queries/time_series/_time_series_query_builder.py @@ -1644,92 +1644,6 @@ def _summary_query(parameters_dict: dict) -> str: return sql_query - # summary_query = ( - # "WITH summary AS (SELECT `{{ tagname_column }}`, " - # "count(`{{ value_column }}`) as Count, " - # "CAST(Avg(`{{ value_column }}`) as decimal(10, 2)) as Avg, " - # "CAST(Min(`{{ value_column }}`) as decimal(10, 2)) as Min, " - # "CAST(Max(`{{ value_column }}`) as decimal(10, 2)) as Max, " - # "CAST(stddev(`{{ value_column }}`) as decimal(10, 2)) as StDev, " - # "CAST(sum(`{{ value_column }}`) as decimal(10, 2)) as Sum, " - # "CAST(variance(`{{ value_column }}`) as decimal(10, 2)) as Var FROM " - # "{% if source is defined and source is not none %}" - # "`{{ source|lower }}` " - # "{% else %}" - # "`{{ business_unit|lower }}`.`sensors`.`{{ asset|lower }}_{{ data_security_level|lower }}_events_{{ data_type|lower }}` " - # "{% endif %}" - # "{% if case_insensitivity_tag_search is defined and case_insensitivity_tag_search == true %}" - # "WHERE `{{ timestamp_column }}` BETWEEN to_timestamp(\"{{ start_date }}\") AND to_timestamp(\"{{ end_date }}\") AND UPPER(`{{ tagname_column }}`) IN ('{{ tag_names | join('\\', \\'') | upper }}') " - # "{% else %}" - # "WHERE `{{ timestamp_column }}` BETWEEN to_timestamp(\"{{ start_date }}\") AND to_timestamp(\"{{ end_date }}\") AND `{{ tagname_column }}` IN ('{{ tag_names | join('\\', \\'') }}') " - # "{% endif %}" - # "{% if include_status is defined and include_status == true and include_bad_data is defined and include_bad_data == false %}" - # "AND `{{ status_column }}` <> 'Bad'" - # "{% endif %}" - # "GROUP BY `{{ tagname_column }}`) " - # "{% if display_uom is defined and display_uom == true %}" - # 'SELECT {% if to_json is defined and to_json == true %}to_json(struct(s.*, m.`UoM`), map("timestampFormat", "yyyy-MM-dd\'T\'HH:mm:ss.SSSSSSSSSXXX")) as Value{% else %}s.*, m.`UoM`{% endif %} FROM summary s ' - # "LEFT OUTER JOIN " - # "{% if metadata_source is defined and metadata_source is not none %}" - # "`{{ metadata_source|lower }}` m ON s.`{{ tagname_column }}` = m.`{{ metadata_tagname_column }}` " - # "{% else %}" - # "`{{ business_unit|lower }}`.`sensors`.`{{ asset|lower }}_{{ data_security_level|lower }}_metadata` m ON s.`{{ tagname_column }}` = m.`{{ tagname_column }}` " - # "{% endif %}" - # "{% else%}" - # 'SELECT {% if to_json is defined and to_json == true %}to_json(struct(*), map("timestampFormat", "yyyy-MM-dd\'T\'HH:mm:ss.SSSSSSSSSXXX")) as Value{% else %}*{% endif %} FROM summary ' - # "{% endif %}" - # "{% if limit is defined and limit is not none %}" - # "LIMIT {{ limit }} " - # "{% endif %}" - # "{% if offset is defined and offset is not none %}" - # "OFFSET {{ offset }} " - # "{% endif %}" - # ) - - # summary_parameters = { - # "source": parameters_dict.get("source", None), - # "metadata_source": parameters_dict.get("metadata_source", None), - # "business_unit": parameters_dict.get("business_unit"), - # "region": parameters_dict.get("region"), - # "asset": parameters_dict.get("asset"), - # "data_security_level": parameters_dict.get("data_security_level"), - # "data_type": parameters_dict.get("data_type"), - # "start_date": parameters_dict["start_date"], - # "end_date": parameters_dict["end_date"], - # "tag_names": list(dict.fromkeys(parameters_dict["tag_names"])), - # "include_bad_data": parameters_dict["include_bad_data"], - # "display_uom": parameters_dict.get("display_uom", False), - # "limit": parameters_dict.get("limit", None), - # "offset": parameters_dict.get("offset", None), - # "time_zone": parameters_dict["time_zone"], - # "tagname_column": parameters_dict.get("tagname_column", "TagName"), - # "timestamp_column": parameters_dict.get("timestamp_column", "EventTime"), - # "include_status": ( - # False - # if "status_column" in parameters_dict - # and parameters_dict.get("status_column") is None - # else True - # ), - # "status_column": ( - # "Status" - # if "status_column" in parameters_dict - # and parameters_dict.get("status_column") is None - # else parameters_dict.get("status_column", "Status") - # ), - # "value_column": parameters_dict.get("value_column", "Value"), - # "case_insensitivity_tag_search": parameters_dict.get( - # "case_insensitivity_tag_search", False - # ), - # "metadata_tagname_column": parameters_dict.get( - # "metadata_tagname_column", "TagName" - # ), - # "metadata_uom_column": parameters_dict.get("metadata_uom_column", "UoM"), - # "to_json": parameters_dict.get("to_json", False), - # } - - # sql_template = Template(summary_query) - # return sql_template.render(summary_parameters) - def _query_builder(parameters_dict: dict, query_type: str) -> str: if "supress_warning" not in parameters_dict: