From d3778dcab5eb56479e5f066e42e19800eb2a3dfc Mon Sep 17 00:00:00 2001 From: Shivam Saxena Date: Mon, 11 Dec 2023 00:56:15 +0530 Subject: [PATCH 1/5] Adding ERCOT to MDM Transfomer class Signed-off-by: Shivam Saxena --- .../transformers/spark/iso/__init__.py | 1 + .../transformers/spark/iso/ercot_to_mdm.py | 78 +++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100644 src/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/ercot_to_mdm.py diff --git a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/__init__.py index fb1d22d4a..583a5bd99 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/__init__.py +++ b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/__init__.py @@ -1,2 +1,3 @@ +from .ercot_to_mdm import ERCOTToMDMTransformer from .miso_to_mdm import MISOToMDMTransformer from .pjm_to_mdm import PJMToMDMTransformer diff --git a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/ercot_to_mdm.py b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/ercot_to_mdm.py new file mode 100644 index 000000000..a7c1cbaf7 --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/ercot_to_mdm.py @@ -0,0 +1,78 @@ +# Copyright 2022 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pyspark.sql import DataFrame, SparkSession, functions as F + +from ..base_raw_to_mdm import BaseRawToMDMTransformer +from ...._pipeline_utils.iso import ERCOT_SCHEMA, melt +from .....data_models.timeseries import SeriesType, ModelType, ValueType + + +class ERCOTToMDMTransformer(BaseRawToMDMTransformer): + """ + Converts ERCOT Raw data into Meters Data Model. + + Please check the BaseRawToMDMTransformer for the required arguments and methods. + + ``` + + BaseRawToMDMTransformer: + ::: src.sdk.python.rtdip_sdk.pipelines.transformers.spark.base_raw_to_mdm + """ + + spark: SparkSession + data: DataFrame + input_schema = ERCOT_SCHEMA + uid_col = "variable" + series_id_col = "'series_std_001'" + timestamp_col = "to_utc_timestamp(StartTime, 'America/Chicago')" + interval_timestamp_col = "Timestamp + INTERVAL 1 HOURS" + value_col = "value" + series_parent_id_col = "'series_parent_std_001'" + name_col = "'ERCOT API'" + uom_col = "'mwh'" + description_col = "'ERCOT data pulled from ERCOT ISO API'" + timestamp_start_col = "StartTime" + timestamp_end_col = "StartTime + INTERVAL 1 HOURS" + time_zone_col = "'America/Chicago'" + version_col = "'1'" + series_type = SeriesType.Hour + model_type = ModelType.Default + value_type = ValueType.Usage + properties_col = "null" + + def _pre_process(self) -> DataFrame: + df: DataFrame = super(ERCOTToMDMTransformer, self)._pre_process() + df = melt( + df, + id_vars=["Date", "HourEnding", "DstFlag"], + value_vars=[ + "Coast", + "East", + "FarWest", + "North", + "NorthCentral", + "SouthCentral", + "Southern", + "West", + "SystemTotal", + ], + ) + df = df.withColumn( + "StartTime", + F.expr( + "Date + MAKE_INTERVAL(0,0,0,0,cast(split(HourEnding,':')[0] as integer),0,0)" + ), + ) + return df From 788682629eeaef2d26bd23af7819786a05924cd3 Mon Sep 17 00:00:00 2001 From: Shivam Saxena Date: Mon, 11 Dec 2023 01:33:21 +0530 Subject: [PATCH 2/5] Adding ERCOT Raw to MDM Transformer Signed-off-by: Shivam Saxena --- .../spark/iso/test_data/ercot_meta/input.csv | 3 + .../iso/test_data/ercot_meta/output.json | 18 ++++ .../spark/iso/test_data/ercot_usage/input.csv | 3 + .../iso/test_data/ercot_usage/output.csv | 19 +++++ .../spark/iso/test_ercot_to_mdm.py | 85 +++++++++++++++++++ 5 files changed, 128 insertions(+) create mode 100644 tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/test_data/ercot_meta/input.csv create mode 100644 tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/test_data/ercot_meta/output.json create mode 100644 tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/test_data/ercot_usage/input.csv create mode 100644 tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/test_data/ercot_usage/output.csv create mode 100644 tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/test_ercot_to_mdm.py diff --git a/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/test_data/ercot_meta/input.csv b/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/test_data/ercot_meta/input.csv new file mode 100644 index 000000000..9ff017a08 --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/test_data/ercot_meta/input.csv @@ -0,0 +1,3 @@ +Date,HourEnding,Coast,East,FarWest,North,NorthCentral,SouthCentral,Southern,West,SystemTotal,DstFlag +2023-12-10T00:00:00,1:00,10173.7002,1384.3101,6210.73,1213.63,12670.0996,6350.9702,3160.3601,1292.41,42456.2102,N +2023-12-10T00:00:00,2:00,9950.5098,1389.42,6276.6201,1215.9399,12627.5,6270.8701,3046.97,1288.1801,42066.01,N diff --git a/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/test_data/ercot_meta/output.json b/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/test_data/ercot_meta/output.json new file mode 100644 index 000000000..e019308e6 --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/test_data/ercot_meta/output.json @@ -0,0 +1,18 @@ +{"Uid":"Coast","SeriesId":"series_std_001","SeriesParentId":"series_parent_std_001","Name":"ERCOT API","Uom":"mwh","Description":"ERCOT data pulled from ERCOT ISO API","TimestampStart":"2023-12-10T01:00:00","TimestampEnd":"2023-12-10T02:00:00","Timezone":"America/Chicago","Version":"1","SeriesType":64,"ModelType":1,"ValueType":16} +{"Uid":"East","SeriesId":"series_std_001","SeriesParentId":"series_parent_std_001","Name":"ERCOT API","Uom":"mwh","Description":"ERCOT data pulled from ERCOT ISO API","TimestampStart":"2023-12-10T01:00:00","TimestampEnd":"2023-12-10T02:00:00","Timezone":"America/Chicago","Version":"1","SeriesType":64,"ModelType":1,"ValueType":16} +{"Uid":"FarWest","SeriesId":"series_std_001","SeriesParentId":"series_parent_std_001","Name":"ERCOT API","Uom":"mwh","Description":"ERCOT data pulled from ERCOT ISO API","TimestampStart":"2023-12-10T01:00:00","TimestampEnd":"2023-12-10T02:00:00","Timezone":"America/Chicago","Version":"1","SeriesType":64,"ModelType":1,"ValueType":16} +{"Uid":"North","SeriesId":"series_std_001","SeriesParentId":"series_parent_std_001","Name":"ERCOT API","Uom":"mwh","Description":"ERCOT data pulled from ERCOT ISO API","TimestampStart":"2023-12-10T01:00:00","TimestampEnd":"2023-12-10T02:00:00","Timezone":"America/Chicago","Version":"1","SeriesType":64,"ModelType":1,"ValueType":16} +{"Uid":"NorthCentral","SeriesId":"series_std_001","SeriesParentId":"series_parent_std_001","Name":"ERCOT API","Uom":"mwh","Description":"ERCOT data pulled from ERCOT ISO API","TimestampStart":"2023-12-10T01:00:00","TimestampEnd":"2023-12-10T02:00:00","Timezone":"America/Chicago","Version":"1","SeriesType":64,"ModelType":1,"ValueType":16} +{"Uid":"SouthCentral","SeriesId":"series_std_001","SeriesParentId":"series_parent_std_001","Name":"ERCOT API","Uom":"mwh","Description":"ERCOT data pulled from ERCOT ISO API","TimestampStart":"2023-12-10T01:00:00","TimestampEnd":"2023-12-10T02:00:00","Timezone":"America/Chicago","Version":"1","SeriesType":64,"ModelType":1,"ValueType":16} +{"Uid":"Southern","SeriesId":"series_std_001","SeriesParentId":"series_parent_std_001","Name":"ERCOT API","Uom":"mwh","Description":"ERCOT data pulled from ERCOT ISO API","TimestampStart":"2023-12-10T01:00:00","TimestampEnd":"2023-12-10T02:00:00","Timezone":"America/Chicago","Version":"1","SeriesType":64,"ModelType":1,"ValueType":16} +{"Uid":"West","SeriesId":"series_std_001","SeriesParentId":"series_parent_std_001","Name":"ERCOT API","Uom":"mwh","Description":"ERCOT data pulled from ERCOT ISO API","TimestampStart":"2023-12-10T01:00:00","TimestampEnd":"2023-12-10T02:00:00","Timezone":"America/Chicago","Version":"1","SeriesType":64,"ModelType":1,"ValueType":16} +{"Uid":"SystemTotal","SeriesId":"series_std_001","SeriesParentId":"series_parent_std_001","Name":"ERCOT API","Uom":"mwh","Description":"ERCOT data pulled from ERCOT ISO API","TimestampStart":"2023-12-10T01:00:00","TimestampEnd":"2023-12-10T02:00:00","Timezone":"America/Chicago","Version":"1","SeriesType":64,"ModelType":1,"ValueType":16} +{"Uid":"Coast","SeriesId":"series_std_001","SeriesParentId":"series_parent_std_001","Name":"ERCOT API","Uom":"mwh","Description":"ERCOT data pulled from ERCOT ISO API","TimestampStart":"2023-12-10T02:00:00","TimestampEnd":"2023-12-10T03:00:00","Timezone":"America/Chicago","Version":"1","SeriesType":64,"ModelType":1,"ValueType":16} +{"Uid":"East","SeriesId":"series_std_001","SeriesParentId":"series_parent_std_001","Name":"ERCOT API","Uom":"mwh","Description":"ERCOT data pulled from ERCOT ISO API","TimestampStart":"2023-12-10T02:00:00","TimestampEnd":"2023-12-10T03:00:00","Timezone":"America/Chicago","Version":"1","SeriesType":64,"ModelType":1,"ValueType":16} +{"Uid":"FarWest","SeriesId":"series_std_001","SeriesParentId":"series_parent_std_001","Name":"ERCOT API","Uom":"mwh","Description":"ERCOT data pulled from ERCOT ISO API","TimestampStart":"2023-12-10T02:00:00","TimestampEnd":"2023-12-10T03:00:00","Timezone":"America/Chicago","Version":"1","SeriesType":64,"ModelType":1,"ValueType":16} +{"Uid":"North","SeriesId":"series_std_001","SeriesParentId":"series_parent_std_001","Name":"ERCOT API","Uom":"mwh","Description":"ERCOT data pulled from ERCOT ISO API","TimestampStart":"2023-12-10T02:00:00","TimestampEnd":"2023-12-10T03:00:00","Timezone":"America/Chicago","Version":"1","SeriesType":64,"ModelType":1,"ValueType":16} +{"Uid":"NorthCentral","SeriesId":"series_std_001","SeriesParentId":"series_parent_std_001","Name":"ERCOT API","Uom":"mwh","Description":"ERCOT data pulled from ERCOT ISO API","TimestampStart":"2023-12-10T02:00:00","TimestampEnd":"2023-12-10T03:00:00","Timezone":"America/Chicago","Version":"1","SeriesType":64,"ModelType":1,"ValueType":16} +{"Uid":"SouthCentral","SeriesId":"series_std_001","SeriesParentId":"series_parent_std_001","Name":"ERCOT API","Uom":"mwh","Description":"ERCOT data pulled from ERCOT ISO API","TimestampStart":"2023-12-10T02:00:00","TimestampEnd":"2023-12-10T03:00:00","Timezone":"America/Chicago","Version":"1","SeriesType":64,"ModelType":1,"ValueType":16} +{"Uid":"Southern","SeriesId":"series_std_001","SeriesParentId":"series_parent_std_001","Name":"ERCOT API","Uom":"mwh","Description":"ERCOT data pulled from ERCOT ISO API","TimestampStart":"2023-12-10T02:00:00","TimestampEnd":"2023-12-10T03:00:00","Timezone":"America/Chicago","Version":"1","SeriesType":64,"ModelType":1,"ValueType":16} +{"Uid":"West","SeriesId":"series_std_001","SeriesParentId":"series_parent_std_001","Name":"ERCOT API","Uom":"mwh","Description":"ERCOT data pulled from ERCOT ISO API","TimestampStart":"2023-12-10T02:00:00","TimestampEnd":"2023-12-10T03:00:00","Timezone":"America/Chicago","Version":"1","SeriesType":64,"ModelType":1,"ValueType":16} +{"Uid":"SystemTotal","SeriesId":"series_std_001","SeriesParentId":"series_parent_std_001","Name":"ERCOT API","Uom":"mwh","Description":"ERCOT data pulled from ERCOT ISO API","TimestampStart":"2023-12-10T02:00:00","TimestampEnd":"2023-12-10T03:00:00","Timezone":"America/Chicago","Version":"1","SeriesType":64,"ModelType":1,"ValueType":16} diff --git a/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/test_data/ercot_usage/input.csv b/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/test_data/ercot_usage/input.csv new file mode 100644 index 000000000..9ff017a08 --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/test_data/ercot_usage/input.csv @@ -0,0 +1,3 @@ +Date,HourEnding,Coast,East,FarWest,North,NorthCentral,SouthCentral,Southern,West,SystemTotal,DstFlag +2023-12-10T00:00:00,1:00,10173.7002,1384.3101,6210.73,1213.63,12670.0996,6350.9702,3160.3601,1292.41,42456.2102,N +2023-12-10T00:00:00,2:00,9950.5098,1389.42,6276.6201,1215.9399,12627.5,6270.8701,3046.97,1288.1801,42066.01,N diff --git a/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/test_data/ercot_usage/output.csv b/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/test_data/ercot_usage/output.csv new file mode 100644 index 000000000..7fd1770a3 --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/test_data/ercot_usage/output.csv @@ -0,0 +1,19 @@ +Uid,SeriesId,Timestamp,IntervalTimestamp,Value +Coast,series_std_001,2023-12-10T07:00:00,2023-12-10T08:00:00,10173.7002 +East,series_std_001,2023-12-10T07:00:00,2023-12-10T08:00:00,1384.3101 +FarWest,series_std_001,2023-12-10T07:00:00,2023-12-10T08:00:00,6210.73 +North,series_std_001,2023-12-10T07:00:00,2023-12-10T08:00:00,1213.63 +NorthCentral,series_std_001,2023-12-10T07:00:00,2023-12-10T08:00:00,12670.0996 +SouthCentral,series_std_001,2023-12-10T07:00:00,2023-12-10T08:00:00,6350.9702 +Southern,series_std_001,2023-12-10T07:00:00,2023-12-10T08:00:00,3160.3601 +West,series_std_001,2023-12-10T07:00:00,2023-12-10T08:00:00,1292.41 +SystemTotal,series_std_001,2023-12-10T07:00:00,2023-12-10T08:00:00,42456.2102 +Coast,series_std_001,2023-12-10T08:00:00,2023-12-10T09:00:00,9950.5098 +East,series_std_001,2023-12-10T08:00:00,2023-12-10T09:00:00,1389.42 +FarWest,series_std_001,2023-12-10T08:00:00,2023-12-10T09:00:00,6276.6201 +North,series_std_001,2023-12-10T08:00:00,2023-12-10T09:00:00,1215.9399 +NorthCentral,series_std_001,2023-12-10T08:00:00,2023-12-10T09:00:00,12627.5 +SouthCentral,series_std_001,2023-12-10T08:00:00,2023-12-10T09:00:00,6270.8701 +Southern,series_std_001,2023-12-10T08:00:00,2023-12-10T09:00:00,3046.97 +West,series_std_001,2023-12-10T08:00:00,2023-12-10T09:00:00,1288.1801 +SystemTotal,series_std_001,2023-12-10T08:00:00,2023-12-10T09:00:00,42066.01 diff --git a/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/test_ercot_to_mdm.py b/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/test_ercot_to_mdm.py new file mode 100644 index 000000000..5cbc13d17 --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/test_ercot_to_mdm.py @@ -0,0 +1,85 @@ +# Copyright 2022 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from pyspark.sql import SparkSession, DataFrame + +from src.sdk.python.rtdip_sdk.pipelines._pipeline_utils.iso import ERCOT_SCHEMA +from src.sdk.python.rtdip_sdk.pipelines._pipeline_utils.mdm import ( + MDM_USAGE_SCHEMA, + MDM_META_SCHEMA, +) +from src.sdk.python.rtdip_sdk.pipelines._pipeline_utils.models import ( + Libraries, + SystemType, +) +from src.sdk.python.rtdip_sdk.pipelines.transformers import ERCOTToMDMTransformer +from src.sdk.python.rtdip_sdk.pipelines.transformers.spark.base_raw_to_mdm import ( + BaseRawToMDMTransformer, +) + +parent_base_path: str = os.path.join( + os.path.dirname(os.path.realpath(__file__)), "test_data" +) + + +def compare_dataframes(transformer, actual_df, expected_df, order_cols): + actual_df = actual_df.orderBy(*order_cols) + expected_df = expected_df.orderBy(*order_cols) + + assert transformer.system_type() == SystemType.PYSPARK + assert isinstance(transformer.libraries(), Libraries) + assert transformer.settings() == dict() + assert str(actual_df.schema) == str(expected_df.schema) + assert str(actual_df.collect()) == str(expected_df.collect()) + + +def test_ercot_to_mdm_usage(spark_session: SparkSession): + base_path: str = os.path.join(parent_base_path, "ercot_usage") + + expected_df: DataFrame = spark_session.read.csv( + f"{base_path}/output.csv", header=True, schema=MDM_USAGE_SCHEMA + ) + input_df: DataFrame = spark_session.read.csv( + f"{base_path}/input.csv", header=True, schema=ERCOT_SCHEMA + ) + + transformer: BaseRawToMDMTransformer = ERCOTToMDMTransformer( + spark_session, input_df, output_type="usage" + ) + actual_df = transformer.transform() + + order_cols = ["Uid", "Timestamp"] + compare_dataframes(transformer, actual_df, expected_df, order_cols) + + +def test_ercot_to_mdm_meta(spark_session: SparkSession): + base_path: str = os.path.join(parent_base_path, "ercot_meta") + + expected_df: DataFrame = spark_session.read.json( + f"{base_path}/output.json", schema=MDM_META_SCHEMA + ) + input_df: DataFrame = spark_session.read.csv( + f"{base_path}/input.csv", header=True, schema=ERCOT_SCHEMA + ) + + transformer: BaseRawToMDMTransformer = ERCOTToMDMTransformer( + spark_session, input_df, output_type="meta" + ) + + actual_df = transformer.transform() + + order_cols = ["Uid", "TimestampStart"] + compare_dataframes(transformer, actual_df, expected_df, order_cols) From 2970e71457109738e6bbbb2f60b41507135ae30b Mon Sep 17 00:00:00 2001 From: Shivam Saxena Date: Mon, 11 Dec 2023 01:43:27 +0530 Subject: [PATCH 3/5] Adding ERCOT to MDM Transformer related docs Signed-off-by: Shivam Saxena --- .../pipelines/transformers/spark/iso/ercot_to_mdm.md | 1 + docs/sdk/pipelines/components.md | 1 + mkdocs.yml | 1 + 3 files changed, 3 insertions(+) create mode 100644 docs/sdk/code-reference/pipelines/transformers/spark/iso/ercot_to_mdm.md diff --git a/docs/sdk/code-reference/pipelines/transformers/spark/iso/ercot_to_mdm.md b/docs/sdk/code-reference/pipelines/transformers/spark/iso/ercot_to_mdm.md new file mode 100644 index 000000000..7b01b5ec5 --- /dev/null +++ b/docs/sdk/code-reference/pipelines/transformers/spark/iso/ercot_to_mdm.md @@ -0,0 +1 @@ +::: src.sdk.python.rtdip_sdk.pipelines.transformers.spark.iso.ercot_to_mdm diff --git a/docs/sdk/pipelines/components.md b/docs/sdk/pipelines/components.md index 40f7bd98f..4d06716ff 100644 --- a/docs/sdk/pipelines/components.md +++ b/docs/sdk/pipelines/components.md @@ -69,6 +69,7 @@ Transformers are components that perform transformations on data. These will tar |[MISO To Meters Data Model](../code-reference/pipelines/transformers/spark/iso/miso_to_mdm.md)||:heavy_check_mark:|:heavy_check_mark:|:heavy_check_mark:|:heavy_check_mark:| |[Raw Forecast to Weather Data Model](../code-reference/pipelines/transformers/spark/the_weather_company/raw_forecast_to_weather_data_model.md)||:heavy_check_mark:|:heavy_check_mark:|:heavy_check_mark:|:heavy_check_mark:| |[PJM To Meters Data Model](../code-reference/pipelines/transformers/spark/iso/pjm_to_mdm.md)||:heavy_check_mark:|:heavy_check_mark:|:heavy_check_mark:|:heavy_check_mark:| +|[ERCOT To Meters Data Model](../code-reference/pipelines/transformers/spark/iso/ercot_to_mdm.md)||:heavy_check_mark:|:heavy_check_mark:|:heavy_check_mark:|:heavy_check_mark:| !!! note "Note" This list will dynamically change as the framework is further developed and new components are added. diff --git a/mkdocs.yml b/mkdocs.yml index b27f43d5e..4e7c97054 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -199,6 +199,7 @@ nav: - ISO: - MISO To Meters Data Model: sdk/code-reference/pipelines/transformers/spark/iso/miso_to_mdm.md - PJM To Meters Data Model: sdk/code-reference/pipelines/transformers/spark/iso/pjm_to_mdm.md + - ERCOT To Meters Data Model: sdk/code-reference/pipelines/transformers/spark/iso/ercot_to_mdm.md - The Weather Company: - Raw Forecast To Weather Data Model: sdk/code-reference/pipelines/transformers/spark/the_weather_company/raw_forecast_to_weather_data_model.md - ECMWF: From ccc9b9848e41f2a56678eb1066812a8d32473c1d Mon Sep 17 00:00:00 2001 From: Shivam Saxena Date: Mon, 11 Dec 2023 02:19:45 +0530 Subject: [PATCH 4/5] Improving ERCOT Raw to MDM Transformer test cases Signed-off-by: Shivam Saxena --- .../transformers/spark/iso/ercot_to_mdm.py | 25 ++++++++-- .../spark/iso/test_ercot_to_mdm.py | 48 ++++++++++++------- 2 files changed, 52 insertions(+), 21 deletions(-) diff --git a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/ercot_to_mdm.py b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/ercot_to_mdm.py index a7c1cbaf7..99f01285e 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/ercot_to_mdm.py +++ b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/ercot_to_mdm.py @@ -25,10 +25,29 @@ class ERCOTToMDMTransformer(BaseRawToMDMTransformer): Please check the BaseRawToMDMTransformer for the required arguments and methods. - ``` + Example + -------- + ```python + from rtdip_sdk.pipelines.transformers import ERCOTToMDMTransformer + from rtdip_sdk.pipelines.utilities import SparkSessionUtility + + # Not required if using Databricks + spark = SparkSessionUtility(config={}).execute() - BaseRawToMDMTransformer: - ::: src.sdk.python.rtdip_sdk.pipelines.transformers.spark.base_raw_to_mdm + ercot_to_mdm_transformer = ERCOTToMDMTransformer( + spark=spark, + data=df, + output_type="usage", + name=None, + description=None, + value_type=None, + version=None, + series_id=None, + series_parent_id=None + ) + + result = ercot_to_mdm_transformer.transform() + ``` """ spark: SparkSession diff --git a/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/test_ercot_to_mdm.py b/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/test_ercot_to_mdm.py index 5cbc13d17..e2b2ff394 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/test_ercot_to_mdm.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/test_ercot_to_mdm.py @@ -46,40 +46,52 @@ def compare_dataframes(transformer, actual_df, expected_df, order_cols): assert str(actual_df.collect()) == str(expected_df.collect()) -def test_ercot_to_mdm_usage(spark_session: SparkSession): - base_path: str = os.path.join(parent_base_path, "ercot_usage") - - expected_df: DataFrame = spark_session.read.csv( - f"{base_path}/output.csv", header=True, schema=MDM_USAGE_SCHEMA - ) +def ercot_to_mdm_test( + spark_session: SparkSession, + output_type: str, + order_cols: list, + expected_df: DataFrame, + base_path: str, +): input_df: DataFrame = spark_session.read.csv( f"{base_path}/input.csv", header=True, schema=ERCOT_SCHEMA ) transformer: BaseRawToMDMTransformer = ERCOTToMDMTransformer( - spark_session, input_df, output_type="usage" + spark_session, input_df, output_type=output_type ) actual_df = transformer.transform() - order_cols = ["Uid", "Timestamp"] compare_dataframes(transformer, actual_df, expected_df, order_cols) +def test_ercot_to_mdm_usage(spark_session: SparkSession): + base_path: str = os.path.join(parent_base_path, "ercot_usage") + + expected_df: DataFrame = spark_session.read.csv( + f"{base_path}/output.csv", header=True, schema=MDM_USAGE_SCHEMA + ) + + ercot_to_mdm_test( + spark_session=spark_session, + output_type="usage", + order_cols=["Uid", "Timestamp"], + expected_df=expected_df, + base_path=base_path, + ) + + def test_ercot_to_mdm_meta(spark_session: SparkSession): base_path: str = os.path.join(parent_base_path, "ercot_meta") expected_df: DataFrame = spark_session.read.json( f"{base_path}/output.json", schema=MDM_META_SCHEMA ) - input_df: DataFrame = spark_session.read.csv( - f"{base_path}/input.csv", header=True, schema=ERCOT_SCHEMA - ) - transformer: BaseRawToMDMTransformer = ERCOTToMDMTransformer( - spark_session, input_df, output_type="meta" + ercot_to_mdm_test( + spark_session=spark_session, + output_type="meta", + order_cols=["Uid", "TimestampStart"], + expected_df=expected_df, + base_path=base_path, ) - - actual_df = transformer.transform() - - order_cols = ["Uid", "TimestampStart"] - compare_dataframes(transformer, actual_df, expected_df, order_cols) From 235bcf7cbf799f8143fdd2b8a562612c66eebddb Mon Sep 17 00:00:00 2001 From: Shivam Saxena Date: Mon, 11 Dec 2023 18:34:27 +0530 Subject: [PATCH 5/5] Improving ERCOT to MDM Transformer Test Code Structure Signed-off-by: Shivam Saxena --- .../spark/iso/test_ercot_to_mdm.py | 58 +++++++------------ 1 file changed, 20 insertions(+), 38 deletions(-) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/test_ercot_to_mdm.py b/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/test_ercot_to_mdm.py index e2b2ff394..b0db47eab 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/test_ercot_to_mdm.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/transformers/spark/iso/test_ercot_to_mdm.py @@ -26,30 +26,15 @@ SystemType, ) from src.sdk.python.rtdip_sdk.pipelines.transformers import ERCOTToMDMTransformer -from src.sdk.python.rtdip_sdk.pipelines.transformers.spark.base_raw_to_mdm import ( - BaseRawToMDMTransformer, -) parent_base_path: str = os.path.join( os.path.dirname(os.path.realpath(__file__)), "test_data" ) -def compare_dataframes(transformer, actual_df, expected_df, order_cols): - actual_df = actual_df.orderBy(*order_cols) - expected_df = expected_df.orderBy(*order_cols) - - assert transformer.system_type() == SystemType.PYSPARK - assert isinstance(transformer.libraries(), Libraries) - assert transformer.settings() == dict() - assert str(actual_df.schema) == str(expected_df.schema) - assert str(actual_df.collect()) == str(expected_df.collect()) - - def ercot_to_mdm_test( spark_session: SparkSession, output_type: str, - order_cols: list, expected_df: DataFrame, base_path: str, ): @@ -57,41 +42,38 @@ def ercot_to_mdm_test( f"{base_path}/input.csv", header=True, schema=ERCOT_SCHEMA ) - transformer: BaseRawToMDMTransformer = ERCOTToMDMTransformer( + transformer = ERCOTToMDMTransformer( spark_session, input_df, output_type=output_type ) + assert transformer.system_type() == SystemType.PYSPARK + assert isinstance(transformer.libraries(), Libraries) + assert transformer.settings() == dict() + actual_df = transformer.transform() - compare_dataframes(transformer, actual_df, expected_df, order_cols) + cols = list( + filter( + lambda column: column in expected_df.columns, + ["Uid", "Timestamp", "TimestampStart"], + ) + ) + assert actual_df.orderBy(cols).collect() == expected_df.orderBy(cols).collect() def test_ercot_to_mdm_usage(spark_session: SparkSession): - base_path: str = os.path.join(parent_base_path, "ercot_usage") - - expected_df: DataFrame = spark_session.read.csv( - f"{base_path}/output.csv", header=True, schema=MDM_USAGE_SCHEMA + usage_path = os.path.join(parent_base_path, "ercot_usage") + usage_expected_df: DataFrame = spark_session.read.csv( + f"{usage_path}/output.csv", header=True, schema=MDM_USAGE_SCHEMA ) - ercot_to_mdm_test( - spark_session=spark_session, - output_type="usage", - order_cols=["Uid", "Timestamp"], - expected_df=expected_df, - base_path=base_path, - ) + ercot_to_mdm_test(spark_session, "usage", usage_expected_df, usage_path) def test_ercot_to_mdm_meta(spark_session: SparkSession): - base_path: str = os.path.join(parent_base_path, "ercot_meta") + meta_path: str = os.path.join(parent_base_path, "ercot_meta") - expected_df: DataFrame = spark_session.read.json( - f"{base_path}/output.json", schema=MDM_META_SCHEMA + meta_expected_df: DataFrame = spark_session.read.json( + f"{meta_path}/output.json", schema=MDM_META_SCHEMA ) - ercot_to_mdm_test( - spark_session=spark_session, - output_type="meta", - order_cols=["Uid", "TimestampStart"], - expected_df=expected_df, - base_path=base_path, - ) + ercot_to_mdm_test(spark_session, "meta", meta_expected_df, meta_path)