diff --git a/.github/workflows/develop.yml b/.github/workflows/develop.yml index 3476ae2f6..cc272246e 100644 --- a/.github/workflows/develop.yml +++ b/.github/workflows/develop.yml @@ -48,7 +48,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v5 with: - python-version: 3.11 + python-version: 3.12 - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 815f58056..f223d44f9 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -27,7 +27,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v5 with: - python-version: 3.11 + python-version: 3.12 - name: Install dependencies run: | python -m pip install --upgrade pip @@ -101,7 +101,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v5 with: - python-version: 3.11 + python-version: 3.12 - name: Install Boost run: | diff --git a/.github/workflows/sonarcloud_reusable.yml b/.github/workflows/sonarcloud_reusable.yml index 3c358efd1..73154f827 100644 --- a/.github/workflows/sonarcloud_reusable.yml +++ b/.github/workflows/sonarcloud_reusable.yml @@ -47,7 +47,7 @@ jobs: strategy: matrix: os: [ubuntu-latest] - python-version: ["3.11"] + python-version: ["3.12"] pyspark: ["3.5.3"] delta-spark: ["3.0.0"] runs-on: ${{ matrix.os }} @@ -102,13 +102,13 @@ jobs: sed -i "s/\/home\/runner\/work\/core\/core<\/source>/\/github\/workspace<\/source>/g" /home/runner/work/core/core/coverage-reports/coverage-unittests.xml - name: SonarCloud Scan - uses: SonarSource/sonarcloud-github-action@master + uses: SonarSource/sonarqube-scan-action@master with: args: > -Dsonar.organization=rtdip -Dsonar.projectKey=rtdip_core -Dsonar.python.coverage.reportPaths=coverage-reports/coverage-unittests.xml - -Dsoner.python.version=3.11 + -Dsoner.python.version=3.12 -Dsonar.scm.revision=${{ inputs.HEAD_SHA }} -Dsonar.pullrequest.key=${{ inputs.PR_NUMBER }} -Dsonar.pullrequest.branch=${{ inputs.PR_HEAD_REF }} diff --git a/.github/workflows/stale_issues.yml b/.github/workflows/stale_issues.yml new file mode 100644 index 000000000..d5cd3cf15 --- /dev/null +++ b/.github/workflows/stale_issues.yml @@ -0,0 +1,22 @@ +name: Close inactive issues +on: + schedule: + - cron: "30 1 * * *" + +jobs: + close-issues: + runs-on: ubuntu-latest + permissions: + issues: write + pull-requests: write + steps: + - uses: actions/stale@v5 + with: + days-before-issue-stale: 30 + days-before-issue-close: 14 + stale-issue-label: "stale" + stale-issue-message: "This issue is stale because it has been open for 30 days with no activity." + close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale." + days-before-pr-stale: -1 + days-before-pr-close: -1 + repo-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 53af11eeb..aba251bcd 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -26,8 +26,8 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest] - python-version: ["3.9", "3.10", "3.11"] - pyspark: ["3.3.0", "3.3.1", "3.3.2", "3.4.0", "3.4.1", "3.5.0", "3.5.1"] + python-version: ["3.9", "3.10", "3.11", "3.12"] + pyspark: ["3.3.0", "3.3.1", "3.3.2", "3.4.0", "3.4.1", "3.5.0", "3.5.1"] # 3.5.2 does not work with conda exclude: - pyspark: "3.5.1" python-version: "3.9" @@ -51,6 +51,16 @@ jobs: python-version: "3.11" - pyspark: "3.3.0" python-version: "3.11" + - pyspark: "3.4.1" + python-version: "3.12" + - pyspark: "3.4.0" + python-version: "3.12" + - pyspark: "3.3.2" + python-version: "3.12" + - pyspark: "3.3.1" + python-version: "3.12" + - pyspark: "3.3.0" + python-version: "3.12" include: - pyspark: "3.3.0" delta-spark: "2.2.0" @@ -108,7 +118,7 @@ jobs: strategy: matrix: os: [ubuntu-latest] - python-version: ["3.11"] + python-version: ["3.12"] pyspark: ["3.5.3"] delta-spark: ["3.0.0"] runs-on: ${{ matrix.os }} diff --git a/.gitignore b/.gitignore index 92c504d28..5d0e761ce 100644 --- a/.gitignore +++ b/.gitignore @@ -136,4 +136,7 @@ spark-warehouse/ spark-checkpoints/ # Delta Sharing -config.share \ No newline at end of file +config.share + +# JetBrains +.idea/ diff --git a/README.md b/README.md index 6e3883197..e6730513c 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ [![PyPI version](https://img.shields.io/pypi/v/rtdip-sdk.svg?logo=pypi&logoColor=FFE873)](https://pypi.org/project/rtdip-sdk/) [![Supported Python versions](https://img.shields.io/pypi/pyversions/rtdip-sdk.svg?logo=python&logoColor=FFE873)](https://pypi.org/project/rtdip-sdk/) [![PyPI downloads](https://img.shields.io/pypi/dm/rtdip-sdk.svg)](https://pypistats.org/packages/rtdip-sdk) +![PyPI Downloads](https://static.pepy.tech/badge/rtdip-sdk) [![OpenSSF Best Practices](https://bestpractices.coreinfrastructure.org/projects/7557/badge)](https://bestpractices.coreinfrastructure.org/projects/7557) [![Code Style Black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) @@ -115,4 +116,4 @@ Distributed under the Apache License Version 2.0. See [LICENSE.md](https://githu * Check previous questions and answers or ask new ones on our slack channel [**#rtdip**](https://lfenergy.slack.com/archives/C0484R9Q6A0) ### Community -* Chat with other community members by joining the **#rtdip** Slack channel. [Click here to join our slack community](https://lfenergy.slack.com/archives/C0484R9Q6A0) \ No newline at end of file +* Chat with other community members by joining the **#rtdip** Slack channel. [Click here to join our slack community](https://lfenergy.slack.com/archives/C0484R9Q6A0) diff --git a/docs/blog/.authors.yml b/docs/blog/.authors.yml index 966175639..ff16faf83 100644 --- a/docs/blog/.authors.yml +++ b/docs/blog/.authors.yml @@ -24,4 +24,8 @@ authors: GBARAS: name: Amber Rigg description: Contributor - avatar: https://github.com/Amber-Rigg.png \ No newline at end of file + avatar: https://github.com/Amber-Rigg.png + TUBCM: + name: Christian Munz + description: Contributor + avatar: https://github.com/chris-1187.png \ No newline at end of file diff --git a/docs/blog/images/agile.svg b/docs/blog/images/agile.svg new file mode 100644 index 000000000..8f206ff30 --- /dev/null +++ b/docs/blog/images/agile.svg @@ -0,0 +1,1827 @@ + + + diff --git a/docs/blog/images/amos_mvi.png b/docs/blog/images/amos_mvi.png new file mode 100644 index 000000000..93fd89a78 Binary files /dev/null and b/docs/blog/images/amos_mvi.png differ diff --git a/docs/blog/images/amos_mvi_raw.png b/docs/blog/images/amos_mvi_raw.png new file mode 100644 index 000000000..bcb1105b5 Binary files /dev/null and b/docs/blog/images/amos_mvi_raw.png differ diff --git a/docs/blog/posts/enhancing_data_quality_amos.md b/docs/blog/posts/enhancing_data_quality_amos.md new file mode 100644 index 000000000..af4e117a7 --- /dev/null +++ b/docs/blog/posts/enhancing_data_quality_amos.md @@ -0,0 +1,94 @@ +--- +date: 2025-02-05 +authors: + - TUBCM +--- + +# Enhancing Data Quality in Real-Time: Our Experience with RTDIP and the AMOS Project + + + +![blog](../images/agile.svg){width=60%} +1 + + +Real-time data integration and preparation are crucial in today's data-driven world, especially when dealing with time series data from often distributed heterogeneous data sources. As data scientists often spend no less than 80%² of their time finding, integrating, and cleaning datasets, the importance of automated ingestion pipelines rises inevitably. Building such ingestion and integration frameworks can be challenging and can entail all sorts of technical debt like glue code, pipeline jungles, or dead code paths, which calls for precise conception and development of such systems. Modern software development approaches try to mitigate technical debts and enhance quality results by introducing and utilizing agile and more iterative methodologies, which are designed to foster rapid feedback and continuous progress. + + + +As part of the Agile Methods and Open Source (AMOS) project, we had the unique opportunity to work in a SCRUM team consisting of students from TU Berlin and FAU Erlangen-Nürnberg, to build data quality measures for the RTDIP Ingestion Pipeline framework. With the goal of enhancing data quality, we got to work and built modular pipeline components that aim to help data scientists and engineers with data integration, data cleaning, and data preparation. + +But what does it mean to work in an agile framework? The Agile Manifesto is above all a set of guiding values, principles, ideals, and goals. The overarching goal is to gain performance and be most effective while adding business value. By prioritizing the right fundamentals like individuals and interactions, working software, customer collaboration, and responding to change, cross-functional teams can ship viable products easier and faster. + +How that worked out for us in building data quality measures? True to the motto "User stories drive everything," we got together with contributors from the RTDIP Team to hear about concepts, the end users' stake in the project, and the current state to get a grasp on the expectations we can set on ourselves. With that, we got to work and planned our first sprint, and soon, we got the idea of how agile implementation is here to point out deficiencies in our processes. Through regular team meetings, we fostered a culture of continuous feedback and testing, leveraging reviews and retrospectives to identify roadblocks and drive necessary changes that enhance the overall development process. + +## Enhancing Data Quality in RTDIP's Pipeline Framework + +Coming up with modular steps that enhance data quality was the initial and arguably most critical step to start off a successful development process. So the question was: what exactly do the terms data integration, data cleaning, and data preparation entail? To expand on the key parts of that, this is what we did to pour these aspects into RTDIP components. + +### Data Validation and Schema Alignment + +Data validation and schema alignment are critical for ensuring the reliability and usability of data, serving as a foundational step before implementing other quality measures. For the time series data at hand, we developed an InputValidator component to verify that incoming data adheres to predefined quality standards, including compliance with an expected schema, correct PySpark data types, and proper handling of null values, raising exceptions when inconsistencies are detected. Additionally, the component enforces schema integration, harmonizing data from multiple sources into a unified, predefined structure. To maintain a consistent and efficient workflow, we required all data quality components to inherit the validation functionality of the InputValidator. + +### Data Cleansing + +Data cleansing is a vital process in enhancing the quality of data within a data integration pipeline, ensuring consistency, reliability, and usability. We implemented functionalities such as duplicate detection, which identifies and removes redundant records to prevent skewed analysis, and flatline filters, which eliminate constant, non-informative data points. Interval and range filters are employed to validate the time series data against predefined temporal or value ranges, ensuring conformity with expected patterns. Additionally, a K-sigma anomaly detection component identifies outliers based on statistical deviations, enabling the isolation of erroneous or anomalous values. Together, these methods ensure the pipeline delivers high-quality, actionable data for downstream processes. + +### Missing Value Imputation + +With a dataset refined to exclude unwanted data points and accounting for potential sensor failures, the next step toward ensuring high-quality data is to address any missing values through imputation. The component we developed first identifies and flags missing values by leveraging PySpark’s capabilities in windowing and UDF operations. With these techniques, we are able to dynamically determine the expected interval for each sensor by analyzing historical data patterns within defined partitions. Spline interpolation allows us to estimate missing values in time series data, seamlessly filling gaps with plausible and mathematically derived substitutes. By doing so, data scientists can not only improve the consistency of integrated datasets but also prevent errors or biases in analytics and machine learning models. +To actually show how this is realized with this new RTDIP component, let me show you a short example on how a few lines of code can enhance an exemplary time series load profile: +```python +from rtdip_sdk.pipelines.data_quality import MissingValueImputation +from pyspark.sql import SparkSession +import pandas as pd + +spark_session = SparkSession.builder.master("local[2]").appName("test").getOrCreate() + +source_df = pd.read_csv('./solar_energy_production_germany_April02.csv') +incomplete_spark_df = spark_session.createDataFrame(vi_april_df, ['Value', 'EventTime', 'TagName', 'Status']) + +#Before Missing Value Imputation +spark_df.show() + +#Execute RTDIP Pipeline component +clean_df = MissingValueImputation(spark_session, df=incomplete_spark_df).filter_data() + +#After Missing Value Imputation +clean_df.show() +``` +To illustrate this visually, plotting the before-and-after DataFrames reveals that all gaps have been successfully filled with meaningful data. + + + +![blog](../images/amos_mvi_raw.png){width=70%} + +![blog](../images/amos_mvi.png){width=70%} + + + + +### Normalization + +Normalization is a critical step in ensuring data quality within data integration pipelines with various sources. Techniques like mean normalization, min-max scaling, and z-score standardization help transform raw time series data into a consistent scale, eliminating biases caused by differing units or magnitudes across features. It enables fair comparisons between variables, accelerates algorithm convergence, and ensures that data from diverse sources aligns seamlessly, supporting possible downstream processes such as entity resolution, data augmentation, and machine learning. To offer a variety of use cases within the RTDIP pipeline, we implemented normalization techniques like mean normalization, min-max scaling, and z-score standardization as well as their respective denormalization methods. + +### Data Monitoring + +Data monitoring is another aspect of enhancing data quality within the RTDIP pipeline, ensuring the reliability and consistency of incoming data streams. Techniques such as flatline detection identify periods of unchanging values, which may indicate sensor malfunctions or stale data. Missing data identification leverages predefined intervals or historical patterns to detect and flag gaps, enabling proactive resolution. By continuously monitoring for these anomalies, the pipeline maintains high data integrity, supporting accurate analysis for inconsistencies. + +### Data Prediction + +Forecasting based on historical data patterns is essential for making informed decisions on a business level. Linear Regression is a simple yet powerful approach for predicting continuous outcomes by establishing a relationship between input features and the target variable. However, for time series data, the ARIMA (Autoregressive Integrated Moving Average) model is often preferred due to its ability to model temporal dependencies and trends in the data. The ARIMA model combines autoregressive (AR) and moving average (MA) components, along with differencing to stabilize the variance and trends in the time series. ARIMA with autonomous parameter selection takes this a step further by automatically optimizing the model’s parameters (p, d, q) using techniques like grid search or other statistical criteria, ensuring that the model is well-suited to the data’s underlying structure for more accurate predictions. To address this, we incorporated both an ARIMA component and an AUTO-ARIMA component, enabling the prediction of future time series data points for each sensor. + +
+ +Working on the RTDIP Project within AMOS has been a fantastic journey, highlighting the importance of people and teamwork in agile development. By focusing on enhancing data quality, we’ve significantly boosted the reliability, consistency, and usability of the data going through the RTDIP pipeline. + +To look back, our regular team meetings were the key to our success. Through open communication and collaboration, we tackled challenges and kept improving our processes. This showed us the power of working together in an agile framework and growing as a dedicated SCRUM team. + +We’re excited about the future and how these advancements will help data scientists and engineers make better decisions. + +
+ +1 Designed by Freepik
+2 Michael Stonebraker, Ihab F. Ilyas: Data Integration: The Current Status and the Way Forward. IEEE Data Eng. Bull. 41(2) (2018) \ No newline at end of file diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md index 6cf65bae9..9045d2f6a 100644 --- a/docs/getting-started/installation.md +++ b/docs/getting-started/installation.md @@ -90,7 +90,7 @@ To use RTDIP Pipelines components in your own environment that leverages [pyspar - conda-forge - defaults dependencies: - - python==3.11 + - python==3.12 - pip - openjdk==11.0.15 - pip: @@ -158,7 +158,7 @@ The following provides examples of how to install the RTDIP SDK package with Pip - conda-forge - defaults dependencies: - - python==3.11 + - python==3.12 - pip - pip: - rtdip-sdk @@ -182,7 +182,7 @@ The following provides examples of how to install the RTDIP SDK package with Pip - conda-forge - defaults dependencies: - - python==3.11 + - python==3.12 - pip - pip: - rtdip-sdk diff --git a/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/dimensionality_reduction.md b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/dimensionality_reduction.md new file mode 100644 index 000000000..f3ef84937 --- /dev/null +++ b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/dimensionality_reduction.md @@ -0,0 +1 @@ +::: src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.dimensionality_reduction diff --git a/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/duplicate_detection.md b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/duplicate_detection.md new file mode 100644 index 000000000..a76a79164 --- /dev/null +++ b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/duplicate_detection.md @@ -0,0 +1 @@ +::: src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.duplicate_detection \ No newline at end of file diff --git a/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/flatline_filter.md b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/flatline_filter.md new file mode 100644 index 000000000..5c82a11d3 --- /dev/null +++ b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/flatline_filter.md @@ -0,0 +1 @@ +::: src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.flatline_filter diff --git a/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/gaussian_smoothing.md b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/gaussian_smoothing.md new file mode 100644 index 000000000..3a4018f46 --- /dev/null +++ b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/gaussian_smoothing.md @@ -0,0 +1 @@ +::: src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.gaussian_smoothing diff --git a/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/interval_filtering.md b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/interval_filtering.md new file mode 100644 index 000000000..fe5f3e968 --- /dev/null +++ b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/interval_filtering.md @@ -0,0 +1 @@ +::: src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.interval_filtering diff --git a/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/k_sigma_anomaly_detection.md b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/k_sigma_anomaly_detection.md new file mode 100644 index 000000000..70e69b3ea --- /dev/null +++ b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/k_sigma_anomaly_detection.md @@ -0,0 +1 @@ +::: src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.k_sigma_anomaly_detection diff --git a/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/missing_value_imputation.md b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/missing_value_imputation.md new file mode 100644 index 000000000..23e7fd491 --- /dev/null +++ b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/missing_value_imputation.md @@ -0,0 +1,2 @@ +::: src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.missing_value_imputation + diff --git a/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/denormalization.md b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/denormalization.md new file mode 100644 index 000000000..c2d5a19cb --- /dev/null +++ b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/denormalization.md @@ -0,0 +1 @@ +::: src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.normalization.denormalization diff --git a/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/normalization.md b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/normalization.md new file mode 100644 index 000000000..2483f8dc8 --- /dev/null +++ b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/normalization.md @@ -0,0 +1 @@ +::: src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.normalization.normalization diff --git a/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/normalization_mean.md b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/normalization_mean.md new file mode 100644 index 000000000..84cb4c997 --- /dev/null +++ b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/normalization_mean.md @@ -0,0 +1 @@ +::: src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.normalization.normalization_mean diff --git a/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/normalization_minmax.md b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/normalization_minmax.md new file mode 100644 index 000000000..b0ca874ad --- /dev/null +++ b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/normalization_minmax.md @@ -0,0 +1 @@ +::: src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.normalization.normalization_minmax diff --git a/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/normalization_zscore.md b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/normalization_zscore.md new file mode 100644 index 000000000..509474b78 --- /dev/null +++ b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/normalization_zscore.md @@ -0,0 +1 @@ +::: src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.normalization.normalization_zscore diff --git a/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/out_of_range_value_filter.md b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/out_of_range_value_filter.md new file mode 100644 index 000000000..af684fb77 --- /dev/null +++ b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/out_of_range_value_filter.md @@ -0,0 +1 @@ +::: src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.out_of_range_value_filter \ No newline at end of file diff --git a/docs/sdk/code-reference/pipelines/data_quality/monitoring/spark/check_value_ranges.md b/docs/sdk/code-reference/pipelines/data_quality/monitoring/spark/check_value_ranges.md new file mode 100644 index 000000000..c3cf7dd82 --- /dev/null +++ b/docs/sdk/code-reference/pipelines/data_quality/monitoring/spark/check_value_ranges.md @@ -0,0 +1 @@ +::: src.sdk.python.rtdip_sdk.pipelines.data_quality.monitoring.spark.check_value_ranges \ No newline at end of file diff --git a/docs/sdk/code-reference/pipelines/data_quality/monitoring/spark/flatline_detection.md b/docs/sdk/code-reference/pipelines/data_quality/monitoring/spark/flatline_detection.md new file mode 100644 index 000000000..0b1965ff1 --- /dev/null +++ b/docs/sdk/code-reference/pipelines/data_quality/monitoring/spark/flatline_detection.md @@ -0,0 +1 @@ +::: src.sdk.python.rtdip_sdk.pipelines.data_quality.monitoring.spark.flatline_detection \ No newline at end of file diff --git a/docs/sdk/code-reference/pipelines/monitoring/spark/data_quality/great_expectations.md b/docs/sdk/code-reference/pipelines/data_quality/monitoring/spark/great_expectations.md similarity index 71% rename from docs/sdk/code-reference/pipelines/monitoring/spark/data_quality/great_expectations.md rename to docs/sdk/code-reference/pipelines/data_quality/monitoring/spark/great_expectations.md index 8f26a67bf..1f2dfd23c 100644 --- a/docs/sdk/code-reference/pipelines/monitoring/spark/data_quality/great_expectations.md +++ b/docs/sdk/code-reference/pipelines/data_quality/monitoring/spark/great_expectations.md @@ -2,4 +2,4 @@ Great Expectations is a Python-based open-source library for validating, documenting, and profiling your data. It helps you to maintain data quality and improve communication about data between teams. -::: src.sdk.python.rtdip_sdk.pipelines.monitoring.spark.data_quality.great_expectations_data_quality \ No newline at end of file +::: src.sdk.python.rtdip_sdk.pipelines.data_quality.monitoring.spark.great_expectations_data_quality \ No newline at end of file diff --git a/docs/sdk/code-reference/pipelines/data_quality/monitoring/spark/identify_missing_data_interval.md b/docs/sdk/code-reference/pipelines/data_quality/monitoring/spark/identify_missing_data_interval.md new file mode 100644 index 000000000..91215567e --- /dev/null +++ b/docs/sdk/code-reference/pipelines/data_quality/monitoring/spark/identify_missing_data_interval.md @@ -0,0 +1 @@ +::: src.sdk.python.rtdip_sdk.pipelines.data_quality.monitoring.spark.identify_missing_data_interval \ No newline at end of file diff --git a/docs/sdk/code-reference/pipelines/data_quality/monitoring/spark/identify_missing_data_pattern.md b/docs/sdk/code-reference/pipelines/data_quality/monitoring/spark/identify_missing_data_pattern.md new file mode 100644 index 000000000..26d3b7fec --- /dev/null +++ b/docs/sdk/code-reference/pipelines/data_quality/monitoring/spark/identify_missing_data_pattern.md @@ -0,0 +1 @@ +::: src.sdk.python.rtdip_sdk.pipelines.data_quality.monitoring.spark.identify_missing_data_pattern \ No newline at end of file diff --git a/docs/sdk/code-reference/pipelines/data_quality/monitoring/spark/moving_average.md b/docs/sdk/code-reference/pipelines/data_quality/monitoring/spark/moving_average.md new file mode 100644 index 000000000..0b13b472d --- /dev/null +++ b/docs/sdk/code-reference/pipelines/data_quality/monitoring/spark/moving_average.md @@ -0,0 +1 @@ +::: src.sdk.python.rtdip_sdk.pipelines.data_quality.monitoring.spark.moving_average \ No newline at end of file diff --git a/docs/sdk/code-reference/pipelines/forecasting/spark/arima.md b/docs/sdk/code-reference/pipelines/forecasting/spark/arima.md new file mode 100644 index 000000000..c0052fccd --- /dev/null +++ b/docs/sdk/code-reference/pipelines/forecasting/spark/arima.md @@ -0,0 +1 @@ +::: src.sdk.python.rtdip_sdk.pipelines.forecasting.spark.arima diff --git a/docs/sdk/code-reference/pipelines/forecasting/spark/auto_arima.md b/docs/sdk/code-reference/pipelines/forecasting/spark/auto_arima.md new file mode 100644 index 000000000..dd27e599a --- /dev/null +++ b/docs/sdk/code-reference/pipelines/forecasting/spark/auto_arima.md @@ -0,0 +1 @@ +::: src.sdk.python.rtdip_sdk.pipelines.forecasting.spark.auto_arima diff --git a/docs/sdk/code-reference/pipelines/forecasting/spark/data_binning.md b/docs/sdk/code-reference/pipelines/forecasting/spark/data_binning.md new file mode 100644 index 000000000..a64da6b3d --- /dev/null +++ b/docs/sdk/code-reference/pipelines/forecasting/spark/data_binning.md @@ -0,0 +1 @@ +::: src.sdk.python.rtdip_sdk.pipelines.forecasting.spark.data_binning diff --git a/docs/sdk/code-reference/pipelines/forecasting/spark/k_nearest_neighbors.md b/docs/sdk/code-reference/pipelines/forecasting/spark/k_nearest_neighbors.md new file mode 100644 index 000000000..215a2c4b0 --- /dev/null +++ b/docs/sdk/code-reference/pipelines/forecasting/spark/k_nearest_neighbors.md @@ -0,0 +1 @@ +::: src.sdk.python.rtdip_sdk.pipelines.forecasting.spark.k_nearest_neighbors \ No newline at end of file diff --git a/docs/sdk/code-reference/pipelines/forecasting/spark/linear_regression.md b/docs/sdk/code-reference/pipelines/forecasting/spark/linear_regression.md new file mode 100644 index 000000000..653fc5400 --- /dev/null +++ b/docs/sdk/code-reference/pipelines/forecasting/spark/linear_regression.md @@ -0,0 +1 @@ +::: src.sdk.python.rtdip_sdk.pipelines.forecasting.spark.linear_regression diff --git a/docs/sdk/queries/databricks/databricks-sql.md b/docs/sdk/queries/databricks/databricks-sql.md index 3a8933b19..3f964bc9f 100644 --- a/docs/sdk/queries/databricks/databricks-sql.md +++ b/docs/sdk/queries/databricks/databricks-sql.md @@ -160,8 +160,6 @@ parameters = { "end_date": "2022-03-10", #end_date can be a date in the format "YYYY-MM-DD" or a datetime in the format "YYYY-MM-DDTHH:MM:SS" "time_interval_rate": "1", #numeric input "time_interval_unit": "hour", #options are second, minute, day or hour - "agg_method": "first", #options are first, last, avg, min, max - "interpolation_method": "forward_fill", #options are forward_fill, backward_fill or linear "include_bad_data": True #boolean options are True or False } diff --git a/docs/sdk/queries/databricks/troubleshooting.md b/docs/sdk/queries/databricks/troubleshooting.md index 867b62012..44209e0b6 100644 --- a/docs/sdk/queries/databricks/troubleshooting.md +++ b/docs/sdk/queries/databricks/troubleshooting.md @@ -40,8 +40,6 @@ dict = { "end_date": "2022-03-10", #end_date can be a date in the format "YYYY-MM-DD" or a datetime in the format "YYYY-MM-DDTHH:MM:SS" "time_interval_rate": "1", #numeric input "time_interval_unit": "hour", #options are second, minute, day, hour - "agg_method": "first", #options are first, last, avg, min, max - "interpolation_method": "forward_fill", #options are forward_fill or backward_fill "include_bad_data": True #boolean options are True or False } diff --git a/docs/university/essentials/rtdip/introduction/prerequisites.md b/docs/university/essentials/rtdip/introduction/prerequisites.md index 7868e45ae..7bb1bd49e 100644 --- a/docs/university/essentials/rtdip/introduction/prerequisites.md +++ b/docs/university/essentials/rtdip/introduction/prerequisites.md @@ -3,7 +3,7 @@ Before you begin the course, ensure you obtain the following prerequisites(from your istructor or from your environment if you are doing this on your own): ## Development Environment -- Python >=3.9,<=3.11 +- Python >=3.9,<=3.12 - An IDE such as Visual Studio Code or PyCharm - Postman via the app, web browser or as an extension on Visual Studio Code diff --git a/environment.yml b/environment.yml index d9d65e0a8..be6597fd3 100644 --- a/environment.yml +++ b/environment.yml @@ -18,9 +18,9 @@ channels: - conda-forge - defaults dependencies: - - python>=3.9,<3.12 + - python>=3.9,<3.13 - importlib-metadata>=7.0.0 - - jinja2>=3.1.4 + - jinja2>=3.1.5 - pytest==7.4.0 - pytest-mock==3.11.1 - pytest-cov==4.1.0 @@ -30,13 +30,13 @@ dependencies: - numpy>=1.23.4,<2.0.0 - oauthlib>=3.2.2,<4.0.0 - cryptography>=38.0.3 - - azure-identity>=1.12.0,<2.0.0 + - azure-identity>=1.20.0,<2.0.0 - azure-storage-file-datalake>=12.12.0,<13.0.0 - azure-keyvault-secrets>=4.7.0,<5.0.0 - azure-mgmt-storage>=21.0.0 - boto3>=1.28.2,<2.0.0 - - pyodbc>=4.0.39,<5.0.0 - - fastapi>=0.110.0,<1.0.0 + - pyodbc>=5.2.0,<6.0.0 + - fastapi>=0.115.6,<1.0.0 - httpx>=0.24.1,<1.0.0 - pyspark>=3.3.0,<3.6.0 - delta-spark>=2.2.0,<3.3.0 @@ -44,7 +44,7 @@ dependencies: - grpcio-status>=1.48.1 - googleapis-common-protos>=1.56.4 - openjdk>=11.0.15,<12.0.0 - - openai>=1.13.3,<2.0.0 + - openai>=1.59.0,<2.0.0 - mkdocs-material==9.5.20 - mkdocs-material-extensions==1.3.1 - mkdocstrings==0.25.0 @@ -53,12 +53,12 @@ dependencies: - mkdocs-autorefs>=1.0.0,<1.1.0 - pygments==2.16.1 - pymdown-extensions==10.8.1 - - databricks-sql-connector>=3.1.0,<4.0.0 + - databricks-sql-connector>=3.6.0,<3.7.0 - semver>=3.0.0,<4.0.0 - xlrd>=2.0.1 - pygithub>=1.59.0 - - pydantic>=2.6.0,<3.0.0 - - pyjwt>=2.8.0,<3.0.0 + - pydantic>=2.10.0,<3.0.0 + - pyjwt>=2.10.0,<3.0.0 - twine==4.0.2 - delta-sharing-python>=1.0.0,<2.0.0 - polars>=0.18.8,<1.0.0 @@ -68,21 +68,25 @@ dependencies: - black>=24.1.0 - joblib==1.3.2,<2.0.0 - great-expectations>=0.18.8,<1.0.0 + - statsmodels>=0.14.1,<0.15.0 + - pmdarima>=2.0.4 + - protobuf>=5.28.2,<5.29.0 - pip: - - databricks-sdk>=0.20.0,<1.0.0 + - databricks-sdk>=0.59.0,<1.0.0 - dependency-injector>=4.41.0,<5.0.0 - - azure-functions>=1.15.0,<2.0.0 + - azure-functions>=1.20.0,<2.0.0 - azure-mgmt-eventgrid>=10.2.0 - - hvac>=1.1.1 - - langchain>=0.2.0,<0.3.0 - - langchain-community>=0.2.0,<0.3.0 + - hvac>=2.3.0 + - langchain>=0.3.27,<0.4.0 + - langchain-community>=0.3.27,<0.4.0 + - langchain-core>=0.3.28,<0.4.0 + - langchain-text-splitters>=0.3.5,<0.4.0 - build==0.10.0 - deltalake>=0.10.1,<1.0.0 - trio>=0.22.1 - sqlparams>=5.1.0,<6.0.0 - entsoe-py>=0.5.10,<1.0.0 - - web3>=6.18.0,<7.0.0 - - eth-typing>=4.2.3,<5.0.0 - - pandas>=1.5.2,<2.2.0 + - web3>=7.7.0,<8.0.0 + - eth-typing>=5.0.1,<6.0.0 + - pandas>=2.0.1,<2.3.0 - moto[s3]>=5.0.16,<6.0.0 - - pyarrow>=14.0.1,<17.0.0 diff --git a/mkdocs.yml b/mkdocs.yml index 6d9b13888..834d47916 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -235,10 +235,38 @@ nav: - Azure Key Vault: sdk/code-reference/pipelines/secrets/azure_key_vault.md - Deploy: - Databricks: sdk/code-reference/pipelines/deploy/databricks.md - - Monitoring: - - Data Quality: - - Great Expectations: - - Data Quality Monitoring: sdk/code-reference/pipelines/monitoring/spark/data_quality/great_expectations.md + - Data Quality: + - Monitoring: + - Check Value Ranges: sdk/code-reference/pipelines/data_quality/monitoring/spark/check_value_ranges.md + - Great Expectations: + - Data Quality Monitoring: sdk/code-reference/pipelines/data_quality/monitoring/spark/great_expectations.md + - Flatline Detection: sdk/code-reference/pipelines/data_quality/monitoring/spark/flatline_detection.md + - Identify Missing Data: + - Interval Based: sdk/code-reference/pipelines/data_quality/monitoring/spark/identify_missing_data_interval.md + - Pattern Based: sdk/code-reference/pipelines/data_quality/monitoring/spark/identify_missing_data_pattern.md + - Moving Average: sdk/code-reference/pipelines/data_quality/monitoring/spark/moving_average.md + - Data Manipulation: + - Duplicate Detetection: sdk/code-reference/pipelines/data_quality/data_manipulation/spark/duplicate_detection.md + - Out of Range Value Filter: sdk/code-reference/pipelines/data_quality/data_manipulation/spark/out_of_range_value_filter.md + - Flatline Filter: sdk/code-reference/pipelines/data_quality/data_manipulation/spark/flatline_filter.md + - Gaussian Smoothing: sdk/code-reference/pipelines/data_quality/data_manipulation/spark/gaussian_smoothing.md + - Dimensionality Reduction: sdk/code-reference/pipelines/data_quality/data_manipulation/spark/dimensionality_reduction.md + - Interval Filtering: sdk/code-reference/pipelines/data_quality/data_manipulation/spark/interval_filtering.md + - K-Sigma Anomaly Detection: sdk/code-reference/pipelines/data_quality/data_manipulation/spark/k_sigma_anomaly_detection.md + - Missing Value Imputation: sdk/code-reference/pipelines/data_quality/data_manipulation/spark/missing_value_imputation.md + - Normalization: + - Normalization: sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/normalization.md + - Normalization Mean: sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/normalization_mean.md + - Normalization MinMax: sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/normalization_minmax.md + - Normalization ZScore: sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/normalization_zscore.md + - Denormalization: sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/denormalization.md + - Forecasting: + - Data Binning: sdk/code-reference/pipelines/forecasting/spark/data_binning.md + - Linear Regression: sdk/code-reference/pipelines/forecasting/spark/linear_regression.md + - Arima: sdk/code-reference/pipelines/forecasting/spark/arima.md + - Auto Arima: sdk/code-reference/pipelines/forecasting/spark/auto_arima.md + - K Nearest Neighbors: sdk/code-reference/pipelines/forecasting/spark/k_nearest_neighbors.md + - Jobs: sdk/pipelines/jobs.md - Deploy: - Databricks Workflows: sdk/pipelines/deploy/databricks.md @@ -330,4 +358,4 @@ nav: - blog/index.md - University: - University: university/overview.md - \ No newline at end of file + diff --git a/setup.py b/setup.py index 980168aa5..91a1abe2b 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,4 @@ -# Copyright 2022 RTDIP +# Copyright 2025 RTDIP # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -28,21 +28,29 @@ long_description = (here / "PYPI-README.md").read_text() INSTALL_REQUIRES = [ - "databricks-sql-connector>=3.1.0,<4.0.0", + "databricks-sql-connector>=3.1.0,<3.7.0", "pyarrow>=14.0.1,<17.0.0", "azure-identity>=1.12.0,<2.0.0", - "pandas>=1.5.2,<2.2.0", - "jinja2>=3.1.2,<4.0.0", + "pandas>=2.0.1,<2.3.0", + "jinja2>=3.1.5,<4.0.0", "importlib_metadata>=7.0.0,<8.0.0", "semver>=3.0.0,<4.0.0", "xlrd>=2.0.1,<3.0.0", "grpcio>=1.48.1", "grpcio-status>=1.48.1", "googleapis-common-protos>=1.56.4", - "langchain>=0.2.0,<0.3.0", - "langchain-community>=0.2.0,<0.3.0", - "openai>=1.13.3,<2.0.0", "pydantic>=2.6.0,<3.0.0", + "protobuf>=5.28.2,<5.29.0", +] + +LANGCHAIN_PACKAGES = [ + "langchain>=0.3.27,<0.4.0", + "langchain-community>=0.3.27,<0.4.0", + "langchain-core>=0.3.28,<0.4.0", + "langchain-text-splitters>=0.3.5,<0.4.0", + "openai>=1.59.0,<2.0.0", + "statsmodels>=0.14.1,<0.15.0", + "pmdarima>=2.0.4", ] PYSPARK_PACKAGES = [ @@ -52,14 +60,15 @@ PIPELINE_PACKAGES = [ "dependency-injector>=4.41.0,<5.0.0", - "databricks-sdk>=0.20.0,<1.0.0", + "databricks-sdk>=0.59.0,<1.0.0", "azure-storage-file-datalake>=12.12.0,<13.0.0", "azure-mgmt-storage>=21.0.0", "azure-mgmt-eventgrid>=10.2.0", "boto3>=1.28.2,<2.0.0", - "hvac>=1.1.1", + "hvac>=2.3.0", "azure-keyvault-secrets>=4.7.0,<5.0.0", - "web3>=6.18.0,<7.0.0", + "web3>=7.7.0,<8.0.0", + "eth-typing>=5.0.1,<6.0.0", "polars[deltalake]>=0.18.8,<1.0.0", "delta-sharing>=1.0.0,<1.1.0", "xarray>=2023.1.0,<2023.8.0", @@ -68,9 +77,11 @@ "joblib>=1.3.2,<2.0.0", "sqlparams>=5.1.0,<6.0.0", "entsoe-py>=0.5.10,<1.0.0", + "numpy>=1.23.4,<2.0.0", ] EXTRAS_DEPENDENCIES: dict[str, list[str]] = { + "langchain": LANGCHAIN_PACKAGES, "pipelines": PIPELINE_PACKAGES, "pyspark": PYSPARK_PACKAGES, } @@ -86,6 +97,7 @@ "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", ], project_urls={ "Issue Tracker": "https://github.com/rtdip/core/issues", @@ -96,7 +108,7 @@ package_dir={"": "src/sdk/python"}, include_package_data=True, packages=find_packages(where="src/sdk/python"), - python_requires=">=3.9, <3.12", + python_requires=">=3.9, <3.13", install_requires=INSTALL_REQUIRES, extras_require=EXTRAS_DEPENDENCIES, setup_requires=["pytest-runner", "setuptools_scm"], diff --git a/src/api/requirements.txt b/src/api/requirements.txt index 9e30382de..0a1201e3a 100644 --- a/src/api/requirements.txt +++ b/src/api/requirements.txt @@ -1,24 +1,27 @@ # Do not include azure-functions-worker as it may conflict with the Azure Functions platform -azure-functions==1.18.0 -fastapi==0.110.0 -pydantic==2.6.0 +azure-functions==1.20.0 +fastapi==0.115.6 +pydantic==2.10.0 # turbodbc==4.11.0 -pyodbc==4.0.39 -importlib_metadata>=1.0.0 -databricks-sql-connector==2.9.3 -azure-identity==1.17.0 +pyodbc==5.2.0 +importlib_metadata>=7.0.0 +databricks-sql-connector==3.6.0 +azure-identity==1.20.0 oauthlib>=3.2.2 -pandas>=2.0.1,<3.0.0 +pandas>=2.0.1,<2.3.0 numpy==1.26.4 -jinja2==3.1.4 -pytz==2024.1 +jinja2==3.1.5 +pytz==2024.2 semver==3.0.2 xlrd==2.0.1 -packaging==23.2 +packaging==24.2 grpcio>=1.48.1 grpcio-status>=1.48.1 googleapis-common-protos>=1.56.4 -langchain>=0.2.0,<0.3.0 -langchain-community>=0.2.0,<0.3.0 -openai==1.13.3 -pyjwt==2.8.0 +protobuf>=5.28.2,<5.29.0 +langchain>=0.3.27,<0.4.0 +langchain-community>=0.3.27,<0.4.0 +langchain-core>=0.3.28,<0.4.0 +langchain-text-splitters>=0.3.5,<0.4.0 +openai>=1.59.0,<2.0.0 +pyjwt==2.10.0 diff --git a/src/api/v1/interpolate.py b/src/api/v1/interpolate.py index a25d097b1..0a14feac2 100644 --- a/src/api/v1/interpolate.py +++ b/src/api/v1/interpolate.py @@ -28,7 +28,6 @@ RawQueryParams, TagsQueryParams, TagsBodyParams, - ResampleQueryParams, InterpolateQueryParams, PivotQueryParams, LimitOffsetQueryParams, @@ -40,7 +39,6 @@ def interpolate_events_get( base_query_parameters, raw_query_parameters, tag_query_parameters, - resample_parameters, interpolate_parameters, pivot_parameters, limit_offset_parameters, @@ -50,7 +48,6 @@ def interpolate_events_get( (connection, parameters) = common_api_setup_tasks( base_query_parameters, raw_query_parameters=raw_query_parameters, - resample_query_parameters=resample_parameters, tag_query_parameters=tag_query_parameters, interpolate_query_parameters=interpolate_parameters, pivot_query_parameters=pivot_parameters, @@ -101,7 +98,6 @@ async def interpolate_get( base_query_parameters: BaseQueryParams = Depends(), raw_query_parameters: RawQueryParams = Depends(), tag_query_parameters: TagsQueryParams = Depends(), - resample_parameters: ResampleQueryParams = Depends(), interpolate_parameters: InterpolateQueryParams = Depends(), pivot_parameters: PivotQueryParams = Depends(), limit_offset_query_parameters: LimitOffsetQueryParams = Depends(), @@ -111,7 +107,6 @@ async def interpolate_get( base_query_parameters, raw_query_parameters, tag_query_parameters, - resample_parameters, interpolate_parameters, pivot_parameters, limit_offset_query_parameters, @@ -146,7 +141,6 @@ async def interpolate_post( base_query_parameters: BaseQueryParams = Depends(), raw_query_parameters: RawQueryParams = Depends(), tag_query_parameters: TagsBodyParams = Body(default=...), - resample_parameters: ResampleQueryParams = Depends(), interpolate_parameters: InterpolateQueryParams = Depends(), pivot_parameters: PivotQueryParams = Depends(), limit_offset_query_parameters: LimitOffsetQueryParams = Depends(), @@ -156,7 +150,6 @@ async def interpolate_post( base_query_parameters, raw_query_parameters, tag_query_parameters, - resample_parameters, interpolate_parameters, pivot_parameters, limit_offset_query_parameters, diff --git a/src/api/v1/models.py b/src/api/v1/models.py index d07fc9b4c..000a517a5 100644 --- a/src/api/v1/models.py +++ b/src/api/v1/models.py @@ -350,23 +350,9 @@ class TagsBodyParams(BaseModel): class PlotQueryParams: def __init__( self, - sample_rate: str = Query( - ..., - description="sample_rate is deprecated and will be removed in v1.0.0. Please use time_interval_rate instead.", - examples=[5], - deprecated=True, - ), - sample_unit: str = Query( - ..., - description="sample_unit is deprecated and will be removed in v1.0.0. Please use time_interval_unit instead.", - examples=["second", "minute", "hour", "day"], - deprecated=True, - ), time_interval_rate: str = DuplicatedQueryParameters.time_interval_rate, time_interval_unit: str = DuplicatedQueryParameters.time_interval_unit, ): - self.sample_rate = sample_rate - self.sample_unit = sample_unit self.time_interval_rate = time_interval_rate self.time_interval_unit = time_interval_unit @@ -374,18 +360,6 @@ def __init__( class ResampleQueryParams: def __init__( self, - sample_rate: str = Query( - ..., - description="sample_rate is deprecated and will be removed in v1.0.0. Please use time_interval_rate instead.", - examples=[5], - deprecated=True, - ), - sample_unit: str = Query( - ..., - description="sample_unit is deprecated and will be removed in v1.0.0. Please use time_interval_unit instead.", - examples=["second", "minute", "hour", "day"], - deprecated=True, - ), time_interval_rate: str = DuplicatedQueryParameters.time_interval_rate, time_interval_unit: str = DuplicatedQueryParameters.time_interval_unit, agg_method: str = Query( @@ -394,8 +368,6 @@ def __init__( examples=["first", "last", "avg", "min", "max"], ), ): - self.sample_rate = sample_rate - self.sample_unit = sample_unit self.time_interval_rate = time_interval_rate self.time_interval_unit = time_interval_unit self.agg_method = agg_method @@ -431,13 +403,11 @@ def __init__( class InterpolateQueryParams: def __init__( self, - interpolation_method: str = Query( - ..., - description="Interpolation Method can e one of the following [forward_fill, backward_fill, linear]", - examples=["forward_fill", "backward_fill", "linear"], - ), + time_interval_rate: str = DuplicatedQueryParameters.time_interval_rate, + time_interval_unit: str = DuplicatedQueryParameters.time_interval_unit, ): - self.interpolation_method = interpolation_method + self.time_interval_rate = time_interval_rate + self.time_interval_unit = time_interval_unit class InterpolationAtTimeQueryParams: @@ -450,8 +420,8 @@ def __init__( ), timestamps: List[Union[date, datetime]] = Query( ..., - description="Timestamps in format YYYY-MM-DD or YYYY-MM-DDTHH:mm:ss or YYYY-MM-DDTHH:mm:ss+zz:zz", - examples=[EXAMPLE_DATE, EXAMPLE_DATETIME, EXAMPLE_DATETIME_TIMEZOME], + description="Timestamps in format YYYY-MM-DDTHH:mm:ss or YYYY-MM-DDTHH:mm:ss+zz:zz", + examples=[EXAMPLE_DATETIME, EXAMPLE_DATETIME_TIMEZOME], ), window_length: int = Query( ..., description="Window Length in days", examples=[1] @@ -474,24 +444,17 @@ def __init__( class TimeWeightedAverageQueryParams: def __init__( self, - window_size_mins: int = Query( - ..., - description="window_size_mins is deprecated and will be removed in v1.0.0. Please use time_interval_rate and time_interval_unit instead.", - examples=[20], - deprecated=True, - ), time_interval_rate: str = DuplicatedQueryParameters.time_interval_rate, time_interval_unit: str = DuplicatedQueryParameters.time_interval_unit, window_length: int = Query( ..., description="Window Length in days", examples=[1] ), step: str = Query( - ..., + default="metadata", description='Step can be "true", "false" or "metadata". "metadata" will retrieve the step value from the metadata table.', examples=["true", "false", "metadata"], ), ): - self.window_size_mins = window_size_mins self.time_interval_rate = time_interval_rate self.time_interval_unit = time_interval_unit self.window_length = window_length diff --git a/src/sdk/python/rtdip_sdk/connectors/__init__.py b/src/sdk/python/rtdip_sdk/connectors/__init__.py index 824c69ba2..927a934c5 100644 --- a/src/sdk/python/rtdip_sdk/connectors/__init__.py +++ b/src/sdk/python/rtdip_sdk/connectors/__init__.py @@ -21,5 +21,9 @@ from .odbc.turbodbc_sql_connector import * if importlib.util.find_spec("pyspark") != None: from .grpc.spark_connector import * -from .llm.chatopenai_databricks_connector import * +if ( + importlib.util.find_spec("langchain") != None + and importlib.util.find_spec("langchain_community") != None +): + from .llm.chatopenai_databricks_connector import * from .models import * diff --git a/src/sdk/python/rtdip_sdk/connectors/llm/chatopenai_databricks_connector.py b/src/sdk/python/rtdip_sdk/connectors/llm/chatopenai_databricks_connector.py index c0a639573..5f607e9b7 100644 --- a/src/sdk/python/rtdip_sdk/connectors/llm/chatopenai_databricks_connector.py +++ b/src/sdk/python/rtdip_sdk/connectors/llm/chatopenai_databricks_connector.py @@ -31,6 +31,11 @@ class ChatOpenAIDatabricksConnection(ConnectionInterface): The connection class represents a connection to a database and uses the Databricks SQL Connector API's for Python to interact with cluster/jobs and langchain to connect to Chat Open AI(Chat GPT) LLM. To find details for SQL warehouses server_hostname and http_path location to the SQL Warehouse tab in the documentation. + Ensure that you install the langchain packages by running the following command: + ``` + pip install rtdip-sdk[langchain] + ``` + Args: catalog: Catalog name in Databricks schema: Schema name in Databricks diff --git a/src/sdk/python/rtdip_sdk/pipelines/_pipeline_utils/spark.py b/src/sdk/python/rtdip_sdk/pipelines/_pipeline_utils/spark.py index 198b91431..71b960d7b 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/_pipeline_utils/spark.py +++ b/src/sdk/python/rtdip_sdk/pipelines/_pipeline_utils/spark.py @@ -1,4 +1,4 @@ -# Copyright 2022 RTDIP +# Copyright 2025 RTDIP # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,7 +13,7 @@ # limitations under the License. import logging -from pyspark.sql import SparkSession +from pyspark.sql import SparkSession, DataFrame from pyspark.sql.types import ( StructType, StructField, @@ -28,6 +28,7 @@ DoubleType, FloatType, ) +from pyspark.sql.functions import col from .models import Libraries from ..._sdk_utils.compare_versions import _package_version_meets_minimum @@ -117,6 +118,96 @@ def get_dbutils( # def onQueryTerminated(self, event): # logging.info("Query terminated: {} {}".format(event.id, event.name)) + +def is_dataframe_partially_conformed_in_schema( + dataframe: DataFrame, schema: StructType, throw_error: bool = True +) -> bool: + """ + Checks if all columns in the dataframe are contained in the schema with appropriate types. + + Parameters: + dataframe (DataFrame): The dataframe to check. + schema (StructType): The schema to conform to. + throw_error (bool): If True, raises an error on non-conformance. Defaults to True. + + Returns: + bool: True if the dataframe conforms to the schema, False otherwise. + """ + for column in dataframe.schema: + if column.name in schema.names: + schema_field = schema[column.name] + if not isinstance(column.dataType, type(schema_field.dataType)): + if throw_error: + raise ValueError( + "Column {0} is of Type {1}, expected Type {2}".format( + column, column.dataType, schema_field.dataType + ) + ) + return False + else: + # dataframe contains column not expected ins schema + if not throw_error: + return False + else: + raise ValueError( + "Column {0} is not expected in dataframe".format(column) + ) + return True + + +def conform_dataframe_to_schema( + dataframe: DataFrame, schema: StructType, throw_error: bool = True +) -> DataFrame: + """ + Tries to convert all columns to the given schema. + + Parameters: + dataframe (DataFrame): The dataframe to conform. + schema (StructType): The schema to conform to. + throw_error (bool): If True, raises an error on non-conformance. Defaults to True. + + Returns: + DataFrame: The conformed dataframe. + """ + for column in dataframe.schema: + c_name = column.name + if c_name in schema.names: + schema_field = schema[c_name] + if not isinstance(column.dataType, type(schema_field.dataType)): + dataframe = dataframe.withColumn( + c_name, dataframe[c_name].cast(schema_field.dataType) + ) + else: + if throw_error: + raise ValueError(f"Column '{c_name}' is not expected in the dataframe") + else: + dataframe = dataframe.drop(c_name) + return dataframe + + +def split_by_source(df: DataFrame, split_by_col: str, timestamp_col: str) -> dict: + """ + + Helper method to separate individual time series based on their source. + + Parameters: + df (DataFrame): The input DataFrame. + split_by_col (str): The column name to split the DataFrame by. + timestamp_col (str): The column name to order the DataFrame by. + + Returns: + dict: A dictionary where keys are distinct values from split_by_col and values are DataFrames filtered and ordered by timestamp_col. + """ + tag_names = df.select(split_by_col).distinct().collect() + tag_names = [row[split_by_col] for row in tag_names] + source_dict = { + tag: df.filter(col(split_by_col) == tag).orderBy(timestamp_col) + for tag in tag_names + } + + return source_dict + + EVENTHUB_SCHEMA = StructType( [ StructField("body", BinaryType(), True), @@ -469,6 +560,15 @@ def get_dbutils( ] ) +PROCESS_DATA_MODEL_EVENT_SCHEMA = StructType( + [ + StructField("TagName", StringType(), True), + StructField("EventTime", TimestampType(), True), + StructField("Status", StringType(), True), + StructField("Value", StringType(), True), + ] +) + KAFKA_SCHEMA = StructType( [ StructField("key", BinaryType(), True), diff --git a/src/sdk/python/rtdip_sdk/pipelines/monitoring/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/__init__.py similarity index 86% rename from src/sdk/python/rtdip_sdk/pipelines/monitoring/__init__.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/__init__.py index 17e525274..734152471 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/monitoring/__init__.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2022 RTDIP +# Copyright 2025 RTDIP # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,4 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from .spark.data_quality.great_expectations_data_quality import * + +from .data_manipulation import * +from .monitoring import * diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/__init__.py new file mode 100644 index 000000000..76bb6a388 --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .spark import * diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/interfaces.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/interfaces.py new file mode 100644 index 000000000..2e226f20d --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/interfaces.py @@ -0,0 +1,24 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import abstractmethod + +from pyspark.sql import DataFrame +from ...interfaces import PipelineComponentBaseInterface + + +class DataManipulationBaseInterface(PipelineComponentBaseInterface): + @abstractmethod + def filter_data(self) -> DataFrame: + pass diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/__init__.py new file mode 100644 index 000000000..0d716ab8a --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/__init__.py @@ -0,0 +1,22 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .normalization import * +from .dimensionality_reduction import DimensionalityReduction +from .duplicate_detection import DuplicateDetection +from .interval_filtering import IntervalFiltering +from .k_sigma_anomaly_detection import KSigmaAnomalyDetection +from .missing_value_imputation import MissingValueImputation +from .out_of_range_value_filter import OutOfRangeValueFilter +from .flatline_filter import FlatlineFilter diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/dimensionality_reduction.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/dimensionality_reduction.py new file mode 100644 index 000000000..2009e5145 --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/dimensionality_reduction.py @@ -0,0 +1,157 @@ +# Copyright 2025 Project Team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pyspark.sql import DataFrame as PySparkDataFrame +from pyspark.ml.stat import Correlation +from pyspark.sql.functions import col +from pyspark.ml.feature import VectorAssembler + +from ..interfaces import DataManipulationBaseInterface +from ...._pipeline_utils.models import ( + Libraries, + SystemType, +) + + +class DimensionalityReduction(DataManipulationBaseInterface): + """ + Detects and combines columns based on correlation or exact duplicates. + + Example + -------- + ```python + from rtdip_sdk.pipelines.data_quality.data_manipulation.spark.dimensionality_reduction import DimensionalityReduction + + from pyspark.sql import SparkSession + + column_correlation_monitor = DimensionalityReduction( + df, + columns=['column1', 'column2'], + threshold=0.95, + combination_method='mean' + ) + + result = column_correlation_monitor.filter_data() + ``` + + Parameters: + df (DataFrame): PySpark DataFrame to be analyzed and transformed. + columns (list): List of column names to check for correlation. Only two columns are supported. + threshold (float, optional): Correlation threshold for column combination [0-1]. If the absolute value of the correlation is equal or bigger, than the columns are combined. Defaults to 0.9. + combination_method (str, optional): Method to combine correlated columns. + Supported methods: + - 'mean': Average the values of both columns and write the result to the first column + (New value = (column1 + column2) / 2) + - 'sum': Sum the values of both columns and write the result to the first column + (New value = column1 + column2) + - 'first': Keep the first column, drop the second column + - 'second': Keep the second column, drop the first column + - 'delete': Remove both columns entirely from the DataFrame + Defaults to 'mean'. + """ + + df: PySparkDataFrame + columns_to_check: list + threshold: float + combination_method: str + + def __init__( + self, + df: PySparkDataFrame, + columns: list, + threshold: float = 0.9, + combination_method: str = "mean", + ) -> None: + # Validate inputs + if not columns or not isinstance(columns, list): + raise ValueError("columns must be a non-empty list of column names.") + if len(columns) != 2: + raise ValueError( + "columns must contain exactly two columns for correlation." + ) + + if not 0 <= threshold <= 1: + raise ValueError("Threshold must be between 0 and 1.") + + valid_methods = ["mean", "sum", "first", "second", "delete"] + if combination_method not in valid_methods: + raise ValueError(f"combination_method must be one of {valid_methods}") + + self.df = df + self.columns_to_check = columns + self.threshold = threshold + self.combination_method = combination_method + + @staticmethod + def system_type(): + """ + Attributes: + SystemType (Environment): Requires PYSPARK + """ + return SystemType.PYSPARK + + @staticmethod + def libraries(): + libraries = Libraries() + return libraries + + @staticmethod + def settings() -> dict: + return {} + + def _calculate_correlation(self) -> float: + """ + Calculate correlation between specified columns. + + Returns: + float: Correlation matrix between columns + """ + assembler = VectorAssembler( + inputCols=self.columns_to_check, outputCol="features" + ) + vector_df = assembler.transform(self.df) + + correlation_matrix = Correlation.corr( + vector_df, "features", method="pearson" + ).collect()[0][0] + + # Correlation between first and second column + return correlation_matrix.toArray()[0][1] + + def filter_data(self) -> PySparkDataFrame: + """ + Process DataFrame by detecting and combining correlated columns. + + Returns: + PySparkDataFrame: Transformed PySpark DataFrame + """ + correlation = self._calculate_correlation() + + # If correlation is below threshold, return original DataFrame + if correlation < self.threshold: + return self.df + + col1, col2 = self.columns_to_check + if self.combination_method == "mean": + return self.df.withColumn(col1, (col(col1) + col(col2)) / 2).drop(col2) + elif self.combination_method == "sum": + return self.df.withColumn(col1, col(col1) + col(col2)).drop(col2) + elif self.combination_method == "first": + return self.df.drop(col2) + elif self.combination_method == "second": + return self.df.drop(col2) + elif self.combination_method == "delete": + return self.df.drop(col1).drop(col2) + else: + return self.df diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/duplicate_detection.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/duplicate_detection.py new file mode 100644 index 000000000..20df3eded --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/duplicate_detection.py @@ -0,0 +1,81 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from pyspark.sql.functions import desc +from pyspark.sql import DataFrame as PySparkDataFrame + +from ..interfaces import DataManipulationBaseInterface +from ...input_validator import InputValidator +from ...._pipeline_utils.models import ( + Libraries, + SystemType, +) + + +class DuplicateDetection(DataManipulationBaseInterface, InputValidator): + """ + Cleanses a PySpark DataFrame from duplicates. + + Example + -------- + ```python + from rtdip_sdk.pipelines.data_quality.data_manipulation.spark.duplicate_detection import DuplicateDetection + + from pyspark.sql import SparkSession + from pyspark.sql.dataframe import DataFrame + + duplicate_detection_monitor = DuplicateDetection(df, primary_key_columns=["TagName", "EventTime"]) + + result = duplicate_detection_monitor.filter_data() + ``` + + Parameters: + df (DataFrame): PySpark DataFrame to be cleansed. + primary_key_columns (list): List of column names that serve as primary key for duplicate detection. + """ + + df: PySparkDataFrame + primary_key_columns: list + + def __init__(self, df: PySparkDataFrame, primary_key_columns: list) -> None: + if not primary_key_columns or not isinstance(primary_key_columns, list): + raise ValueError( + "primary_key_columns must be a non-empty list of column names." + ) + self.df = df + self.primary_key_columns = primary_key_columns + + @staticmethod + def system_type(): + """ + Attributes: + SystemType (Environment): Requires PYSPARK + """ + return SystemType.PYSPARK + + @staticmethod + def libraries(): + libraries = Libraries() + return libraries + + @staticmethod + def settings() -> dict: + return {} + + def filter_data(self) -> PySparkDataFrame: + """ + Returns: + PySparkDataFrame: A cleansed PySpark DataFrame from all duplicates based on primary key columns. + """ + cleansed_df = self.df.dropDuplicates(self.primary_key_columns) + return cleansed_df diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/flatline_filter.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/flatline_filter.py new file mode 100644 index 000000000..4809dde0b --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/flatline_filter.py @@ -0,0 +1,92 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from pyspark.sql import DataFrame as PySparkDataFrame + +from ...monitoring.spark.flatline_detection import FlatlineDetection +from ..interfaces import DataManipulationBaseInterface +from ...._pipeline_utils.models import ( + Libraries, + SystemType, +) + + +class FlatlineFilter(DataManipulationBaseInterface): + """ + Removes and logs rows with flatlining detected in specified columns of a PySpark DataFrame. + + Args: + df (pyspark.sql.DataFrame): The input DataFrame to process. + watch_columns (list): List of column names to monitor for flatlining (null or zero values). + tolerance_timespan (int): Maximum allowed consecutive flatlining period. Rows exceeding this period are removed. + + Example: + ```python + from pyspark.sql import SparkSession + from rtdip_sdk.pipelines.data_quality.data_manipulation.spark.flatline_filter import FlatlineFilter + + + spark = SparkSession.builder.master("local[1]").appName("FlatlineFilterExample").getOrCreate() + + # Example DataFrame + data = [ + (1, "2024-01-02 03:49:45.000", 0.0), + (1, "2024-01-02 03:50:45.000", 0.0), + (1, "2024-01-02 03:51:45.000", 0.0), + (2, "2024-01-02 03:49:45.000", 5.0), + ] + columns = ["TagName", "EventTime", "Value"] + df = spark.createDataFrame(data, columns) + + filter_flatlining_rows = FlatlineFilter( + df=df, + watch_columns=["Value"], + tolerance_timespan=2, + ) + + result_df = filter_flatlining_rows.filter_data() + result_df.show() + ``` + """ + + def __init__( + self, df: PySparkDataFrame, watch_columns: list, tolerance_timespan: int + ) -> None: + self.df = df + self.flatline_detection = FlatlineDetection( + df=df, watch_columns=watch_columns, tolerance_timespan=tolerance_timespan + ) + + @staticmethod + def system_type(): + return SystemType.PYSPARK + + @staticmethod + def libraries(): + libraries = Libraries() + return libraries + + @staticmethod + def settings() -> dict: + return {} + + def filter_data(self) -> PySparkDataFrame: + """ + Removes rows with flatlining detected. + + Returns: + pyspark.sql.DataFrame: A DataFrame without rows with flatlining detected. + """ + flatlined_rows = self.flatline_detection.check_for_flatlining() + flatlined_rows = flatlined_rows.select(*self.df.columns) + return self.df.subtract(flatlined_rows) diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/gaussian_smoothing.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/gaussian_smoothing.py new file mode 100644 index 000000000..49a0cd8f7 --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/gaussian_smoothing.py @@ -0,0 +1,146 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +from pyspark.sql.types import FloatType +from scipy.ndimage import gaussian_filter1d +from pyspark.sql import DataFrame as PySparkDataFrame, Window +from pyspark.sql import functions as F + +from ...._pipeline_utils.models import ( + Libraries, + SystemType, +) +from ..interfaces import DataManipulationBaseInterface + + +class GaussianSmoothing(DataManipulationBaseInterface): + """ + Applies Gaussian smoothing to a PySpark DataFrame. This method smooths the values in a specified column + using a Gaussian filter, which helps reduce noise and fluctuations in time-series or spatial data. + + The smoothing can be performed in two modes: + - **Temporal mode**: Applies smoothing along the time axis within each unique ID. + - **Spatial mode**: Applies smoothing across different IDs for the same timestamp. + + Example + -------- + ```python + from pyspark.sql import SparkSession + from rtdip_sdk.pipelines.data_quality.data_manipulation.spark.gaussian_smoothing import GaussianSmoothing + + + spark = SparkSession.builder.getOrCreate() + df = ... # Load your PySpark DataFrame + + smoothed_df = GaussianSmoothing( + df=df, + sigma=2.0, + mode="temporal", + id_col="sensor_id", + timestamp_col="timestamp", + value_col="measurement" + ).filter_data() + + smoothed_df.show() + ``` + + Parameters: + df (PySparkDataFrame): The input PySpark DataFrame. + sigma (float): The standard deviation for the Gaussian kernel, controlling the amount of smoothing. + mode (str, optional): The smoothing mode, either `"temporal"` (default) or `"spatial"`. + id_col (str, optional): The name of the column representing unique entity IDs (default: `"id"`). + timestamp_col (str, optional): The name of the column representing timestamps (default: `"timestamp"`). + value_col (str, optional): The name of the column containing the values to be smoothed (default: `"value"`). + + Raises: + TypeError: If `df` is not a PySpark DataFrame. + ValueError: If `sigma` is not a positive number. + ValueError: If `mode` is not `"temporal"` or `"spatial"`. + ValueError: If `id_col`, `timestamp_col`, or `value_col` are not found in the DataFrame. + """ + + def __init__( + self, + df: PySparkDataFrame, + sigma: float, + mode: str = "temporal", + id_col: str = "id", + timestamp_col: str = "timestamp", + value_col: str = "value", + ) -> None: + if not isinstance(df, PySparkDataFrame): + raise TypeError("df must be a PySpark DataFrame") + if not isinstance(sigma, (int, float)) or sigma <= 0: + raise ValueError("sigma must be a positive number") + if mode not in ["temporal", "spatial"]: + raise ValueError("mode must be either 'temporal' or 'spatial'") + + if id_col not in df.columns: + raise ValueError(f"Column {id_col} not found in DataFrame") + if timestamp_col not in df.columns: + raise ValueError(f"Column {timestamp_col} not found in DataFrame") + if value_col not in df.columns: + raise ValueError(f"Column {value_col} not found in DataFrame") + + self.df = df + self.sigma = sigma + self.mode = mode + self.id_col = id_col + self.timestamp_col = timestamp_col + self.value_col = value_col + + @staticmethod + def system_type(): + return SystemType.PYSPARK + + @staticmethod + def libraries(): + libraries = Libraries() + return libraries + + @staticmethod + def settings() -> dict: + return {} + + @staticmethod + def create_gaussian_smoother(sigma_value): + def apply_gaussian(values): + if not values: + return None + values_array = np.array([float(v) for v in values]) + smoothed = gaussian_filter1d(values_array, sigma=sigma_value) + return float(smoothed[-1]) + + return apply_gaussian + + def filter_data(self) -> PySparkDataFrame: + + smooth_udf = F.udf(self.create_gaussian_smoother(self.sigma), FloatType()) + + if self.mode == "temporal": + window = ( + Window.partitionBy(self.id_col) + .orderBy(self.timestamp_col) + .rangeBetween(Window.unboundedPreceding, Window.unboundedFollowing) + ) + else: # spatial mode + window = ( + Window.partitionBy(self.timestamp_col) + .orderBy(self.id_col) + .rangeBetween(Window.unboundedPreceding, Window.unboundedFollowing) + ) + + collect_list_expr = F.collect_list(F.col(self.value_col)).over(window) + + return self.df.withColumn(self.value_col, smooth_udf(collect_list_expr)) diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/interval_filtering.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/interval_filtering.py new file mode 100644 index 000000000..35cf723e0 --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/interval_filtering.py @@ -0,0 +1,184 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from datetime import timedelta + +import pandas as pd +from pyspark.sql.types import StringType +from pyspark.sql import functions as F +from pyspark.sql import SparkSession +from pyspark.sql import DataFrame + +from ...._pipeline_utils.models import ( + Libraries, + SystemType, +) +from ..interfaces import DataManipulationBaseInterface +from ...input_validator import InputValidator + + +class IntervalFiltering(DataManipulationBaseInterface, InputValidator): + """ + Cleanses a DataFrame by removing rows outside a specified interval window. Supported time stamp columns are DateType and StringType. + + Parameters: + spark (SparkSession): A SparkSession object. + df (DataFrame): PySpark DataFrame to be converted + interval (int): The interval length for cleansing. + interval_unit (str): 'hours', 'minutes', 'seconds' or 'milliseconds' to specify the unit of the interval. + time_stamp_column_name (str): The name of the column containing the time stamps. Default is 'EventTime'. + tolerance (int): The tolerance for the interval. Default is None. + """ + + """ Default time stamp column name if not set in the constructor """ + DEFAULT_TIME_STAMP_COLUMN_NAME: str = "EventTime" + + def __init__( + self, + spark: SparkSession, + df: DataFrame, + interval: int, + interval_unit: str, + time_stamp_column_name: str = None, + tolerance: int = None, + ) -> None: + self.spark = spark + self.df = df + self.interval = interval + self.interval_unit = interval_unit + self.tolerance = tolerance + if time_stamp_column_name is None: + self.time_stamp_column_name = self.DEFAULT_TIME_STAMP_COLUMN_NAME + else: + self.time_stamp_column_name = time_stamp_column_name + + def filter_data(self) -> DataFrame: + """ + Filters the DataFrame based on the interval + """ + + if self.time_stamp_column_name not in self.df.columns: + raise ValueError( + f"Column {self.time_stamp_column_name} not found in the DataFrame." + ) + is_string_time_stamp = isinstance( + self.df.schema[self.time_stamp_column_name].dataType, StringType + ) + + original_schema = self.df.schema + self.df = self.convert_column_to_timestamp().orderBy( + self.time_stamp_column_name + ) + + tolerance_in_ms = None + if self.tolerance is not None: + tolerance_in_ms = self.get_time_delta(self.tolerance).total_seconds() * 1000 + + time_delta_in_ms = self.get_time_delta(self.interval).total_seconds() * 1000 + + rows = self.df.collect() + last_time_stamp = rows[0][self.time_stamp_column_name] + first_row = rows[0].asDict() + + first_row[self.time_stamp_column_name] = ( + self.format_date_time_to_string(first_row[self.time_stamp_column_name]) + if is_string_time_stamp + else first_row[self.time_stamp_column_name] + ) + + cleansed_df = [first_row] + + for i in range(1, len(rows)): + current_row = rows[i] + current_time_stamp = current_row[self.time_stamp_column_name] + + if self.check_outside_of_interval( + current_time_stamp, last_time_stamp, time_delta_in_ms, tolerance_in_ms + ): + current_row_dict = current_row.asDict() + current_row_dict[self.time_stamp_column_name] = ( + self.format_date_time_to_string( + current_row_dict[self.time_stamp_column_name] + ) + if is_string_time_stamp + else current_row_dict[self.time_stamp_column_name] + ) + + cleansed_df.append(current_row_dict) + last_time_stamp = current_time_stamp + + result_df = self.spark.createDataFrame(cleansed_df, schema=original_schema) + + return result_df + + @staticmethod + def system_type(): + """ + Attributes: + SystemType (Environment): Requires PYSPARK + """ + return SystemType.PYSPARK + + @staticmethod + def libraries(): + libraries = Libraries() + return libraries + + @staticmethod + def settings() -> dict: + return {} + + def convert_column_to_timestamp(self) -> DataFrame: + try: + return self.df.withColumn( + self.time_stamp_column_name, F.to_timestamp(self.time_stamp_column_name) + ) + except Exception as e: + raise ValueError( + f"Error converting column {self.time_stamp_column_name} to timestamp: {e}" + f"{self.df.schema[self.time_stamp_column_name].dataType} might be unsupported!" + ) + + def get_time_delta(self, value: int) -> timedelta: + if self.interval_unit == "minutes": + return timedelta(minutes=value) + elif self.interval_unit == "days": + return timedelta(days=value) + elif self.interval_unit == "hours": + return timedelta(hours=value) + elif self.interval_unit == "seconds": + return timedelta(seconds=value) + elif self.interval_unit == "milliseconds": + return timedelta(milliseconds=value) + else: + raise ValueError( + "interval_unit must be either 'days', 'hours', 'minutes', 'seconds' or 'milliseconds'" + ) + + def check_outside_of_interval( + self, + current_time_stamp: pd.Timestamp, + last_time_stamp: pd.Timestamp, + time_delta_in_ms: float, + tolerance_in_ms: float, + ) -> bool: + time_difference = (current_time_stamp - last_time_stamp).total_seconds() * 1000 + if not tolerance_in_ms is None: + time_difference += tolerance_in_ms + return time_difference >= time_delta_in_ms + + def format_date_time_to_string(self, time_stamp: pd.Timestamp) -> str: + try: + return time_stamp.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] + except Exception as e: + raise ValueError(f"Error converting timestamp to string: {e}") diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/k_sigma_anomaly_detection.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/k_sigma_anomaly_detection.py new file mode 100644 index 000000000..090c149a5 --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/k_sigma_anomaly_detection.py @@ -0,0 +1,142 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pyspark.sql import DataFrame, SparkSession +from pyspark.sql.functions import mean, stddev, abs, col +from ..interfaces import DataManipulationBaseInterface +from ...input_validator import InputValidator +from ...._pipeline_utils.models import ( + Libraries, + SystemType, +) +from pyspark.sql.types import ( + DoubleType, + StructType, + StructField, +) + + +class KSigmaAnomalyDetection(DataManipulationBaseInterface, InputValidator): + """ + Anomaly detection with the k-sigma method. This method either computes the mean and standard deviation, or the median and the median absolute deviation (MAD) of the data. + The k-sigma method then filters out all data points that are k times the standard deviation away from the mean, or k times the MAD away from the median. + Assuming a normal distribution, this method keeps around 99.7% of the data points when k=3 and use_median=False. + + Example + -------- + ```python + from rtdip_sdk.pipelines.data_quality.data_manipulation.spark.k_sigma_anomaly_detection import KSigmaAnomalyDetection + + + spark = ... # SparkSession + df = ... # Get a PySpark DataFrame + + filtered_df = KSigmaAnomalyDetection( + spark, df, [""] + ).filter_data() + + filtered_df.show() + ``` + + Parameters: + spark (SparkSession): A SparkSession object. + df (DataFrame): Dataframe containing the raw data. + column_names (list[str]): The names of the columns to be filtered (currently only one column is supported). + k_value (float): The number of deviations to build the threshold. + use_median (book): If True the median and the median absolute deviation (MAD) are used, instead of the mean and standard deviation. + """ + + def __init__( + self, + spark: SparkSession, + df: DataFrame, + column_names: list[str], + k_value: float = 3.0, + use_median: bool = False, + ) -> None: + if len(column_names) == 0: + raise Exception("You must provide at least one column name") + if len(column_names) > 1: + raise NotImplementedError("Multiple columns are not supported yet") + + self.column_names = column_names + self.use_median = use_median + self.spark = spark + self.df = df + self.k_value = k_value + + self.validate( + StructType( + [StructField(column, DoubleType(), True) for column in column_names] + ) + ) + + @staticmethod + def system_type(): + """ + Attributes: + SystemType (Environment): Requires PYSPARK + """ + return SystemType.PYSPARK + + @staticmethod + def libraries(): + libraries = Libraries() + return libraries + + @staticmethod + def settings() -> dict: + return {} + + def filter_data(self) -> DataFrame: + """ + Filter anomalies based on the k-sigma rule + """ + + column_name = self.column_names[0] + mean_value, deviation = 0, 0 + + if self.use_median: + mean_value = self.df.approxQuantile(column_name, [0.5], 0.0)[0] + if mean_value is None: + raise Exception("Failed to calculate the mean value") + + df_with_deviation = self.df.withColumn( + "absolute_deviation", abs(col(column_name) - mean_value) + ) + deviation = df_with_deviation.approxQuantile( + "absolute_deviation", [0.5], 0.0 + )[0] + if deviation is None: + raise Exception("Failed to calculate the deviation value") + else: + stats = self.df.select( + mean(column_name), stddev(self.column_names[0]) + ).first() + if stats is None: + raise Exception( + "Failed to calculate the mean value and the standard deviation value" + ) + + mean_value = stats[0] + deviation = stats[1] + + shift = self.k_value * deviation + lower_bound = mean_value - shift + upper_bound = mean_value + shift + + return self.df.filter( + (self.df[column_name] >= lower_bound) + & (self.df[column_name] <= upper_bound) + ) diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/missing_value_imputation.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/missing_value_imputation.py new file mode 100644 index 000000000..955d49ea2 --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/missing_value_imputation.py @@ -0,0 +1,290 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pyspark.sql import SparkSession, DataFrame as PySparkDataFrame, functions as F, Row +from pyspark.sql.functions import col, udf +from pyspark.sql.types import StringType, TimestampType, FloatType, ArrayType +from pyspark.sql.window import Window +from scipy.interpolate import UnivariateSpline +import numpy as np +from datetime import timedelta +from typing import List +from ..interfaces import DataManipulationBaseInterface +from ...input_validator import InputValidator +from ...._pipeline_utils.models import ( + Libraries, + SystemType, +) + + +class MissingValueImputation(DataManipulationBaseInterface, InputValidator): + """ + Imputes missing values in a univariate time series creating a continuous curve of data points. For that, the + time intervals of each individual source is calculated, to then insert empty records at the missing timestamps with + NaN values. Through spline interpolation the missing NaN values are calculated resulting in a consistent data set + and thus enhance your data quality. + + Example + -------- + ```python + from pyspark.sql import SparkSession + from pyspark.sql.dataframe import DataFrame + from pyspark.sql.types import StructType, StructField, StringType + from rtdip_sdk.pipelines.data_quality.data_manipulation.spark.missing_value_imputation import ( + MissingValueImputation, + ) + + spark = spark_session() + + schema = StructType([ + StructField("TagName", StringType(), True), + StructField("EventTime", StringType(), True), + StructField("Status", StringType(), True), + StructField("Value", StringType(), True) + ]) + + data = [ + ("A2PS64V0J.:ZUX09R", "2024-01-01 03:29:21.000", "Good", "1.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-01 07:32:55.000", "Good", "2.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-01 11:36:29.000", "Good", "3.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-01 15:39:03.000", "Good", "4.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-01 19:42:37.000", "Good", "5.0"), + #("A2PS64V0J.:ZUX09R", "2024-01-01 23:46:11.000", "Good", "6.0"), # Test values + #("A2PS64V0J.:ZUX09R", "2024-01-02 03:49:45.000", "Good", "7.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 07:53:11.000", "Good", "8.0"), + ] + df = spark.createDataFrame(data, schema=schema) + + missing_value_imputation = MissingValueImputation(spark, df) + result = missing_value_imputation.filter_data() + ``` + + Parameters: + df (DataFrame): Dataframe containing the raw data. + tolerance_percentage (int): Percentage value that indicates how much the time series data points may vary + in each interval + """ + + df: PySparkDataFrame + + def __init__( + self, + spark: SparkSession, + df: PySparkDataFrame, + tolerance_percentage: int = 5, + ) -> None: + self.spark = spark + self.df = df + self.tolerance_percentage = tolerance_percentage + + @staticmethod + def system_type(): + """ + Attributes: + SystemType (Environment): Requires PYSPARK + """ + return SystemType.PYSPARK + + @staticmethod + def libraries(): + libraries = Libraries() + return libraries + + @staticmethod + def settings() -> dict: + return {} + + @staticmethod + def _impute_missing_values_sp(df) -> PySparkDataFrame: + """ + Imputes missing values by Spline Interpolation + """ + data = np.array( + df.select("Value").rdd.flatMap(lambda x: x).collect(), dtype=float + ) + mask = np.isnan(data) + + x_data = np.arange(len(data)) + y_data = data[~mask] + + spline = UnivariateSpline(x_data[~mask], y_data, s=0) + + data_imputed = data.copy() + data_imputed[mask] = spline(x_data[mask]) + data_imputed_list = data_imputed.tolist() + + imputed_rdd = df.rdd.zipWithIndex().map( + lambda row: Row( + TagName=row[0][0], + EventTime=row[0][1], + Status=row[0][2], + Value=float(data_imputed_list[row[1]]), + ) + ) + imputed_df = imputed_rdd.toDF(df.schema) + + return imputed_df + + @staticmethod + def _flag_missing_values(df, tolerance_percentage) -> PySparkDataFrame: + """ + Determines intervals of each respective source time series and inserts empty records at missing timestamps + with NaN values + """ + window_spec = Window.partitionBy("TagName").orderBy("EventTime") + + df = df.withColumn("prev_event_time", F.lag("EventTime").over(window_spec)) + df = df.withColumn( + "time_diff_seconds", + (F.unix_timestamp("EventTime") - F.unix_timestamp("prev_event_time")), + ) + + df_diff = df.filter(F.col("time_diff_seconds").isNotNull()) + interval_counts = df_diff.groupBy("time_diff_seconds").count() + most_frequent_interval = interval_counts.orderBy(F.desc("count")).first() + expected_interval = ( + most_frequent_interval["time_diff_seconds"] + if most_frequent_interval + else None + ) + + tolerance = ( + (expected_interval * tolerance_percentage) / 100 if expected_interval else 0 + ) + + existing_timestamps = ( + df.select("TagName", "EventTime") + .rdd.map(lambda row: (row["TagName"], row["EventTime"])) + .groupByKey() + .collectAsMap() + ) + + def generate_missing_timestamps(prev_event_time, event_time, tag_name): + # Check for first row + if ( + prev_event_time is None + or event_time is None + or expected_interval is None + ): + return [] + + # Check against existing timestamps to avoid duplicates + tag_timestamps = set(existing_timestamps.get(tag_name, [])) + missing_timestamps = [] + current_time = prev_event_time + + while current_time < event_time: + next_expected_time = current_time + timedelta(seconds=expected_interval) + time_diff = abs((next_expected_time - event_time).total_seconds()) + if time_diff <= tolerance: + break + if next_expected_time not in tag_timestamps: + missing_timestamps.append(next_expected_time) + current_time = next_expected_time + + return missing_timestamps + + generate_missing_timestamps_udf = udf( + generate_missing_timestamps, ArrayType(TimestampType()) + ) + + df_with_missing = df.withColumn( + "missing_timestamps", + generate_missing_timestamps_udf("prev_event_time", "EventTime", "TagName"), + ) + + df_missing_entries = df_with_missing.select( + "TagName", + F.explode("missing_timestamps").alias("EventTime"), + F.lit("Good").alias("Status"), + F.lit(float("nan")).cast(FloatType()).alias("Value"), + ) + + df_combined = ( + df.select("TagName", "EventTime", "Status", "Value") + .union(df_missing_entries) + .orderBy("EventTime") + ) + + return df_combined + + @staticmethod + def _is_column_type(df, column_name, data_type): + """ + Helper method for data type checking + """ + type_ = df.schema[column_name] + + return isinstance(type_.dataType, data_type) + + def filter_data(self) -> PySparkDataFrame: + """ + Imputate missing values based on [Spline Interpolation, ] + """ + if not all( + col_ in self.df.columns + for col_ in ["TagName", "EventTime", "Value", "Status"] + ): + raise ValueError("Columns not as expected") + + if not self._is_column_type(self.df, "EventTime", TimestampType): + if self._is_column_type(self.df, "EventTime", StringType): + # Attempt to parse the first format, then fallback to the second + self.df = self.df.withColumn( + "EventTime", + F.coalesce( + F.to_timestamp("EventTime", "yyyy-MM-dd HH:mm:ss.SSS"), + F.to_timestamp("EventTime", "dd.MM.yyyy HH:mm:ss"), + ), + ) + if not self._is_column_type(self.df, "Value", FloatType): + self.df = self.df.withColumn("Value", self.df["Value"].cast(FloatType())) + + dfs_by_source = self._split_by_source() + + imputed_dfs: List[PySparkDataFrame] = [] + + for source, df in dfs_by_source.items(): + # Determine, insert and flag all the missing entries + flagged_df = self._flag_missing_values(df, self.tolerance_percentage) + + # Impute the missing values of flagged entries + try: + imputed_df_sp = self._impute_missing_values_sp(flagged_df) + except Exception as e: + if flagged_df.count() != 1: # Account for single entries + raise Exception( + "Something went wrong while imputing missing values" + ) + + imputed_dfs.append(imputed_df_sp) + + result_df = imputed_dfs[0] + for df in imputed_dfs[1:]: + result_df = result_df.unionByName(df) + + return result_df + + def _split_by_source(self) -> dict: + """ + Helper method to separate individual time series based on their source + """ + tag_names = self.df.select("TagName").distinct().collect() + tag_names = [row["TagName"] for row in tag_names] + source_dict = { + tag: self.df.filter(col("TagName") == tag).orderBy("EventTime") + for tag in tag_names + } + + return source_dict diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/normalization/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/normalization/__init__.py new file mode 100644 index 000000000..672fdd6d3 --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/normalization/__init__.py @@ -0,0 +1,18 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .denormalization import Denormalization +from .normalization_mean import NormalizationMean +from .normalization_minmax import NormalizationMinMax +from .normalization_zscore import NormalizationZScore diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/normalization/denormalization.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/normalization/denormalization.py new file mode 100644 index 000000000..3e7a7fc8b --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/normalization/denormalization.py @@ -0,0 +1,75 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from pyspark.sql import DataFrame as PySparkDataFrame +from ....input_validator import InputValidator +from ...interfaces import ( + DataManipulationBaseInterface, +) +from ....._pipeline_utils.models import ( + Libraries, + SystemType, +) +from .normalization import ( + NormalizationBaseClass, +) + + +class Denormalization(DataManipulationBaseInterface, InputValidator): + """ + Applies the appropriate denormalization method to revert values to their original scale. + + Example + -------- + ```python + from rtdip_sdk.pipelines.data_quality.data_manipulation.spark.normalization.denormalization import Denormalization + from pyspark.sql import SparkSession + from pyspark.sql.dataframe import DataFrame + + denormalization = Denormalization(normalized_df, normalization) + denormalized_df = denormalization.filter_data() + ``` + + Parameters: + df (DataFrame): PySpark DataFrame to be reverted to its original scale. + normalization_to_revert (NormalizationBaseClass): An instance of the specific normalization subclass (NormalizationZScore, NormalizationMinMax, NormalizationMean) that was originally used to normalize the data. + """ + + df: PySparkDataFrame + normalization_to_revert: NormalizationBaseClass + + def __init__( + self, df: PySparkDataFrame, normalization_to_revert: NormalizationBaseClass + ) -> None: + self.df = df + self.normalization_to_revert = normalization_to_revert + + @staticmethod + def system_type(): + """ + Attributes: + SystemType (Environment): Requires PYSPARK + """ + return SystemType.PYSPARK + + @staticmethod + def libraries(): + libraries = Libraries() + return libraries + + @staticmethod + def settings() -> dict: + return {} + + def filter_data(self) -> PySparkDataFrame: + return self.normalization_to_revert.denormalize(self.df) diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/normalization/normalization.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/normalization/normalization.py new file mode 100644 index 000000000..dd4c3cad3 --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/normalization/normalization.py @@ -0,0 +1,149 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from abc import abstractmethod +from pyspark.sql import DataFrame as PySparkDataFrame +from typing import List +from pyspark.sql.types import DoubleType, StructField, StructType +from ....input_validator import InputValidator +from ...interfaces import ( + DataManipulationBaseInterface, +) +from ....._pipeline_utils.models import ( + Libraries, + SystemType, +) + + +class NormalizationBaseClass(DataManipulationBaseInterface, InputValidator): + """ + A base class for applying normalization techniques to multiple columns in a PySpark DataFrame. + This class serves as a framework to support various normalization methods (e.g., Z-Score, Min-Max, and Mean), + with specific implementations in separate subclasses for each normalization type. + + Subclasses should implement specific normalization and denormalization methods by inheriting from this base class. + + + Example + -------- + ```python + from rtdip_sdk.pipelines.data_quality.data_manipulation.spark.normalization.normalization import NormalizationZScore + + from pyspark.sql import SparkSession + from pyspark.sql.dataframe import DataFrame + + normalization = NormalizationZScore(df, column_names=["value_column_1", "value_column_2"], in_place=False) + normalized_df = normalization.filter_data() + ``` + + Parameters: + df (DataFrame): PySpark DataFrame to be normalized. + column_names (List[str]): List of columns in the DataFrame to be normalized. + in_place (bool): If true, then result of normalization is stored in the same column. + + Attributes: + NORMALIZATION_NAME_POSTFIX : str + Suffix added to the column name if a new column is created for normalized values. + + """ + + df: PySparkDataFrame + column_names: List[str] + in_place: bool + + reversal_value: List[float] + + # Appended to column name if new column is added + NORMALIZATION_NAME_POSTFIX: str = "normalization" + + def __init__( + self, df: PySparkDataFrame, column_names: List[str], in_place: bool = False + ) -> None: + self.df = df + self.column_names = column_names + self.in_place = in_place + + EXPECTED_SCHEMA = StructType( + [StructField(column_name, DoubleType()) for column_name in column_names] + ) + self.validate(EXPECTED_SCHEMA) + + @staticmethod + def system_type(): + """ + Attributes: + SystemType (Environment): Requires PYSPARK + """ + return SystemType.PYSPARK + + @staticmethod + def libraries(): + libraries = Libraries() + return libraries + + @staticmethod + def settings() -> dict: + return {} + + def filter_data(self): + return self.normalize() + + def normalize(self) -> PySparkDataFrame: + """ + Applies the specified normalization to each column in column_names. + + Returns: + DataFrame: A PySpark DataFrame with the normalized values. + """ + normalized_df = self.df + for column in self.column_names: + normalized_df = self._normalize_column(normalized_df, column) + return normalized_df + + def denormalize(self, input_df) -> PySparkDataFrame: + """ + Denormalizes the input DataFrame. Intended to be used by the denormalization component. + + Parameters: + input_df (DataFrame): Dataframe containing the current data. + """ + denormalized_df = input_df + if not self.in_place: + for column in self.column_names: + denormalized_df = denormalized_df.drop( + self._get_norm_column_name(column) + ) + else: + for column in self.column_names: + denormalized_df = self._denormalize_column(denormalized_df, column) + return denormalized_df + + @property + @abstractmethod + def NORMALIZED_COLUMN_NAME(self): ... + + @abstractmethod + def _normalize_column(self, df: PySparkDataFrame, column: str) -> PySparkDataFrame: + pass + + @abstractmethod + def _denormalize_column( + self, df: PySparkDataFrame, column: str + ) -> PySparkDataFrame: + pass + + def _get_norm_column_name(self, column_name: str) -> str: + if not self.in_place: + return f"{column_name}_{self.NORMALIZED_COLUMN_NAME}_{self.NORMALIZATION_NAME_POSTFIX}" + else: + return column_name diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/normalization/normalization_mean.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/normalization/normalization_mean.py new file mode 100644 index 000000000..55f29de37 --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/normalization/normalization_mean.py @@ -0,0 +1,81 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math + +from .normalization import NormalizationBaseClass +from pyspark.sql import DataFrame as PySparkDataFrame +from pyspark.sql import functions as F + + +class NormalizationMean(NormalizationBaseClass): + """ + Implements mean normalization for specified columns in a PySpark DataFrame. + + Example + -------- + ```python + from rtdip_sdk.pipelines.data_quality.data_manipulation.spark.normalization.normalization_mean import NormalizationMean + from pyspark.sql import SparkSession + from pyspark.sql.dataframe import DataFrame + + normalization = NormalizationMean(df, column_names=["value_column_1", "value_column_2"], in_place=False) + normalized_df = normalization.filter_data() + ``` + + Parameters: + df (DataFrame): PySpark DataFrame to be normalized. + column_names (List[str]): List of columns in the DataFrame to be normalized. + in_place (bool): If true, then result of normalization is stored in the same column. + """ + + NORMALIZED_COLUMN_NAME = "mean" + + def _normalize_column(self, df: PySparkDataFrame, column: str) -> PySparkDataFrame: + """ + Private method to apply Mean normalization to the specified column. + Mean normalization: (value - mean) / (max - min) + """ + mean_val = df.select(F.mean(F.col(column))).collect()[0][0] + min_val = df.select(F.min(F.col(column))).collect()[0][0] + max_val = df.select(F.max(F.col(column))).collect()[0][0] + + divisor = max_val - min_val + if math.isclose(divisor, 0.0, abs_tol=10e-8) or not math.isfinite(divisor): + raise ZeroDivisionError("Division by Zero in Mean") + + store_column = self._get_norm_column_name(column) + self.reversal_value = [mean_val, min_val, max_val] + + return df.withColumn( + store_column, + (F.col(column) - F.lit(mean_val)) / (F.lit(max_val) - F.lit(min_val)), + ) + + def _denormalize_column( + self, df: PySparkDataFrame, column: str + ) -> PySparkDataFrame: + """ + Private method to revert Mean normalization to the specified column. + Mean denormalization: normalized_value * (max - min) + mean = value + """ + mean_val = self.reversal_value[0] + min_val = self.reversal_value[1] + max_val = self.reversal_value[2] + + store_column = self._get_norm_column_name(column) + + return df.withColumn( + store_column, + F.col(column) * (F.lit(max_val) - F.lit(min_val)) + F.lit(mean_val), + ) diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/normalization/normalization_minmax.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/normalization/normalization_minmax.py new file mode 100644 index 000000000..0c2ad583a --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/normalization/normalization_minmax.py @@ -0,0 +1,79 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math + +from .normalization import NormalizationBaseClass +from pyspark.sql import DataFrame as PySparkDataFrame +from pyspark.sql import functions as F + + +class NormalizationMinMax(NormalizationBaseClass): + """ + Implements Min-Max normalization for specified columns in a PySpark DataFrame. + + Example + -------- + ```python + from rtdip_sdk.pipelines.data_quality.data_manipulation.spark.normalization.normalization_minmax import NormalizationMinMax + from pyspark.sql import SparkSession + from pyspark.sql.dataframe import DataFrame + + normalization = NormalizationMinMax(df, column_names=["value_column_1", "value_column_2"], in_place=False) + normalized_df = normalization.filter_data() + ``` + + Parameters: + df (DataFrame): PySpark DataFrame to be normalized. + column_names (List[str]): List of columns in the DataFrame to be normalized. + in_place (bool): If true, then result of normalization is stored in the same column. + """ + + NORMALIZED_COLUMN_NAME = "minmax" + + def _normalize_column(self, df: PySparkDataFrame, column: str) -> PySparkDataFrame: + """ + Private method to revert Min-Max normalization to the specified column. + Min-Max denormalization: normalized_value * (max - min) + min = value + """ + min_val = df.select(F.min(F.col(column))).collect()[0][0] + max_val = df.select(F.max(F.col(column))).collect()[0][0] + + divisor = max_val - min_val + if math.isclose(divisor, 0.0, abs_tol=10e-8) or not math.isfinite(divisor): + raise ZeroDivisionError("Division by Zero in MinMax") + + store_column = self._get_norm_column_name(column) + self.reversal_value = [min_val, max_val] + + return df.withColumn( + store_column, + (F.col(column) - F.lit(min_val)) / (F.lit(max_val) - F.lit(min_val)), + ) + + def _denormalize_column( + self, df: PySparkDataFrame, column: str + ) -> PySparkDataFrame: + """ + Private method to revert Z-Score normalization to the specified column. + Z-Score denormalization: normalized_value * std_dev + mean = value + """ + min_val = self.reversal_value[0] + max_val = self.reversal_value[1] + + store_column = self._get_norm_column_name(column) + + return df.withColumn( + store_column, + (F.col(column) * (F.lit(max_val) - F.lit(min_val))) + F.lit(min_val), + ) diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/normalization/normalization_zscore.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/normalization/normalization_zscore.py new file mode 100644 index 000000000..da13aaac9 --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/normalization/normalization_zscore.py @@ -0,0 +1,78 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math + +from .normalization import NormalizationBaseClass +from pyspark.sql import DataFrame as PySparkDataFrame +from pyspark.sql import functions as F + + +class NormalizationZScore(NormalizationBaseClass): + """ + Implements Z-Score normalization for specified columns in a PySpark DataFrame. + + Example + -------- + ```python + from rtdip_sdk.pipelines.data_quality.data_manipulation.spark.normalization.normalization_zscore import NormalizationZScore + from pyspark.sql import SparkSession + from pyspark.sql.dataframe import DataFrame + + normalization = NormalizationZScore(df, column_names=["value_column_1", "value_column_2"], in_place=False) + normalized_df = normalization.filter_data() + ``` + + Parameters: + df (DataFrame): PySpark DataFrame to be normalized. + column_names (List[str]): List of columns in the DataFrame to be normalized. + in_place (bool): If true, then result of normalization is stored in the same column. + """ + + NORMALIZED_COLUMN_NAME = "zscore" + + def _normalize_column(self, df: PySparkDataFrame, column: str) -> PySparkDataFrame: + """ + Private method to apply Z-Score normalization to the specified column. + Z-Score normalization: (value - mean) / std_dev + """ + mean_val = df.select(F.mean(F.col(column))).collect()[0][0] + std_dev_val = df.select(F.stddev(F.col(column))).collect()[0][0] + + if math.isclose(std_dev_val, 0.0, abs_tol=10e-8) or not math.isfinite( + std_dev_val + ): + raise ZeroDivisionError("Division by Zero in ZScore") + + store_column = self._get_norm_column_name(column) + self.reversal_value = [mean_val, std_dev_val] + + return df.withColumn( + store_column, (F.col(column) - F.lit(mean_val)) / F.lit(std_dev_val) + ) + + def _denormalize_column( + self, df: PySparkDataFrame, column: str + ) -> PySparkDataFrame: + """ + Private method to revert Z-Score normalization to the specified column. + Z-Score denormalization: normalized_value * std_dev + mean = value + """ + mean_val = self.reversal_value[0] + std_dev_val = self.reversal_value[1] + + store_column = self._get_norm_column_name(column) + + return df.withColumn( + store_column, F.col(column) * F.lit(std_dev_val) + F.lit(mean_val) + ) diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/out_of_range_value_filter.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/out_of_range_value_filter.py new file mode 100644 index 000000000..8f9b80115 --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/out_of_range_value_filter.py @@ -0,0 +1,127 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from pyspark.sql import DataFrame as PySparkDataFrame +from ...monitoring.spark.check_value_ranges import CheckValueRanges +from ..interfaces import DataManipulationBaseInterface +from ...._pipeline_utils.models import ( + Libraries, + SystemType, +) + + +class OutOfRangeValueFilter(DataManipulationBaseInterface): + """ + Filters data in a DataFrame by checking the 'Value' column against expected ranges for specified TagNames. + Logs events when 'Value' exceeds the defined ranges for any TagName and deletes the rows. + + Args: + df (pyspark.sql.DataFrame): The DataFrame to monitor. + tag_ranges (dict): A dictionary where keys are TagNames and values are dictionaries specifying 'min' and/or + 'max', and optionally 'inclusive_bounds' values. + Example: + { + 'A2PS64V0J.:ZUX09R': {'min': 0, 'max': 100, 'inclusive_bounds': True}, + 'B3TS64V0K.:ZUX09R': {'min': 10, 'max': 200, 'inclusive_bounds': False}, + } + + Example: + ```python + from pyspark.sql import SparkSession + from rtdip_sdk.pipelines.data_quality.data_manipulation.spark.out_of_range_value_filter import OutOfRangeValueFilter + + + spark = SparkSession.builder.master("local[1]").appName("DeleteOutOfRangeValuesExample").getOrCreate() + + data = [ + ("A2PS64V0J.:ZUX09R", "2024-01-02 03:49:45.000", "Good", 25.0), + ("A2PS64V0J.:ZUX09R", "2024-01-02 07:53:11.000", "Good", -5.0), + ("A2PS64V0J.:ZUX09R", "2024-01-02 11:56:42.000", "Good", 50.0), + ("B3TS64V0K.:ZUX09R", "2024-01-02 16:00:12.000", "Good", 80.0), + ("A2PS64V0J.:ZUX09R", "2024-01-02 20:03:46.000", "Good", 100.0), + ] + + columns = ["TagName", "EventTime", "Status", "Value"] + + df = spark.createDataFrame(data, columns) + + tag_ranges = { + "A2PS64V0J.:ZUX09R": {"min": 0, "max": 50, "inclusive_bounds": True}, + "B3TS64V0K.:ZUX09R": {"min": 50, "max": 100, "inclusive_bounds": False}, + } + + out_of_range_value_filter = OutOfRangeValueFilter( + df=df, + tag_ranges=tag_ranges, + ) + + result_df = out_of_range_value_filter.filter_data() + ``` + """ + + df: PySparkDataFrame + + def __init__( + self, + df: PySparkDataFrame, + tag_ranges: dict, + ) -> None: + self.df = df + self.check_value_ranges = CheckValueRanges(df=df, tag_ranges=tag_ranges) + + # Configure logging + self.logger = logging.getLogger(self.__class__.__name__) + if not self.logger.handlers: + handler = logging.StreamHandler() + formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + ) + handler.setFormatter(formatter) + self.logger.addHandler(handler) + self.logger.setLevel(logging.INFO) + + @staticmethod + def system_type(): + """ + Attributes: + SystemType (Environment): Requires PYSPARK + """ + return SystemType.PYSPARK + + @staticmethod + def libraries(): + libraries = Libraries() + return libraries + + @staticmethod + def settings() -> dict: + return {} + + def filter_data(self) -> PySparkDataFrame: + """ + Executes the value range checking logic for the specified TagNames. Identifies, logs and deletes any rows + where 'Value' exceeds the defined ranges for each TagName. + + Returns: + pyspark.sql.DataFrame: + Returns a PySpark DataFrame without the rows that were out of range. + """ + out_of_range_df = self.check_value_ranges.check_for_out_of_range() + + if out_of_range_df.count() > 0: + self.check_value_ranges.log_out_of_range_values(out_of_range_df) + else: + self.logger.info(f"No out of range values found in 'Value' column.") + return self.df.subtract(out_of_range_df) diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_quality/input_validator.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/input_validator.py new file mode 100644 index 000000000..434113cf0 --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/input_validator.py @@ -0,0 +1,171 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pyspark.sql.types import DataType, StructType +from pyspark.sql import functions as F +from pyspark.sql import DataFrame as SparkDataFrame +from ..interfaces import PipelineComponentBaseInterface +from .._pipeline_utils.models import ( + Libraries, + SystemType, +) + + +class InputValidator(PipelineComponentBaseInterface): + """ + Validates the PySpark DataFrame of the respective child class instance against a schema dictionary or pyspark + StructType. Checks for column availability and column data types. If data types differ, it tries to cast the + column into the expected data type. Casts "None", "none", "Null", "null" and "" to None. Raises Errors if some step fails. + + Example: + -------- + import pytest + from pyspark.sql import SparkSession + from pyspark.sql.types import StructType, StructField, StringType, TimestampType, FloatType + from src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.missing_value_imputation import ( + MissingValueImputation, + ) + + @pytest.fixture(scope="session") + def spark_session(): + return SparkSession.builder.master("local[2]").appName("test").getOrCreate() + + spark = spark_session() + + test_schema = StructType( + [ + StructField("TagName", StringType(), True), + StructField("EventTime", StringType(), True), + StructField("Status", StringType(), True), + StructField("Value", StringType(), True), + ] + ) + expected_schema = StructType( + [ + StructField("TagName", StringType(), True), + StructField("EventTime", TimestampType(), True), + StructField("Status", StringType(), True), + StructField("Value", FloatType(), True), + ] + ) + + test_data = [ + ("A2PS64V0J.:ZUX09R", "2024-01-01 03:29:21.000", "Good", "1.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-01 07:32:55.000", "Good", "2.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-01 11:36:29.000", "Good", "3.0"), + ] + + test_df = spark_session.createDataFrame(test_data, schema=test_schema) + test_component = MissingValueImputation(spark_session, test_df) + + print(test_component.validate(expected_schema)) # True + + ``` + + Parameters: + schema_dict: dict or pyspark StructType + A dictionary where keys are column names, and values are expected PySpark data types. + Example: {"column1": StringType(), "column2": IntegerType()} + + Returns: + True: if data is valid + Raises Error else + + Raises: + ValueError: If a column is missing or has a mismatched pyspark data type. + TypeError: If a column does not hold or specify a pyspark data type. + """ + + @staticmethod + def system_type(): + """ + Attributes: + SystemType (Environment): Requires PYSPARK + """ + return SystemType.PYSPARK + + @staticmethod + def libraries(): + libraries = Libraries() + return libraries + + @staticmethod + def settings() -> dict: + return {} + + def validate(self, schema_dict, df: SparkDataFrame = None): + """ + Used by child data quality utility classes to validate the input data. + """ + if df is None: + dataframe = getattr(self, "df", None) + + if isinstance(schema_dict, StructType): + schema_dict = {field.name: field.dataType for field in schema_dict.fields} + + dataframe_schema = { + field.name: field.dataType for field in dataframe.schema.fields + } + + for column, expected_type in schema_dict.items(): + if column in dataframe.columns: + dataframe = dataframe.withColumn( + column, + F.when( + F.col(column).isin("None", "none", "null", "Null", ""), None + ).otherwise(F.col(column)), + ) + + for column, expected_type in schema_dict.items(): + # Check if the column exists + if column not in dataframe_schema: + raise ValueError(f"Column '{column}' is missing in the DataFrame.") + + # Check if both types are of a pyspark data type + actual_type = dataframe_schema[column] + if not isinstance(actual_type, DataType) or not isinstance( + expected_type, DataType + ): + raise TypeError( + "Expected and actual types must be instances of pyspark.sql.types.DataType." + ) + + # Check if actual type is expected type, try to cast else + dataframe = self.cast_column_if_needed( + dataframe, column, expected_type, actual_type + ) + + self.df = dataframe + return True + + def cast_column_if_needed(self, dataframe, column, expected_type, actual_type): + if not isinstance(actual_type, type(expected_type)): + try: + original_null_count = dataframe.filter(F.col(column).isNull()).count() + casted_column = dataframe.withColumn( + column, F.col(column).cast(expected_type) + ) + new_null_count = casted_column.filter(F.col(column).isNull()).count() + + if new_null_count > original_null_count: + raise ValueError( + f"Column '{column}' cannot be cast to {expected_type}." + ) + dataframe = casted_column + except Exception as e: + raise ValueError( + f"Error during casting column '{column}' to {expected_type}: {str(e)}" + ) + + return dataframe diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/__init__.py new file mode 100644 index 000000000..76bb6a388 --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .spark import * diff --git a/src/sdk/python/rtdip_sdk/pipelines/monitoring/interfaces.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/interfaces.py similarity index 80% rename from src/sdk/python/rtdip_sdk/pipelines/monitoring/interfaces.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/interfaces.py index 2c446c5bc..34176beeb 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/monitoring/interfaces.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/interfaces.py @@ -13,8 +13,13 @@ # limitations under the License. from abc import abstractmethod -from ..interfaces import PipelineComponentBaseInterface + +from pyspark.sql import DataFrame + +from ...interfaces import PipelineComponentBaseInterface class MonitoringBaseInterface(PipelineComponentBaseInterface): - pass + @abstractmethod + def check(self) -> DataFrame: + pass diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/__init__.py new file mode 100644 index 000000000..50c574207 --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/__init__.py @@ -0,0 +1,22 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sys + +from .check_value_ranges import CheckValueRanges +from .flatline_detection import FlatlineDetection + +if "great_expectations" in sys.modules: + from .great_expectations_data_quality import GreatExpectationsDataQuality +from .identify_missing_data_interval import IdentifyMissingDataInterval +from .identify_missing_data_pattern import IdentifyMissingDataPattern diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/check_value_ranges.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/check_value_ranges.py new file mode 100644 index 000000000..f226f4561 --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/check_value_ranges.py @@ -0,0 +1,260 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from pyspark.sql import DataFrame as PySparkDataFrame +from pyspark.sql.functions import col +from pyspark.sql.types import ( + StructType, + StructField, + StringType, + TimestampType, + FloatType, +) +from functools import reduce +from operator import or_ +from ..interfaces import MonitoringBaseInterface +from ...._pipeline_utils.models import ( + Libraries, + SystemType, +) +from ...input_validator import InputValidator + + +class CheckValueRanges(MonitoringBaseInterface, InputValidator): + """ + Monitors data in a DataFrame by checking the 'Value' column against expected ranges for specified TagNames. + Logs events when 'Value' exceeds the defined ranges for any TagName. + + Args: + df (pyspark.sql.DataFrame): The DataFrame to monitor. + tag_ranges (dict): A dictionary where keys are TagNames and values are dictionaries specifying 'min' and/or + 'max', and optionally 'inclusive_bounds' values. + Example: + { + 'A2PS64V0J.:ZUX09R': {'min': 0, 'max': 100, 'inclusive_bounds': True}, + 'B3TS64V0K.:ZUX09R': {'min': 10, 'max': 200, 'inclusive_bounds': False}, + } + + Example: + ```python + from pyspark.sql import SparkSession + from rtdip_sdk.pipelines.data_quality.monitoring.spark.check_value_ranges import CheckValueRanges + + + spark = SparkSession.builder.master("local[1]").appName("CheckValueRangesExample").getOrCreate() + + data = [ + ("A2PS64V0J.:ZUX09R", "2024-01-02 03:49:45.000", "Good", 25.0), + ("A2PS64V0J.:ZUX09R", "2024-01-02 07:53:11.000", "Good", -5.0), + ("A2PS64V0J.:ZUX09R", "2024-01-02 11:56:42.000", "Good", 50.0), + ("B3TS64V0K.:ZUX09R", "2024-01-02 16:00:12.000", "Good", 80.0), + ("A2PS64V0J.:ZUX09R", "2024-01-02 20:03:46.000", "Good", 100.0), + ] + + columns = ["TagName", "EventTime", "Status", "Value"] + + df = spark.createDataFrame(data, columns) + + tag_ranges = { + "A2PS64V0J.:ZUX09R": {"min": 0, "max": 50, "inclusive_bounds": True}, + "B3TS64V0K.:ZUX09R": {"min": 50, "max": 100, "inclusive_bounds": False}, + } + + check_value_ranges = CheckValueRanges( + df=df, + tag_ranges=tag_ranges, + ) + + result_df = check_value_ranges.check() + ``` + """ + + df: PySparkDataFrame + tag_ranges: dict + EXPECTED_SCHEMA = StructType( + [ + StructField("TagName", StringType(), True), + StructField("EventTime", TimestampType(), True), + StructField("Status", StringType(), True), + StructField("Value", FloatType(), True), + ] + ) + + def __init__( + self, + df: PySparkDataFrame, + tag_ranges: dict, + ) -> None: + self.df = df + self.validate(self.EXPECTED_SCHEMA) + self.tag_ranges = tag_ranges + + # Configure logging + self.logger = logging.getLogger(self.__class__.__name__) + if not self.logger.handlers: + handler = logging.StreamHandler() + formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + ) + handler.setFormatter(formatter) + self.logger.addHandler(handler) + self.logger.setLevel(logging.INFO) + + @staticmethod + def system_type(): + """ + Attributes: + SystemType (Environment): Requires PYSPARK + """ + return SystemType.PYSPARK + + @staticmethod + def libraries(): + libraries = Libraries() + return libraries + + @staticmethod + def settings() -> dict: + return {} + + def check(self) -> PySparkDataFrame: + """ + Executes the value range checking logic for the specified TagNames. Identifies and logs any rows + where 'Value' exceeds the defined ranges for each TagName. + + Returns: + pyspark.sql.DataFrame: + Returns the original PySpark DataFrame without changes. + """ + out_of_range_df = self.check_for_out_of_range() + + if out_of_range_df.count() > 0: + self.log_out_of_range_values(out_of_range_df) + else: + self.logger.info(f"No out of range values found in 'Value' column.") + + return self.df + + def check_for_out_of_range(self) -> PySparkDataFrame: + """ + Identifies rows where 'Value' exceeds defined ranges. + + Returns: + pyspark.sql.DataFrame: A DataFrame containing rows with out-of-range values. + """ + + self._validate_inputs() + + out_of_range_df = self.df.filter("1=0") + + for tag_name, range_dict in self.tag_ranges.items(): + df = self.df.filter(col("TagName") == tag_name) + + if df.count() == 0: + self.logger.warning(f"No data found for TagName '{tag_name}'.") + continue + + min_value = range_dict.get("min", None) + max_value = range_dict.get("max", None) + inclusive_bounds = range_dict.get("inclusive_bounds", True) + + conditions = [] + + # Build minimum value condition + self.add_min_value_condition(min_value, inclusive_bounds, conditions) + + # Build maximum value condition + self.add_max_value_condition(max_value, inclusive_bounds, conditions) + + if conditions: + condition = reduce(or_, conditions) + tag_out_of_range_df = df.filter(condition) + out_of_range_df = out_of_range_df.union(tag_out_of_range_df) + + return out_of_range_df + + def add_min_value_condition(self, min_value, inclusive_bounds, conditions): + if min_value is not None: + if inclusive_bounds: + min_condition = col("Value") < min_value + else: + min_condition = col("Value") <= min_value + conditions.append(min_condition) + + def add_max_value_condition(self, max_value, inclusive_bounds, conditions): + if max_value is not None: + if inclusive_bounds: + max_condition = col("Value") > max_value + else: + max_condition = col("Value") >= max_value + conditions.append(max_condition) + + def log_out_of_range_values(self, out_of_range_df: PySparkDataFrame): + """ + Logs out-of-range values for all TagNames. + """ + for tag_name in ( + out_of_range_df.select("TagName") + .distinct() + .rdd.map(lambda row: row[0]) + .collect() + ): + tag_out_of_range_df = out_of_range_df.filter(col("TagName") == tag_name) + count = tag_out_of_range_df.count() + self.logger.info( + f"Found {count} rows in 'Value' column for TagName '{tag_name}' out of range." + ) + for row in tag_out_of_range_df.collect(): + self.logger.info(f"Out of range row for TagName '{tag_name}': {row}") + + def _validate_inputs(self): + if not isinstance(self.tag_ranges, dict): + raise TypeError("tag_ranges must be a dictionary.") + + available_tags = ( + self.df.select("TagName").distinct().rdd.map(lambda row: row[0]).collect() + ) + + for tag_name, range_dict in self.tag_ranges.items(): + self.validate_tag_name(available_tags, tag_name, range_dict) + + inclusive_bounds = range_dict.get("inclusive_bounds", True) + if not isinstance(inclusive_bounds, bool): + raise ValueError( + f"Inclusive_bounds for TagName '{tag_name}' must be a boolean." + ) + + min_value = range_dict.get("min", None) + max_value = range_dict.get("max", None) + if min_value is not None and not isinstance(min_value, (int, float)): + raise ValueError( + f"Minimum value for TagName '{tag_name}' must be a number." + ) + if max_value is not None and not isinstance(max_value, (int, float)): + raise ValueError( + f"Maximum value for TagName '{tag_name}' must be a number." + ) + + def validate_tag_name(self, available_tags, tag_name, range_dict): + if not isinstance(tag_name, str): + raise ValueError(f"TagName '{tag_name}' must be a string.") + + if tag_name not in available_tags: + raise ValueError(f"TagName '{tag_name}' not found in DataFrame.") + + if "min" not in range_dict and "max" not in range_dict: + raise ValueError( + f"TagName '{tag_name}' must have at least 'min' or 'max' specified." + ) diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/flatline_detection.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/flatline_detection.py new file mode 100644 index 000000000..41e75c10c --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/flatline_detection.py @@ -0,0 +1,234 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math +import logging +from pyspark.sql import DataFrame as PySparkDataFrame +from pyspark.sql.functions import col, when, lag, sum, lit, abs +from pyspark.sql.window import Window +from pyspark.sql.types import ( + StructType, + StructField, + StringType, + TimestampType, + FloatType, +) + +from ..interfaces import MonitoringBaseInterface +from ...._pipeline_utils.models import ( + Libraries, + SystemType, +) +from ...input_validator import InputValidator + + +class FlatlineDetection(MonitoringBaseInterface, InputValidator): + """ + Detects flatlining in specified columns of a PySpark DataFrame and logs warnings. + + Flatlining occurs when a column contains consecutive null or zero values exceeding a specified tolerance period. + This class identifies such occurrences and logs the rows where flatlining is detected. + + Args: + df (pyspark.sql.DataFrame): The input DataFrame to monitor for flatlining. + watch_columns (list): List of column names to monitor for flatlining (null or zero values). + tolerance_timespan (int): Maximum allowed consecutive flatlining period. If exceeded, a warning is logged. + + Example: + ```python + from rtdip_sdk.pipelines.data_quality.monitoring.spark.flatline_detection import FlatlineDetection + + from pyspark.sql import SparkSession + + spark = SparkSession.builder.master("local[1]").appName("FlatlineDetectionExample").getOrCreate() + + # Example DataFrame + data = [ + (1, 1), + (2, 0), + (3, 0), + (4, 0), + (5, 5), + ] + columns = ["ID", "Value"] + df = spark.createDataFrame(data, columns) + + # Initialize FlatlineDetection + flatline_detection = FlatlineDetection( + df, + watch_columns=["Value"], + tolerance_timespan=2 + ) + + # Detect flatlining + flatline_detection.check() + ``` + """ + + df: PySparkDataFrame + watch_columns: list + tolerance_timespan: int + EXPECTED_SCHEMA = StructType( + [ + StructField("TagName", StringType(), True), + StructField("EventTime", TimestampType(), True), + StructField("Status", StringType(), True), + StructField("Value", FloatType(), True), + ] + ) + + def __init__( + self, df: PySparkDataFrame, watch_columns: list, tolerance_timespan: int + ) -> None: + if not watch_columns or not isinstance(watch_columns, list): + raise ValueError("watch_columns must be a non-empty list of column names.") + if not isinstance(tolerance_timespan, int) or tolerance_timespan <= 0: + raise ValueError("tolerance_timespan must be a positive integer.") + + self.df = df + self.validate(self.EXPECTED_SCHEMA) + self.watch_columns = watch_columns + self.tolerance_timespan = tolerance_timespan + + self.logger = logging.getLogger(self.__class__.__name__) + if not self.logger.handlers: + handler = logging.StreamHandler() + formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + ) + handler.setFormatter(formatter) + self.logger.addHandler(handler) + self.logger.setLevel(logging.INFO) + + @staticmethod + def system_type(): + """ + Attributes: + SystemType (Environment): Requires PYSPARK + """ + return SystemType.PYSPARK + + @staticmethod + def libraries(): + libraries = Libraries() + return libraries + + @staticmethod + def settings() -> dict: + return {} + + def check(self) -> PySparkDataFrame: + """ + Detects flatlining and logs relevant rows. + + Returns: + pyspark.sql.DataFrame: The original DataFrame with additional flatline detection metadata. + """ + flatlined_rows = self.check_for_flatlining() + print("Flatlined Rows:") + flatlined_rows.show(truncate=False) + self.log_flatlining_rows(flatlined_rows) + return self.df + + def check_for_flatlining(self) -> PySparkDataFrame: + """ + Identifies rows with flatlining based on the specified columns and tolerance. + + Returns: + pyspark.sql.DataFrame: A DataFrame containing rows with flatlining detected. + """ + partition_column = "TagName" + sort_column = "EventTime" + window_spec = Window.partitionBy(partition_column).orderBy(sort_column) + + # Start with an empty DataFrame, ensure it has the required schema + flatlined_rows = ( + self.df.withColumn("Value_flatline_flag", lit(None).cast("int")) + .withColumn("Value_group", lit(None).cast("bigint")) + .filter("1=0") + ) + + for column in self.watch_columns: + flagged_column = f"{column}_flatline_flag" + group_column = f"{column}_group" + + # Add flag and group columns + df_with_flags = self.df.withColumn( + flagged_column, + when( + (col(column).isNull()) | (abs(col(column) - 0.0) <= 1e-09), + 1, + ).otherwise(0), + ).withColumn( + group_column, + sum( + when( + col(flagged_column) + != lag(col(flagged_column), 1, 0).over(window_spec), + 1, + ).otherwise(0) + ).over(window_spec), + ) + + # Identify flatlining groups + group_counts = ( + df_with_flags.filter(col(flagged_column) == 1) + .groupBy(group_column) + .count() + ) + large_groups = group_counts.filter(col("count") > self.tolerance_timespan) + large_group_ids = [row[group_column] for row in large_groups.collect()] + + if large_group_ids: + relevant_rows = df_with_flags.filter( + col(group_column).isin(large_group_ids) + ) + + # Ensure both DataFrames have the same columns + for col_name in flatlined_rows.columns: + if col_name not in relevant_rows.columns: + relevant_rows = relevant_rows.withColumn(col_name, lit(None)) + + flatlined_rows = flatlined_rows.union(relevant_rows) + + return flatlined_rows + + def log_flatlining_rows(self, flatlined_rows: PySparkDataFrame): + """ + Logs flatlining rows for all monitored columns. + + Args: + flatlined_rows (pyspark.sql.DataFrame): The DataFrame containing rows with flatlining detected. + """ + if flatlined_rows.count() == 0: + self.logger.info("No flatlining detected.") + return + + for column in self.watch_columns: + flagged_column = f"{column}_flatline_flag" + + if flagged_column not in flatlined_rows.columns: + self.logger.warning( + f"Expected column '{flagged_column}' not found in DataFrame." + ) + continue + + relevant_rows = flatlined_rows.filter(col(flagged_column) == 1).collect() + + if relevant_rows: + for row in relevant_rows: + self.logger.warning( + f"Flatlining detected in column '{column}' at row: {row}." + ) + else: + self.logger.info(f"No flatlining detected in column '{column}'.") diff --git a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/great_expectations_data_quality.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/great_expectations_data_quality.py similarity index 93% rename from src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/great_expectations_data_quality.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/great_expectations_data_quality.py index f8022e41c..4aed6a90c 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/great_expectations_data_quality.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/great_expectations_data_quality.py @@ -14,25 +14,29 @@ import great_expectations as gx from pyspark.sql import DataFrame, SparkSession -from ...interfaces import MonitoringBaseInterface -from ...._pipeline_utils.models import Libraries, SystemType +from ..interfaces import MonitoringBaseInterface +from ...._pipeline_utils.models import ( + Libraries, + SystemType, +) from great_expectations.checkpoint import ( Checkpoint, ) from great_expectations.expectations.expectation import ( ExpectationConfiguration, ) +from ...input_validator import InputValidator # Create a new context -class GreatExpectationsDataQuality(MonitoringBaseInterface): +class GreatExpectationsDataQuality(MonitoringBaseInterface, InputValidator): """ Data Quality Monitoring using Great Expectations allowing you to create and check your data quality expectations. Example -------- ```python - from src.sdk.python.rtdip_sdk.monitoring.data_quality.great_expectations.python.great_expectations_data_quality import GreatExpectationsDataQuality + from src.sdk.python.rtdip_sdk.monitoring.data_manipulation.great_expectations.python.great_expectations_data_quality import GreatExpectationsDataQuality from rtdip_sdk.pipelines.utilities import SparkSessionUtility import json @@ -74,7 +78,7 @@ class GreatExpectationsDataQuality(MonitoringBaseInterface): GX.display_expectations(suite) - #Run the Data Quality Check by Validating your data against set expecations in the suite + #Run the Data Quality Check by Validating your data against set expectations in the suite checkpoint_name = "checkpoint_name" run_name_template = "run_name_template" @@ -215,7 +219,7 @@ def check( action_list: list, ): """ - Validate your data against set expecations in the suite + Validate your data against set expectations in the suite Args: checkpoint_name (str): The name of the checkpoint. run_name_template (str): The name of the run. diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/identify_missing_data_interval.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/identify_missing_data_interval.py new file mode 100644 index 000000000..f91ce5f17 --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/identify_missing_data_interval.py @@ -0,0 +1,218 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pyspark.sql import DataFrame as PySparkDataFrame +from pyspark.sql import functions as F +from pyspark.sql.window import Window +from pyspark.sql.types import ( + StructType, + StructField, + StringType, + TimestampType, + FloatType, +) + +from ..interfaces import MonitoringBaseInterface +from ...._pipeline_utils.models import ( + Libraries, + SystemType, +) +from ....utilities.spark.time_string_parsing import parse_time_string_to_ms +from ...input_validator import InputValidator +from ....logging.logger_manager import LoggerManager + + +class IdentifyMissingDataInterval(MonitoringBaseInterface, InputValidator): + """ + Detects missing data intervals in a DataFrame by identifying time differences between consecutive + measurements that exceed a specified tolerance or a multiple of the Median Absolute Deviation (MAD). + Logs the start and end times of missing intervals along with their durations. + + + Args: + df (pyspark.sql.Dataframe): DataFrame containing at least the 'EventTime' column. + interval (str, optional): Expected interval between data points (e.g., '10ms', '500ms'). If not specified, the median of time differences is used. + tolerance (str, optional): Tolerance time beyond which an interval is considered missing (e.g., '10ms'). If not specified, it defaults to 'mad_multiplier' times the Median Absolute Deviation (MAD) of time differences. + mad_multiplier (float, optional): Multiplier for MAD to calculate tolerance. Default is 3. + min_tolerance (str, optional): Minimum tolerance for pattern-based detection (e.g., '100ms'). Default is '10ms'. + + Returns: + df (pyspark.sql.Dataframe): Returns the original PySparkDataFrame without changes. + + Example + -------- + ```python + from rtdip_sdk.pipelines.data_quality.monitoring.spark.identify_missing_data_interval import IdentifyMissingDataInterval + + from pyspark.sql import SparkSession + + missing_data_monitor = IdentifyMissingDataInterval( + df=df, + interval='100ms', + tolerance='10ms', + ) + + df_result = missing_data_monitor.check() + ``` + + """ + + df: PySparkDataFrame + EXPECTED_SCHEMA = StructType( + [ + StructField("TagName", StringType(), True), + StructField("EventTime", TimestampType(), True), + StructField("Status", StringType(), True), + StructField("Value", FloatType(), True), + ] + ) + + def __init__( + self, + df: PySparkDataFrame, + interval: str = None, + tolerance: str = None, + mad_multiplier: float = 3, + min_tolerance: str = "10ms", + ) -> None: + + self.df = df + self.interval = interval + self.tolerance = tolerance + self.mad_multiplier = mad_multiplier + self.min_tolerance = min_tolerance + self.validate(self.EXPECTED_SCHEMA) + + # Use global pipeline logger + self.logger_manager = LoggerManager() + self.logger = self.logger_manager.create_logger("IdentifyMissingDataInterval") + + @staticmethod + def system_type(): + """ + Attributes: + SystemType (Environment): Requires PYSPARK + """ + return SystemType.PYSPARK + + @staticmethod + def libraries(): + libraries = Libraries() + return libraries + + @staticmethod + def settings() -> dict: + return {} + + def check(self) -> PySparkDataFrame: + """ + Executes the identify missing data logic. + + Returns: + pyspark.sql.DataFrame: + Returns the original PySpark DataFrame without changes. + """ + if "EventTime" not in self.df.columns: + self.logger.error("The DataFrame must contain an 'EventTime' column.") + raise ValueError("The DataFrame must contain an 'EventTime' column.") + + df = self.df.withColumn("EventTime", F.to_timestamp("EventTime")) + df_sorted = df.orderBy("EventTime") + # Calculate time difference in milliseconds between consecutive rows + df_with_diff = df_sorted.withColumn( + "TimeDeltaMs", + ( + F.col("EventTime").cast("double") + - F.lag("EventTime").over(Window.orderBy("EventTime")).cast("double") + ) + * 1000, + ).withColumn( + "StartMissing", F.lag("EventTime").over(Window.orderBy("EventTime")) + ) + # Parse interval to milliseconds if given + if self.interval is not None: + try: + interval_ms = parse_time_string_to_ms(self.interval) + self.logger.info(f"Using provided expected interval: {interval_ms} ms") + except ValueError as e: + self.logger.error(e) + raise + else: + # Calculate interval based on median of time differences + median_expr = F.expr("percentile_approx(TimeDeltaMs, 0.5)") + median_row = df_with_diff.select(median_expr.alias("median")).collect()[0] + interval_ms = median_row["median"] + self.logger.info( + f"Using median of time differences as expected interval: {interval_ms} ms" + ) + # Parse tolernace to milliseconds if given + if self.tolerance is not None: + try: + tolerance_ms = parse_time_string_to_ms(self.tolerance) + self.logger.info(f"Using provided tolerance: {tolerance_ms} ms") + except ValueError as e: + self.logger.error(e) + raise + else: + # Calculate tolerance based on MAD + mad_expr = F.expr( + f"percentile_approx(abs(TimeDeltaMs - {interval_ms}), 0.5)" + ) + mad_row = df_with_diff.select(mad_expr.alias("mad")).collect()[0] + mad = mad_row["mad"] + calculated_tolerance_ms = self.mad_multiplier * mad + min_tolerance_ms = parse_time_string_to_ms(self.min_tolerance) + tolerance_ms = max(calculated_tolerance_ms, min_tolerance_ms) + self.logger.info(f"Calculated tolerance: {tolerance_ms} ms (MAD-based)") + # Calculate the maximum acceptable interval with tolerance + max_interval_with_tolerance_ms = interval_ms + tolerance_ms + self.logger.info( + f"Maximum acceptable interval with tolerance: {max_interval_with_tolerance_ms} ms" + ) + + # Identify missing intervals + missing_intervals_df = df_with_diff.filter( + (F.col("TimeDeltaMs") > max_interval_with_tolerance_ms) + & (F.col("StartMissing").isNotNull()) + ).select( + "TagName", + "StartMissing", + F.col("EventTime").alias("EndMissing"), + "TimeDeltaMs", + ) + # Convert time delta to readable format + missing_intervals_df = missing_intervals_df.withColumn( + "DurationMissing", + F.concat( + F.floor(F.col("TimeDeltaMs") / 3600000).cast("string"), + F.lit("h "), + F.floor((F.col("TimeDeltaMs") % 3600000) / 60000).cast("string"), + F.lit("m "), + F.floor(((F.col("TimeDeltaMs") % 3600000) % 60000) / 1000).cast( + "string" + ), + F.lit("s"), + ), + ).select("TagName", "StartMissing", "EndMissing", "DurationMissing") + missing_intervals = missing_intervals_df.collect() + if missing_intervals: + self.logger.info("Detected Missing Intervals:") + for row in missing_intervals: + self.logger.info( + f"Tag: {row['TagName']} Missing Interval from {row['StartMissing']} to {row['EndMissing']} " + f"Duration: {row['DurationMissing']}" + ) + else: + self.logger.info("No missing intervals detected.") + return self.df diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/identify_missing_data_pattern.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/identify_missing_data_pattern.py new file mode 100644 index 000000000..debb59b1e --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/identify_missing_data_pattern.py @@ -0,0 +1,362 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging + +import pandas as pd +from pyspark.sql import DataFrame as PySparkDataFrame +from pyspark.sql import functions as F +from pyspark.sql.types import ( + StructType, + StructField, + StringType, + TimestampType, + FloatType, +) + + +from ....logging.logger_manager import LoggerManager +from ...input_validator import InputValidator +from ..interfaces import MonitoringBaseInterface +from ...._pipeline_utils.models import ( + Libraries, + SystemType, +) +from ....utilities.spark.time_string_parsing import parse_time_string_to_ms + + +class IdentifyMissingDataPattern(MonitoringBaseInterface, InputValidator): + """ + Identifies missing data in a DataFrame based on specified time patterns. + Logs the expected missing times. + + Args: + df (pyspark.sql.Dataframe): DataFrame containing at least the 'EventTime' column. + patterns (list of dict): List of dictionaries specifying the time patterns. + - For 'minutely' frequency: Specify 'second' and optionally 'millisecond'. + Example: [{'second': 0}, {'second': 13}, {'second': 49}] + - For 'hourly' frequency: Specify 'minute', 'second', and optionally 'millisecond'. + Example: [{'minute': 0, 'second': 0}, {'minute': 30, 'second': 30}] + frequency (str): Frequency of the patterns. Must be either 'minutely' or 'hourly'. + - 'minutely': Patterns are checked every minute at specified seconds. + - 'hourly': Patterns are checked every hour at specified minutes and seconds. + tolerance (str, optional): Maximum allowed deviation from the pattern (e.g., '1s', '500ms'). + Default is '10ms'. + + Example: + ```python + from pyspark.sql import SparkSession + + spark = SparkSession.builder.master("local[1]").appName("IdentifyMissingDataPatternExample").getOrCreate() + + patterns = [ + {"second": 0}, + {"second": 20}, + ] + + frequency = "minutely" + tolerance = "1s" + + identify_missing_data = IdentifyMissingDataPattern( + df=df, + patterns=patterns, + frequency=frequency, + tolerance=tolerance, + ) + + identify_missing_data.check() + ``` + + """ + + df: PySparkDataFrame + EXPECTED_SCHEMA = StructType( + [ + StructField("TagName", StringType(), True), + StructField("EventTime", TimestampType(), True), + StructField("Status", StringType(), True), + StructField("Value", FloatType(), True), + ] + ) + + def __init__( + self, + df: PySparkDataFrame, + patterns: list, + frequency: str = "minutely", + tolerance: str = "10ms", + ) -> None: + + self.df = df + self.patterns = patterns + self.frequency = frequency.lower() + self.tolerance = tolerance + self.validate(self.EXPECTED_SCHEMA) + + # Configure logging + self.logger = LoggerManager().create_logger(self.__class__.__name__) + + @staticmethod + def system_type(): + """ + Attributes: + SystemType (Environment): Requires PYSPARK + """ + return SystemType.PYSPARK + + @staticmethod + def libraries(): + libraries = Libraries() + return libraries + + @staticmethod + def settings() -> dict: + return {} + + def check(self) -> PySparkDataFrame: + """ + Executes the missing pattern detection logic. Identifies and logs any missing patterns + based on the provided patterns and frequency within the specified tolerance. + + Returns: + pyspark.sql.DataFrame: + Returns the original PySpark DataFrame without changes. + """ + self._validate_inputs() + df = self.df.withColumn("EventTime", F.to_timestamp("EventTime")) + df_sorted = df.orderBy("EventTime") + # Determine if the DataFrame is empty + count = df_sorted.count() + if count == 0: + self.logger.info("Generated 0 expected times based on patterns.") + self.logger.info("DataFrame is empty. No missing patterns to detect.") + return self.df + # Determine the time range of the data + min_time, max_time = df_sorted.agg( + F.min("EventTime"), F.max("EventTime") + ).first() + if not min_time or not max_time: + self.logger.info("Generated 0 expected times based on patterns.") + self.logger.info("DataFrame is empty. No missing patterns to detect.") + return self.df + # Generate all expected times based on patterns and frequency + expected_times_df = self._generate_expected_times(min_time, max_time) + # Identify missing patterns by left joining expected times with actual EventTimes within tolerance + missing_patterns_df = self._find_missing_patterns(expected_times_df, df_sorted) + self._log_missing_patterns(missing_patterns_df) + return self.df + + def _validate_inputs(self): + if self.frequency not in ["minutely", "hourly"]: + error_msg = "Frequency must be either 'minutely' or 'hourly'." + self.logger.error(error_msg) + raise ValueError(error_msg) + for pattern in self.patterns: + if self.frequency == "minutely": + self.validate_minutely_pattern(pattern) + elif self.frequency == "hourly": + self.validate_hourly_patterns(pattern) + try: + self.tolerance_ms = parse_time_string_to_ms(self.tolerance) + self.tolerance_seconds = self.tolerance_ms / 1000 + self.logger.info( + f"Using tolerance: {self.tolerance_ms} ms ({self.tolerance_seconds} seconds)" + ) + except ValueError as e: + error_msg = f"Invalid tolerance format: {self.tolerance}" + self.logger.error(error_msg) + raise ValueError(error_msg) from e + + def validate_hourly_patterns(self, pattern): + if "minute" not in pattern or "second" not in pattern: + raise ValueError( + "Each pattern must have 'minute' and 'second' keys for 'hourly' frequency." + ) + if pattern.get("minute", 0) >= 60: + raise ValueError("For 'hourly' frequency, 'minute' must be less than 60.") + if "hour" in pattern: + raise ValueError( + "For 'hourly' frequency, pattern should not contain 'hour'." + ) + + def validate_minutely_pattern(self, pattern): + if "second" not in pattern: + raise ValueError( + "Each pattern must have a 'second' key for 'minutely' frequency." + ) + if pattern.get("second", 0) >= 60: + raise ValueError("For 'minutely' frequency, 'second' must be less than 60.") + if "minute" in pattern or "hour" in pattern: + raise ValueError( + "For 'minutely' frequency, pattern should not contain 'minute' or 'hour'." + ) + + def _generate_expected_times(self, min_time, max_time) -> PySparkDataFrame: + floor_min_time = self._get_floor_min_time(min_time) + ceil_max_time = self._get_ceil_max_time(max_time) + base_times_df = self._create_base_times_df(floor_min_time, ceil_max_time) + expected_times_df = self._apply_patterns( + base_times_df, floor_min_time, max_time + ) + return expected_times_df + + def _get_floor_min_time(self, min_time): + if self.frequency == "minutely": + return min_time.replace(second=0, microsecond=0) + elif self.frequency == "hourly": + return min_time.replace(minute=0, second=0, microsecond=0) + + def _get_ceil_max_time(self, max_time): + if self.frequency == "minutely": + return (max_time + pd.Timedelta(minutes=1)).replace(second=0, microsecond=0) + elif self.frequency == "hourly": + return (max_time + pd.Timedelta(hours=1)).replace( + minute=0, second=0, microsecond=0 + ) + + def _create_base_times_df(self, floor_min_time, ceil_max_time): + step = F.expr(f"INTERVAL 1 {self.frequency.upper()[:-2]}") + return self.df.sparkSession.createDataFrame( + [(floor_min_time, ceil_max_time)], ["start", "end"] + ).select( + F.explode( + F.sequence( + F.col("start").cast("timestamp"), + F.col("end").cast("timestamp"), + step, + ) + ).alias("BaseTime") + ) + + def _apply_patterns(self, base_times_df, floor_min_time, max_time): + expected_times = [] + for pattern in self.patterns: + expected_time = self._calculate_expected_time(base_times_df, pattern) + expected_times.append(expected_time) + expected_times_df = ( + base_times_df.withColumn( + "ExpectedTime", F.explode(F.array(*expected_times)) + ) + .select("ExpectedTime") + .distinct() + .filter( + (F.col("ExpectedTime") >= F.lit(floor_min_time)) + & (F.col("ExpectedTime") <= F.lit(max_time)) + ) + ) + return expected_times_df + + def _calculate_expected_time(self, base_times_df, pattern): + if self.frequency == "minutely": + seconds = pattern.get("second", 0) + milliseconds = pattern.get("millisecond", 0) + return ( + F.col("BaseTime") + + F.expr(f"INTERVAL {seconds} SECOND") + + F.expr(f"INTERVAL {milliseconds} MILLISECOND") + ) + elif self.frequency == "hourly": + minutes = pattern.get("minute", 0) + seconds = pattern.get("second", 0) + milliseconds = pattern.get("millisecond", 0) + return ( + F.col("BaseTime") + + F.expr(f"INTERVAL {minutes} MINUTE") + + F.expr(f"INTERVAL {seconds} SECOND") + + F.expr(f"INTERVAL {milliseconds} MILLISECOND") + ) + + def _find_missing_patterns( + self, expected_times_df: PySparkDataFrame, actual_df: PySparkDataFrame + ) -> PySparkDataFrame: + """ + Finds missing patterns by comparing expected times with actual EventTimes within tolerance. + + Args: + expected_times_df (PySparkDataFrame): DataFrame with expected 'ExpectedTime'. + actual_df (PySparkDataFrame): Actual DataFrame with 'EventTime'. + + Returns: + PySparkDataFrame: DataFrame with missing 'ExpectedTime'. + """ + # Format tolerance for SQL INTERVAL + tolerance_str = self._format_timedelta_for_sql(self.tolerance_ms) + # Perform left join with tolerance window + actual_event_time = "at.EventTime" + missing_patterns_df = ( + expected_times_df.alias("et") + .join( + actual_df.alias("at"), + ( + F.col(actual_event_time) + >= F.expr(f"et.ExpectedTime - INTERVAL {tolerance_str}") + ) + & ( + F.col(actual_event_time) + <= F.expr(f"et.ExpectedTime + INTERVAL {tolerance_str}") + ), + how="left", + ) + .filter(F.col(actual_event_time).isNull()) + .select(F.col("et.ExpectedTime")) + ) + self.logger.info(f"Identified {missing_patterns_df.count()} missing patterns.") + return missing_patterns_df + + def _log_missing_patterns(self, missing_patterns_df: PySparkDataFrame): + """ + Logs the missing patterns. + + Args: + missing_patterns_df (PySparkDataFrame): DataFrame with missing 'ExpectedTime'. + """ + missing_patterns = missing_patterns_df.collect() + if missing_patterns: + self.logger.info("Detected Missing Patterns:") + # Sort missing patterns by ExpectedTime + sorted_missing_patterns = sorted( + missing_patterns, key=lambda row: row["ExpectedTime"] + ) + for row in sorted_missing_patterns: + # Format ExpectedTime to include milliseconds correctly + formatted_time = row["ExpectedTime"].strftime("%Y-%m-%d %H:%M:%S.%f")[ + :-3 + ] + self.logger.info(f"Missing Pattern at {formatted_time}") + else: + self.logger.info("No missing patterns detected.") + + @staticmethod + def _format_timedelta_for_sql(tolerance_ms: float) -> str: + """ + Formats a tolerance in milliseconds to a string suitable for SQL INTERVAL. + + Args: + tolerance_ms (float): Tolerance in milliseconds. + + Returns: + str: Formatted string (e.g., '1 SECOND', '500 MILLISECONDS'). + """ + if tolerance_ms >= 3600000: + hours = int(tolerance_ms // 3600000) + return f"{hours} HOURS" + elif tolerance_ms >= 60000: + minutes = int(tolerance_ms // 60000) + return f"{minutes} MINUTES" + elif tolerance_ms >= 1000: + seconds = int(tolerance_ms // 1000) + return f"{seconds} SECONDS" + else: + milliseconds = int(tolerance_ms) + return f"{milliseconds} MILLISECONDS" diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/moving_average.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/moving_average.py new file mode 100644 index 000000000..ac9e096f6 --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/moving_average.py @@ -0,0 +1,146 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +from pyspark.sql import DataFrame as PySparkDataFrame +from pyspark.sql.functions import col, avg +from pyspark.sql.window import Window +from pyspark.sql.types import ( + StructType, + StructField, + StringType, + TimestampType, + FloatType, +) + +from ..interfaces import MonitoringBaseInterface +from ...._pipeline_utils.models import ( + Libraries, + SystemType, +) +from ...input_validator import InputValidator + + +class MovingAverage(MonitoringBaseInterface, InputValidator): + """ + Computes and logs the moving average over a specified window size for a given PySpark DataFrame. + + Args: + df (pyspark.sql.DataFrame): The DataFrame to process. + window_size (int): The size of the moving window. + + Example: + ```python + from pyspark.sql import SparkSession + from rtdip_sdk.pipelines.data_quality.monitoring.spark.data_quality.moving_average import MovingAverage + + spark = SparkSession.builder.master("local[1]").appName("MovingAverageExample").getOrCreate() + + data = [ + ("A2PS64V0J.:ZUX09R", "2024-01-02 03:49:45.000", "Good", 1.0), + ("A2PS64V0J.:ZUX09R", "2024-01-02 07:53:11.000", "Good", 2.0), + ("A2PS64V0J.:ZUX09R", "2024-01-02 11:56:42.000", "Good", 3.0), + ("A2PS64V0J.:ZUX09R", "2024-01-02 16:00:12.000", "Good", 4.0), + ("A2PS64V0J.:ZUX09R", "2024-01-02 20:03:46.000", "Good", 5.0), + ] + + columns = ["TagName", "EventTime", "Status", "Value"] + + df = spark.createDataFrame(data, columns) + + moving_avg = MovingAverage( + df=df, + window_size=3, + ) + + moving_avg.check() + ``` + """ + + df: PySparkDataFrame + window_size: int + EXPECTED_SCHEMA = StructType( + [ + StructField("TagName", StringType(), True), + StructField("EventTime", TimestampType(), True), + StructField("Status", StringType(), True), + StructField("Value", FloatType(), True), + ] + ) + + def __init__( + self, + df: PySparkDataFrame, + window_size: int, + ) -> None: + if not isinstance(window_size, int) or window_size <= 0: + raise ValueError("window_size must be a positive integer.") + + self.df = df + self.validate(self.EXPECTED_SCHEMA) + self.window_size = window_size + + self.logger = logging.getLogger(self.__class__.__name__) + if not self.logger.handlers: + handler = logging.StreamHandler() + formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + ) + handler.setFormatter(formatter) + self.logger.addHandler(handler) + self.logger.setLevel(logging.INFO) + + @staticmethod + def system_type(): + """ + Attributes: + SystemType (Environment): Requires PYSPARK + """ + return SystemType.PYSPARK + + @staticmethod + def libraries(): + libraries = Libraries() + return libraries + + @staticmethod + def settings() -> dict: + return {} + + def check(self) -> None: + """ + Computes and logs the moving average using a specified window size. + """ + + self._validate_inputs() + + window_spec = ( + Window.partitionBy("TagName") + .orderBy("EventTime") + .rowsBetween(-(self.window_size - 1), 0) + ) + + self.logger.info("Computing moving averages:") + + for row in ( + self.df.withColumn("MovingAverage", avg(col("Value")).over(window_spec)) + .select("TagName", "EventTime", "Value", "MovingAverage") + .collect() + ): + self.logger.info( + f"Tag: {row.TagName}, Time: {row.EventTime}, Value: {row.Value}, Moving Avg: {row.MovingAverage}" + ) + + def _validate_inputs(self): + if not isinstance(self.window_size, int) or self.window_size <= 0: + raise ValueError("window_size must be a positive integer.") diff --git a/src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py b/src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py index 3fa53a3a2..fb3f2617f 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py +++ b/src/sdk/python/rtdip_sdk/pipelines/deploy/databricks.py @@ -11,16 +11,39 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from dataclasses import dataclass import sys -from typing import Union +from typing import List, Optional, Union from importlib_metadata import PackageNotFoundError, version from importlib.util import module_from_spec, spec_from_file_location from pathlib import Path from io import BytesIO - +from enum import Enum +from typing import Any, Callable, Dict, Iterator, List, Optional from databricks.sdk import WorkspaceClient from databricks.sdk.config import Config -from databricks.sdk.service.jobs import CreateJob, JobSettings +from databricks.sdk.service.jobs import ( + JobSettings, + Continuous, + JobAccessControlRequest, + JobDeployment, + JobEditMode, + JobEmailNotifications, + JobEnvironment, + Format, + GitSource, + JobsHealthRules, + JobCluster, + JobNotificationSettings, + JobParameterDefinition, + PerformanceTarget, + QueueSettings, + JobRunAs, + CronSchedule, + Task, + WebhookNotifications, + TriggerSettings, +) from databricks.sdk.service.compute import Library, PythonPyPiLibrary, MavenLibrary from .interfaces import DeployInterface from ..utilities.pipeline_components import PipelineComponentsGetUtility @@ -30,6 +53,237 @@ __description__: str +@dataclass +class CreateJob: + access_control_list: Optional[List[JobAccessControlRequest]] = None + """List of permissions to set on the job.""" + + budget_policy_id: Optional[str] = None + """The id of the user specified budget policy to use for this job. If not specified, a default + budget policy may be applied when creating or modifying the job. See + `effective_budget_policy_id` for the budget policy used by this workload.""" + + continuous: Optional[Continuous] = None + """An optional continuous property for this job. The continuous property will ensure that there is + always one run executing. Only one of `schedule` and `continuous` can be used.""" + + deployment: Optional[JobDeployment] = None + """Deployment information for jobs managed by external sources.""" + + description: Optional[str] = None + """An optional description for the job. The maximum length is 27700 characters in UTF-8 encoding.""" + + edit_mode: Optional[JobEditMode] = None + """Edit mode of the job. + + * `UI_LOCKED`: The job is in a locked UI state and cannot be modified. * `EDITABLE`: The job is + in an editable state and can be modified.""" + + email_notifications: Optional[JobEmailNotifications] = None + """An optional set of email addresses that is notified when runs of this job begin or complete as + well as when this job is deleted.""" + + environments: Optional[List[JobEnvironment]] = None + """A list of task execution environment specifications that can be referenced by serverless tasks + of this job. An environment is required to be present for serverless tasks. For serverless + notebook tasks, the environment is accessible in the notebook environment panel. For other + serverless tasks, the task environment is required to be specified using environment_key in the + task settings.""" + + format: Optional[Format] = None + """Used to tell what is the format of the job. This field is ignored in Create/Update/Reset calls. + When using the Jobs API 2.1 this value is always set to `"MULTI_TASK"`.""" + + git_source: Optional[GitSource] = None + """An optional specification for a remote Git repository containing the source code used by tasks. + Version-controlled source code is supported by notebook, dbt, Python script, and SQL File tasks. + + If `git_source` is set, these tasks retrieve the file from the remote repository by default. + However, this behavior can be overridden by setting `source` to `WORKSPACE` on the task. + + Note: dbt and SQL File tasks support only version-controlled sources. If dbt or SQL File tasks + are used, `git_source` must be defined on the job.""" + + health: Optional[JobsHealthRules] = None + + job_clusters: Optional[List[JobCluster]] = None + """A list of job cluster specifications that can be shared and reused by tasks of this job. + Libraries cannot be declared in a shared job cluster. You must declare dependent libraries in + task settings.""" + + max_concurrent_runs: Optional[int] = None + """An optional maximum allowed number of concurrent runs of the job. Set this value if you want to + be able to execute multiple runs of the same job concurrently. This is useful for example if you + trigger your job on a frequent schedule and want to allow consecutive runs to overlap with each + other, or if you want to trigger multiple runs which differ by their input parameters. This + setting affects only new runs. For example, suppose the job’s concurrency is 4 and there are 4 + concurrent active runs. Then setting the concurrency to 3 won’t kill any of the active runs. + However, from then on, new runs are skipped unless there are fewer than 3 active runs. This + value cannot exceed 1000. Setting this value to `0` causes all new runs to be skipped.""" + + name: Optional[str] = None + """An optional name for the job. The maximum length is 4096 bytes in UTF-8 encoding.""" + + notification_settings: Optional[JobNotificationSettings] = None + """Optional notification settings that are used when sending notifications to each of the + `email_notifications` and `webhook_notifications` for this job.""" + + parameters: Optional[List[JobParameterDefinition]] = None + """Job-level parameter definitions""" + + performance_target: Optional[PerformanceTarget] = None + """The performance mode on a serverless job. This field determines the level of compute performance + or cost-efficiency for the run. + + * `STANDARD`: Enables cost-efficient execution of serverless workloads. * + `PERFORMANCE_OPTIMIZED`: Prioritizes fast startup and execution times through rapid scaling and + optimized cluster performance.""" + + queue: Optional[QueueSettings] = None + """The queue settings of the job.""" + + run_as: Optional[JobRunAs] = None + + schedule: Optional[CronSchedule] = None + """An optional periodic schedule for this job. The default behavior is that the job only runs when + triggered by clicking “Run Now” in the Jobs UI or sending an API request to `runNow`.""" + + tags: Optional[Dict[str, str]] = None + """A map of tags associated with the job. These are forwarded to the cluster as cluster tags for + jobs clusters, and are subject to the same limitations as cluster tags. A maximum of 25 tags can + be added to the job.""" + + tasks: Optional[List[Task]] = None + """A list of task specifications to be executed by this job. It supports up to 1000 elements in + write endpoints (:method:jobs/create, :method:jobs/reset, :method:jobs/update, + :method:jobs/submit). Read endpoints return only 100 tasks. If more than 100 tasks are + available, you can paginate through them using :method:jobs/get. Use the `next_page_token` field + at the object root to determine if more results are available.""" + + timeout_seconds: Optional[int] = None + """An optional timeout applied to each run of this job. A value of `0` means no timeout.""" + + trigger: Optional[TriggerSettings] = None + """A configuration to trigger a run when certain conditions are met. The default behavior is that + the job runs only when triggered by clicking “Run Now” in the Jobs UI or sending an API + request to `runNow`.""" + + webhook_notifications: Optional[WebhookNotifications] = None + """A collection of system notification IDs to notify when runs of this job begin or complete.""" + + def as_dict(self) -> dict: # pragma: no cover + """Serializes the CreateJob into a dictionary suitable for use as a JSON request body.""" + body = {} + if self.access_control_list: + body["access_control_list"] = [ + v.as_dict() for v in self.access_control_list + ] + if self.budget_policy_id is not None: + body["budget_policy_id"] = self.budget_policy_id + if self.continuous: + body["continuous"] = self.continuous.as_dict() + if self.deployment: + body["deployment"] = self.deployment.as_dict() + if self.description is not None: + body["description"] = self.description + if self.edit_mode is not None: + body["edit_mode"] = self.edit_mode.value + if self.email_notifications: + body["email_notifications"] = self.email_notifications.as_dict() + if self.environments: + body["environments"] = [v.as_dict() for v in self.environments] + if self.format is not None: + body["format"] = self.format.value + if self.git_source: + body["git_source"] = self.git_source.as_dict() + if self.health: + body["health"] = self.health.as_dict() + if self.job_clusters: + body["job_clusters"] = [v.as_dict() for v in self.job_clusters] + if self.max_concurrent_runs is not None: + body["max_concurrent_runs"] = self.max_concurrent_runs + if self.name is not None: + body["name"] = self.name + if self.notification_settings: + body["notification_settings"] = self.notification_settings.as_dict() + if self.parameters: + body["parameters"] = [v.as_dict() for v in self.parameters] + if self.performance_target is not None: + body["performance_target"] = self.performance_target.value + if self.queue: + body["queue"] = self.queue.as_dict() + if self.run_as: + body["run_as"] = self.run_as.as_dict() + if self.schedule: + body["schedule"] = self.schedule.as_dict() + if self.tags: + body["tags"] = self.tags + if self.tasks: + body["tasks"] = [v.as_dict() for v in self.tasks] + if self.timeout_seconds is not None: + body["timeout_seconds"] = self.timeout_seconds + if self.trigger: + body["trigger"] = self.trigger.as_dict() + if self.webhook_notifications: + body["webhook_notifications"] = self.webhook_notifications.as_dict() + return body + + def as_shallow_dict(self) -> dict: # pragma: no cover + """Serializes the CreateJob into a shallow dictionary of its immediate attributes.""" + body = {} + if self.access_control_list: + body["access_control_list"] = self.access_control_list + if self.budget_policy_id is not None: + body["budget_policy_id"] = self.budget_policy_id + if self.continuous: + body["continuous"] = self.continuous + if self.deployment: + body["deployment"] = self.deployment + if self.description is not None: + body["description"] = self.description + if self.edit_mode is not None: + body["edit_mode"] = self.edit_mode + if self.email_notifications: + body["email_notifications"] = self.email_notifications + if self.environments: + body["environments"] = self.environments + if self.format is not None: + body["format"] = self.format + if self.git_source: + body["git_source"] = self.git_source + if self.health: + body["health"] = self.health + if self.job_clusters: + body["job_clusters"] = self.job_clusters + if self.max_concurrent_runs is not None: + body["max_concurrent_runs"] = self.max_concurrent_runs + if self.name is not None: + body["name"] = self.name + if self.notification_settings: + body["notification_settings"] = self.notification_settings + if self.parameters: + body["parameters"] = self.parameters + if self.performance_target is not None: + body["performance_target"] = self.performance_target + if self.queue: + body["queue"] = self.queue + if self.run_as: + body["run_as"] = self.run_as + if self.schedule: + body["schedule"] = self.schedule + if self.tags: + body["tags"] = self.tags + if self.tasks: + body["tasks"] = self.tasks + if self.timeout_seconds is not None: + body["timeout_seconds"] = self.timeout_seconds + if self.trigger: + body["trigger"] = self.trigger + if self.webhook_notifications: + body["webhook_notifications"] = self.webhook_notifications + return body + + class DatabricksSDKDeploy(DeployInterface): """ Deploys an RTDIP Pipeline to Databricks Workflows leveraging the Databricks [SDK.](https://docs.databricks.com/dev-tools/sdk-python.html) @@ -72,7 +326,6 @@ class DatabricksSDKDeploy(DeployInterface): notebook_path="/path/to/pipeline/rtdip_pipeline.py" ) )) - job = CreateJob( name="test_job_rtdip", job_clusters=cluster_list, @@ -109,11 +362,11 @@ def __init__( self.token = token self.workspace_directory = workspace_directory - def _convert_file_to_binary(self, path) -> BytesIO: + def _convert_file_to_binary(self, path) -> BytesIO: # pragma: no cover with open(path, "rb") as f: return BytesIO(f.read()) - def _load_module(self, module_name, path): + def _load_module(self, module_name, path): # pragma: no cover spec = spec_from_file_location(module_name, path) module = module_from_spec(spec) spec.loader.exec_module(module) @@ -133,7 +386,7 @@ def deploy(self) -> Union[bool, ValueError]: auth_type="pat", ) ) - for task in self.databricks_job.tasks: + for task in self.databricks_job.tasks: # pragma: no cover if task.notebook_task is None and task.spark_python_task is None: return ValueError( "A Notebook or Spark Python Task must be populated for each task in the Databricks Job" diff --git a/src/sdk/python/rtdip_sdk/pipelines/destinations/spark/pcdm_to_delta.py b/src/sdk/python/rtdip_sdk/pipelines/destinations/spark/pcdm_to_delta.py index 97ceec5ff..d69b1f0a6 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/destinations/spark/pcdm_to_delta.py +++ b/src/sdk/python/rtdip_sdk/pipelines/destinations/spark/pcdm_to_delta.py @@ -61,7 +61,7 @@ class SparkPCDMToDeltaDestination(DestinationInterface): merge=True, try_broadcast_join=False, remove_nanoseconds=False, - remove_duplicates-True + remove_duplicates=True ) pcdm_to_delta_destination.write_stream() @@ -86,7 +86,7 @@ class SparkPCDMToDeltaDestination(DestinationInterface): merge=True, try_broadcast_join=False, remove_nanoseconds=False, - remove_duplicates-True + remove_duplicates=True ) pcdm_to_delta_destination.write_batch() @@ -105,7 +105,7 @@ class SparkPCDMToDeltaDestination(DestinationInterface): merge (bool): Use Delta Merge to perform inserts, updates and deletes try_broadcast_join (bool): Attempts to perform a broadcast join in the merge which can leverage data skipping using partition pruning and file pruning automatically. Can fail if dataframe being merged is large and therefore more suitable for streaming merges than batch merges remove_nanoseconds (bool): Removes nanoseconds from the EventTime column and replaces with zeros - remove_duplicates (bool: Removes duplicates before writing the data + remove_duplicates (bool): Removes duplicates before writing the data Attributes: checkpointLocation (str): Path to checkpoint files. (Streaming) diff --git a/src/sdk/python/rtdip_sdk/pipelines/forecasting/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/forecasting/__init__.py new file mode 100644 index 000000000..76bb6a388 --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/forecasting/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .spark import * diff --git a/src/sdk/python/rtdip_sdk/pipelines/forecasting/interfaces.py b/src/sdk/python/rtdip_sdk/pipelines/forecasting/interfaces.py new file mode 100644 index 000000000..f79a36232 --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/forecasting/interfaces.py @@ -0,0 +1,32 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from abc import abstractmethod + +from great_expectations.compatibility.pyspark import DataFrame + +from ..interfaces import PipelineComponentBaseInterface + + +class MachineLearningInterface(PipelineComponentBaseInterface): + @abstractmethod + def __init__(self): + pass + + @abstractmethod + def train(self, train_df: DataFrame): + return self + + @abstractmethod + def predict(self, predict_df: DataFrame, *args, **kwargs) -> DataFrame: + pass diff --git a/src/sdk/python/rtdip_sdk/pipelines/forecasting/spark/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/forecasting/spark/__init__.py new file mode 100644 index 000000000..e2ca763d4 --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/forecasting/spark/__init__.py @@ -0,0 +1,19 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .data_binning import DataBinning +from .linear_regression import LinearRegression +from .arima import ArimaPrediction +from .auto_arima import ArimaAutoPrediction +from .k_nearest_neighbors import KNearestNeighbors diff --git a/src/sdk/python/rtdip_sdk/pipelines/forecasting/spark/arima.py b/src/sdk/python/rtdip_sdk/pipelines/forecasting/spark/arima.py new file mode 100644 index 000000000..f92f00135 --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/forecasting/spark/arima.py @@ -0,0 +1,446 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy +import statistics +from enum import Enum +from typing import List, Tuple + +import pandas as pd +from pandas import DataFrame +from pyspark.sql import ( + DataFrame as PySparkDataFrame, + SparkSession, + functions as F, + DataFrame as SparkDataFrame, +) +from pyspark.sql.functions import col, lit +from pyspark.sql.types import StringType, StructField, StructType +from regex import regex +from statsmodels.tsa.arima.model import ARIMA +import numpy as np + +from ...data_quality.data_manipulation.interfaces import DataManipulationBaseInterface +from ...data_quality.input_validator import InputValidator +from ...._sdk_utils.pandas import _prepare_pandas_to_convert_to_spark +from ..._pipeline_utils.models import ( + Libraries, + SystemType, +) + + +class ArimaPrediction(DataManipulationBaseInterface, InputValidator): + """ + Extends the timeseries data in given DataFrame with forecasted values from an ARIMA model. + It forecasts a value column of the given time series dataframe based on the historical data points and constructs + full entries based on the preceding timestamps. It is advised to place this step after the missing value imputation + to prevent learning on dirty data. + + It supports dataframes in a source-based format (where each row is an event by a single sensor) and column-based format (where each row is a point in time). + + The similar component AutoArimaPrediction wraps around this component and needs less manual parameters set. + + ARIMA-Specific parameters can be viewed at the following statsmodels documentation page: + [ARIMA Documentation](https://www.statsmodels.org/dev/generated/statsmodels.tsa.arima.model.ARIMA.html) + + Example + ------- + ```python + import numpy as np + import matplotlib.pyplot as plt + import numpy.random + import pandas + from pyspark.sql import SparkSession + + from rtdip_sdk.pipelines.forecasting.spark.arima import ArimaPrediction + + import rtdip_sdk.pipelines._pipeline_utils.spark as spark_utils + + spark_session = SparkSession.builder.master("local[2]").appName("test").getOrCreate() + df = pandas.DataFrame() + + numpy.random.seed(0) + arr_len = 250 + h_a_l = int(arr_len / 2) + df['Value'] = np.random.rand(arr_len) + np.sin(np.linspace(0, arr_len / 10, num=arr_len)) + df['Value2'] = np.random.rand(arr_len) + np.cos(np.linspace(0, arr_len / 2, num=arr_len)) + 5 + df['index'] = np.asarray(pandas.date_range(start='1/1/2024', end='2/1/2024', periods=arr_len)) + df = df.set_index(pandas.DatetimeIndex(df['index'])) + + learn_df = df.head(h_a_l) + + # plt.plot(df['Value']) + # plt.show() + + input_df = spark_session.createDataFrame( + learn_df, + ['Value', 'Value2', 'index'], + ) + arima_comp = ArimaPrediction(input_df, to_extend_name='Value', number_of_data_points_to_analyze=h_a_l, number_of_data_points_to_predict=h_a_l, + order=(3,0,0), seasonal_order=(3,0,0,62)) + forecasted_df = arima_comp.filter_data().toPandas() + print('Done') + ``` + + Parameters: + past_data (PySparkDataFrame): PySpark DataFrame which contains training data + to_extend_name (str): Column or source to forecast on + past_data_style (InputStyle): In which format is past_data formatted + value_name (str): Name of column in source-based format, where values are stored + timestamp_name (str): Name of column, where event timestamps are stored + source_name (str): Name of column in source-based format, where source of events are stored + status_name (str): Name of column in source-based format, where status of events are stored + external_regressor_names (List[str]): Currently not working. Names of the columns with data to use for prediction, but not extend + number_of_data_points_to_predict (int): Amount of points to forecast + number_of_data_points_to_analyze (int): Amount of most recent points to train on + order (tuple): ARIMA-Specific setting + seasonal_order (tuple): ARIMA-Specific setting + trend (str): ARIMA-Specific setting + enforce_stationarity (bool): ARIMA-Specific setting + enforce_invertibility (bool): ARIMA-Specific setting + concentrate_scale (bool): ARIMA-Specific setting + trend_offset (int): ARIMA-Specific setting + missing (str): ARIMA-Specific setting + """ + + df: PySparkDataFrame = None + pd_df: DataFrame = None + spark_session: SparkSession + + column_to_predict: str + rows_to_predict: int + rows_to_analyze: int + + value_name: str + timestamp_name: str + source_name: str + external_regressor_names: List[str] + + class InputStyle(Enum): + """ + Used to describe style of a dataframe + """ + + COLUMN_BASED = 1 # Schema: [EventTime, FirstSource, SecondSource, ...] + SOURCE_BASED = 2 # Schema: [EventTime, NameSource, Value, OptionalStatus] + + def __init__( + self, + past_data: PySparkDataFrame, + to_extend_name: str, # either source or column + # Metadata about past_date + past_data_style: InputStyle = None, + value_name: str = None, + timestamp_name: str = None, + source_name: str = None, + status_name: str = None, + # Options for ARIMA + external_regressor_names: List[str] = None, + number_of_data_points_to_predict: int = 50, + number_of_data_points_to_analyze: int = None, + order: tuple = (0, 0, 0), + seasonal_order: tuple = (0, 0, 0, 0), + trend=None, + enforce_stationarity: bool = True, + enforce_invertibility: bool = True, + concentrate_scale: bool = False, + trend_offset: int = 1, + missing: str = "None", + ) -> None: + self.past_data = past_data + # Convert dataframe to general column-based format for internal processing + self._initialize_self_df( + past_data, + past_data_style, + source_name, + status_name, + timestamp_name, + to_extend_name, + value_name, + ) + + if number_of_data_points_to_analyze > self.df.count(): + raise ValueError( + "Number of data points to analyze exceeds the number of rows present" + ) + + self.spark_session = past_data.sparkSession + self.column_to_predict = to_extend_name + self.rows_to_predict = number_of_data_points_to_predict + self.rows_to_analyze = number_of_data_points_to_analyze or past_data.count() + self.order = order + self.seasonal_order = seasonal_order + self.trend = trend + self.enforce_stationarity = enforce_stationarity + self.enforce_invertibility = enforce_invertibility + self.concentrate_scale = concentrate_scale + self.trend_offset = trend_offset + self.missing = missing + self.external_regressor_names = external_regressor_names + + @staticmethod + def system_type(): + """ + Attributes: + SystemType (Environment): Requires PYSPARK + """ + return SystemType.PYSPARK + + @staticmethod + def libraries(): + libraries = Libraries() + return libraries + + @staticmethod + def settings() -> dict: + return {} + + @staticmethod + def _is_column_type(df, column_name, data_type): + """ + Helper method for data type checking + """ + type_ = df.schema[column_name] + + return isinstance(type_.dataType, data_type) + + def _initialize_self_df( + self, + past_data, + past_data_style, + source_name, + status_name, + timestamp_name, + to_extend_name, + value_name, + ): + # Initialize self.df with meta parameters if not already done by previous constructor + if self.df is None: + ( + self.past_data_style, + self.value_name, + self.timestamp_name, + self.source_name, + self.status_name, + ) = self._constructor_handle_input_metadata( + past_data, + past_data_style, + value_name, + timestamp_name, + source_name, + status_name, + ) + + if self.past_data_style == self.InputStyle.COLUMN_BASED: + self.df = past_data + elif self.past_data_style == self.InputStyle.SOURCE_BASED: + self.df = ( + past_data.groupby(self.timestamp_name) + .pivot(self.source_name) + .agg(F.first(self.value_name)) + ) + if not to_extend_name in self.df.columns: + raise ValueError("{} not found in the DataFrame.".format(to_extend_name)) + + def _constructor_handle_input_metadata( + self, + past_data: PySparkDataFrame, + past_data_style: InputStyle, + value_name: str, + timestamp_name: str, + source_name: str, + status_name: str, + ) -> Tuple[InputStyle, str, str, str, str]: + # Infer names of columns from past_data schema. If nothing is found, leave self parameters at None. + if past_data_style is not None: + return past_data_style, value_name, timestamp_name, source_name, status_name + # Automatic calculation part + schema_names = past_data.schema.names.copy() + + assumed_past_data_style = None + value_name = None + timestamp_name = None + source_name = None + status_name = None + + def pickout_column( + rem_columns: List[str], regex_string: str + ) -> (str, List[str]): + rgx = regex.compile(regex_string) + sus_columns = list(filter(rgx.search, rem_columns)) + found_column = sus_columns[0] if len(sus_columns) == 1 else None + return found_column + + # Is there a status column? + status_name = pickout_column(schema_names, r"(?i)status") + # Is there a source name / tag + source_name = pickout_column(schema_names, r"(?i)tag") + # Is there a timestamp column? + timestamp_name = pickout_column(schema_names, r"(?i)time|index") + # Is there a value column? + value_name = pickout_column(schema_names, r"(?i)value") + + if source_name is not None: + assumed_past_data_style = self.InputStyle.SOURCE_BASED + else: + assumed_past_data_style = self.InputStyle.COLUMN_BASED + + # if self.past_data_style is None: + # raise ValueError( + # "Automatic determination of past_data_style failed, must be specified in parameter instead.") + return ( + assumed_past_data_style, + value_name, + timestamp_name, + source_name, + status_name, + ) + + def filter_data(self) -> PySparkDataFrame: + """ + Forecasts a value column of a given time series dataframe based on the historical data points using ARIMA. + + Constructs full entries based on the preceding timestamps. It is advised to place this step after the missing + value imputation to prevent learning on dirty data. + + Returns: + DataFrame: A PySpark DataFrame with forecasted value entries depending on constructor parameters. + """ + # expected_scheme = StructType( + # [ + # StructField("TagName", StringType(), True), + # StructField("EventTime", TimestampType(), True), + # StructField("Status", StringType(), True), + # StructField("Value", NumericType(), True), + # ] + # ) + pd_df = self.df.toPandas() + pd_df.loc[:, self.timestamp_name] = pd.to_datetime( + pd_df[self.timestamp_name], format="mixed" + ).astype("datetime64[ns]") + pd_df.loc[:, self.column_to_predict] = pd_df.loc[ + :, self.column_to_predict + ].astype(float) + pd_df.sort_values(self.timestamp_name, inplace=True) + pd_df.reset_index(drop=True, inplace=True) + # self.validate(expected_scheme) + + # limit df to specific data points + pd_to_train_on = pd_df[pd_df[self.column_to_predict].notna()].tail( + self.rows_to_analyze + ) + pd_to_predict_on = pd_df[pd_df[self.column_to_predict].isna()].head( + self.rows_to_predict + ) + pd_df = pd.concat([pd_to_train_on, pd_to_predict_on]) + + main_signal_df = pd_df[pd_df[self.column_to_predict].notna()] + + input_data = main_signal_df[self.column_to_predict].astype(float) + exog_data = None + # if self.external_regressor_names is not None: + # exog_data = [] + # for column_name in self.external_regressor_names: + # signal_df = pd.concat([pd_to_train_on[column_name], pd_to_predict_on[column_name]]) + # exog_data.append(signal_df) + + source_model = ARIMA( + endog=input_data, + exog=exog_data, + order=self.order, + seasonal_order=self.seasonal_order, + trend=self.trend, + enforce_stationarity=self.enforce_stationarity, + enforce_invertibility=self.enforce_invertibility, + concentrate_scale=self.concentrate_scale, + trend_offset=self.trend_offset, + missing=self.missing, + ).fit() + + forecast = source_model.forecast(steps=self.rows_to_predict) + inferred_freq = pd.Timedelta( + value=statistics.mode(np.diff(main_signal_df[self.timestamp_name].values)) + ) + + pd_forecast_df = pd.DataFrame( + { + self.timestamp_name: pd.date_range( + start=main_signal_df[self.timestamp_name].max() + inferred_freq, + periods=self.rows_to_predict, + freq=inferred_freq, + ), + self.column_to_predict: forecast, + } + ) + + pd_df = pd.concat([pd_df, pd_forecast_df]) + + if self.past_data_style == self.InputStyle.COLUMN_BASED: + for obj in self.past_data.schema: + simple_string_type = obj.dataType.simpleString() + if simple_string_type == "timestamp": + continue + pd_df.loc[:, obj.name] = pd_df.loc[:, obj.name].astype( + simple_string_type + ) + # Workaround needed for PySpark versions <3.4 + pd_df = _prepare_pandas_to_convert_to_spark(pd_df) + predicted_source_pyspark_dataframe = self.spark_session.createDataFrame( + pd_df, schema=copy.deepcopy(self.past_data.schema) + ) + return predicted_source_pyspark_dataframe + elif self.past_data_style == self.InputStyle.SOURCE_BASED: + data_to_add = pd_forecast_df[[self.column_to_predict, self.timestamp_name]] + data_to_add = data_to_add.rename( + columns={ + self.timestamp_name: self.timestamp_name, + self.column_to_predict: self.value_name, + } + ) + data_to_add[self.source_name] = self.column_to_predict + data_to_add[self.timestamp_name] = data_to_add[ + self.timestamp_name + ].dt.strftime("%Y-%m-%dT%H:%M:%S.%f") + + pd_df_schema = StructType( + [ + StructField(self.source_name, StringType(), True), + StructField(self.timestamp_name, StringType(), True), + StructField(self.value_name, StringType(), True), + ] + ) + + # Workaround needed for PySpark versions <3.4 + data_to_add = _prepare_pandas_to_convert_to_spark(data_to_add) + + predicted_source_pyspark_dataframe = self.spark_session.createDataFrame( + _prepare_pandas_to_convert_to_spark( + data_to_add[ + [self.source_name, self.timestamp_name, self.value_name] + ] + ), + schema=pd_df_schema, + ) + + if self.status_name is not None: + predicted_source_pyspark_dataframe = ( + predicted_source_pyspark_dataframe.withColumn( + self.status_name, lit("Predicted") + ) + ) + + to_return = self.past_data.unionByName(predicted_source_pyspark_dataframe) + return to_return + + def validate(self, schema_dict, df: SparkDataFrame = None): + return super().validate(schema_dict, self.past_data) diff --git a/src/sdk/python/rtdip_sdk/pipelines/forecasting/spark/auto_arima.py b/src/sdk/python/rtdip_sdk/pipelines/forecasting/spark/auto_arima.py new file mode 100644 index 000000000..a47ff7a77 --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/forecasting/spark/auto_arima.py @@ -0,0 +1,151 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import statistics +from typing import List, Tuple + +from pyspark.sql import DataFrame as PySparkDataFrame, SparkSession, functions as F +from pmdarima import auto_arima + +from .arima import ArimaPrediction + + +class ArimaAutoPrediction(ArimaPrediction): + """ + A wrapper for ArimaPrediction which uses pmdarima auto_arima for data prediction. + It selectively tries various sets of p and q (also P and Q for seasonal models) parameters and selects the model with the minimal AIC. + + Example + ------- + ```python + import numpy as np + import matplotlib.pyplot as plt + import numpy.random + import pandas + from pyspark.sql import SparkSession + + from rtdip_sdk.pipelines.data_quality.forecasting.spark.arima import ArimaPrediction + + import rtdip_sdk.pipelines._pipeline_utils.spark as spark_utils + from rtdip_sdk.pipelines.data_quality.forecasting.spark.auto_arima import ArimaAutoPrediction + + spark_session = SparkSession.builder.master("local[2]").appName("test").getOrCreate() + df = pandas.DataFrame() + + numpy.random.seed(0) + arr_len = 250 + h_a_l = int(arr_len / 2) + df['Value'] = np.random.rand(arr_len) + np.sin(np.linspace(0, arr_len / 10, num=arr_len)) + df['Value2'] = np.random.rand(arr_len) + np.cos(np.linspace(0, arr_len / 2, num=arr_len)) + 5 + df['index'] = np.asarray(pandas.date_range(start='1/1/2024', end='2/1/2024', periods=arr_len)) + df = df.set_index(pandas.DatetimeIndex(df['index'])) + + learn_df = df.head(h_a_l) + + # plt.plot(df['Value']) + # plt.show() + + input_df = spark_session.createDataFrame( + learn_df, + ['Value', 'Value2', 'index'], + ) + arima_comp = ArimaAutoPrediction(input_df, to_extend_name='Value', number_of_data_points_to_analyze=h_a_l, number_of_data_points_to_predict=h_a_l, + seasonal=True) + forecasted_df = arima_comp.filter_data().toPandas() + print('Done') + ``` + + Parameters: + past_data (PySparkDataFrame): PySpark DataFrame which contains training data + to_extend_name (str): Column or source to forecast on + past_data_style (InputStyle): In which format is past_data formatted + value_name (str): Name of column in source-based format, where values are stored + timestamp_name (str): Name of column, where event timestamps are stored + source_name (str): Name of column in source-based format, where source of events are stored + status_name (str): Name of column in source-based format, where status of events are stored + external_regressor_names (List[str]): Currently not working. Names of the columns with data to use for prediction, but not extend + number_of_data_points_to_predict (int): Amount of points to forecast + number_of_data_points_to_analyze (int): Amount of most recent points to train on + seasonal (bool): Setting for AutoArima, is past_data seasonal? + enforce_stationarity (bool): ARIMA-Specific setting + enforce_invertibility (bool): ARIMA-Specific setting + concentrate_scale (bool): ARIMA-Specific setting + trend_offset (int): ARIMA-Specific setting + missing (str): ARIMA-Specific setting + """ + + def __init__( + self, + past_data: PySparkDataFrame, + past_data_style: ArimaPrediction.InputStyle = None, + to_extend_name: str = None, + value_name: str = None, + timestamp_name: str = None, + source_name: str = None, + status_name: str = None, + external_regressor_names: List[str] = None, + number_of_data_points_to_predict: int = 50, + number_of_data_points_to_analyze: int = None, + seasonal: bool = False, + enforce_stationarity: bool = True, + enforce_invertibility: bool = True, + concentrate_scale: bool = False, + trend_offset: int = 1, + missing: str = "None", + ) -> None: + # Convert source-based dataframe to column-based if necessary + self._initialize_self_df( + past_data, + past_data_style, + source_name, + status_name, + timestamp_name, + to_extend_name, + value_name, + ) + # Prepare Input data + input_data = self.df.toPandas() + input_data = input_data[input_data[to_extend_name].notna()].tail( + number_of_data_points_to_analyze + )[to_extend_name] + + auto_model = auto_arima( + y=input_data, + seasonal=seasonal, + stepwise=True, + suppress_warnings=True, + trace=False, # Set to true if to debug + error_action="ignore", + max_order=None, + ) + + super().__init__( + past_data=past_data, + past_data_style=self.past_data_style, + to_extend_name=to_extend_name, + value_name=self.value_name, + timestamp_name=self.timestamp_name, + source_name=self.source_name, + status_name=self.status_name, + external_regressor_names=external_regressor_names, + number_of_data_points_to_predict=number_of_data_points_to_predict, + number_of_data_points_to_analyze=number_of_data_points_to_analyze, + order=auto_model.order, + seasonal_order=auto_model.seasonal_order, + trend="c" if auto_model.order[1] == 0 else "t", + enforce_stationarity=enforce_stationarity, + enforce_invertibility=enforce_invertibility, + concentrate_scale=concentrate_scale, + trend_offset=trend_offset, + missing=missing, + ) diff --git a/src/sdk/python/rtdip_sdk/pipelines/forecasting/spark/data_binning.py b/src/sdk/python/rtdip_sdk/pipelines/forecasting/spark/data_binning.py new file mode 100644 index 000000000..7138c547f --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/forecasting/spark/data_binning.py @@ -0,0 +1,91 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pyspark.ml.clustering as clustering +from pyspark.sql import DataFrame +from ..interfaces import MachineLearningInterface +from ..._pipeline_utils.models import Libraries, SystemType + + +class DataBinning(MachineLearningInterface): + """ + Data binning using clustering methods. This method partitions the data points into a specified number of clusters (bins) + based on the specified column. Each data point is assigned to the nearest cluster center. + + Example + -------- + ```python + from src.sdk.python.rtdip_sdk.pipelines.forecasting.spark.data_binning import DataBinning + + df = ... # Get a PySpark DataFrame with features column + + binning = DataBinning( + column_name="features", + bins=3, + output_column_name="bin", + method="kmeans" + ) + binned_df = binning.train(df).predict(df) + binned_df.show() + ``` + + Parameters: + column_name (str): The name of the input column to be binned (default: "features"). + bins (int): The number of bins/clusters to create (default: 2). + output_column_name (str): The name of the output column containing bin assignments (default: "bin"). + method (str): The binning method to use. Currently only supports "kmeans". + """ + + def __init__( + self, + column_name: str = "features", + bins: int = 2, + output_column_name: str = "bin", + method: str = "kmeans", + ) -> None: + self.column_name = column_name + + if method == "kmeans": + self.method = clustering.KMeans( + featuresCol=column_name, predictionCol=output_column_name, k=bins + ) + else: + raise ValueError("Unknown method: {}".format(method)) + + @staticmethod + def system_type(): + """ + Attributes: + SystemType (Environment): Requires PYSPARK + """ + return SystemType.PYSPARK + + @staticmethod + def libraries(): + libraries = Libraries() + return libraries + + @staticmethod + def settings() -> dict: + return {} + + def train(self, train_df): + """ + Filter anomalies based on the k-sigma rule + """ + self.model = self.method.fit(train_df) + return self + + def predict(self, predict_df): + return self.model.transform(predict_df) diff --git a/src/sdk/python/rtdip_sdk/pipelines/forecasting/spark/k_nearest_neighbors.py b/src/sdk/python/rtdip_sdk/pipelines/forecasting/spark/k_nearest_neighbors.py new file mode 100644 index 000000000..da4a7cd86 --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/forecasting/spark/k_nearest_neighbors.py @@ -0,0 +1,205 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from pyspark.sql import DataFrame +from pyspark.sql.functions import col, udf +from pyspark.sql.types import DoubleType +from ..interfaces import MachineLearningInterface +from ..._pipeline_utils.models import Libraries, SystemType +import numpy as np + + +class KNearestNeighbors(MachineLearningInterface): + """ + Implements the K-Nearest Neighbors (KNN) algorithm to predict missing values in a dataset. + This component is compatible with time series data and supports customizable weighted or unweighted averaging for predictions. + + Example: + ```python + from pyspark.ml.feature import StandardScaler, VectorAssembler + from pyspark.sql import SparkSession + from src.sdk.python.rtdip_sdk.pipelines.forecasting.spark.k_nearest_neighbors import KNearestNeighbors + spark_session = SparkSession.builder.master("local[2]").appName("KNN").getOrCreate() + data = [ + ("A2PS64V0J.:ZUX09R", "2024-01-02 03:49:45.000", "Good", 25.0), + ("A2PS64V0J.:ZUX09R", "2024-01-02 07:53:11.000", "Good", -5.0), + ("A2PS64V0J.:ZUX09R", "2024-01-02 11:56:42.000", "Good", 50.0), + ("B3TS64V0K.:ZUX09R", "2024-01-02 16:00:12.000", "Good", 80.0), + ("A2PS64V0J.:ZUX09R", "2024-01-02 20:03:46.000", "Good", 100.0), + ] + columns = ["TagName", "EventTime", "Status", "Value"] + raw_df = = spark.createDataFrame(data, columns) + assembler = VectorAssembler(inputCols=["feature1", "feature2"], outputCol="assembled_features") + df = assembler.transform(raw_df) + scaler = StandardScaler(inputCol="assembled_features", outputCol="features", withStd=True, withMean=True) + scaled_df = scaler.fit(df).transform(df) + knn = KNearestNeighbors( + features_col="features", + label_col="label", + timestamp_col="timestamp", + k=3, + weighted=True, + distance_metric="combined", # Options: "euclidean", "temporal", "combined" + temporal_weight=0.3 # Weight for temporal distance when using combined metric + ) + train_df, test_df = knn.randomSplit([0.8, 0.2], seed=42) + knn.train(train_df) + predictions = knn.predict(test_df) + ``` + + Parameters: + features_col (str): Name of the column containing the features (the input). Default is 'features' + label_col (str): Name of the column containing the label (the input). Default is 'label' + timestamp_col (str, optional): Name of the column containing timestamps + k (int): The number of neighbors to consider in the KNN algorithm. Default is 3 + weighted (bool): Whether to use weighted averaging based on distance. Default is False (unweighted averaging) + distance_metric (str): Type of distance calculation ("euclidean", "temporal", or "combined") + temporal_weight (float): Weight for temporal distance in combined metric (0 to 1) + """ + + def __init__( + self, + features_col, + label_col, + timestamp_col=None, + k=3, + weighted=False, + distance_metric="euclidean", + temporal_weight=0.5, + ): + self.features_col = features_col + self.label_col = label_col + self.timestamp_col = timestamp_col + self.k = k + self.weighted = weighted + self.distance_metric = distance_metric + self.temporal_weight = temporal_weight + self.train_features = None + self.train_labels = None + self.train_timestamps = None + + if distance_metric not in ["euclidean", "temporal", "combined"]: + raise ValueError( + "distance_metric must be 'euclidean', 'temporal', or 'combined'" + ) + + if distance_metric in ["temporal", "combined"] and timestamp_col is None: + raise ValueError( + "timestamp_col must be provided when using temporal or combined distance metrics" + ) + + @staticmethod + def system_type(): + return SystemType.PYSPARK + + @staticmethod + def libraries(): + libraries = Libraries() + return libraries + + @staticmethod + def settings() -> dict: + return {} + + def train(self, train_df: DataFrame): + """ + Sets up the training DataFrame including temporal information if specified. + """ + if self.timestamp_col: + df = train_df.select( + self.features_col, self.label_col, self.timestamp_col + ).collect() + self.train_timestamps = np.array( + [row[self.timestamp_col].timestamp() for row in df] + ) + else: + df = train_df.select(self.features_col, self.label_col).collect() + + self.train_features = np.array([row[self.features_col] for row in df]) + self.train_labels = np.array([row[self.label_col] for row in df]) + return self + + def predict(self, test_df: DataFrame) -> DataFrame: + """ + Predicts labels using the specified distance metric. + """ + train_features = self.train_features + train_labels = self.train_labels + train_timestamps = self.train_timestamps + k = self.k + weighted = self.weighted + distance_metric = self.distance_metric + temporal_weight = self.temporal_weight + + def calculate_distances(features, timestamp=None): + test_point = np.array(features) + + if distance_metric == "euclidean": + return np.sqrt(np.sum((train_features - test_point) ** 2, axis=1)) + + elif distance_metric == "temporal": + return np.abs(train_timestamps - timestamp) + + else: # combined + feature_distances = np.sqrt( + np.sum((train_features - test_point) ** 2, axis=1) + ) + temporal_distances = np.abs(train_timestamps - timestamp) + + # Normalize distances to [0, 1] range + feature_distances = (feature_distances - feature_distances.min()) / ( + feature_distances.max() - feature_distances.min() + 1e-10 + ) + temporal_distances = (temporal_distances - temporal_distances.min()) / ( + temporal_distances.max() - temporal_distances.min() + 1e-10 + ) + + # Combine distances with weights + return ( + 1 - temporal_weight + ) * feature_distances + temporal_weight * temporal_distances + + def knn_predict(features, timestamp=None): + distances = calculate_distances(features, timestamp) + k_nearest_indices = np.argsort(distances)[:k] + k_nearest_labels = train_labels[k_nearest_indices] + + if weighted: + k_distances = distances[k_nearest_indices] + weights = 1 / (k_distances + 1e-10) + weights /= np.sum(weights) + unique_labels = np.unique(k_nearest_labels) + weighted_votes = { + label: np.sum(weights[k_nearest_labels == label]) + for label in unique_labels + } + return float(max(weighted_votes.items(), key=lambda x: x[1])[0]) + else: + return float( + max(set(k_nearest_labels), key=list(k_nearest_labels).count) + ) + + if self.distance_metric in ["temporal", "combined"]: + predict_udf = udf( + lambda features, timestamp: knn_predict( + features, timestamp.timestamp() + ), + DoubleType(), + ) + return test_df.withColumn( + "prediction", + predict_udf(col(self.features_col), col(self.timestamp_col)), + ) + else: + predict_udf = udf(lambda features: knn_predict(features), DoubleType()) + return test_df.withColumn("prediction", predict_udf(col(self.features_col))) diff --git a/src/sdk/python/rtdip_sdk/pipelines/forecasting/spark/linear_regression.py b/src/sdk/python/rtdip_sdk/pipelines/forecasting/spark/linear_regression.py new file mode 100644 index 000000000..b4195c37c --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/forecasting/spark/linear_regression.py @@ -0,0 +1,159 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from pyspark.sql import DataFrame +import pyspark.ml as ml +from pyspark.ml.evaluation import RegressionEvaluator +from ..interfaces import MachineLearningInterface +from ..._pipeline_utils.models import Libraries, SystemType +from typing import Optional + + +class LinearRegression(MachineLearningInterface): + """ + This class uses pyspark.ml.LinearRegression to train a linear regression model on time data + and then uses the model to predict next values in the time series. + + Args: + features_col (str): Name of the column containing the features (the input). Default is 'features'. + label_col (str): Name of the column containing the label (the input). Default is 'label'. + prediction_col (str): Name of the column to which the prediction will be written. Default is 'prediction'. + + Example: + -------- + ```python + from pyspark.sql import SparkSession + from pyspark.ml.feature import VectorAssembler + from rtdip_sdk.pipelines.forecasting.spark.linear_regression import LinearRegression + + spark = SparkSession.builder.master("local[2]").appName("LinearRegressionExample").getOrCreate() + + data = [ + (1, 2.0, 3.0), + (2, 3.0, 4.0), + (3, 4.0, 5.0), + (4, 5.0, 6.0), + (5, 6.0, 7.0), + ] + columns = ["id", "feature1", "label"] + df = spark.createDataFrame(data, columns) + + assembler = VectorAssembler(inputCols=["feature1"], outputCol="features") + df = assembler.transform(df) + + lr = LinearRegression(features_col="features", label_col="label", prediction_col="prediction") + train_df, test_df = lr.split_data(df, train_ratio=0.8) + lr.train(train_df) + predictions = lr.predict(test_df) + rmse, r2 = lr.evaluate(predictions) + print(f"RMSE: {rmse}, R²: {r2}") + ``` + + """ + + def __init__( + self, + features_col: str = "features", + label_col: str = "label", + prediction_col: str = "prediction", + ) -> None: + self.features_col = features_col + self.label_col = label_col + self.prediction_col = prediction_col + + @staticmethod + def system_type(): + """ + Attributes: + SystemType (Environment): Requires PYSPARK + """ + return SystemType.PYSPARK + + @staticmethod + def libraries(): + libraries = Libraries() + return libraries + + @staticmethod + def settings() -> dict: + return {} + + def split_data( + self, df: DataFrame, train_ratio: float = 0.8 + ) -> tuple[DataFrame, DataFrame]: + """ + Splits the dataset into training and testing sets. + + Args: + train_ratio (float): The ratio of the data to be used for training. Default is 0.8 (80% for training). + + Returns: + tuple[DataFrame, DataFrame]: Returns the training and testing datasets. + """ + train_df, test_df = df.randomSplit([train_ratio, 1 - train_ratio], seed=42) + return train_df, test_df + + def train(self, train_df: DataFrame): + """ + Trains a linear regression model on the provided data. + """ + linear_regression = ml.regression.LinearRegression( + featuresCol=self.features_col, + labelCol=self.label_col, + predictionCol=self.prediction_col, + ) + + self.model = linear_regression.fit(train_df) + return self + + def predict(self, prediction_df: DataFrame): + """ + Predicts the next values in the time series. + """ + + return self.model.transform( + prediction_df, + ) + + def evaluate(self, test_df: DataFrame) -> Optional[float]: + """ + Evaluates the trained model using RMSE. + + Args: + test_df (DataFrame): The testing dataset to evaluate the model. + + Returns: + Optional[float]: The Root Mean Squared Error (RMSE) of the model or None if the prediction columnd doesn't exist. + """ + + if self.prediction_col not in test_df.columns: + print( + f"Error: '{self.prediction_col}' column is missing in the test DataFrame." + ) + return None + + # Evaluator for RMSE + evaluator_rmse = RegressionEvaluator( + labelCol=self.label_col, + predictionCol=self.prediction_col, + metricName="rmse", + ) + rmse = evaluator_rmse.evaluate(test_df) + + # Evaluator for R² + evaluator_r2 = RegressionEvaluator( + labelCol=self.label_col, predictionCol=self.prediction_col, metricName="r2" + ) + r2 = evaluator_r2.evaluate(test_df) + + return rmse, r2 diff --git a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/logging/__init__.py similarity index 95% rename from tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/__init__.py rename to src/sdk/python/rtdip_sdk/pipelines/logging/__init__.py index 5305a429e..1832b01ae 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/__init__.py +++ b/src/sdk/python/rtdip_sdk/pipelines/logging/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2022 RTDIP +# Copyright 2025 RTDIP # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/src/sdk/python/rtdip_sdk/pipelines/logging/interfaces.py b/src/sdk/python/rtdip_sdk/pipelines/logging/interfaces.py new file mode 100644 index 000000000..f72d565be --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/logging/interfaces.py @@ -0,0 +1,24 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import abstractmethod + +from pyspark.sql import DataFrame +from ..interfaces import PipelineComponentBaseInterface + + +class LoggingBaseInterface(PipelineComponentBaseInterface): + @abstractmethod + def get_logs_as_df(self, logger_name: str) -> DataFrame: + pass diff --git a/src/sdk/python/rtdip_sdk/pipelines/logging/logger_manager.py b/src/sdk/python/rtdip_sdk/pipelines/logging/logger_manager.py new file mode 100644 index 000000000..1e68e181f --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/logging/logger_manager.py @@ -0,0 +1,82 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging + + +from pyspark.pandas.usage_logging.usage_logger import get_logger + + +class LoggerManager: + """ + Manages creation and storage of all loggers in the application. This is a singleton class. + Please create loggers with the LoggerManager if you want your logs to be handled and stored properly. + + + Example Usage + -------- + ```python + logger_manager = LoggerManager() + logger = logger_manager.create_logger("my_logger") + logger.info("This is a log message") + my_logger = logger_manager.get_logger("my_logger") + ``` + """ + + _instance = None + _initialized = False + + # dictionary to store all loggers + loggers = {} + + def __new__(cls): + if cls._instance is None: + cls._instance = super(LoggerManager, cls).__new__(cls) + return cls._instance + + def __init__(self): + if not LoggerManager._initialized: + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + ) + LoggerManager._initialized = True + + @classmethod + def create_logger(cls, name: str): + """ + Creates a logger with the specified name. + + Args: + name (str): The name of the logger. + + Returns: + logging.Logger: Configured logger instance. + """ + if name not in cls.loggers: + logger = logging.getLogger(name) + cls.loggers[name] = logger + return logger + + return cls.get_logger(name) + + @classmethod + def get_logger(cls, name: str): + if name not in cls.loggers: + return None + return cls.loggers[name] + + @classmethod + def get_all_loggers(cls) -> dict: + return cls.loggers diff --git a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/logging/spark/__init__.py similarity index 95% rename from tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/__init__.py rename to src/sdk/python/rtdip_sdk/pipelines/logging/spark/__init__.py index 5305a429e..1832b01ae 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/__init__.py +++ b/src/sdk/python/rtdip_sdk/pipelines/logging/spark/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2022 RTDIP +# Copyright 2025 RTDIP # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/logging/spark/dataframe/__init__.py similarity index 95% rename from src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/__init__.py rename to src/sdk/python/rtdip_sdk/pipelines/logging/spark/dataframe/__init__.py index 5305a429e..1832b01ae 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/__init__.py +++ b/src/sdk/python/rtdip_sdk/pipelines/logging/spark/dataframe/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2022 RTDIP +# Copyright 2025 RTDIP # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/src/sdk/python/rtdip_sdk/pipelines/logging/spark/dataframe/dataframe_log_handler.py b/src/sdk/python/rtdip_sdk/pipelines/logging/spark/dataframe/dataframe_log_handler.py new file mode 100644 index 000000000..f0d8ebdb6 --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/logging/spark/dataframe/dataframe_log_handler.py @@ -0,0 +1,72 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging + +from pyspark.sql import DataFrame as PySparkDataFrame, SparkSession +from datetime import datetime + + +from pyspark.sql.types import StructField, TimestampType, StringType, StructType, Row + + +class DataFrameLogHandler(logging.Handler): + """ + Handles logs from attached logger and stores them in a DataFrame at runtime + Uses the following format: {Timestamp, Logger Name, Logging Level, Log Message} + + Args: + logging.Handler: Inherits from logging.Handler + + Returns: + returns a DataFrame with logs stored in it + + Example + -------- + ```python + import logging + + log_manager = logging.getLogger('log_manager') + + """ + + logs_df: PySparkDataFrame = None + spark: SparkSession + + def __init__(self, spark: SparkSession): + self.spark = spark + schema = StructType( + [ + StructField("timestamp", TimestampType(), True), + StructField("name", StringType(), True), + StructField("level", StringType(), True), + StructField("message", StringType(), True), + ] + ) + + self.logs_df = self.spark.createDataFrame([], schema) + super().__init__() + + def emit(self, record: logging.LogRecord) -> None: + """Process and store a log record""" + new_log_entry = Row( + timestamp=datetime.fromtimestamp(record.created), + name=record.name, + level=record.levelname, + message=record.msg, + ) + + self.logs_df = self.logs_df.union(self.spark.createDataFrame([new_log_entry])) + + def get_logs_as_df(self) -> PySparkDataFrame: + return self.logs_df diff --git a/src/sdk/python/rtdip_sdk/pipelines/logging/spark/log_file/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/logging/spark/log_file/__init__.py new file mode 100644 index 000000000..1832b01ae --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/logging/spark/log_file/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/src/sdk/python/rtdip_sdk/pipelines/logging/spark/log_file/file_log_handler.py b/src/sdk/python/rtdip_sdk/pipelines/logging/spark/log_file/file_log_handler.py new file mode 100644 index 000000000..d820348a9 --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/logging/spark/log_file/file_log_handler.py @@ -0,0 +1,61 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging + + +from pandas import DataFrame +from datetime import datetime + + +class FileLogHandler(logging.Handler): + """ + Handles logs from attached logger and stores them in a .log file + + Args: + logging.Handler: Inherits from logging.Handler + filename (str): Name of the log file to write to + mode (str): File opening mode ('a' for append, 'w' for write) + + Example + -------- + ```python + import logging + + log_manager = logging.getLogger('log_manager') + handler = FileLogHandler('my_logs.log') + log_manager.addHandler(handler) + ``` + """ + + logs_df: DataFrame = None + + def __init__(self, file_path: str, mode: str = "a"): + super().__init__() + self.mode = mode + self.file_path = file_path + + def emit(self, record: logging.LogRecord) -> None: + """Process and store a log record in the log file""" + try: + log_entry = { + f"{datetime.fromtimestamp(record.created).isoformat()} | " + f"{record.name} | " + f"{record.levelname} | " + f"{record.msg}\n" + } + with open(self.file_path, self.mode, encoding="utf-8") as log_file: + log_file.write(str(log_entry) + "\n") + + except Exception as e: + print(f"Error writing log entry to file: {e}") diff --git a/src/sdk/python/rtdip_sdk/pipelines/logging/spark/runtime_log_collector.py b/src/sdk/python/rtdip_sdk/pipelines/logging/spark/runtime_log_collector.py new file mode 100644 index 000000000..7b3ad84fb --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/logging/spark/runtime_log_collector.py @@ -0,0 +1,73 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +from pyspark.sql import SparkSession + +from src.sdk.python.rtdip_sdk.pipelines._pipeline_utils.models import ( + Libraries, + SystemType, +) + +from src.sdk.python.rtdip_sdk.pipelines.logging.logger_manager import LoggerManager +from src.sdk.python.rtdip_sdk.pipelines.logging.spark.dataframe.dataframe_log_handler import ( + DataFrameLogHandler, +) +from src.sdk.python.rtdip_sdk.pipelines.logging.spark.log_file.file_log_handler import ( + FileLogHandler, +) + + +class RuntimeLogCollector: + """Collects logs from all loggers in the LoggerManager at runtime.""" + + logger_manager: LoggerManager = LoggerManager() + + spark: SparkSession + + def __init__(self, spark: SparkSession): + self.spark = spark + + @staticmethod + def libraries(): + libraries = Libraries() + return libraries + + @staticmethod + def settings() -> dict: + return {} + + def _attach_dataframe_handler_to_logger( + self, logger_name: str + ) -> DataFrameLogHandler: + """Attaches the DataFrameLogHandler to the logger. Returns True if the handler was attached, False otherwise.""" + logger = self.logger_manager.get_logger(logger_name) + df_log_handler = DataFrameLogHandler(self.spark) + if logger is not None: + if df_log_handler not in logger.handlers: + logger.addHandler(df_log_handler) + return df_log_handler + + def _attach_file_handler_to_loggers( + self, filename: str, path: str = ".", mode: str = "a" + ) -> None: + """Attaches the FileLogHandler to the logger.""" + + loggers = self.logger_manager.get_all_loggers() + file_path = os.path.join(path, filename) + file_handler = FileLogHandler(file_path, mode) + for logger in loggers.values(): + # avoid duplicate handlers + if file_handler not in logger.handlers: + logger.addHandler(file_handler) diff --git a/src/sdk/python/rtdip_sdk/pipelines/sources/spark/ecmwf/base_mars.py b/src/sdk/python/rtdip_sdk/pipelines/sources/spark/ecmwf/base_mars.py index 4021e9cef..1dec7866a 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/sources/spark/ecmwf/base_mars.py +++ b/src/sdk/python/rtdip_sdk/pipelines/sources/spark/ecmwf/base_mars.py @@ -82,11 +82,9 @@ def retrieve( Parameters: mars_dict (dict): Dictionary of mars parameters. n_jobs (int, optional): Download in parallel? by default None, i.e. no parallelization - backend (str, optional) : Specify the parallelization backend implementation in joblib, by default "loky" + backend (str, optional): Specify the parallelization backend implementation in joblib, by default "loky" tries (int, optional): Number of tries for each request if it fails, by default 5 - cost (bool, optional): Pass a cost request to mars to estimate the size and efficiency of your request, - but not actually download the data. Can be useful for defining requests, - by default False. + cost (bool, optional): Pass a cost request to mars to estimate the size and efficiency of your request, but not actually download the data. Can be useful for defining requests, by default False. """ chk = ["date", "target", "time", "format", "output"] for i in chk: diff --git a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/machine_learning/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/machine_learning/__init__.py new file mode 100644 index 000000000..9a4ecff83 --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/machine_learning/__init__.py @@ -0,0 +1,16 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .columns_to_vector import * +from .polynomial_features import * diff --git a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/machine_learning/columns_to_vector.py b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/machine_learning/columns_to_vector.py new file mode 100644 index 000000000..df856bf57 --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/machine_learning/columns_to_vector.py @@ -0,0 +1,86 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pyspark.ml.feature import VectorAssembler +from pyspark.sql import DataFrame +from ...._pipeline_utils.models import Libraries, SystemType +from ...interfaces import TransformerInterface + + +class ColumnsToVector(TransformerInterface): + """ + Converts columns containing numbers to a column containing a vector. + + Parameters: + df (DataFrame): PySpark DataFrame + input_cols (list[str]): List of columns to convert to a vector. + output_col (str): Name of the output column where the vector will be stored. + override_col (bool): If True, the output column can override an existing column. + """ + + def __init__( + self, + df: DataFrame, + input_cols: list[str], + output_col: str, + override_col: bool = False, + ) -> None: + self.input_cols = input_cols + self.output_col = output_col + self.override_col = override_col + self.df = df + + @staticmethod + def system_type(): + """ + Attributes: + SystemType (Environment): Requires PYSPARK + """ + return SystemType.PYSPARK + + @staticmethod + def libraries(): + libraries = Libraries() + return libraries + + @staticmethod + def settings() -> dict: + return {} + + def pre_transform_validation(self): + if self.output_col in self.df.columns and not self.override_col: + return False + return True + + def post_transform_validation(self): + return True + + def transform(self): + if not self.pre_transform_validation(): + raise ValueError( + f"Output column {self.output_col} already exists and override_col is set to False." + ) + + temp_col = ( + f"{self.output_col}_temp" if self.output_col in self.df.columns else None + ) + transformed_df = VectorAssembler( + inputCols=self.input_cols, outputCol=(temp_col or self.output_col) + ).transform(self.df) + + if temp_col: + return transformed_df.drop(self.output_col).withColumnRenamed( + temp_col, self.output_col + ) + return transformed_df diff --git a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/machine_learning/one_hot_encoding.py b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/machine_learning/one_hot_encoding.py new file mode 100644 index 000000000..37a0d2ae1 --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/machine_learning/one_hot_encoding.py @@ -0,0 +1,135 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pyspark.sql import DataFrame as PySparkDataFrame +from pyspark.sql import functions as F +from ...interfaces import TransformerInterface +from ...._pipeline_utils.models import Libraries, SystemType + + +class OneHotEncoding(TransformerInterface): + """ + Performs One-Hot Encoding on a specified column of a PySpark DataFrame. + + Example + -------- + ```python + from src.sdk.python.rtdip_sdk.pipelines.transformers.spark.machine_learning.one_hot_encoding import OneHotEncoding + from pyspark.sql import SparkSession + + + spark = ... # SparkSession + df = ... # Get a PySpark DataFrame + + one_hot_encoder = OneHotEncoding(df, "column_name", ["list_of_distinct_values"]) + result_df = one_hot_encoder.encode() + result_df.show() + ``` + + Parameters: + df (DataFrame): The PySpark DataFrame to apply encoding on. + column (str): The name of the column to apply the encoding to. + values (list, optional): A list of distinct values to encode. If not provided, + the distinct values from the data will be used. + """ + + df: PySparkDataFrame + column: str + values: list + + def __init__(self, df: PySparkDataFrame, column: str, values: list = None) -> None: + self.df = df + self.column = column + self.values = values + + @staticmethod + def system_type(): + """ + Attributes: + SystemType (Environment): Requires PYSPARK + """ + return SystemType.PYSPARK + + @staticmethod + def libraries(): + libraries = Libraries() + return libraries + + @staticmethod + def settings() -> dict: + return {} + + def pre_transform_validation(self): + """ + Validate the input data before transformation. + - Check if the specified column exists in the DataFrame. + - If no values are provided, check if the distinct values can be computed. + - Ensure the DataFrame is not empty. + """ + if self.df is None or self.df.count() == 0: + raise ValueError("The DataFrame is empty.") + + if self.column not in self.df.columns: + raise ValueError(f"Column '{self.column}' does not exist in the DataFrame.") + + if not self.values: + distinct_values = [ + row[self.column] + for row in self.df.select(self.column).distinct().collect() + ] + if not distinct_values: + raise ValueError(f"No distinct values found in column '{self.column}'.") + self.values = distinct_values + + def post_transform_validation(self): + """ + Validate the result after transformation. + - Ensure that new columns have been added based on the distinct values. + - Verify the transformed DataFrame contains the expected number of columns. + """ + expected_columns = [ + f"{self.column}_{value if value is not None else 'None'}" + for value in self.values + ] + missing_columns = [ + col for col in expected_columns if col not in self.df.columns + ] + + if missing_columns: + raise ValueError( + f"Missing columns in the transformed DataFrame: {missing_columns}" + ) + + if self.df.count() == 0: + raise ValueError("The transformed DataFrame is empty.") + + def transform(self) -> PySparkDataFrame: + + self.pre_transform_validation() + + if not self.values: + self.values = [ + row[self.column] + for row in self.df.select(self.column).distinct().collect() + ] + + for value in self.values: + self.df = self.df.withColumn( + f"{self.column}_{value if value is not None else 'None'}", + F.when(F.col(self.column) == value, 1).otherwise(0), + ) + + self.post_transform_validation() + + return self.df diff --git a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/machine_learning/polynomial_features.py b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/machine_learning/polynomial_features.py new file mode 100644 index 000000000..b3456fe65 --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/machine_learning/polynomial_features.py @@ -0,0 +1,110 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pyspark.ml as ml +from pyspark.sql import DataFrame + +from ...._pipeline_utils.models import Libraries, SystemType +from ...interfaces import TransformerInterface + + +class PolynomialFeatures(TransformerInterface): + """ + This transformer takes a vector column and generates polynomial combinations of the input features + up to the specified degree. For example, if the input vector is [a, b] and degree=2, + the output features will be [a, b, a^2, ab, b^2]. + + Parameters: + df (DataFrame): PySpark DataFrame + input_col (str): Name of the input column in the DataFrame that contains the feature vectors + output_col (str): + poly_degree (int): The degree of the polynomial features to generate + override_col (bool): If True, the output column can override an existing column. + """ + + def __init__( + self, + df: DataFrame, + input_col: str, + output_col: str, + poly_degree: int, + override_col: bool = False, + ): + self.df = df + self.input_col = input_col + self.output_col = output_col + self.poly_degree = poly_degree + self.override_col = override_col + + @staticmethod + def system_type(): + """ + Attributes: + SystemType (Environment): Requires PYSPARK + """ + return SystemType.PYSPARK + + @staticmethod + def libraries(): + libraries = Libraries() + return libraries + + @staticmethod + def settings() -> dict: + return {} + + def pre_transform_validation(self): + if not (self.input_col in self.df.columns): + raise ValueError( + f"Input column '{self.input_col}' does not exist in the DataFrame." + ) + if self.output_col in self.df.columns and not self.override_col: + raise ValueError( + f"Output column '{self.output_col}' already exists in the DataFrame and override_col is set to False." + ) + if not isinstance(self.df.schema[self.input_col].dataType, ml.linalg.VectorUDT): + raise ValueError( + f"Input column '{self.input_col}' is not of type VectorUDT." + ) + return True + + def post_transform_validation(self): + if self.output_col not in self.df.columns: + raise ValueError( + f"Output column '{self.output_col}' does not exist in the transformed DataFrame." + ) + return True + + def transform(self): + + self.pre_transform_validation() + + temp_col = ( + f"{self.output_col}_temp" if self.output_col in self.df.columns else None + ) + transformed_df = ml.feature.PolynomialExpansion( + degree=self.poly_degree, + inputCol=self.input_col, + outputCol=(temp_col or self.output_col), + ).transform(self.df) + + if temp_col: + return transformed_df.drop(self.output_col).withColumnRenamed( + temp_col, self.output_col + ) + + self.df = transformed_df + self.post_transform_validation() + + return transformed_df diff --git a/src/sdk/python/rtdip_sdk/pipelines/utilities/spark/time_string_parsing.py b/src/sdk/python/rtdip_sdk/pipelines/utilities/spark/time_string_parsing.py new file mode 100644 index 000000000..0bad557a7 --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/utilities/spark/time_string_parsing.py @@ -0,0 +1,46 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re + + +def parse_time_string_to_ms(time_str: str) -> float: + """ + Parses a time string and returns the total time in milliseconds. + + Args: + time_str (str): Time string (e.g., '10ms', '1s', '2m', '1h'). + + Returns: + float: Total time in milliseconds. + + Raises: + ValueError: If the format is invalid. + """ + pattern = re.compile(r"^(\d+(?:\.\d+)?)(ms|s|m|h)$") + match = pattern.match(time_str) + if not match: + raise ValueError(f"Invalid time format: {time_str}") + value, unit = match.groups() + value = float(value) + if unit == "ms": + return value + elif unit == "s": + return value * 1000 + elif unit == "m": + return value * 60 * 1000 + elif unit == "h": + return value * 3600 * 1000 + else: + raise ValueError(f"Unsupported time unit in time: {unit}") diff --git a/src/sdk/python/rtdip_sdk/queries/time_series/_time_series_query_builder.py b/src/sdk/python/rtdip_sdk/queries/time_series/_time_series_query_builder.py index e89c7b07c..3797e5877 100644 --- a/src/sdk/python/rtdip_sdk/queries/time_series/_time_series_query_builder.py +++ b/src/sdk/python/rtdip_sdk/queries/time_series/_time_series_query_builder.py @@ -27,43 +27,473 @@ seconds_per_unit = {"s": 1, "m": 60, "h": 3600, "d": 86400, "w": 604800} -def _raw_query(parameters_dict: dict) -> str: - raw_query = ( - 'WITH raw_events AS (SELECT DISTINCT from_utc_timestamp(date_trunc("millisecond",`{{ timestamp_column }}`), "{{ time_zone }}") AS `{{ timestamp_column }}`, `{{ tagname_column }}`, {% if include_status is defined and include_status == true %} `{{ status_column }}`, {% endif %} `{{ value_column }}` FROM ' - "{% if source is defined and source is not none %}" - "`{{ source|lower }}` " - "{% else %}" - "`{{ business_unit|lower }}`.`sensors`.`{{ asset|lower }}_{{ data_security_level|lower }}_events_{{ data_type|lower }}` " - "{% endif %}" - "{% if case_insensitivity_tag_search is defined and case_insensitivity_tag_search == true %}" - "WHERE `{{ timestamp_column }}` BETWEEN to_timestamp(\"{{ start_date }}\") AND to_timestamp(\"{{ end_date }}\") AND UPPER(`{{ tagname_column }}`) IN ('{{ tag_names | join('\\', \\'') | upper }}') " - "{% else %}" - "WHERE `{{ timestamp_column }}` BETWEEN to_timestamp(\"{{ start_date }}\") AND to_timestamp(\"{{ end_date }}\") AND `{{ tagname_column }}` IN ('{{ tag_names | join('\\', \\'') }}') " - "{% endif %}" - "{% if include_status is defined and include_status == true and include_bad_data is defined and include_bad_data == false %}" - "AND `{{ status_column }}` <> 'Bad'" - "{% endif %}" - "ORDER BY `{{ tagname_column }}`, `{{ timestamp_column }}` " - ") " - "{% if display_uom is defined and display_uom == true %}" - 'SELECT {% if to_json is defined and to_json == true %}to_json(struct(e.`EventTime`, e.`TagName`, e.`Status`, e.`Value`, m.`UOM`), map("timestampFormat", "yyyy-MM-dd\'T\'HH:mm:ss.SSSSSSSSSXXX")) as Value{% else %}e.`EventTime`, e.`TagName`, e.`Status`, e.`Value`, m.`UOM`{% endif %} FROM raw_events e ' - "LEFT OUTER JOIN " - "{% if metadata_source is defined and metadata_source is not none %}" - "`{{ metadata_source|lower }}` m ON e.`{{ tagname_column }}` = m.`{{ metadata_tagname_column }}` " - "{% else %}" - "`{{ business_unit|lower }}`.`sensors`.`{{ asset|lower }}_{{ data_security_level|lower }}_metadata` m ON e.`{{ tagname_column }}` = m.`{{ tagname_column }}` " - "{% endif %}" - "{% else %}" - 'SELECT {% if to_json is defined and to_json == true %}to_json(struct(*), map("timestampFormat", "yyyy-MM-dd\'T\'HH:mm:ss.SSSSSSSSSXXX")) as Value{% else %}*{% endif %} FROM raw_events ' - "{% endif %}" - "{% if limit is defined and limit is not none %}" - "LIMIT {{ limit }} " - "{% endif %}" - "{% if offset is defined and offset is not none %}" - "OFFSET {{ offset }} " - "{% endif %}" +def _build_sql_cte_statement(sql_query_list): + sql_cte_query = ", ".join( + [sql_query["sql_query"] for sql_query in sql_query_list[:-1]], + ) + + sql_cte_query = " ".join(["WITH", sql_cte_query]) + + if len(sql_cte_query) > 1: + sql_cte_query = " ".join([sql_cte_query, sql_query_list[-1]["sql_query"]]) + + return sql_cte_query + + +def _window_start_time_offset(start_date, time_interval_rate, time_interval_unit: str): + time_interval_rate_number = float(time_interval_rate) + + if "day" in time_interval_unit: + time_interval_rate_seconds = time_interval_rate_number * 24 * 3600 + elif "hour" in time_interval_unit: + time_interval_rate_seconds = time_interval_rate_number * 3600 + elif "minute" in time_interval_unit: + time_interval_rate_seconds = time_interval_rate_number * 60 + elif "second" in time_interval_unit: + time_interval_rate_seconds = time_interval_rate_number + + # Calculate Offset for startTime parameter + + offset_start_time = ( + datetime.strptime(start_date, TIMESTAMP_FORMAT).timestamp() + % time_interval_rate_seconds + ) + + offset_start_time = f"{int(offset_start_time)} second" + return offset_start_time + + +def _build_raw_query( + sql_query_name, + timestamp_column, + tagname_column, + status_column, + value_column, + start_date, + end_date, + time_zone, + time_interval_rate=None, + time_interval_unit=None, + agg_method=None, + deduplicate=None, + source=None, + business_unit=None, + asset=None, + data_security_level=None, + data_type=None, + tag_names=None, + include_status=None, + include_bad_data=None, + case_insensitivity_tag_search=None, + sort=True, +): + # Select + raw_query_sql = f"{sql_query_name} AS (SELECT" + if agg_method == "avg" or deduplicate == True: + raw_query_sql = " ".join([raw_query_sql, "DISTINCT"]) + + # Event Time + raw_query_sql = " ".join( + [ + raw_query_sql, + f"from_utc_timestamp(date_trunc('millisecond',`{timestamp_column}`), '{time_zone}') AS `{timestamp_column}`,", + ] + ) + if time_interval_rate is not None: + window_offset_start_time = _window_start_time_offset( + start_date=start_date, + time_interval_rate=time_interval_rate, + time_interval_unit=time_interval_unit, + ) + raw_query_sql = " ".join( + [ + raw_query_sql, + f"window(from_utc_timestamp(date_trunc('millisecond',`{timestamp_column}`), '{time_zone}'), '{time_interval_rate} {time_interval_unit}', '{time_interval_rate} {time_interval_unit}', '{window_offset_start_time}') AS `window`,", + ] + ) + + # Tag Name + raw_query_sql = " ".join([raw_query_sql, f"`{tagname_column}`,"]) + + # Status + if include_status == True: + raw_query_sql = " ".join([raw_query_sql, f"`{status_column}`,"]) + else: + raw_query_sql = " ".join([raw_query_sql, "'Good' AS `Status`,"]) + + # Value + raw_query_sql = " ".join([raw_query_sql, f"`{value_column}` FROM"]) + + if source is not None: + raw_query_sql = " ".join([raw_query_sql, f"`{source.lower()}`"]) + else: + raw_query_sql = " ".join( + [ + raw_query_sql, + f"`{business_unit.lower()}`.`sensors`.`{asset.lower()}_{data_security_level.lower()}_events_{data_type.lower()}`", + ] + ) + + # Where + to_timestamp = ( + f"to_timestamp('{end_date}')" + if time_interval_rate is None + else f"timestampadd({time_interval_unit}, {time_interval_rate}, to_timestamp('{end_date}'))" + ) + + raw_query_sql = " ".join( + [ + raw_query_sql, + f"WHERE `{timestamp_column}` BETWEEN to_timestamp('{start_date}') AND {to_timestamp} AND", + ] + ) + + if case_insensitivity_tag_search == True: + quoted_tag_names = "', '".join([tag.upper() for tag in tag_names]) + raw_query_sql = " ".join( + [ + raw_query_sql, + f"UPPER(`{tagname_column}`) IN ('{quoted_tag_names}')", + ] + ) + else: + quoted_tag_names = "', '".join(tag_names) + raw_query_sql = " ".join( + [ + raw_query_sql, + f"`{tagname_column}` IN ('{quoted_tag_names}')", + ] + ) + + if include_status == True and include_bad_data == False: + raw_query_sql = " ".join([raw_query_sql, f"AND `{status_column}` <> 'Bad'"]) + + if sort == True: + raw_query_sql = " ".join( + [ + raw_query_sql, + f"ORDER BY `{tagname_column}`, `{timestamp_column}`", + ] + ) + raw_query_sql += ")" + + return raw_query_sql + + +def _build_resample_query( + sql_query_list, + sql_query_name, + timestamp_column, + tagname_column, + value_column, + tag_names, + start_date, + end_date, + time_zone, + time_interval_rate, + time_interval_unit, + agg_method, + case_insensitivity_tag_search, + fill=False, + sort=True, +): + parent_sql_query_name = sql_query_list[-1]["query_name"] + + from_sql = parent_sql_query_name + timestamp_sql = f"{parent_sql_query_name}.`window`.start" + tagname_sql = f"{parent_sql_query_name}.`{tagname_column}`" + groupby_sql = f"{parent_sql_query_name}.`{tagname_column}`, {parent_sql_query_name}.`window`.start" + + if fill == True: + quoted_tag_names = ( + "', '".join([tag.upper() for tag in tag_names]) + if case_insensitivity_tag_search == True + else "', '".join(tag_names) + ) + date_fill_query = f"fill_intervals AS (SELECT DISTINCT explode(sequence(from_utc_timestamp(to_timestamp('{start_date}'), '{time_zone}'), from_utc_timestamp(to_timestamp('{end_date}'), '{time_zone}'), INTERVAL '{time_interval_rate} {time_interval_unit}')) AS `{timestamp_column}`, explode(array('{quoted_tag_names}')) AS `{tagname_column}`)" + from_sql = f"fill_intervals LEFT OUTER JOIN {parent_sql_query_name} ON fill_intervals.`{timestamp_column}` = {parent_sql_query_name}.`window`.start AND fill_intervals.`{tagname_column}` = {parent_sql_query_name}.`{tagname_column}`" + timestamp_sql = f"fill_intervals.`{timestamp_column}`" + tagname_sql = f"fill_intervals.`{tagname_column}`" + groupby_sql = ( + f"fill_intervals.`{tagname_column}`, fill_intervals.`{timestamp_column}`" + ) + + resample_query_sql = f"{sql_query_name} AS (SELECT {tagname_sql}, {timestamp_sql} AS `{timestamp_column}`, {agg_method}({parent_sql_query_name}.`{value_column}`) AS `{value_column}` FROM {from_sql} GROUP BY {groupby_sql}" + + if fill == True: + resample_query_sql = ", ".join( + [ + date_fill_query, + resample_query_sql, + ] + ) + + if sort == True: + resample_query_sql = " ".join( + [ + resample_query_sql, + f"ORDER BY `{tagname_column}`, `{timestamp_column}`", + ] + ) + + return resample_query_sql + ")" + + +def _build_fill_intervals_query( + sql_query_list, + sql_query_name, + timestamp_column, + tagname_column, + value_column, + tag_names, + start_date, + end_date, + time_zone, + time_interval_rate, + time_interval_unit, + case_insensitivity_tag_search, +): + parent_sql_query_name = sql_query_list[-1]["query_name"] + quoted_tag_names = ( + "', '".join([tag.upper() for tag in tag_names]) + if case_insensitivity_tag_search == True + else "', '".join(tag_names) + ) + intervals_query = f"intervals AS (SELECT DISTINCT explode(sequence(from_utc_timestamp(to_timestamp('{start_date}'), '{time_zone}'), from_utc_timestamp(to_timestamp('{end_date}'), '{time_zone}'), INTERVAL '{time_interval_rate} {time_interval_unit}')) AS `{timestamp_column}`, explode(array('{quoted_tag_names}')) AS `{tagname_column}`), " + fill_intervals_query = f"{sql_query_name} as (SELECT intervals.`{tagname_column}`, intervals.`{timestamp_column}` as `{timestamp_column}`, raw. `{timestamp_column}` as `Original{timestamp_column}`, raw.`{value_column}`, CASE WHEN raw.`{value_column}` IS NULL THEN NULL ELSE struct(raw.`{timestamp_column}`, raw.`{value_column}`) END AS `{timestamp_column}_{value_column}` " + from_sql = f"FROM intervals LEFT OUTER JOIN {parent_sql_query_name} ON intervals.`{timestamp_column}` = {parent_sql_query_name}.`window`.start AND intervals.`{tagname_column}` = {parent_sql_query_name}.`{tagname_column}`" + + return intervals_query + fill_intervals_query + from_sql + ")" + + +def _build_interpolate_query( + sql_query_list, + sql_query_name, + tagname_column, + timestamp_column, + value_column, + sort=True, +): + parent_sql_query_name = sql_query_list[-1]["query_name"] + + interpolate_calc_query_sql = f"{sql_query_name}_calculate AS (SELECT `Original{timestamp_column}`, `{timestamp_column}`, `{tagname_column}`, " + lag_value_query_sql = f"CASE WHEN `{value_column}` IS NOT NULL THEN NULL ELSE LAG(`{timestamp_column}_{value_column}`) IGNORE NULLS OVER (PARTITION BY `{tagname_column}` ORDER BY `{timestamp_column}`) END AS Prev{timestamp_column}{value_column}, " + lead_value_query_sql = f"CASE WHEN `{value_column}` IS NOT NULL THEN NULL ELSE LEAD(`{timestamp_column}_{value_column}`) IGNORE NULLS OVER (PARTITION BY `{tagname_column}` ORDER BY `{timestamp_column}`) END AS Next{timestamp_column}{value_column}, " + value_query_sql = f"CASE WHEN `Original{timestamp_column}` = `{timestamp_column}` THEN `{value_column}` WHEN `Prev{timestamp_column}{value_column}` IS NOT NULL AND `Next{timestamp_column}{value_column}` IS NOT NULL THEN `Prev{timestamp_column}{value_column}`.`{value_column}` + ((`Next{timestamp_column}{value_column}`.`{value_column}` - `Prev{timestamp_column}{value_column}`.`{value_column}`) * (unix_timestamp(`{timestamp_column}`) - unix_timestamp(`Prev{timestamp_column}{value_column}`.`{timestamp_column}`)) / (unix_timestamp(`Next{timestamp_column}{value_column}`.`{timestamp_column}`) - unix_timestamp(`Prev{timestamp_column}{value_column}`.`{timestamp_column}`))) WHEN `Prev{timestamp_column}{value_column}` IS NOT NULL THEN `Prev{timestamp_column}{value_column}`.`{value_column}` ELSE NULL END as `{value_column}` FROM {parent_sql_query_name} " + interpolate_project_query_sql = f"), {sql_query_name} AS (SELECT `{timestamp_column}`, `{tagname_column}`, `{value_column}` FROM {sql_query_name}_calculate WHERE `Original{timestamp_column}` IS NULL OR `Original{timestamp_column}` = `{timestamp_column}` " + + interpolate_query_sql = ( + interpolate_calc_query_sql + + lag_value_query_sql + + lead_value_query_sql + + value_query_sql + + interpolate_project_query_sql + ) + + if sort == True: + interpolate_query_sql = " ".join( + [ + interpolate_query_sql, + f"ORDER BY `{tagname_column}`, `{timestamp_column}`", + ] + ) + + return interpolate_query_sql + ")" + + +def _build_summary_query( + sql_query_name, + timestamp_column, + tagname_column, + status_column, + value_column, + start_date, + end_date, + source=None, + business_unit=None, + asset=None, + data_security_level=None, + data_type=None, + tag_names=None, + include_status=None, + include_bad_data=None, + case_insensitivity_tag_search=None, +): + + # Select + summary_query_sql = f"{sql_query_name} AS (SELECT `{tagname_column}`, " + summary_query_sql = " ".join( + [ + summary_query_sql, + f"count(`{value_column}`) as Count,", + f"CAST(Avg(`{value_column}`) as decimal(10, 2)) as Avg,", + f"CAST(Min(`{value_column}`) as decimal(10, 2)) as Min,", + f"CAST(Max(`{value_column}`) as decimal(10, 2)) as Max,", + f"CAST(stddev(`{value_column}`) as decimal(10, 2)) as StDev,", + f"CAST(sum(`{value_column}`) as decimal(10, 2)) as Sum,", + f"CAST(variance(`{value_column}`) as decimal(10, 2)) as Var FROM", + ] + ) + + # From + if source is not None: + summary_query_sql = " ".join([summary_query_sql, f"`{source.lower()}`"]) + else: + summary_query_sql = " ".join( + [ + summary_query_sql, + f"`{business_unit.lower()}`.`sensors`.`{asset.lower()}_{data_security_level.lower()}_events_{data_type.lower()}`", + ] + ) + + # Where EventTime + summary_query_sql = " ".join( + [ + summary_query_sql, + f"WHERE `{timestamp_column}` BETWEEN to_timestamp('{start_date}') AND to_timestamp('{end_date}') AND", + ] + ) + + # TagName + if case_insensitivity_tag_search == True: + quoted_tag_names = "', '".join([tag.upper() for tag in tag_names]) + summary_query_sql = " ".join( + [ + summary_query_sql, + f"UPPER(`{tagname_column}`) IN ('{quoted_tag_names}')", + ] + ) + else: + quoted_tag_names = "', '".join(tag_names) + summary_query_sql = " ".join( + [summary_query_sql, f"`{tagname_column}` IN ('{quoted_tag_names}')"] + ) + + # Optional bad data filtering + if include_status == True and include_bad_data == False: + summary_query_sql = " ".join( + [summary_query_sql, f"AND `{status_column}` <> 'Bad'"] + ) + + # Group by + summary_query_sql = " ".join([summary_query_sql, f"GROUP BY `{tagname_column}`"]) + summary_query_sql += ")" + + return summary_query_sql + + +def _build_pivot_query( + sql_query_list, + sql_query_name, + tagname_column, + timestamp_column, + value_column, + tag_names, + is_case_insensitive_tag_search, + sort=True, +): + parent_sql_query_name = sql_query_list[-1]["query_name"] + + tag_names_string = ( + ", ".join([f"'{tag.upper()}' AS `{tag}`" for tag in tag_names]) + if is_case_insensitive_tag_search == True + else ", ".join([f"'{tag}' AS `{tag}`" for tag in tag_names]) + ) + + pivot_query_sql = f"{sql_query_name} AS (SELECT * FROM (SELECT `{timestamp_column}`, `{value_column}`," + + if is_case_insensitive_tag_search == True: + pivot_query_sql = " ".join( + [pivot_query_sql, f"UPPER(`{tagname_column}`) AS `{tagname_column}`"] + ) + else: + pivot_query_sql = " ".join([pivot_query_sql, f"`{tagname_column}`"]) + + pivot_query_sql = " ".join( + [ + pivot_query_sql, + f"FROM {parent_sql_query_name}) PIVOT (FIRST(`{value_column}`) FOR `{tagname_column}` IN ({tag_names_string}))", + ] + ) + + if sort == True: + pivot_query_sql = " ".join( + [ + pivot_query_sql, + f"ORDER BY `{timestamp_column}`", + ] + ) + + return pivot_query_sql + ")" + + +def _build_uom_query( + sql_query_list, + sql_query_name, + metadata_source, + business_unit, + asset, + data_security_level, + tagname_column, + metadata_tagname_column, + metadata_uom_column, +): + parent_sql_query_name = sql_query_list[-1]["query_name"] + + uom_sql_query = f"{sql_query_name} AS (SELECT {parent_sql_query_name}.*, metadata.`{metadata_uom_column}` FROM {parent_sql_query_name} LEFT OUTER JOIN" + + if metadata_source: + uom_sql_query = " ".join([uom_sql_query, f"{metadata_source}"]) + else: + uom_sql_query = " ".join( + [ + uom_sql_query, + f"`{business_unit.lower()}`.`sensors`.`{asset.lower()}_{data_security_level.lower()}_metadata`", + ] + ) + + uom_sql_query = " ".join( + [ + uom_sql_query, + f"AS metadata ON {parent_sql_query_name}.`{tagname_column}` = metadata.`{metadata_tagname_column}`", + ] ) + return uom_sql_query + ")" + + +def _build_output_query(sql_query_list, to_json, limit, offset): + parent_sql_query_name = sql_query_list[-1]["query_name"] + + output_sql_query = f"SELECT" + + if to_json == True: + output_sql_query = " ".join( + [ + output_sql_query, + "to_json(struct(*), map('timestampFormat', " + "'yyyy-MM-dd\\'T\\'HH:mm:ss.SSSSSSSSSXXX'" + ")) AS Value", + ] + ) + else: + output_sql_query = " ".join([output_sql_query, "*"]) + + output_sql_query = " ".join([output_sql_query, f"FROM {parent_sql_query_name}"]) + + if limit is not None: + output_sql_query = " ".join([output_sql_query, f"LIMIT {limit}"]) + + if offset is not None: + output_sql_query = " ".join([output_sql_query, f"OFFSET {offset}"]) + + return output_sql_query + + +def _raw_query(parameters_dict: dict) -> str: + + sql_query_list = [] + raw_parameters = { "source": parameters_dict.get("source", None), "metadata_source": parameters_dict.get("metadata_source", None), @@ -76,6 +506,7 @@ def _raw_query(parameters_dict: dict) -> str: "end_date": parameters_dict["end_date"], "tag_names": list(dict.fromkeys(parameters_dict["tag_names"])), "include_bad_data": parameters_dict["include_bad_data"], + "sort": parameters_dict.get("sort", True), "limit": parameters_dict.get("limit", None), "offset": parameters_dict.get("offset", None), "display_uom": parameters_dict.get("display_uom", False), @@ -105,8 +536,56 @@ def _raw_query(parameters_dict: dict) -> str: "to_json": parameters_dict.get("to_json", False), } - sql_template = Template(raw_query) - return sql_template.render(raw_parameters) + raw_query = _build_raw_query( + sql_query_name="raw", + timestamp_column=raw_parameters["timestamp_column"], + tagname_column=raw_parameters["tagname_column"], + status_column=raw_parameters["status_column"], + value_column=raw_parameters["value_column"], + start_date=raw_parameters["start_date"], + end_date=raw_parameters["end_date"], + time_zone=raw_parameters["time_zone"], + deduplicate=True, + source=raw_parameters["source"], + business_unit=raw_parameters["business_unit"], + asset=raw_parameters["asset"], + data_security_level=raw_parameters["data_security_level"], + data_type=raw_parameters["data_type"], + tag_names=raw_parameters["tag_names"], + include_status=raw_parameters["include_status"], + case_insensitivity_tag_search=raw_parameters["case_insensitivity_tag_search"], + sort=raw_parameters["sort"], + ) + + sql_query_list.append({"query_name": "raw", "sql_query": raw_query}) + + if raw_parameters["display_uom"] == True: + uom_query = _build_uom_query( + sql_query_list=sql_query_list, + sql_query_name="uom", + metadata_source=raw_parameters["metadata_source"], + business_unit=raw_parameters["business_unit"], + asset=raw_parameters["asset"], + data_security_level=raw_parameters["data_security_level"], + tagname_column=raw_parameters["tagname_column"], + metadata_tagname_column=raw_parameters["metadata_tagname_column"], + metadata_uom_column=raw_parameters["metadata_uom_column"], + ) + + sql_query_list.append({"query_name": "uom", "sql_query": uom_query}) + + output_query = _build_output_query( + sql_query_list=sql_query_list, + to_json=raw_parameters["to_json"], + limit=raw_parameters["limit"], + offset=raw_parameters["offset"], + ) + + sql_query_list.append({"query_name": "output", "sql_query": output_query}) + + sql_query = _build_sql_cte_statement(sql_query_list) + + return sql_query def _sql_query(parameters_dict: dict) -> str: @@ -137,62 +616,7 @@ def _sql_query(parameters_dict: dict) -> str: return sql_template.render(sql_parameters) -def _sample_query(parameters_dict: dict) -> tuple: - sample_query = ( - 'WITH raw_events AS (SELECT DISTINCT from_utc_timestamp(date_trunc("millisecond",`{{ timestamp_column }}`), "{{ time_zone }}") AS `{{ timestamp_column }}`, `{{ tagname_column }}`, {% if include_status is defined and include_status == true %} `{{ status_column }}`, {% else %} \'Good\' AS `Status`, {% endif %} `{{ value_column }}` FROM ' - "{% if source is defined and source is not none %}" - "`{{ source|lower }}` " - "{% else %}" - "`{{ business_unit|lower }}`.`sensors`.`{{ asset|lower }}_{{ data_security_level|lower }}_events_{{ data_type|lower }}` " - "{% endif %}" - "{% if case_insensitivity_tag_search is defined and case_insensitivity_tag_search == true %}" - "WHERE `{{ timestamp_column }}` BETWEEN to_timestamp(\"{{ start_date }}\") AND to_timestamp(\"{{ end_date }}\") AND UPPER(`{{ tagname_column }}`) IN ('{{ tag_names | join('\\', \\'') | upper }}') " - "{% else %}" - "WHERE `{{ timestamp_column }}` BETWEEN to_timestamp(\"{{ start_date }}\") AND to_timestamp(\"{{ end_date }}\") AND `{{ tagname_column }}` IN ('{{ tag_names | join('\\', \\'') }}') " - "{% endif %}" - "{% if include_status is defined and include_status == true and include_bad_data is defined and include_bad_data == false %} AND `{{ status_column }}` <> 'Bad' {% endif %}) " - ',date_array AS (SELECT explode(sequence(from_utc_timestamp(to_timestamp("{{ start_date }}"), "{{ time_zone }}"), from_utc_timestamp(to_timestamp("{{ end_date }}"), "{{ time_zone }}"), INTERVAL \'{{ time_interval_rate + \' \' + time_interval_unit }}\')) AS timestamp_array) ' - ",window_buckets AS (SELECT timestamp_array AS window_start, timestampadd({{time_interval_unit }}, {{ time_interval_rate }}, timestamp_array) AS window_end FROM date_array) " - ",resample AS (SELECT /*+ RANGE_JOIN(d, {{ range_join_seconds }} ) */ d.window_start, d.window_end, e.`{{ tagname_column }}`, {{ agg_method }}(e.`{{ value_column }}`) OVER (PARTITION BY e.`{{ tagname_column }}`, d.window_start ORDER BY e.`{{ timestamp_column }}` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `{{ value_column }}` FROM window_buckets d INNER JOIN raw_events e ON d.window_start <= e.`{{ timestamp_column }}` AND d.window_end > e.`{{ timestamp_column }}`) " - ",project AS (SELECT window_start AS `{{ timestamp_column }}`, `{{ tagname_column }}`, `{{ value_column }}` FROM resample GROUP BY window_start, `{{ tagname_column }}`, `{{ value_column }}` " - "{% if is_resample is defined and is_resample == true %}" - "ORDER BY `{{ tagname_column }}`, `{{ timestamp_column }}` " - "{% endif %}" - ") " - "{% if is_resample is defined and is_resample == true and pivot is defined and pivot == true %}" - "{% if case_insensitivity_tag_search is defined and case_insensitivity_tag_search == true %}" - ",pivot AS (SELECT * FROM (SELECT `{{ timestamp_column }}`, `{{ value_column }}`, UPPER(`{{ tagname_column }}`) AS `{{ tagname_column }}` FROM project) PIVOT (FIRST(`{{ value_column }}`) FOR `{{ tagname_column }}` IN (" - "{% for i in range(tag_names | length) %}" - "'{{ tag_names[i] | upper }}' AS `{{ tag_names[i] }}`{% if not loop.last %}, {% endif %}" - "{% endfor %}" - "{% else %}" - ",pivot AS (SELECT * FROM (SELECT `{{ timestamp_column }}`, `{{ value_column }}`, `{{ tagname_column }}` AS `{{ tagname_column }}` FROM project) PIVOT (FIRST(`{{ value_column }}`) FOR `{{ tagname_column }}` IN (" - "{% for i in range(tag_names | length) %}" - "'{{ tag_names[i] }}' AS `{{ tag_names[i] }}`{% if not loop.last %}, {% endif %}" - "{% endfor %}" - "{% endif %}" - '))) SELECT {% if to_json_resample is defined and to_json_resample == true %}to_json(struct(*), map("timestampFormat", "yyyy-MM-dd\'T\'HH:mm:ss.SSSSSSSSSXXX")) as Value{% else %}*{% endif %} FROM pivot ORDER BY `{{ timestamp_column }}` ' - "{% else %}" - "{% if display_uom is defined and display_uom == true %}" - 'SELECT {% if to_json_resample is defined and to_json_resample == true %}to_json(struct(p.`EventTime`, p.`TagName`, p.`Value`, m.`UoM`), map("timestampFormat", "yyyy-MM-dd\'T\'HH:mm:ss.SSSSSSSSSXXX")) as Value{% else %}p.`EventTime`, p.`TagName`, p.`Value`, m.`UoM`{% endif %} FROM project p ' - "LEFT OUTER JOIN " - "{% if metadata_source is defined and metadata_source is not none %}" - "`{{ metadata_source|lower }}` m ON p.`{{ tagname_column }}` = m.`{{ metadata_tagname_column }}` " - "{% else %}" - "`{{ business_unit|lower }}`.`sensors`.`{{ asset|lower }}_{{ data_security_level|lower }}_metadata` m ON p.`{{ tagname_column }}` = m.`{{ tagname_column }}` " - "{% endif %}" - "{% else %}" - 'SELECT {% if to_json_resample is defined and to_json_resample == true %}to_json(struct(*), map("timestampFormat", "yyyy-MM-dd\'T\'HH:mm:ss.SSSSSSSSSXXX")) as Value{% else %}*{% endif %} FROM project ' - "{% endif %}" - "{% endif %}" - "{% if is_resample is defined and is_resample == true and limit is defined and limit is not none %}" - "LIMIT {{ limit }} " - "{% endif %}" - "{% if is_resample is defined and is_resample == true and offset is defined and offset is not none %}" - "OFFSET {{ offset }} " - "{% endif %}" - ) - +def _sample_query_parameters(parameters_dict: dict) -> dict: sample_parameters = { "source": parameters_dict.get("source", None), "metadata_source": parameters_dict.get("metadata_source", None), @@ -208,6 +632,7 @@ def _sample_query(parameters_dict: dict) -> tuple: "time_interval_rate": parameters_dict["time_interval_rate"], "time_interval_unit": parameters_dict["time_interval_unit"], "agg_method": parameters_dict["agg_method"], + "fill": parameters_dict.get("fill", False), "time_zone": parameters_dict["time_zone"], "pivot": parameters_dict.get("pivot", None), "limit": parameters_dict.get("limit", None), @@ -233,81 +658,205 @@ def _sample_query(parameters_dict: dict) -> tuple: "case_insensitivity_tag_search", False ), "display_uom": parameters_dict.get("display_uom", False), + "sort": parameters_dict.get("sort", True), "metadata_tagname_column": parameters_dict.get( "metadata_tagname_column", "TagName" ), "metadata_uom_column": parameters_dict.get("metadata_uom_column", "UoM"), "to_json_resample": parameters_dict.get("to_json", False), } + return sample_parameters + + +def _sample_query(parameters_dict: dict) -> str: + + sample_parameters = _sample_query_parameters(parameters_dict) + + sql_query_list = [] + + raw_query = _build_raw_query( + sql_query_name="raw", + timestamp_column=sample_parameters["timestamp_column"], + tagname_column=sample_parameters["tagname_column"], + status_column=sample_parameters["status_column"], + value_column=sample_parameters["value_column"], + start_date=sample_parameters["start_date"], + end_date=sample_parameters["end_date"], + time_interval_rate=sample_parameters["time_interval_rate"], + time_interval_unit=sample_parameters["time_interval_unit"], + agg_method=sample_parameters["agg_method"], + time_zone=sample_parameters["time_zone"], + source=sample_parameters["source"], + business_unit=sample_parameters["business_unit"], + asset=sample_parameters["asset"], + data_security_level=sample_parameters["data_security_level"], + data_type=sample_parameters["data_type"], + tag_names=sample_parameters["tag_names"], + include_status=sample_parameters["include_status"], + case_insensitivity_tag_search=sample_parameters[ + "case_insensitivity_tag_search" + ], + sort=False, + ) - sql_template = Template(sample_query) - sql_query = sql_template.render(sample_parameters) - return sql_query, sample_query, sample_parameters + sql_query_list.append({"query_name": "raw", "sql_query": raw_query}) + + resample_query = _build_resample_query( + sql_query_list=sql_query_list, + sql_query_name="resample", + timestamp_column=sample_parameters["timestamp_column"], + tagname_column=sample_parameters["tagname_column"], + value_column=sample_parameters["value_column"], + tag_names=sample_parameters["tag_names"], + start_date=sample_parameters["start_date"], + end_date=sample_parameters["end_date"], + time_zone=sample_parameters["time_zone"], + time_interval_rate=sample_parameters["time_interval_rate"], + time_interval_unit=sample_parameters["time_interval_unit"], + agg_method=sample_parameters["agg_method"], + case_insensitivity_tag_search=sample_parameters[ + "case_insensitivity_tag_search" + ], + fill=sample_parameters["fill"], + sort=( + sample_parameters["sort"] if sample_parameters["pivot"] == False else False + ), + ) + sql_query_list.append({"query_name": "resample", "sql_query": resample_query}) + + if sample_parameters["pivot"] == True: + pivot_query = _build_pivot_query( + sql_query_list=sql_query_list, + sql_query_name="pivot", + tagname_column=sample_parameters["tagname_column"], + timestamp_column=sample_parameters["timestamp_column"], + value_column=sample_parameters["value_column"], + tag_names=sample_parameters["tag_names"], + is_case_insensitive_tag_search=sample_parameters[ + "case_insensitivity_tag_search" + ], + sort=sample_parameters["sort"], + ) -def _plot_query(parameters_dict: dict) -> tuple: - plot_query = ( - 'WITH raw_events AS (SELECT DISTINCT from_utc_timestamp(date_trunc("millisecond",`{{ timestamp_column }}`), "{{ time_zone }}") AS `{{ timestamp_column }}`, `{{ tagname_column }}`, {% if include_status is defined and include_status == true %} `{{ status_column }}`, {% else %} \'Good\' AS `Status`, {% endif %} `{{ value_column }}` FROM ' - "{% if source is defined and source is not none %}" - "`{{ source|lower }}` " - "{% else %}" - "`{{ business_unit|lower }}`.`sensors`.`{{ asset|lower }}_{{ data_security_level|lower }}_events_{{ data_type|lower }}` " - "{% endif %}" - "{% if case_insensitivity_tag_search is defined and case_insensitivity_tag_search == true %}" - "WHERE `{{ timestamp_column }}` BETWEEN to_timestamp(\"{{ start_date }}\") AND to_timestamp(\"{{ end_date }}\") AND UPPER(`{{ tagname_column }}`) IN ('{{ tag_names | join('\\', \\'') | upper }}') " - "{% else %}" - "WHERE `{{ timestamp_column }}` BETWEEN to_timestamp(\"{{ start_date }}\") AND to_timestamp(\"{{ end_date }}\") AND `{{ tagname_column }}` IN ('{{ tag_names | join('\\', \\'') }}') " - "{% endif %}" - "{% if include_status is defined and include_status == true and include_bad_data is defined and include_bad_data == false %} AND `{{ status_column }}` <> 'Bad' {% endif %}) " - ',date_array AS (SELECT explode(sequence(from_utc_timestamp(to_timestamp("{{ start_date }}"), "{{ time_zone }}"), from_utc_timestamp(to_timestamp("{{ end_date }}"), "{{ time_zone }}"), INTERVAL \'{{ time_interval_rate + \' \' + time_interval_unit }}\')) AS timestamp_array) ' - ",window_buckets AS (SELECT timestamp_array AS window_start, timestampadd({{time_interval_unit }}, {{ time_interval_rate }}, timestamp_array) AS window_end FROM date_array) " - ",plot AS (SELECT /*+ RANGE_JOIN(d, {{ range_join_seconds }} ) */ d.window_start, d.window_end, e.`{{ tagname_column }}`" - ", min(CASE WHEN `{{ status_column }}` = 'Bad' THEN null ELSE struct(e.`{{ value_column }}`, e.`{{ timestamp_column }}`) END) OVER (PARTITION BY e.`{{ tagname_column }}`, d.window_start ORDER BY e.`{{ timestamp_column }}` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `min_{{ value_column }}`" - ", max(CASE WHEN `{{ status_column }}` = 'Bad' THEN null ELSE struct(e.`{{ value_column }}`, e.`{{ timestamp_column }}`) END) OVER (PARTITION BY e.`{{ tagname_column }}`, d.window_start ORDER BY e.`{{ timestamp_column }}` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `max_{{ value_column }}`" - ", first(CASE WHEN `{{ status_column }}` = 'Bad' THEN null ELSE struct(e.`{{ value_column }}`, e.`{{ timestamp_column }}`) END, True) OVER (PARTITION BY e.`{{ tagname_column }}`, d.window_start ORDER BY e.`{{ timestamp_column }}` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `first_{{ value_column }}`" - ", last(CASE WHEN `{{ status_column }}` = 'Bad' THEN null ELSE struct(e.`{{ value_column }}`, e.`{{ timestamp_column }}`) END, True) OVER (PARTITION BY e.`{{ tagname_column }}`, d.window_start ORDER BY e.`{{ timestamp_column }}` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `last_{{ value_column }}`" - ", first(CASE WHEN `{{ status_column }}` = 'Bad' THEN struct(e.`{{ value_column }}`, e.`{{ timestamp_column }}`) ELSE null END, True) OVER (PARTITION BY e.`{{ tagname_column }}`, d.window_start ORDER BY e.`{{ timestamp_column }}` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `excp_{{ value_column }}` " - "FROM window_buckets d INNER JOIN raw_events e ON d.window_start <= e.`{{ timestamp_column }}` AND d.window_end > e.`{{ timestamp_column }}`) " - ",deduplicate AS (SELECT window_start AS `{{ timestamp_column }}`, `{{ tagname_column }}`, `min_{{ value_column }}` as `Min`, `max_{{ value_column }}` as `Max`, `first_{{ value_column }}` as `First`, `last_{{ value_column }}` as `Last`, `excp_{{ value_column }}` as `Exception` FROM plot GROUP BY window_start, `{{ tagname_column }}`, `min_{{ value_column }}`, `max_{{ value_column }}`, `first_{{ value_column }}`, `last_{{ value_column }}`, `excp_{{ value_column }}`) " - ",project AS (SELECT distinct Values.{{ timestamp_column }}, `{{ tagname_column }}`, Values.{{ value_column }} FROM (SELECT * FROM deduplicate UNPIVOT (`Values` for `Aggregation` IN (`Min`, `Max`, `First`, `Last`, `Exception`))) " - "{% if is_resample is defined and is_resample == true %}" - "ORDER BY `{{ tagname_column }}`, `{{ timestamp_column }}` " - "{% endif %}" - ") " - "{% if is_resample is defined and is_resample == true and pivot is defined and pivot == true %}" - "{% if case_insensitivity_tag_search is defined and case_insensitivity_tag_search == true %}" - ",pivot AS (SELECT * FROM (SELECT `{{ timestamp_column }}`, `{{ value_column }}`, UPPER(`{{ tagname_column }}`) AS `{{ tagname_column }}` FROM project) PIVOT (FIRST(`{{ value_column }}`) FOR `{{ tagname_column }}` IN (" - "{% for i in range(tag_names | length) %}" - "'{{ tag_names[i] | upper }}' AS `{{ tag_names[i] }}`{% if not loop.last %}, {% endif %}" - "{% endfor %}" - "{% else %}" - ",pivot AS (SELECT * FROM (SELECT `{{ timestamp_column }}`, `{{ value_column }}`, `{{ tagname_column }}` AS `{{ tagname_column }}` FROM project) PIVOT (FIRST(`{{ value_column }}`) FOR `{{ tagname_column }}` IN (" - "{% for i in range(tag_names | length) %}" - "'{{ tag_names[i] }}' AS `{{ tag_names[i] }}`{% if not loop.last %}, {% endif %}" - "{% endfor %}" - "{% endif %}" - '))) SELECT {% if to_json is defined and to_json == true %}to_json(struct(*), map("timestampFormat", "yyyy-MM-dd\'T\'HH:mm:ss.SSSSSSSSSXXX")) as Value{% else %}*{% endif %} FROM pivot ORDER BY `{{ timestamp_column }}` ' - "{% else %}" - "{% if display_uom is defined and display_uom == true %}" - 'SELECT {% if to_json is defined and to_json == true %}to_json(struct(p.`EventTime`, p.`TagName`, p.`Value`, m.`UoM`), map("timestampFormat", "yyyy-MM-dd\'T\'HH:mm:ss.SSSSSSSSSXXX")) as Value{% else %}p.`EventTime`, p.`TagName`, p.`Value`, m.`UoM`{% endif %} FROM project p ' - "LEFT OUTER JOIN " - "{% if metadata_source is defined and metadata_source is not none %}" - "`{{ metadata_source|lower }}` m ON p.`{{ tagname_column }}` = m.`{{ metadata_tagname_column }}` " - "{% else %}" - "`{{ business_unit|lower }}`.`sensors`.`{{ asset|lower }}_{{ data_security_level|lower }}_metadata` m ON p.`{{ tagname_column }}` = m.`{{ tagname_column }}` " - "{% endif %}" - "{% else %}" - 'SELECT {% if to_json is defined and to_json == true %}to_json(struct(*), map("timestampFormat", "yyyy-MM-dd\'T\'HH:mm:ss.SSSSSSSSSXXX")) as Value{% else %}*{% endif %} FROM project ' - "{% endif %}" - "{% endif %}" - "{% if is_resample is defined and is_resample == true and limit is defined and limit is not none %}" - "LIMIT {{ limit }} " - "{% endif %}" - "{% if is_resample is defined and is_resample == true and offset is defined and offset is not none %}" - "OFFSET {{ offset }} " - "{% endif %}" + sql_query_list.append({"query_name": "pivot", "sql_query": pivot_query}) + + if sample_parameters["display_uom"] == True: + uom_query = _build_uom_query( + sql_query_list=sql_query_list, + sql_query_name="uom", + metadata_source=sample_parameters["metadata_source"], + business_unit=sample_parameters["business_unit"], + asset=sample_parameters["asset"], + data_security_level=sample_parameters["data_security_level"], + tagname_column=sample_parameters["tagname_column"], + metadata_tagname_column=sample_parameters["metadata_tagname_column"], + metadata_uom_column=sample_parameters["metadata_uom_column"], + ) + + sql_query_list.append({"query_name": "uom", "sql_query": uom_query}) + + output_query = _build_output_query( + sql_query_list=sql_query_list, + to_json=sample_parameters["to_json_resample"], + limit=sample_parameters["limit"], + offset=sample_parameters["offset"], + ) + + sql_query_list.append({"query_name": "output", "sql_query": output_query}) + + sql_query = _build_sql_cte_statement(sql_query_list) + + return sql_query + + +def _build_time_interval_array( + sql_query_name, + timestamp_column, + start_date, + end_date, + time_zone, + time_interval_rate, + time_interval_unit, +): + """Build time interval array for windowing operations.""" + time_interval_array_query = f"{sql_query_name} AS (SELECT explode(sequence(from_utc_timestamp(to_timestamp('{start_date}'), '{time_zone}'), from_utc_timestamp(to_timestamp('{end_date}'), '{time_zone}'), INTERVAL '{time_interval_rate} {time_interval_unit}')) AS timestamp_array)" + return time_interval_array_query + + +def _build_window_buckets( + sql_query_list, + sql_query_name, + timestamp_column, + time_interval_rate, + time_interval_unit, +): + """Build window buckets for time-based aggregations.""" + parent_sql_query_name = sql_query_list[-1]["query_name"] + window_buckets_query = f"{sql_query_name} AS (SELECT timestamp_array AS window_start, timestampadd({time_interval_unit}, {time_interval_rate}, timestamp_array) AS window_end FROM {parent_sql_query_name})" + return window_buckets_query + + +def _build_plot_aggregations( + sql_query_list, + sql_query_name, + timestamp_column, + tagname_column, + value_column, + status_column, + range_join_seconds, +): + """Build plot aggregations with OHLC (open, high, low, close) calculations.""" + parent_sql_query_name = sql_query_list[-1]["query_name"] + raw_events_name = next( + ( + query["query_name"] + for query in sql_query_list + if query["query_name"] == "raw_events" + ), + "raw_events", ) + plot_aggregations_query = f"{sql_query_name} AS (SELECT /*+ RANGE_JOIN(d, {range_join_seconds}) */ d.window_start, d.window_end, e.`{tagname_column}`, min(CASE WHEN `{status_column}` = 'Bad' THEN null ELSE struct(e.`{value_column}`, e.`{timestamp_column}`) END) OVER (PARTITION BY e.`{tagname_column}`, d.window_start ORDER BY e.`{timestamp_column}` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `min_{value_column}`, max(CASE WHEN `{status_column}` = 'Bad' THEN null ELSE struct(e.`{value_column}`, e.`{timestamp_column}`) END) OVER (PARTITION BY e.`{tagname_column}`, d.window_start ORDER BY e.`{timestamp_column}` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `max_{value_column}`, first(CASE WHEN `{status_column}` = 'Bad' THEN null ELSE struct(e.`{value_column}`, e.`{timestamp_column}`) END, True) OVER (PARTITION BY e.`{tagname_column}`, d.window_start ORDER BY e.`{timestamp_column}` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `first_{value_column}`, last(CASE WHEN `{status_column}` = 'Bad' THEN null ELSE struct(e.`{value_column}`, e.`{timestamp_column}`) END, True) OVER (PARTITION BY e.`{tagname_column}`, d.window_start ORDER BY e.`{timestamp_column}` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `last_{value_column}`, first(CASE WHEN `{status_column}` = 'Bad' THEN struct(e.`{value_column}`, e.`{timestamp_column}`) ELSE null END, True) OVER (PARTITION BY e.`{tagname_column}`, d.window_start ORDER BY e.`{timestamp_column}` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `excp_{value_column}` FROM {parent_sql_query_name} d INNER JOIN {raw_events_name} e ON d.window_start <= e.`{timestamp_column}` AND d.window_end > e.`{timestamp_column}`)" + return plot_aggregations_query + + +def _build_plot_deduplication( + sql_query_list, + sql_query_name, + timestamp_column, + tagname_column, + value_column, +): + """Build deduplication step for plot aggregations.""" + parent_sql_query_name = sql_query_list[-1]["query_name"] + deduplication_query = f"{sql_query_name} AS (SELECT window_start AS `{timestamp_column}`, `{tagname_column}`, `min_{value_column}` as `Min`, `max_{value_column}` as `Max`, `first_{value_column}` as `First`, `last_{value_column}` as `Last`, `excp_{value_column}` as `Exception` FROM {parent_sql_query_name} GROUP BY window_start, `{tagname_column}`, `min_{value_column}`, `max_{value_column}`, `first_{value_column}`, `last_{value_column}`, `excp_{value_column}`)" + return deduplication_query + + +def _build_unpivot_projection( + sql_query_list, + sql_query_name, + timestamp_column, + tagname_column, + value_column, + sort=True, +): + """Build unpivot projection to transform aggregated values into rows.""" + parent_sql_query_name = sql_query_list[-1]["query_name"] + + unpivot_query = f"{sql_query_name} AS (SELECT distinct Values.{timestamp_column}, `{tagname_column}`, Values.{value_column} FROM (SELECT * FROM {parent_sql_query_name} UNPIVOT (`Values` for `Aggregation` IN (`Min`, `Max`, `First`, `Last`, `Exception`)))" + + if sort: + unpivot_query = " ".join( + [unpivot_query, f"ORDER BY `{tagname_column}`, `{timestamp_column}`"] + ) + + return unpivot_query + ")" + + +def _plot_query_parameters(parameters_dict: dict) -> dict: + """Extract and validate parameters for plot query.""" plot_parameters = { "source": parameters_dict.get("source", None), "metadata_source": parameters_dict.get("metadata_source", None), @@ -327,7 +876,6 @@ def _plot_query(parameters_dict: dict) -> tuple: "display_uom": parameters_dict.get("display_uom", False), "limit": parameters_dict.get("limit", None), "offset": parameters_dict.get("offset", None), - "is_resample": True, "tagname_column": parameters_dict.get("tagname_column", "TagName"), "timestamp_column": parameters_dict.get("timestamp_column", "EventTime"), "include_status": ( @@ -352,94 +900,292 @@ def _plot_query(parameters_dict: dict) -> tuple: ), "metadata_uom_column": parameters_dict.get("metadata_uom_column", "UoM"), "to_json": parameters_dict.get("to_json", False), + "sort": parameters_dict.get("sort", True), } + return plot_parameters + + +def _interpolation_query(parameters_dict: dict) -> str: + + parameters_dict["agg_method"] = None + + interpolate_parameters = _sample_query_parameters(parameters_dict) + + sql_query_list = [] + + raw_query = _build_raw_query( + sql_query_name="raw", + timestamp_column=interpolate_parameters["timestamp_column"], + tagname_column=interpolate_parameters["tagname_column"], + status_column=interpolate_parameters["status_column"], + value_column=interpolate_parameters["value_column"], + start_date=interpolate_parameters["start_date"], + end_date=interpolate_parameters["end_date"], + time_interval_rate=interpolate_parameters["time_interval_rate"], + time_interval_unit=interpolate_parameters["time_interval_unit"], + agg_method=None, + time_zone=interpolate_parameters["time_zone"], + source=interpolate_parameters["source"], + business_unit=interpolate_parameters["business_unit"], + asset=interpolate_parameters["asset"], + data_security_level=interpolate_parameters["data_security_level"], + data_type=interpolate_parameters["data_type"], + tag_names=interpolate_parameters["tag_names"], + include_status=interpolate_parameters["include_status"], + case_insensitivity_tag_search=interpolate_parameters[ + "case_insensitivity_tag_search" + ], + sort=False, + ) - sql_template = Template(plot_query) - sql_query = sql_template.render(plot_parameters) - return sql_query, plot_query, plot_parameters + sql_query_list.append({"query_name": "raw", "sql_query": raw_query}) + + # resample_query = _build_resample_query( + # sql_query_list=sql_query_list, + # sql_query_name="resample", + # timestamp_column=interpolate_parameters["timestamp_column"], + # tagname_column=interpolate_parameters["tagname_column"], + # value_column=interpolate_parameters["value_column"], + # tag_names=interpolate_parameters["tag_names"], + # start_date=interpolate_parameters["start_date"], + # end_date=interpolate_parameters["end_date"], + # time_zone=interpolate_parameters["time_zone"], + # time_interval_rate=interpolate_parameters["time_interval_rate"], + # time_interval_unit=interpolate_parameters["time_interval_unit"], + # agg_method=interpolate_parameters["agg_method"], + # case_insensitivity_tag_search=interpolate_parameters[ + # "case_insensitivity_tag_search" + # ], + # fill=True, + # sort=False, + # ) + + # sql_query_list.append({"query_name": "resample", "sql_query": resample_query}) + fill_intervals_query = _build_fill_intervals_query( + sql_query_list=sql_query_list, + sql_query_name="fill_intervals", + timestamp_column=interpolate_parameters["timestamp_column"], + tagname_column=interpolate_parameters["tagname_column"], + value_column=interpolate_parameters["value_column"], + tag_names=interpolate_parameters["tag_names"], + start_date=interpolate_parameters["start_date"], + end_date=interpolate_parameters["end_date"], + time_zone=interpolate_parameters["time_zone"], + time_interval_rate=interpolate_parameters["time_interval_rate"], + time_interval_unit=interpolate_parameters["time_interval_unit"], + case_insensitivity_tag_search=interpolate_parameters[ + "case_insensitivity_tag_search" + ], + ) + sql_query_list.append( + {"query_name": "fill_intervals", "sql_query": fill_intervals_query} + ) -def _interpolation_query( - parameters_dict: dict, sample_query: str, sample_parameters: dict -) -> str: - if parameters_dict["interpolation_method"] == "forward_fill": - interpolation_methods = "last_value/UNBOUNDED PRECEDING/CURRENT ROW" + interpolate_query = _build_interpolate_query( + sql_query_list=sql_query_list, + sql_query_name="interpolate", + timestamp_column=interpolate_parameters["timestamp_column"], + tagname_column=interpolate_parameters["tagname_column"], + value_column=interpolate_parameters["value_column"], + sort=( + interpolate_parameters["sort"] + if interpolate_parameters["pivot"] == False + else False + ), + ) - if parameters_dict["interpolation_method"] == "backward_fill": - interpolation_methods = "first_value/CURRENT ROW/UNBOUNDED FOLLOWING" + sql_query_list.append({"query_name": "interpolate", "sql_query": interpolate_query}) + + if interpolate_parameters["pivot"] == True: + pivot_query = _build_pivot_query( + sql_query_list=sql_query_list, + sql_query_name="pivot", + tagname_column=interpolate_parameters["tagname_column"], + timestamp_column=interpolate_parameters["timestamp_column"], + value_column=interpolate_parameters["value_column"], + tag_names=interpolate_parameters["tag_names"], + is_case_insensitive_tag_search=interpolate_parameters[ + "case_insensitivity_tag_search" + ], + sort=interpolate_parameters["sort"], + ) - if ( - parameters_dict["interpolation_method"] == "forward_fill" - or parameters_dict["interpolation_method"] == "backward_fill" - ): - interpolation_options = interpolation_methods.split("/") + sql_query_list.append({"query_name": "pivot", "sql_query": pivot_query}) + + if interpolate_parameters["display_uom"] == True: + uom_query = _build_uom_query( + sql_query_list=sql_query_list, + sql_query_name="uom", + metadata_source=interpolate_parameters["metadata_source"], + business_unit=interpolate_parameters["business_unit"], + asset=interpolate_parameters["asset"], + data_security_level=interpolate_parameters["data_security_level"], + tagname_column=interpolate_parameters["tagname_column"], + metadata_tagname_column=interpolate_parameters["metadata_tagname_column"], + metadata_uom_column=interpolate_parameters["metadata_uom_column"], + ) - interpolate_query = ( - f"WITH resample AS ({sample_query})" - "{% if case_insensitivity_tag_search is defined and case_insensitivity_tag_search == true %}" - ',date_array AS (SELECT DISTINCT explode(sequence(from_utc_timestamp(to_timestamp("{{ start_date }}"), "{{ time_zone }}"), from_utc_timestamp(to_timestamp("{{ end_date }}"), "{{ time_zone }}"), INTERVAL \'{{ time_interval_rate + \' \' + time_interval_unit }}\')) AS `{{ timestamp_column }}`, explode(array(`{{ tagname_column }}`)) AS `{{ tagname_column }}` FROM resample) ' - "{% else %}" - ",date_array AS (SELECT explode(sequence(from_utc_timestamp(to_timestamp(\"{{ start_date }}\"), \"{{ time_zone }}\"), from_utc_timestamp(to_timestamp(\"{{ end_date }}\"), \"{{ time_zone }}\"), INTERVAL '{{ time_interval_rate + ' ' + time_interval_unit }}')) AS `{{ timestamp_column }}`, explode(array('{{ tag_names | join('\\', \\'') }}')) AS `{{ tagname_column }}`) " - "{% endif %}" - '{% if (interpolation_method is defined) and (interpolation_method == "forward_fill" or interpolation_method == "backward_fill") %}' - ",project AS (SELECT a.`{{ timestamp_column }}`, a.`{{ tagname_column }}`, {{ interpolation_options_0 }}(b.`{{ value_column }}`, true) OVER (PARTITION BY a.`{{ tagname_column }}` ORDER BY a.`{{ timestamp_column }}` ROWS BETWEEN {{ interpolation_options_1 }} AND {{ interpolation_options_2 }}) AS `{{ value_column }}` FROM date_array a LEFT OUTER JOIN resample b ON a.`{{ timestamp_column }}` = b.`{{ timestamp_column }}` AND a.`{{ tagname_column }}` = b.`{{ tagname_column }}`) " - '{% elif (interpolation_method is defined) and (interpolation_method == "linear") %}' - ",linear_interpolation_calculations AS (SELECT coalesce(a.`{{ tagname_column }}`, b.`{{ tagname_column }}`) AS `{{ tagname_column }}`, coalesce(a.`{{ timestamp_column }}`, b.`{{ timestamp_column }}`) AS `{{ timestamp_column }}`, a.`{{ timestamp_column }}` AS `Requested_{{ timestamp_column }}`, b.`{{ timestamp_column }}` AS `Found_{{ timestamp_column }}`, b.`{{ value_column }}`, " - "last_value(b.`{{ timestamp_column }}`, true) OVER (PARTITION BY a.`{{ tagname_column }}` ORDER BY a.`{{ timestamp_column }}` ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS `Last_{{ timestamp_column }}`, last_value(b.`{{ value_column }}`, true) OVER (PARTITION BY a.`{{ tagname_column }}` ORDER BY a.`{{ timestamp_column }}` ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS `Last_{{ value_column }}`, " - "first_value(b.`{{ timestamp_column }}`, true) OVER (PARTITION BY a.`{{ tagname_column }}` ORDER BY a.`{{ timestamp_column }}` ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS `Next_{{ timestamp_column }}`, first_value(b.`{{ value_column }}`, true) OVER (PARTITION BY a.`{{ tagname_column }}` ORDER BY a.`{{ timestamp_column }}` ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS `Next_{{ value_column }}`, " - "CASE WHEN b.`{{ value_column }}` is NULL THEN `Last_{{ value_column }}` + (unix_timestamp(a.`{{ timestamp_column }}`) - unix_timestamp(`Last_{{ timestamp_column }}`)) * ((`Next_{{ value_column }}` - `Last_{{ value_column }}`)) / ((unix_timestamp(`Next_{{ timestamp_column }}`) - unix_timestamp(`Last_{{ timestamp_column }}`))) ELSE b.`{{ value_column }}` END AS `linear_interpolated_{{ value_column }}` FROM date_array a FULL OUTER JOIN resample b ON a.`{{ timestamp_column }}` = b.`{{ timestamp_column }}` AND a.`{{ tagname_column }}` = b.`{{ tagname_column }}`) " - ",project AS (SELECT `{{ timestamp_column }}`, `{{ tagname_column }}`, `linear_interpolated_{{ value_column }}` AS `{{ value_column }}` FROM linear_interpolation_calculations) " - "{% else %}" - ",project AS (SELECT * FROM resample) " - "{% endif %}" - "{% if pivot is defined and pivot == true %}" - "{% if case_insensitivity_tag_search is defined and case_insensitivity_tag_search == true %}" - ",pivot AS (SELECT * FROM (SELECT `{{ timestamp_column }}`, `{{ value_column }}`, UPPER(`{{ tagname_column }}`) AS `{{ tagname_column }}` FROM project) PIVOT (FIRST(`{{ value_column }}`) FOR `{{ tagname_column }}` IN (" - "{% for i in range(tag_names | length) %}" - "'{{ tag_names[i] | upper }}' AS `{{ tag_names[i] }}`{% if not loop.last %}, {% endif %}" - "{% endfor %}" - "{% else %}" - ",pivot AS (SELECT * FROM (SELECT `{{ timestamp_column }}`, `{{ value_column }}`, `{{ tagname_column }}` AS `{{ tagname_column }}` FROM project) PIVOT (FIRST(`{{ value_column }}`) FOR `{{ tagname_column }}` IN (" - "{% for i in range(tag_names | length) %}" - "'{{ tag_names[i] }}' AS `{{ tag_names[i] }}`{% if not loop.last %}, {% endif %}" - "{% endfor %}" - "{% endif %}" - '))) SELECT {% if to_json is defined and to_json == true %}to_json(struct(*), map("timestampFormat", "yyyy-MM-dd\'T\'HH:mm:ss.SSSSSSSSSXXX")) as Value{% else %}*{% endif %} FROM pivot ORDER BY `{{ timestamp_column }}` ' - "{% else %}" - "{% if display_uom is defined and display_uom == true %}" - 'SELECT {% if to_json is defined and to_json == true %}to_json(struct(p.`EventTime`, p.`TagName`, p.`Value`, m.`UoM`), map("timestampFormat", "yyyy-MM-dd\'T\'HH:mm:ss.SSSSSSSSSXXX")) as Value{% else %}p.`EventTime`, p.`TagName`, p.`Value`, m.`UoM`{% endif %} FROM project p ' - "LEFT OUTER JOIN " - "{% if metadata_source is defined and metadata_source is not none %}" - "`{{ metadata_source|lower }}` m ON p.`{{ tagname_column }}` = m.`{{ metadata_tagname_column }}` ORDER BY `{{ tagname_column }}`, `{{ timestamp_column }}` " - "{% else %}" - "`{{ business_unit|lower }}`.`sensors`.`{{ asset|lower }}_{{ data_security_level|lower }}_metadata` m ON p.`{{ tagname_column }}` = m.`{{ tagname_column }}` ORDER BY `{{ tagname_column }}`, `{{ timestamp_column }}` " - "{% endif %}" - "{% else%}" - 'SELECT {% if to_json is defined and to_json == true %}to_json(struct(*), map("timestampFormat", "yyyy-MM-dd\'T\'HH:mm:ss.SSSSSSSSSXXX")) as Value{% else %}*{% endif %} FROM project ORDER BY `{{ tagname_column }}`, `{{ timestamp_column }}` ' - "{% endif %}" - "{% endif %}" - "{% if limit is defined and limit is not none %}" - "LIMIT {{ limit }} " - "{% endif %}" - "{% if offset is defined and offset is not none %}" - "OFFSET {{ offset }} " - "{% endif %}" + sql_query_list.append({"query_name": "uom", "sql_query": uom_query}) + + output_query = _build_output_query( + sql_query_list=sql_query_list, + to_json=interpolate_parameters["to_json_resample"], + limit=interpolate_parameters["limit"], + offset=interpolate_parameters["offset"], + ) + + sql_query_list.append({"query_name": "output", "sql_query": output_query}) + + sql_query = _build_sql_cte_statement(sql_query_list) + + return sql_query + + +def _plot_query(parameters_dict: dict) -> str: + + plot_parameters = _plot_query_parameters(parameters_dict) + + sql_query_list = [] + + # Build raw events query + raw_query = _build_raw_query( + sql_query_name="raw_events", + timestamp_column=plot_parameters["timestamp_column"], + tagname_column=plot_parameters["tagname_column"], + status_column=plot_parameters["status_column"], + value_column=plot_parameters["value_column"], + start_date=plot_parameters["start_date"], + end_date=plot_parameters["end_date"], + time_zone=plot_parameters["time_zone"], + deduplicate=True, + source=plot_parameters["source"], + business_unit=plot_parameters["business_unit"], + asset=plot_parameters["asset"], + data_security_level=plot_parameters["data_security_level"], + data_type=plot_parameters["data_type"], + tag_names=plot_parameters["tag_names"], + include_status=plot_parameters["include_status"], + include_bad_data=plot_parameters["include_bad_data"], + case_insensitivity_tag_search=plot_parameters["case_insensitivity_tag_search"], + sort=False, + ) + + sql_query_list.append({"query_name": "raw_events", "sql_query": raw_query}) + + # Build time interval array + time_interval_query = _build_time_interval_array( + sql_query_name="date_array", + timestamp_column=plot_parameters["timestamp_column"], + start_date=plot_parameters["start_date"], + end_date=plot_parameters["end_date"], + time_zone=plot_parameters["time_zone"], + time_interval_rate=plot_parameters["time_interval_rate"], + time_interval_unit=plot_parameters["time_interval_unit"], + ) + + sql_query_list.append( + {"query_name": "date_array", "sql_query": time_interval_query} + ) + + # Build window buckets + window_buckets_query = _build_window_buckets( + sql_query_list=sql_query_list, + sql_query_name="window_buckets", + timestamp_column=plot_parameters["timestamp_column"], + time_interval_rate=plot_parameters["time_interval_rate"], + time_interval_unit=plot_parameters["time_interval_unit"], + ) + + sql_query_list.append( + {"query_name": "window_buckets", "sql_query": window_buckets_query} + ) + + # Build plot aggregations + plot_aggregations_query = _build_plot_aggregations( + sql_query_list=sql_query_list, + sql_query_name="plot", + timestamp_column=plot_parameters["timestamp_column"], + tagname_column=plot_parameters["tagname_column"], + value_column=plot_parameters["value_column"], + status_column=plot_parameters["status_column"], + range_join_seconds=plot_parameters["range_join_seconds"], + ) + + sql_query_list.append({"query_name": "plot", "sql_query": plot_aggregations_query}) + + # Build deduplication + deduplication_query = _build_plot_deduplication( + sql_query_list=sql_query_list, + sql_query_name="deduplicate", + timestamp_column=plot_parameters["timestamp_column"], + tagname_column=plot_parameters["tagname_column"], + value_column=plot_parameters["value_column"], + ) + + sql_query_list.append( + {"query_name": "deduplicate", "sql_query": deduplication_query} + ) + + # Build unpivot projection + unpivot_query = _build_unpivot_projection( + sql_query_list=sql_query_list, + sql_query_name="project", + timestamp_column=plot_parameters["timestamp_column"], + tagname_column=plot_parameters["tagname_column"], + value_column=plot_parameters["value_column"], + sort=(plot_parameters["sort"] if plot_parameters["pivot"] == False else False), + ) + + sql_query_list.append({"query_name": "project", "sql_query": unpivot_query}) + + # Add pivot if requested + if plot_parameters["pivot"] == True: + pivot_query = _build_pivot_query( + sql_query_list=sql_query_list, + sql_query_name="pivot", + tagname_column=plot_parameters["tagname_column"], + timestamp_column=plot_parameters["timestamp_column"], + value_column=plot_parameters["value_column"], + tag_names=plot_parameters["tag_names"], + is_case_insensitive_tag_search=plot_parameters[ + "case_insensitivity_tag_search" + ], + sort=True, + ) + + sql_query_list.append({"query_name": "pivot", "sql_query": pivot_query}) + + # Add UOM if requested + if plot_parameters["display_uom"] == True: + uom_query = _build_uom_query( + sql_query_list=sql_query_list, + sql_query_name="uom", + metadata_source=plot_parameters["metadata_source"], + business_unit=plot_parameters["business_unit"], + asset=plot_parameters["asset"], + data_security_level=plot_parameters["data_security_level"], + tagname_column=plot_parameters["tagname_column"], + metadata_tagname_column=plot_parameters["metadata_tagname_column"], + metadata_uom_column=plot_parameters["metadata_uom_column"], + ) + + sql_query_list.append({"query_name": "uom", "sql_query": uom_query}) + + # Build output query + output_query = _build_output_query( + sql_query_list=sql_query_list, + to_json=plot_parameters["to_json"], + limit=plot_parameters["limit"], + offset=plot_parameters["offset"], ) - interpolate_parameters = sample_parameters.copy() - interpolate_parameters["interpolation_method"] = parameters_dict[ - "interpolation_method" - ] - if ( - parameters_dict["interpolation_method"] == "forward_fill" - or parameters_dict["interpolation_method"] == "backward_fill" - ): - interpolate_parameters["interpolation_options_0"] = interpolation_options[0] - interpolate_parameters["interpolation_options_1"] = interpolation_options[1] - interpolate_parameters["interpolation_options_2"] = interpolation_options[2] + sql_query_list.append({"query_name": "output", "sql_query": output_query}) + + # Build final SQL + sql_query = _build_sql_cte_statement(sql_query_list) - sql_template = Template(interpolate_query) - return sql_template.render(interpolate_parameters) + return sql_query def _interpolation_at_time(parameters_dict: dict) -> str: @@ -507,7 +1253,7 @@ def _interpolation_at_time(parameters_dict: dict) -> str: 'SELECT {% if to_json is defined and to_json == true %}to_json(struct(p.`EventTime`, p.`TagName`, p.`Value`, m.`UoM`), map("timestampFormat", "yyyy-MM-dd\'T\'HH:mm:ss.SSSSSSSSSXXX")) as Value{% else %}p.`EventTime`, p.`TagName`, p.`Value`, m.`UoM`{% endif %} FROM project p ' "LEFT OUTER JOIN " "{% if metadata_source is defined and metadata_source is not none %}" - "`{{ metadata_source|lower }}` m ON p.`{{ tagname_column }}` = m.`{{ metadata_tagname_column }}` ORDER BY `{{ tagname_column }}`, `{{ timestamp_column }}` " + "{{ metadata_source|lower }} m ON p.`{{ tagname_column }}` = m.`{{ metadata_tagname_column }}` ORDER BY `{{ tagname_column }}`, `{{ timestamp_column }}` " "{% else %}" "`{{ business_unit|lower }}`.`sensors`.`{{ asset|lower }}_{{ data_security_level|lower }}_metadata` m ON p.`{{ tagname_column }}` = m.`{{ tagname_column }}` ORDER BY `{{ tagname_column }}`, `{{ timestamp_column }}` " "{% endif %}" @@ -634,7 +1380,7 @@ def _latest_query(parameters_dict: dict) -> str: 'SELECT {% if to_json is defined and to_json == true %}to_json(struct(l.*, m.`UoM), map("timestampFormat", "yyyy-MM-dd\'T\'HH:mm:ss.SSSSSSSSSXXX")) as Value{% else %}l.*, m.`UoM`{% endif %} FROM latest l ' "LEFT OUTER JOIN " "{% if metadata_source is defined and metadata_source is not none %}" - "`{{ metadata_source|lower }}` m ON l.`{{ tagname_column }}` = m.`{{ metadata_tagname_column }}` " + "{{ metadata_source|lower }} m ON l.`{{ tagname_column }}` = m.`{{ metadata_tagname_column }}` " "{% else %}" "`{{ business_unit|lower }}`.`sensors`.`{{ asset|lower }}_{{ data_security_level|lower }}_metadata` m ON l.`{{ tagname_column }}` = m.`{{ tagname_column }}` " "{% endif %}" @@ -707,10 +1453,10 @@ def _time_weighted_average_query(parameters_dict: dict) -> str: ',fill_status AS (SELECT *, last_value(`{{ status_column }}`, true) OVER (PARTITION BY `{{ tagname_column }}` ORDER BY `{{ timestamp_column }}` ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS `Fill_{{ status_column }}`, CASE WHEN `Fill_{{ status_column }}` <> "Bad" THEN `{{ value_column }}` ELSE null END AS `Good_{{ value_column }}` FROM window_events) ' ",fill_value AS (SELECT *, last_value(`Good_{{ value_column }}`, true) OVER (PARTITION BY `{{ tagname_column }}` ORDER BY `{{ timestamp_column }}` ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS `Fill_{{ value_column }}` FROM fill_status) " '{% if step is defined and step == "metadata" %} ' - ",fill_step AS (SELECT *, IFNULL(Step, false) AS Step FROM fill_value f " + ",fill_step AS (SELECT f.*, IFNULL(m.Step, false) AS Step FROM fill_value f " "LEFT JOIN " "{% if metadata_source is defined and metadata_source is not none %}" - "`{{ metadata_source|lower }}` m ON f.`{{ tagname_column }}` = m.`{{ metadata_tagname_column }}`) " + "{{ metadata_source|lower }} m ON f.`{{ tagname_column }}` = m.`{{ metadata_tagname_column }}`) " "{% else %}" "`{{ business_unit|lower }}`.`sensors`.`{{ asset|lower }}_{{ data_security_level|lower }}_metadata` m ON f.`{{ tagname_column }}` = m.`{{ tagname_column }}`) " "{% endif %}" @@ -868,7 +1614,7 @@ def _circular_stats_query(parameters_dict: dict) -> str: 'SELECT {% if to_json is defined and to_json == true %}to_json(struct(p.*, m.`UoM`), map("timestampFormat", "yyyy-MM-dd\'T\'HH:mm:ss.SSSSSSSSSXXX")) as Value{% else %}p.*, m.`UoM`{% endif %} FROM project p ' "LEFT OUTER JOIN " "{% if metadata_source is defined and metadata_source is not none %}" - "`{{ metadata_source|lower }}` m ON p.`{{ tagname_column }}` = m.`{{ metadata_tagname_column }}` ORDER BY `{{ tagname_column }}`, `{{ timestamp_column }}` " + "{{ metadata_source|lower }} m ON p.`{{ tagname_column }}` = m.`{{ metadata_tagname_column }}` ORDER BY `{{ tagname_column }}`, `{{ timestamp_column }}` " "{% else %}" "`{{ business_unit|lower }}`.`sensors`.`{{ asset|lower }}_{{ data_security_level|lower }}_metadata` m ON p.`{{ tagname_column }}` = m.`{{ tagname_column }}` ORDER BY `{{ tagname_column }}`, `{{ timestamp_column }}` " "{% endif %}" @@ -906,7 +1652,7 @@ def _circular_stats_query(parameters_dict: dict) -> str: 'SELECT {% if to_json is defined and to_json == true %}to_json(struct(p.*, m.`UoM`), map("timestampFormat", "yyyy-MM-dd\'T\'HH:mm:ss.SSSSSSSSSXXX")) as Value{% else %}p.*, m.`UoM`{% endif %} FROM project p ' "LEFT OUTER JOIN " "{% if metadata_source is defined and metadata_source is not none %}" - "`{{ metadata_source|lower }}` m ON p.`{{ tagname_column }}` = m.`{{ metadata_tagname_column }}` ORDER BY `{{ tagname_column }}`, `{{ timestamp_column }}` " + "{{ metadata_source|lower }} m ON p.`{{ tagname_column }}` = m.`{{ metadata_tagname_column }}` ORDER BY `{{ tagname_column }}`, `{{ timestamp_column }}` " "{% else %}" "`{{ business_unit|lower }}`.`sensors`.`{{ asset|lower }}_{{ data_security_level|lower }}_metadata` m ON p.`{{ tagname_column }}` = m.`{{ tagname_column }}` ORDER BY `{{ tagname_column }}`, `{{ timestamp_column }}` " "{% endif %}" @@ -974,47 +1720,7 @@ def _circular_stats_query(parameters_dict: dict) -> str: def _summary_query(parameters_dict: dict) -> str: - summary_query = ( - "WITH summary AS (SELECT `{{ tagname_column }}`, " - "count(`{{ value_column }}`) as Count, " - "CAST(Avg(`{{ value_column }}`) as decimal(10, 2)) as Avg, " - "CAST(Min(`{{ value_column }}`) as decimal(10, 2)) as Min, " - "CAST(Max(`{{ value_column }}`) as decimal(10, 2)) as Max, " - "CAST(stddev(`{{ value_column }}`) as decimal(10, 2)) as StDev, " - "CAST(sum(`{{ value_column }}`) as decimal(10, 2)) as Sum, " - "CAST(variance(`{{ value_column }}`) as decimal(10, 2)) as Var FROM " - "{% if source is defined and source is not none %}" - "`{{ source|lower }}` " - "{% else %}" - "`{{ business_unit|lower }}`.`sensors`.`{{ asset|lower }}_{{ data_security_level|lower }}_events_{{ data_type|lower }}` " - "{% endif %}" - "{% if case_insensitivity_tag_search is defined and case_insensitivity_tag_search == true %}" - "WHERE `{{ timestamp_column }}` BETWEEN to_timestamp(\"{{ start_date }}\") AND to_timestamp(\"{{ end_date }}\") AND UPPER(`{{ tagname_column }}`) IN ('{{ tag_names | join('\\', \\'') | upper }}') " - "{% else %}" - "WHERE `{{ timestamp_column }}` BETWEEN to_timestamp(\"{{ start_date }}\") AND to_timestamp(\"{{ end_date }}\") AND `{{ tagname_column }}` IN ('{{ tag_names | join('\\', \\'') }}') " - "{% endif %}" - "{% if include_status is defined and include_status == true and include_bad_data is defined and include_bad_data == false %}" - "AND `{{ status_column }}` <> 'Bad'" - "{% endif %}" - "GROUP BY `{{ tagname_column }}`) " - "{% if display_uom is defined and display_uom == true %}" - 'SELECT {% if to_json is defined and to_json == true %}to_json(struct(s.*, m.`UoM`), map("timestampFormat", "yyyy-MM-dd\'T\'HH:mm:ss.SSSSSSSSSXXX")) as Value{% else %}s.*, m.`UoM`{% endif %} FROM summary s ' - "LEFT OUTER JOIN " - "{% if metadata_source is defined and metadata_source is not none %}" - "`{{ metadata_source|lower }}` m ON s.`{{ tagname_column }}` = m.`{{ metadata_tagname_column }}` " - "{% else %}" - "`{{ business_unit|lower }}`.`sensors`.`{{ asset|lower }}_{{ data_security_level|lower }}_metadata` m ON s.`{{ tagname_column }}` = m.`{{ tagname_column }}` " - "{% endif %}" - "{% else%}" - 'SELECT {% if to_json is defined and to_json == true %}to_json(struct(*), map("timestampFormat", "yyyy-MM-dd\'T\'HH:mm:ss.SSSSSSSSSXXX")) as Value{% else %}*{% endif %} FROM summary ' - "{% endif %}" - "{% if limit is defined and limit is not none %}" - "LIMIT {{ limit }} " - "{% endif %}" - "{% if offset is defined and offset is not none %}" - "OFFSET {{ offset }} " - "{% endif %}" - ) + sql_query_list = [] summary_parameters = { "source": parameters_dict.get("source", None), @@ -1057,8 +1763,55 @@ def _summary_query(parameters_dict: dict) -> str: "to_json": parameters_dict.get("to_json", False), } - sql_template = Template(summary_query) - return sql_template.render(summary_parameters) + summary_query = _build_summary_query( + sql_query_name="summary", + timestamp_column=summary_parameters["timestamp_column"], + tagname_column=summary_parameters["tagname_column"], + status_column=summary_parameters["status_column"], + value_column=summary_parameters["value_column"], + start_date=summary_parameters["start_date"], + end_date=summary_parameters["end_date"], + source=summary_parameters["source"], + business_unit=summary_parameters["business_unit"], + asset=summary_parameters["asset"], + data_security_level=summary_parameters["data_security_level"], + data_type=summary_parameters["data_type"], + tag_names=summary_parameters["tag_names"], + include_status=summary_parameters["include_status"], + include_bad_data=summary_parameters["include_bad_data"], + case_insensitivity_tag_search=summary_parameters[ + "case_insensitivity_tag_search" + ], + ) + + sql_query_list.append({"query_name": "summary", "sql_query": summary_query}) + + if summary_parameters["display_uom"] == True: + uom_query = _build_uom_query( + sql_query_list=sql_query_list, + sql_query_name="uom", + metadata_source=summary_parameters["metadata_source"], + business_unit=summary_parameters["business_unit"], + asset=summary_parameters["asset"], + data_security_level=summary_parameters["data_security_level"], + tagname_column=summary_parameters["tagname_column"], + metadata_tagname_column=summary_parameters["metadata_tagname_column"], + metadata_uom_column=summary_parameters["metadata_uom_column"], + ) + sql_query_list.append({"query_name": "uom", "sql_query": uom_query}) + + # Add output query + output_query = _build_output_query( + sql_query_list=sql_query_list, + to_json=summary_parameters["to_json"], + limit=summary_parameters["limit"], + offset=summary_parameters["offset"], + ) + sql_query_list.append({"query_name": "output", "sql_query": output_query}) + # Build final SQL using CTE statement builder + sql_query = _build_sql_cte_statement(sql_query_list) + + return sql_query def _query_builder(parameters_dict: dict, query_type: str) -> str: @@ -1097,34 +1850,26 @@ def _query_builder(parameters_dict: dict, query_type: str) -> str: + " " + parameters_dict["time_interval_unit"][0] ) - sample_prepared_query, sample_query, sample_parameters = _sample_query( - parameters_dict - ) + sample_prepared_query = _sample_query(parameters_dict) return sample_prepared_query - if query_type == "plot": + if query_type == "interpolate": parameters_dict["range_join_seconds"] = _convert_to_seconds( parameters_dict["time_interval_rate"] + " " + parameters_dict["time_interval_unit"][0] ) - plot_prepared_query, _, _ = _plot_query(parameters_dict) - return plot_prepared_query + interpolate_prepared_query = _interpolation_query(parameters_dict) + return interpolate_prepared_query - if query_type == "interpolate": + if query_type == "plot": parameters_dict["range_join_seconds"] = _convert_to_seconds( parameters_dict["time_interval_rate"] + " " + parameters_dict["time_interval_unit"][0] ) - to_json_flag = parameters_dict.get("to_json", False) - parameters_dict["to_json"] = False - sample_prepared_query, sample_query, sample_parameters = _sample_query( - parameters_dict - ) - sample_parameters["is_resample"] = False - sample_parameters["to_json"] = to_json_flag - return _interpolation_query(parameters_dict, sample_query, sample_parameters) + plot_prepared_query = _plot_query(parameters_dict) + return plot_prepared_query if query_type == "time_weighted_average": parameters_dict["range_join_seconds"] = _convert_to_seconds( diff --git a/src/sdk/python/rtdip_sdk/queries/time_series/interpolate.py b/src/sdk/python/rtdip_sdk/queries/time_series/interpolate.py index 354b74a31..eecf42dfc 100644 --- a/src/sdk/python/rtdip_sdk/queries/time_series/interpolate.py +++ b/src/sdk/python/rtdip_sdk/queries/time_series/interpolate.py @@ -38,17 +38,14 @@ def get(connection: object, parameters_dict: dict) -> pd.DataFrame: tag_names (list): List of tagname or tagnames ["tag_1", "tag_2"] start_date (str): Start date (Either a date in the format YY-MM-DD or a datetime in the format YYY-MM-DDTHH:MM:SS or specify the timezone offset in the format YYYY-MM-DDTHH:MM:SS+zz:zz) end_date (str): End date (Either a date in the format YY-MM-DD or a datetime in the format YYY-MM-DDTHH:MM:SS or specify the timezone offset in the format YYYY-MM-DDTHH:MM:SS+zz:zz) - sample_rate (int): (deprecated) Please use time_interval_rate instead. See below. - sample_unit (str): (deprecated) Please use time_interval_unit instead. See below. time_interval_rate (str): The time interval rate (numeric input) time_interval_unit (str): The time interval unit (second, minute, day, hour) - agg_method (str): Aggregation Method (first, last, avg, min, max) - interpolation_method (str): Interpolation method (forward_fill, backward_fill, linear) include_bad_data (bool): Include "Bad" data points with True or remove "Bad" data points with False pivot (bool): Pivot the data on timestamp column with True or do not pivot the data with False display_uom (optional bool): Display the unit of measure with True or False. Does not apply to pivoted tables. Defaults to False limit (optional int): The number of rows to be returned offset (optional int): The number of rows to skip before returning rows + sort (optional bool): Sort the data in ascending order by the TagName and Timestamp columns or, if pivot is True, by the Timestamp column case_insensitivity_tag_search (optional bool): Search for tags using case insensitivity with True or case sensitivity with False Returns: @@ -68,18 +65,6 @@ def get(connection: object, parameters_dict: dict) -> pd.DataFrame: if parameters_dict["pivot"] is True and parameters_dict["display_uom"] is True: raise ValueError("pivot True and display_uom True cannot be used together") - if "sample_rate" in parameters_dict: - logging.warning( - "Parameter sample_rate is deprecated and will be removed in v1.0.0. Please use time_interval_rate instead." - ) - parameters_dict["time_interval_rate"] = parameters_dict["sample_rate"] - - if "sample_unit" in parameters_dict: - logging.warning( - "Parameter sample_unit is deprecated and will be removed in v1.0.0. Please use time_interval_unit instead." - ) - parameters_dict["time_interval_unit"] = parameters_dict["sample_unit"] - try: query = _query_builder(parameters_dict, "interpolate") diff --git a/src/sdk/python/rtdip_sdk/queries/time_series/plot.py b/src/sdk/python/rtdip_sdk/queries/time_series/plot.py index e3c8c5b89..13a63b6ca 100644 --- a/src/sdk/python/rtdip_sdk/queries/time_series/plot.py +++ b/src/sdk/python/rtdip_sdk/queries/time_series/plot.py @@ -40,8 +40,6 @@ def get(connection: object, parameters_dict: dict) -> pd.DataFrame: tag_names (list): List of tagname or tagnames ["tag_1", "tag_2"] start_date (str): Start date (Either a date in the format YY-MM-DD or a datetime in the format YYY-MM-DDTHH:MM:SS or specify the timezone offset in the format YYYY-MM-DDTHH:MM:SS+zz:zz) end_date (str): End date (Either a date in the format YY-MM-DD or a datetime in the format YYY-MM-DDTHH:MM:SS or specify the timezone offset in the format YYYY-MM-DDTHH:MM:SS+zz:zz) - sample_rate (int): (deprecated) Please use time_interval_rate instead. See below. - sample_unit (str): (deprecated) Please use time_interval_unit instead. See below. time_interval_rate (str): The time interval rate (numeric input) time_interval_unit (str): The time interval unit (second, minute, day, hour) include_bad_data (bool): Include "Bad" data points with True or remove "Bad" data points with False @@ -67,18 +65,6 @@ def get(connection: object, parameters_dict: dict) -> pd.DataFrame: if parameters_dict["pivot"] is True and parameters_dict["display_uom"] is True: raise ValueError("pivot True and display_uom True cannot be used together") - if "sample_rate" in parameters_dict: - logging.warning( - "Parameter sample_rate is deprecated and will be removed in v1.0.0. Please use time_interval_rate instead." - ) - parameters_dict["time_interval_rate"] = parameters_dict["sample_rate"] - - if "sample_unit" in parameters_dict: - logging.warning( - "Parameter sample_unit is deprecated and will be removed in v1.0.0. Please use time_interval_unit instead." - ) - parameters_dict["time_interval_unit"] = parameters_dict["sample_unit"] - try: query = _query_builder(parameters_dict, "plot") diff --git a/src/sdk/python/rtdip_sdk/queries/time_series/raw.py b/src/sdk/python/rtdip_sdk/queries/time_series/raw.py index d29715c9f..7498050b3 100644 --- a/src/sdk/python/rtdip_sdk/queries/time_series/raw.py +++ b/src/sdk/python/rtdip_sdk/queries/time_series/raw.py @@ -44,6 +44,7 @@ def get(connection: object, parameters_dict: dict) -> pd.DataFrame: limit (optional int): The number of rows to be returned offset (optional int): The number of rows to skip before returning rows display_uom (optional bool): Display the unit of measure with True or False. Defaults to False + sort (optional bool): Sort the data in ascending order by the TagName and Timestamp columns case_insensitivity_tag_search (optional bool): Search for tags using case insensitivity with True or case sensitivity with False Returns: diff --git a/src/sdk/python/rtdip_sdk/queries/time_series/resample.py b/src/sdk/python/rtdip_sdk/queries/time_series/resample.py index 1f8f40f13..9010e774f 100644 --- a/src/sdk/python/rtdip_sdk/queries/time_series/resample.py +++ b/src/sdk/python/rtdip_sdk/queries/time_series/resample.py @@ -40,16 +40,16 @@ def get(connection: object, parameters_dict: dict) -> pd.DataFrame: tag_names (list): List of tagname or tagnames ["tag_1", "tag_2"] start_date (str): Start date (Either a date in the format YY-MM-DD or a datetime in the format YYY-MM-DDTHH:MM:SS or specify the timezone offset in the format YYYY-MM-DDTHH:MM:SS+zz:zz) end_date (str): End date (Either a date in the format YY-MM-DD or a datetime in the format YYY-MM-DDTHH:MM:SS or specify the timezone offset in the format YYYY-MM-DDTHH:MM:SS+zz:zz) - sample_rate (int): (deprecated) Please use time_interval_rate instead. See below. - sample_unit (str): (deprecated) Please use time_interval_unit instead. See below. time_interval_rate (str): The time interval rate (numeric input) time_interval_unit (str): The time interval unit (second, minute, day, hour) agg_method (str): Aggregation Method (first, last, avg, min, max) include_bad_data (bool): Include "Bad" data points with True or remove "Bad" data points with False + fill (optional bool): Fill the data with intervals where no data exists. The Value column will be filled with Null pivot (optional bool): Pivot the data on timestamp column with True or do not pivot the data with False display_uom (optional bool): Display the unit of measure with True or False. Does not apply to pivoted tables. Defaults to False limit (optional int): The number of rows to be returned offset (optional int): The number of rows to skip before returning rows + sort (optional bool): Sort the data in ascending order by the TagName and Timestamp columns or, if pivot is True, by the Timestamp column case_insensitivity_tag_search (optional bool): Search for tags using case insensitivity with True or case sensitivity with False @@ -70,18 +70,6 @@ def get(connection: object, parameters_dict: dict) -> pd.DataFrame: if parameters_dict["pivot"] is True and parameters_dict["display_uom"] is True: raise ValueError("pivot True and display_uom True cannot be used together") - if "sample_rate" in parameters_dict: - logging.warning( - "Parameter sample_rate is deprecated and will be removed in v1.0.0. Please use time_interval_rate instead." - ) - parameters_dict["time_interval_rate"] = parameters_dict["sample_rate"] - - if "sample_unit" in parameters_dict: - logging.warning( - "Parameter sample_unit is deprecated and will be removed in v1.0.0. Please use time_interval_unit instead." - ) - parameters_dict["time_interval_unit"] = parameters_dict["sample_unit"] - try: query = _query_builder(parameters_dict, "resample") diff --git a/src/sdk/python/rtdip_sdk/queries/time_series/time_series_query_builder.py b/src/sdk/python/rtdip_sdk/queries/time_series/time_series_query_builder.py index 9109ddaab..c2c655006 100644 --- a/src/sdk/python/rtdip_sdk/queries/time_series/time_series_query_builder.py +++ b/src/sdk/python/rtdip_sdk/queries/time_series/time_series_query_builder.py @@ -162,7 +162,7 @@ def m_source( metadata_tagname_column (optional str): The column name in the source that contains the tagnames or series metadata_uom_column (optional str): The column name in the source that contains the unit of measure """ - self.metadata_source = "`.`".join(metadata_source.split(".")) + self.metadata_source = f"`{'`.`'.join(metadata_source.split('.'))}`" self.metadata_tagname_column = metadata_tagname_column self.metadata_uom_column = metadata_uom_column return self @@ -174,6 +174,7 @@ def raw( end_date: str, include_bad_data: bool = False, display_uom: bool = False, + sort: bool = True, limit: int = None, offset: int = None, ) -> DataFrame: @@ -211,6 +212,7 @@ def raw( end_date (str): End date (Either a date in the format YY-MM-DD or a datetime in the format YYY-MM-DDTHH:MM:SS or specify the timezone offset in the format YYYY-MM-DDTHH:MM:SS+zz:zz) include_bad_data (optional bool): Include "Bad" data points with True or remove "Bad" data points with False display_uom (optional bool): Display the unit of measure with True or False. Defaults to False. If True, metadata_source must be populated + sort (optional bool): Sort the data in ascending order by the TagName and Timestamp columns limit (optional int): The number of rows to be returned offset (optional int): The number of rows to skip before returning rows @@ -225,6 +227,7 @@ def raw( "end_date": end_date, "include_bad_data": include_bad_data, "display_uom": display_uom, + "sort": sort, "limit": limit, "offset": offset, "tagname_column": self.tagname_column, @@ -253,8 +256,10 @@ def resample( time_interval_unit: str, agg_method: str, include_bad_data: bool = False, + fill: bool = False, pivot: bool = False, display_uom: bool = False, + sort: bool = True, limit: int = None, offset: int = None, ) -> DataFrame: @@ -297,8 +302,10 @@ def resample( time_interval_unit (str): The time interval unit (second, minute, day, hour) agg_method (str): Aggregation Method (first, last, avg, min, max) include_bad_data (optional bool): Include "Bad" data points with True or remove "Bad" data points with False + fill (bool): Fill the data with intervals where no data exists. The Value column will be filled with Null pivot (optional bool): Pivot the data on the timestamp column with True or do not pivot the data with False display_uom (optional bool): Display the unit of measure with True or False. Defaults to False. If True, metadata_source must be populated + sort (optional bool): Sort the data in ascending order by the TagName and Timestamp columns or, if pivot is True, by the Timestamp column limit (optional int): The number of rows to be returned offset (optional int): The number of rows to skip before returning rows @@ -316,8 +323,10 @@ def resample( "time_interval_rate": time_interval_rate, "time_interval_unit": time_interval_unit, "agg_method": agg_method, + "fill": fill, "pivot": pivot, "display_uom": display_uom, + "sort": sort, "limit": limit, "offset": offset, "tagname_column": self.tagname_column, @@ -350,6 +359,7 @@ def plot( include_bad_data: bool = False, pivot: bool = False, display_uom: bool = False, + sort: bool = True, limit: int = None, offset: int = None, ) -> DataFrame: @@ -392,6 +402,7 @@ def plot( include_bad_data (optional bool): Include "Bad" data points with True or remove "Bad" data points with False pivot (optional bool): Pivot the data on the timestamp column with True or do not pivot the data with False display_uom (optional bool): Display the unit of measure with True or False. Defaults to False. If True, metadata_source must be populated + sort (optional bool): Sort the data in ascending order by the TagName and Timestamp columns limit (optional int): The number of rows to be returned offset (optional int): The number of rows to skip before returning rows @@ -410,6 +421,7 @@ def plot( "include_bad_data": include_bad_data, "pivot": pivot, "display_uom": display_uom, + "sort": sort, "limit": limit, "offset": offset, "tagname_column": self.tagname_column, @@ -436,11 +448,10 @@ def interpolate( end_date: str, time_interval_rate: str, time_interval_unit: str, - agg_method: str, - interpolation_method: str, include_bad_data: bool = False, pivot: bool = False, display_uom: bool = False, + sort: bool = True, limit: int = None, offset: int = None, ) -> DataFrame: @@ -467,8 +478,6 @@ def interpolate( end_date="2023-01-31", time_interval_rate="15", time_interval_unit="minute", - agg_method="first", - interpolation_method="forward_fill", ) ) @@ -482,11 +491,10 @@ def interpolate( end_date (str): End date (Either a date in the format YY-MM-DD or a datetime in the format YYY-MM-DDTHH:MM:SS or specify the timezone offset in the format YYYY-MM-DDTHH:MM:SS+zz:zz) time_interval_rate (str): The time interval rate (numeric input) time_interval_unit (str): The time interval unit (second, minute, day, hour) - agg_method (str): Aggregation Method (first, last, avg, min, max) - interpolation_method (str): Interpolation method (forward_fill, backward_fill, linear) include_bad_data (optional bool): Include "Bad" data points with True or remove "Bad" data points with False pivot (optional bool): Pivot the data on the timestamp column with True or do not pivot the data with False display_uom (optional bool): Display the unit of measure with True or False. Defaults to False. If True, metadata_source must be populated + sort (optional bool): Sort the data in ascending order by the TagName and Timestamp columns or, if pivot is True, by the Timestamp column limit (optional int): The number of rows to be returned offset (optional int): The number of rows to skip before returning rows @@ -502,10 +510,9 @@ def interpolate( "include_bad_data": include_bad_data, "time_interval_rate": time_interval_rate, "time_interval_unit": time_interval_unit, - "agg_method": agg_method, - "interpolation_method": interpolation_method, "pivot": pivot, "display_uom": display_uom, + "sort": sort, "limit": limit, "offset": offset, "tagname_column": self.tagname_column, diff --git a/src/sdk/python/rtdip_sdk/queries/time_series/time_weighted_average.py b/src/sdk/python/rtdip_sdk/queries/time_series/time_weighted_average.py index fb9d644cd..be1083e8d 100644 --- a/src/sdk/python/rtdip_sdk/queries/time_series/time_weighted_average.py +++ b/src/sdk/python/rtdip_sdk/queries/time_series/time_weighted_average.py @@ -37,7 +37,6 @@ def get(connection: object, parameters_dict: dict) -> pd.DataFrame: tag_names (list): List of tagname or tagnames start_date (str): Start date (Either a utc date in the format YYYY-MM-DD or a utc datetime in the format YYYY-MM-DDTHH:MM:SS or specify the timezone offset in the format YYYY-MM-DDTHH:MM:SS+zz:zz) end_date (str): End date (Either a utc date in the format YYYY-MM-DD or a utc datetime in the format YYYY-MM-DDTHH:MM:SS or specify the timezone offset in the format YYYY-MM-DDTHH:MM:SS+zz:zz) - window_size_mins (int): (deprecated) Window size in minutes. Please use time_interval_rate and time_interval_unit below instead. time_interval_rate (str): The time interval rate (numeric input) time_interval_unit (str): The time interval unit (second, minute, day, hour) window_length (int): Add longer window time in days for the start or end of specified date to cater for edge cases. @@ -65,12 +64,8 @@ def get(connection: object, parameters_dict: dict) -> pd.DataFrame: if parameters_dict["pivot"] is True and parameters_dict["display_uom"] is True: raise ValueError("pivot True and display_uom True cannot be used together") - if "window_size_mins" in parameters_dict: - logging.warning( - "Parameter window_size_mins is deprecated and will be removed in v1.0.0. Please use time_interval_rate and time_interval_unit instead." - ) - parameters_dict["time_interval_rate"] = str(parameters_dict["window_size_mins"]) - parameters_dict["time_interval_unit"] = "minute" + if "step" not in parameters_dict: # default step to metadata if not provided + parameters_dict["step"] = "metadata" try: query = _query_builder(parameters_dict, "time_weighted_average") diff --git a/tests/api/v1/api_test_objects.py b/tests/api/v1/api_test_objects.py index ce143e954..2599a298c 100644 --- a/tests/api/v1/api_test_objects.py +++ b/tests/api/v1/api_test_objects.py @@ -118,9 +118,6 @@ INTERPOLATE_MOCKED_PARAMETER_DICT = RESAMPLE_MOCKED_PARAMETER_DICT.copy() INTERPOLATE_MOCKED_PARAMETER_ERROR_DICT = RESAMPLE_MOCKED_PARAMETER_ERROR_DICT.copy() -INTERPOLATE_MOCKED_PARAMETER_DICT["interpolation_method"] = "forward_fill" -INTERPOLATE_MOCKED_PARAMETER_ERROR_DICT["interpolation_method"] = "forward_fill" - INTERPOLATE_POST_MOCKED_PARAMETER_DICT = INTERPOLATE_MOCKED_PARAMETER_DICT.copy() INTERPOLATE_POST_MOCKED_PARAMETER_DICT.pop("tag_name") @@ -155,12 +152,10 @@ RAW_MOCKED_PARAMETER_ERROR_DICT.copy() ) -TIME_WEIGHTED_AVERAGE_MOCKED_PARAMETER_DICT["window_size_mins"] = "15" TIME_WEIGHTED_AVERAGE_MOCKED_PARAMETER_DICT["time_interval_rate"] = "15" TIME_WEIGHTED_AVERAGE_MOCKED_PARAMETER_DICT["time_interval_unit"] = "minute" TIME_WEIGHTED_AVERAGE_MOCKED_PARAMETER_DICT["window_length"] = 10 TIME_WEIGHTED_AVERAGE_MOCKED_PARAMETER_DICT["step"] = "metadata" -TIME_WEIGHTED_AVERAGE_MOCKED_PARAMETER_ERROR_DICT["window_size_mins"] = "15" TIME_WEIGHTED_AVERAGE_MOCKED_PARAMETER_ERROR_DICT["time_interval_rate"] = "15" TIME_WEIGHTED_AVERAGE_MOCKED_PARAMETER_ERROR_DICT["time_interval_unit"] = "minute" TIME_WEIGHTED_AVERAGE_MOCKED_PARAMETER_ERROR_DICT["window_length"] = 10 diff --git a/tests/api/v1/test_api_batch.py b/tests/api/v1/test_api_batch.py index 3b6076396..3349afc77 100644 --- a/tests/api/v1/test_api_batch.py +++ b/tests/api/v1/test_api_batch.py @@ -38,7 +38,7 @@ RawResponse, ) from pandas.io.json import build_table_schema -from httpx import AsyncClient +from httpx import AsyncClient, ASGITransport from src.api.v1 import app from src.api.v1.common import json_response_batch from src.sdk.python.rtdip_sdk.queries.time_series import batch @@ -87,7 +87,7 @@ async def test_api_batch_single_get_success(mocker: MockerFixture): mock_lookup = "src.api.v1.batch.lookup_before_get" mocked_lookup_before_get = mocker.patch(mock_lookup, return_value=None) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: actual = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -175,7 +175,7 @@ async def test_api_batch_single_get_success_with_lookup(mocker: MockerFixture): mock_lookup = "src.api.v1.batch.lookup_before_get" mocked_lookup_before_get = mocker.patch(mock_lookup, return_value=test_data) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: actual = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -255,7 +255,7 @@ async def test_api_batch_single_post_success(mocker: MockerFixture): # Make a surveillance batch method reference to check if called and what args with surveillance_batch = mocker.patch(mock_method, return_value=mock_method_return_data) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: actual = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -300,7 +300,7 @@ async def test_api_batch_single_get_unsupported_route_error(mocker: MockerFixtur os.environ, {"DATABRICKS_SERVING_ENDPOINT": MOCK_MAPPING_ENDPOINT_URL} ) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: actual = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -343,7 +343,7 @@ async def test_api_batch_single_post_missing_body_error(mocker: MockerFixture): os.environ, {"DATABRICKS_SERVING_ENDPOINT": MOCK_MAPPING_ENDPOINT_URL} ) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: actual = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -406,7 +406,7 @@ async def test_api_batch_multiple_success(mocker: MockerFixture): # Make a surveillance batch method reference to check if called and what args with surveillance_batch = mocker.patch(mock_method, side_effect=mock_patch_side_effect) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: actual = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -464,7 +464,7 @@ async def test_api_batch_one_success_one_fail(mocker: MockerFixture): os.environ, {"DATABRICKS_SERVING_ENDPOINT": MOCK_MAPPING_ENDPOINT_URL} ) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: actual = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -515,7 +515,7 @@ async def test_api_batch_one_success_one_fail(mocker: MockerFixture): os.environ, {"DATABRICKS_SERVING_ENDPOINT": MOCK_MAPPING_ENDPOINT_URL} ) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: actual = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, diff --git a/tests/api/v1/test_api_circular_average.py b/tests/api/v1/test_api_circular_average.py index 99100c52f..a31b727e9 100644 --- a/tests/api/v1/test_api_circular_average.py +++ b/tests/api/v1/test_api_circular_average.py @@ -30,7 +30,7 @@ MOCK_TAG_MAPPING_EMPTY, MOCK_MAPPING_ENDPOINT_URL, ) -from httpx import AsyncClient +from httpx import AsyncClient, ASGITransport from src.api.v1 import app MOCK_METHOD = "src.sdk.python.rtdip_sdk.queries.time_series.circular_average.get" @@ -42,7 +42,7 @@ async def test_api_circular_average_get_success(mocker: MockerFixture, api_test_data): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_agg"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, @@ -59,7 +59,7 @@ async def test_api_circular_average_get_validation_error( ): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_agg"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, @@ -82,7 +82,7 @@ async def test_api_circular_average_get_error(mocker: MockerFixture, api_test_da Exception("Error Connecting to Database"), ) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, @@ -97,7 +97,7 @@ async def test_api_circular_average_get_error(mocker: MockerFixture, api_test_da async def test_api_circular_average_post_success(mocker: MockerFixture, api_test_data): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_agg"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -115,7 +115,7 @@ async def test_api_circular_average_post_validation_error( ): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_agg"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -139,7 +139,7 @@ async def test_api_circular_average_post_error(mocker: MockerFixture, api_test_d Exception("Error Connecting to Database"), ) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -182,7 +182,7 @@ async def test_api_circular_average_get_lookup_success(mocker: MockerFixture): modified_param_dict = CIRCULAR_AVERAGE_MOCKED_PARAMETER_DICT.copy() del modified_param_dict["business_unit"] - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: actual = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, params=modified_param_dict ) @@ -228,7 +228,7 @@ async def test_api_circular_average_post_lookup_success(mocker: MockerFixture): modified_param_dict = CIRCULAR_AVERAGE_MOCKED_PARAMETER_DICT.copy() del modified_param_dict["business_unit"] - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: actual = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -278,7 +278,7 @@ async def test_api_circular_average_get_lookup_no_tag_map_error(mocker: MockerFi modified_param_dict["tagname"] = ["NonExistentTag"] del modified_param_dict["business_unit"] - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: actual = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, params=modified_param_dict ) diff --git a/tests/api/v1/test_api_circular_standard_deviation.py b/tests/api/v1/test_api_circular_standard_deviation.py index 8db08ce19..1c5b92ca3 100644 --- a/tests/api/v1/test_api_circular_standard_deviation.py +++ b/tests/api/v1/test_api_circular_standard_deviation.py @@ -25,7 +25,7 @@ TEST_HEADERS, BASE_URL, ) -from httpx import AsyncClient +from httpx import AsyncClient, ASGITransport from src.api.v1 import app MOCK_METHOD = ( @@ -41,7 +41,7 @@ async def test_api_circular_standard_deviation_get_success( ): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_agg"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, @@ -58,7 +58,7 @@ async def test_api_circular_standard_deviation_get_validation_error( ): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_agg"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, @@ -83,7 +83,7 @@ async def test_api_circular_standard_deviation_get_error( Exception("Error Connecting to Database"), ) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, @@ -100,7 +100,7 @@ async def test_api_circular_standard_deviation_post_success( ): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_agg"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -118,7 +118,7 @@ async def test_api_circular_standard_deviation_post_validation_error( ): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_agg"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -144,7 +144,7 @@ async def test_api_circular_standard_deviation_post_error( Exception("Error Connecting to Database"), ) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, diff --git a/tests/api/v1/test_api_interpolate.py b/tests/api/v1/test_api_interpolate.py index 80f67ba6b..4b6e118e4 100644 --- a/tests/api/v1/test_api_interpolate.py +++ b/tests/api/v1/test_api_interpolate.py @@ -25,7 +25,7 @@ TEST_HEADERS, BASE_URL, ) -from httpx import AsyncClient +from httpx import AsyncClient, ASGITransport from src.api.v1 import app MOCK_METHOD = "src.sdk.python.rtdip_sdk.queries.time_series.interpolate.get" @@ -37,7 +37,7 @@ async def test_api_interpolate_get_success(mocker: MockerFixture, api_test_data): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_agg"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, @@ -54,7 +54,7 @@ async def test_api_interpolate_get_validation_error( ): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_agg"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, @@ -77,7 +77,7 @@ async def test_api_interpolate_get_error(mocker: MockerFixture, api_test_data): Exception("Error Connecting to Database"), ) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, @@ -92,7 +92,7 @@ async def test_api_interpolate_get_error(mocker: MockerFixture, api_test_data): async def test_api_interpolate_post_success(mocker: MockerFixture, api_test_data): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_agg"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -110,7 +110,7 @@ async def test_api_interpolate_post_validation_error( ): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_agg"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -134,7 +134,7 @@ async def test_api_interpolate_post_error(mocker: MockerFixture, api_test_data): Exception("Error Connecting to Database"), ) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, diff --git a/tests/api/v1/test_api_interpolation_at_time.py b/tests/api/v1/test_api_interpolation_at_time.py index 23218d5f4..879973225 100644 --- a/tests/api/v1/test_api_interpolation_at_time.py +++ b/tests/api/v1/test_api_interpolation_at_time.py @@ -24,7 +24,7 @@ TEST_HEADERS, BASE_URL, ) -from httpx import AsyncClient +from httpx import AsyncClient, ASGITransport from src.api.v1 import app MOCK_METHOD = "src.sdk.python.rtdip_sdk.queries.time_series.interpolation_at_time.get" @@ -38,7 +38,7 @@ async def test_api_interpolation_at_time_get_success( ): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_agg"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, @@ -57,7 +57,7 @@ async def test_api_interpolation_at_time_get_success( # ) # mocker = mocker_setup(mocker, MOCK_METHOD, test_data) -# async with AsyncClient(app=app, base_url=BASE_URL) as ac: +# async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: # response = await ac.get( # MOCK_API_NAME, # headers=TEST_HEADERS, @@ -82,7 +82,7 @@ async def test_api_interpolation_at_time_get_error( Exception("Error Connecting to Database"), ) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, @@ -99,7 +99,7 @@ async def test_api_interpolation_at_time_post_success( ): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_agg"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -119,7 +119,7 @@ async def test_api_interpolation_at_time_post_success( # ) # mocker = mocker_setup(mocker, MOCK_METHOD, test_data) -# async with AsyncClient(app=app, base_url=BASE_URL) as ac: +# async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: # response = await ac.post( # MOCK_API_NAME, # headers=TEST_HEADERS, @@ -145,7 +145,7 @@ async def test_api_interpolation_at_time_post_error( Exception("Error Connecting to Database"), ) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, diff --git a/tests/api/v1/test_api_latest.py b/tests/api/v1/test_api_latest.py index 1bc3b29a3..7c2e941db 100644 --- a/tests/api/v1/test_api_latest.py +++ b/tests/api/v1/test_api_latest.py @@ -29,7 +29,7 @@ MOCK_TAG_MAPPING_EMPTY, MOCK_MAPPING_ENDPOINT_URL, ) -from httpx import AsyncClient +from httpx import AsyncClient, ASGITransport from src.api.v1 import app MOCK_METHOD = "src.sdk.python.rtdip_sdk.queries.time_series.latest.get" @@ -43,7 +43,7 @@ async def test_api_latest_get_tags_provided_success( ): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_latest"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, params=METADATA_MOCKED_PARAMETER_DICT ) @@ -58,7 +58,7 @@ async def test_api_latest_get_no_good_values_tags_provided_success( ): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_latest"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, params=METADATA_MOCKED_PARAMETER_DICT ) @@ -75,7 +75,7 @@ async def test_api_latest_get_no_tags_provided_success( METADATA_MOCKED_PARAMETER_NO_TAG_DICT = METADATA_MOCKED_PARAMETER_DICT.copy() METADATA_MOCKED_PARAMETER_NO_TAG_DICT.pop("tag_name") - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, @@ -90,7 +90,7 @@ async def test_api_latest_get_no_tags_provided_success( async def test_api_latest_get_validation_error(mocker: MockerFixture, api_test_data): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_latest"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, @@ -113,7 +113,7 @@ async def test_api_latest_get_error(mocker: MockerFixture, api_test_data): Exception("Error Connecting to Database"), ) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, params=METADATA_MOCKED_PARAMETER_DICT ) @@ -128,7 +128,7 @@ async def test_api_latest_post_tags_provided_success( ): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_latest"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -148,7 +148,7 @@ async def test_api_latest_post_no_tags_provided_error( METADATA_MOCKED_PARAMETER_NO_TAG_DICT = METADATA_MOCKED_PARAMETER_DICT.copy() METADATA_MOCKED_PARAMETER_NO_TAG_DICT.pop("tag_name") - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -166,7 +166,7 @@ async def test_api_latest_post_no_tags_provided_error( async def test_api_latest_post_validation_error(mocker: MockerFixture, api_test_data): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_latest"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -190,7 +190,7 @@ async def test_api_raw_post_error(mocker: MockerFixture, api_test_data): Exception("Error Connecting to Database"), ) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -238,7 +238,7 @@ async def test_api_latest_get_lookup_success(mocker: MockerFixture): modified_param_dict = METADATA_MOCKED_PARAMETER_DICT.copy() del modified_param_dict["business_unit"] - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: actual = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, params=modified_param_dict ) @@ -288,7 +288,7 @@ async def test_api_latest_post_lookup_success(mocker: MockerFixture): modified_param_dict = METADATA_MOCKED_PARAMETER_DICT.copy() del modified_param_dict["business_unit"] - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: actual = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -342,7 +342,7 @@ async def test_api_latest_get_lookup_no_tag_map_error(mocker: MockerFixture): modified_param_dict["tagname"] = ["NonExistentTag"] del modified_param_dict["business_unit"] - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: actual = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, params=modified_param_dict ) diff --git a/tests/api/v1/test_api_metadata.py b/tests/api/v1/test_api_metadata.py index 966014ecb..585b41267 100644 --- a/tests/api/v1/test_api_metadata.py +++ b/tests/api/v1/test_api_metadata.py @@ -28,7 +28,7 @@ MOCK_TAG_MAPPING_EMPTY, MOCK_MAPPING_ENDPOINT_URL, ) -from httpx import AsyncClient +from httpx import AsyncClient, ASGITransport from src.api.v1 import app MOCK_METHOD = "src.sdk.python.rtdip_sdk.queries.metadata.get" @@ -45,7 +45,7 @@ async def test_api_metadata_get_tags_provided_success( ): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_metadata"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, params=METADATA_MOCKED_PARAMETER_DICT ) @@ -62,7 +62,7 @@ async def test_api_metadata_get_no_tags_provided_success( METADATA_MOCKED_PARAMETER_NO_TAG_DICT = METADATA_MOCKED_PARAMETER_DICT.copy() METADATA_MOCKED_PARAMETER_NO_TAG_DICT.pop("tag_name") - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, @@ -77,7 +77,7 @@ async def test_api_metadata_get_no_tags_provided_success( async def test_api_metadata_get_validation_error(mocker: MockerFixture, api_test_data): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_metadata"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, @@ -100,7 +100,7 @@ async def test_api_metadata_get_error(mocker: MockerFixture, api_test_data): Exception("Error Connecting to Database"), ) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, params=METADATA_MOCKED_PARAMETER_DICT ) @@ -115,7 +115,7 @@ async def test_api_metadata_post_tags_provided_success( ): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_metadata"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -135,7 +135,7 @@ async def test_api_metadata_post_no_tags_provided_error( METADATA_MOCKED_PARAMETER_NO_TAG_DICT = METADATA_MOCKED_PARAMETER_DICT.copy() METADATA_MOCKED_PARAMETER_NO_TAG_DICT.pop("tag_name") - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -153,7 +153,7 @@ async def test_api_metadata_post_no_tags_provided_error( async def test_api_metadata_post_validation_error(mocker: MockerFixture, api_test_data): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_metadata"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -177,7 +177,7 @@ async def test_api_metadata_post_error(mocker: MockerFixture, api_test_data): Exception("Error Connecting to Database"), ) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -212,7 +212,7 @@ async def test_api_metadata_get_lookup_success(mocker: MockerFixture): modified_param_dict = METADATA_MOCKED_PARAMETER_DICT.copy() del modified_param_dict["business_unit"] - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: actual = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, params=modified_param_dict ) @@ -249,7 +249,7 @@ async def test_api_metadata_post_lookup_success(mocker: MockerFixture): modified_param_dict = METADATA_MOCKED_PARAMETER_DICT.copy() del modified_param_dict["business_unit"] - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: actual = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -290,7 +290,7 @@ async def test_api_metadata_get_lookup_no_tag_map_error(mocker: MockerFixture): modified_param_dict["tagname"] = ["NonExistentTag"] del modified_param_dict["business_unit"] - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: actual = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, params=modified_param_dict ) diff --git a/tests/api/v1/test_api_plot.py b/tests/api/v1/test_api_plot.py index 52560e293..df01aee95 100644 --- a/tests/api/v1/test_api_plot.py +++ b/tests/api/v1/test_api_plot.py @@ -25,7 +25,7 @@ TEST_HEADERS, BASE_URL, ) -from httpx import AsyncClient +from httpx import AsyncClient, ASGITransport from src.api.v1 import app MOCK_METHOD = "src.sdk.python.rtdip_sdk.queries.time_series.plot.get" @@ -37,7 +37,7 @@ async def test_api_plot_get_success(mocker: MockerFixture, api_test_data): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_plot"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, params=PLOT_MOCKED_PARAMETER_DICT ) @@ -50,7 +50,7 @@ async def test_api_plot_get_success(mocker: MockerFixture, api_test_data): async def test_api_plot_get_validation_error(mocker: MockerFixture, api_test_data): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_plot"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, @@ -73,7 +73,7 @@ async def test_api_pot_get_error(mocker: MockerFixture, api_test_data): Exception("Error Connecting to Database"), ) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, params=PLOT_MOCKED_PARAMETER_DICT ) @@ -86,7 +86,7 @@ async def test_api_pot_get_error(mocker: MockerFixture, api_test_data): async def test_api_plot_post_success(mocker: MockerFixture, api_test_data): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_plot"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -102,7 +102,7 @@ async def test_api_plot_post_success(mocker: MockerFixture, api_test_data): async def test_api_plot_post_validation_error(mocker: MockerFixture, api_test_data): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_plot"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -126,7 +126,7 @@ async def test_api_plot_post_error(mocker: MockerFixture, api_test_data): Exception("Error Connecting to Database"), ) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, diff --git a/tests/api/v1/test_api_raw.py b/tests/api/v1/test_api_raw.py index afde6d60b..93d681b66 100644 --- a/tests/api/v1/test_api_raw.py +++ b/tests/api/v1/test_api_raw.py @@ -29,7 +29,7 @@ ) from pandas.io.json import build_table_schema import pandas as pd -from httpx import AsyncClient +from httpx import AsyncClient, ASGITransport from src.api.v1 import app MOCK_METHOD = "src.sdk.python.rtdip_sdk.queries.time_series.raw.get" @@ -41,7 +41,7 @@ async def test_api_raw_get_success(mocker: MockerFixture, api_test_data): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_raw"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, params=RAW_MOCKED_PARAMETER_DICT ) @@ -54,7 +54,7 @@ async def test_api_raw_get_success(mocker: MockerFixture, api_test_data): async def test_api_raw_get_validation_error(mocker: MockerFixture, api_test_data): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_raw"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, params=RAW_MOCKED_PARAMETER_ERROR_DICT ) @@ -75,7 +75,7 @@ async def test_api_raw_get_error(mocker: MockerFixture, api_test_data): Exception("Error Connecting to Database"), ) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, params=RAW_MOCKED_PARAMETER_DICT ) @@ -88,7 +88,7 @@ async def test_api_raw_get_error(mocker: MockerFixture, api_test_data): async def test_api_raw_post_success(mocker: MockerFixture, api_test_data): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_raw"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -104,7 +104,7 @@ async def test_api_raw_post_success(mocker: MockerFixture, api_test_data): async def test_api_raw_post_validation_error(mocker: MockerFixture, api_test_data): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_raw"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -128,7 +128,7 @@ async def test_api_raw_post_error(mocker: MockerFixture, api_test_data): Exception("Error Connecting to Database"), ) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -172,7 +172,7 @@ async def test_api_raw_get_lookup_success(mocker: MockerFixture, api_test_data): modified_param_dict = RAW_MOCKED_PARAMETER_DICT.copy() del modified_param_dict["business_unit"] - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: actual = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, params=modified_param_dict ) @@ -217,7 +217,7 @@ async def test_api_raw_post_lookup_success(mocker: MockerFixture): modified_param_dict = RAW_POST_MOCKED_PARAMETER_DICT.copy() del modified_param_dict["business_unit"] - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: actual = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -267,7 +267,7 @@ async def test_api_raw_get_lookup_no_tag_map_error(mocker: MockerFixture): modified_param_dict["tagname"] = ["NonExistentTag"] del modified_param_dict["business_unit"] - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: actual = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, params=modified_param_dict ) diff --git a/tests/api/v1/test_api_resample.py b/tests/api/v1/test_api_resample.py index 2baa5a6fc..6ec815ca1 100644 --- a/tests/api/v1/test_api_resample.py +++ b/tests/api/v1/test_api_resample.py @@ -25,7 +25,7 @@ TEST_HEADERS, BASE_URL, ) -from httpx import AsyncClient +from httpx import AsyncClient, ASGITransport from src.api.v1 import app MOCK_METHOD = "src.sdk.python.rtdip_sdk.queries.time_series.resample.get" @@ -37,7 +37,7 @@ async def test_api_resample_get_success(mocker: MockerFixture, api_test_data): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_agg"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, params=RESAMPLE_MOCKED_PARAMETER_DICT ) @@ -50,7 +50,7 @@ async def test_api_resample_get_success(mocker: MockerFixture, api_test_data): async def test_api_resample_get_validation_error(mocker: MockerFixture, api_test_data): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_agg"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, @@ -73,7 +73,7 @@ async def test_api_resample_get_error(mocker: MockerFixture, api_test_data): Exception("Error Connecting to Database"), ) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, params=RESAMPLE_MOCKED_PARAMETER_DICT ) @@ -86,7 +86,7 @@ async def test_api_resample_get_error(mocker: MockerFixture, api_test_data): async def test_api_resample_post_success(mocker: MockerFixture, api_test_data): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_agg"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -102,7 +102,7 @@ async def test_api_resample_post_success(mocker: MockerFixture, api_test_data): async def test_api_resample_post_validation_error(mocker: MockerFixture, api_test_data): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_agg"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -126,7 +126,7 @@ async def test_api_resample_post_error(mocker: MockerFixture, api_test_data): Exception("Error Connecting to Database"), ) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, diff --git a/tests/api/v1/test_api_sql.py b/tests/api/v1/test_api_sql.py index 1ea88482e..064124d31 100644 --- a/tests/api/v1/test_api_sql.py +++ b/tests/api/v1/test_api_sql.py @@ -25,7 +25,7 @@ BASE_URL, ) from pandas.io.json import build_table_schema -from httpx import AsyncClient +from httpx import AsyncClient, ASGITransport from src.api.v1 import app MOCK_METHOD = "src.sdk.python.rtdip_sdk.queries.sql.sql_query.SQLQueryBuilder.get" @@ -37,7 +37,7 @@ async def test_api_sql_post_success(mocker: MockerFixture, api_test_data): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_raw"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -53,7 +53,7 @@ async def test_api_sql_post_success(mocker: MockerFixture, api_test_data): async def test_api_sql_post_validation_error(mocker: MockerFixture, api_test_data): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_raw"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -77,7 +77,7 @@ async def test_api_sql_post_error(mocker: MockerFixture, api_test_data): Exception("Error Connecting to Database"), ) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, diff --git a/tests/api/v1/test_api_summary.py b/tests/api/v1/test_api_summary.py index 88c2f1cbe..42319aafd 100644 --- a/tests/api/v1/test_api_summary.py +++ b/tests/api/v1/test_api_summary.py @@ -24,7 +24,7 @@ TEST_HEADERS, BASE_URL, ) -from httpx import AsyncClient +from httpx import AsyncClient, ASGITransport from src.api.v1 import app import json @@ -37,7 +37,7 @@ async def test_api_summary_get_success(mocker: MockerFixture, api_test_data): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_summary"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, params=SUMMARY_MOCKED_PARAMETER_DICT ) @@ -50,7 +50,7 @@ async def test_api_summary_get_success(mocker: MockerFixture, api_test_data): async def test_api_summary_get_validation_error(mocker: MockerFixture, api_test_data): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_summary"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, @@ -73,7 +73,7 @@ async def test_api_summary_get_error(mocker: MockerFixture, api_test_data): Exception("Error Connecting to Database"), ) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, params=SUMMARY_MOCKED_PARAMETER_DICT ) @@ -86,7 +86,7 @@ async def test_api_summary_get_error(mocker: MockerFixture, api_test_data): async def test_api_summary_post_success(mocker: MockerFixture, api_test_data): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_summary"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -102,7 +102,7 @@ async def test_api_summary_post_success(mocker: MockerFixture, api_test_data): async def test_api_summary_post_validation_error(mocker: MockerFixture, api_test_data): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_summary"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -126,7 +126,7 @@ async def test_api_summary_post_error(mocker: MockerFixture, api_test_data): Exception("Error Connecting to Database"), ) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, diff --git a/tests/api/v1/test_api_time_weighted_average.py b/tests/api/v1/test_api_time_weighted_average.py index 99eb19c8d..5ba0591b3 100644 --- a/tests/api/v1/test_api_time_weighted_average.py +++ b/tests/api/v1/test_api_time_weighted_average.py @@ -25,7 +25,7 @@ TEST_HEADERS, BASE_URL, ) -from httpx import AsyncClient +from httpx import AsyncClient, ASGITransport from src.api.v1 import app MOCK_METHOD = "src.sdk.python.rtdip_sdk.queries.time_series.time_weighted_average.get" @@ -39,7 +39,7 @@ async def test_api_time_weighted_average_get_success( ): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_agg"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, @@ -56,7 +56,7 @@ async def test_api_time_weighted_average_get_validation_error( ): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_agg"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, @@ -81,7 +81,7 @@ async def test_api_time_weighted_average_get_error( Exception("Error Connecting to Database"), ) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get( MOCK_API_NAME, headers=TEST_HEADERS, @@ -98,7 +98,7 @@ async def test_api_time_weighted_average_post_success( ): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_agg"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -116,7 +116,7 @@ async def test_api_time_weighted_average_post_validation_error( ): mocker = mocker_setup(mocker, MOCK_METHOD, api_test_data["mock_data_agg"]) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, @@ -142,7 +142,7 @@ async def test_api_time_weighted_average_post_error( Exception("Error Connecting to Database"), ) - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.post( MOCK_API_NAME, headers=TEST_HEADERS, diff --git a/tests/api/v1/test_api_utilities.py b/tests/api/v1/test_api_utilities.py index fb3042a6b..718a419dd 100644 --- a/tests/api/v1/test_api_utilities.py +++ b/tests/api/v1/test_api_utilities.py @@ -14,7 +14,7 @@ import pytest from pytest_mock import MockerFixture -from httpx import AsyncClient +from httpx import AsyncClient, ASGITransport from tests.api.v1.api_test_objects import BASE_URL from src.api.v1 import app @@ -22,7 +22,7 @@ async def test_api_home(mocker: MockerFixture): - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get("/") assert response.status_code == 307 @@ -30,14 +30,14 @@ async def test_api_home(mocker: MockerFixture): async def test_api_docs(mocker: MockerFixture): - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get("/docs") assert response.status_code == 200 async def test_api_redoc(mocker: MockerFixture): - async with AsyncClient(app=app, base_url=BASE_URL) as ac: + async with AsyncClient(transport=ASGITransport(app=app), base_url=BASE_URL) as ac: response = await ac.get("/redoc") assert response.status_code == 200 diff --git a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/__init__.py b/tests/sdk/python/rtdip_sdk/pipelines/__init__.py similarity index 100% rename from src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/__init__.py rename to tests/sdk/python/rtdip_sdk/pipelines/__init__.py diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_quality/__init__.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/__init__.py new file mode 100644 index 000000000..1832b01ae --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/__init__.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/__init__.py new file mode 100644 index 000000000..1832b01ae --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/__init__.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/__init__.py new file mode 100644 index 000000000..1832b01ae --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_dimensionality_reduction.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_dimensionality_reduction.py new file mode 100644 index 000000000..0e19edd6e --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_dimensionality_reduction.py @@ -0,0 +1,119 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from pyspark.sql import SparkSession + +from src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.dimensionality_reduction import ( + DimensionalityReduction, +) + + +@pytest.fixture(scope="session") +def spark_session(): + return SparkSession.builder.master("local[2]").appName("test").getOrCreate() + + +@pytest.fixture +def test_data(spark_session): + normal_distribution = [ + 0.30832997, + 0.22166579, + -1.68713693, + 1.41243689, + 1.25282623, + -0.70494665, + 0.52186887, + -0.34352648, + -1.38233527, + -0.76870644, + 1.72735928, + -0.14838714, + -0.76086769, + 1.81330706, + -1.84541331, + -1.05816002, + 0.86864253, + -2.47756826, + 0.19112086, + -0.72390124, + ] + + noise = [ + 2.39757601, + 0.40913959, + 0.40281196, + 0.43624341, + 0.57281305, + 0.15978893, + 0.09098515, + 0.18199072, + 2.9758837, + 1.38059478, + 1.55032586, + 0.88507288, + 2.13327, + 2.21896827, + 0.61288938, + 0.17535961, + 1.83386377, + 1.08476656, + 1.86311249, + 0.44964528, + ] + + data_with_noise = [ + (normal_distribution[i], normal_distribution[i] + noise[i]) + for i in range(len(normal_distribution)) + ] + + identical_data = [ + (normal_distribution[i], normal_distribution[i]) + for i in range(len(normal_distribution)) + ] + + return [ + spark_session.createDataFrame(data_with_noise, ["Value1", "Value2"]), + spark_session.createDataFrame(identical_data, ["Value1", "Value2"]), + ] + + +def test_with_correlated_data(spark_session, test_data): + identical_data = test_data[1] + + dimensionality_reduction = DimensionalityReduction( + identical_data, ["Value1", "Value2"] + ) + result_df = dimensionality_reduction.filter_data() + + assert ( + result_df.count() == identical_data.count() + ), "Row count does not match expected result" + assert "Value1" in result_df.columns, "Value1 should be in the DataFrame" + assert "Value2" not in result_df.columns, "Value2 should have been removed" + + +def test_with_uncorrelated_data(spark_session, test_data): + uncorrelated_data = test_data[0] + + dimensionality_reduction = DimensionalityReduction( + uncorrelated_data, ["Value1", "Value2"] + ) + result_df = dimensionality_reduction.filter_data() + + assert ( + result_df.count() == uncorrelated_data.count() + ), "Row count does not match expected result" + assert "Value1" in result_df.columns, "Value1 should be in the DataFrame" + assert "Value2" in result_df.columns, "Value2 should be in the DataFrame" diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_duplicate_detection.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_duplicate_detection.py new file mode 100644 index 000000000..270f2c36e --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_duplicate_detection.py @@ -0,0 +1,163 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import os +from pyspark.sql import SparkSession +from pyspark.sql.dataframe import DataFrame +from pyspark.sql.types import ( + StructType, + StructField, + StringType, + TimestampType, + FloatType, +) + +from src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.duplicate_detection import ( + DuplicateDetection, +) + + +@pytest.fixture(scope="session") +def spark_session(): + return SparkSession.builder.master("local[2]").appName("test").getOrCreate() + + +@pytest.fixture +def test_data(spark_session): + data = [ + ("key1", "time1", "value1"), + ("key2", "time2", "value2"), + ("key2", "time3", "value2"), + ("key1", "time1", "value3"), + ("key4", "time4", "value4"), + ("key5", "time4", "value5"), + ] + columns = ["TagName", "EventTime", "Value"] + return spark_session.createDataFrame(data, columns) + + +def test_duplicate_detection_two_columns(spark_session, test_data): + expected_data = [ + ("key1", "time1", "value1"), + ("key2", "time2", "value2"), + ("key2", "time3", "value2"), + ("key4", "time4", "value4"), + ("key5", "time4", "value5"), + ] + columns = ["TagName", "EventTime", "Value"] + expected_df = spark_session.createDataFrame(expected_data, columns) + + duplicate_detection = DuplicateDetection( + test_data, primary_key_columns=["TagName", "EventTime"] + ) + result_df = duplicate_detection.filter_data() + result_df.show() + + assert ( + result_df.count() == expected_df.count() + ), "Row count does not match expected result" + assert sorted(result_df.collect()) == sorted( + expected_df.collect() + ), "Data does not match expected result" + + +def test_duplicate_detection_one_column(spark_session, test_data): + expected_data = [ + ("key1", "time1", "value1"), + ("key2", "time2", "value2"), + ("key4", "time4", "value4"), + ("key5", "time4", "value5"), + ] + columns = ["TagName", "EventTime", "Value"] + expected_df = spark_session.createDataFrame(expected_data, columns) + + duplicate_detection = DuplicateDetection(test_data, primary_key_columns=["TagName"]) + result_df = duplicate_detection.filter_data() + result_df.show() + + assert ( + result_df.count() == expected_df.count() + ), "Row count does not match expected result" + assert sorted(result_df.collect()) == sorted( + expected_df.collect() + ), "Data does not match expected result" + + +def test_duplicate_detection_large_data_set(spark_session: SparkSession): + test_path = os.path.dirname(__file__) + data_path = os.path.join(test_path, "../../test_data.csv") + + actual_df = spark_session.read.option("header", "true").csv(data_path) + + expected_schema = StructType( + [ + StructField("TagName", StringType(), True), + StructField("EventTime", TimestampType(), True), + StructField("Status", StringType(), True), + StructField("Value", FloatType(), True), + ] + ) + + duplicate_detection_component = DuplicateDetection( + actual_df, primary_key_columns=["TagName", "EventTime"] + ) + result_df = DataFrame + + try: + if duplicate_detection_component.validate(expected_schema): + result_df = duplicate_detection_component.filter_data() + except Exception as e: + print(repr(e)) + + assert isinstance(actual_df, DataFrame) + + assert result_df.schema == expected_schema + assert result_df.count() < actual_df.count() + assert result_df.count() == (actual_df.count() - 4) + + +def test_duplicate_detection_wrong_datatype(spark_session: SparkSession): + + expected_schema = StructType( + [ + StructField("TagName", StringType(), True), + StructField("EventTime", TimestampType(), True), + StructField("Status", StringType(), True), + StructField("Value", FloatType(), True), + ] + ) + + test_df = spark_session.createDataFrame( + [ + ("A2PS64V0J.:ZUX09R", "invalid_data_type", "Good", "1.0"), + ("A2PS64V0J.:ZUX09R", "invalid_data_type", "Good", "2.0"), + ("A2PS64V0J.:ZUX09R", "invalid_data_type", "Good", "3.0"), + ("A2PS64V0J.:ZUX09R", "invalid_data_type", "Good", "4.0"), + ("A2PS64V0J.:ZUX09R", "invalid_data_type", "Good", "5.0"), + ], + ["TagName", "EventTime", "Status", "Value"], + ) + + duplicate_detection_component = DuplicateDetection( + test_df, primary_key_columns=["TagName", "EventTime"] + ) + + with pytest.raises(ValueError) as exc_info: + duplicate_detection_component.validate(expected_schema) + + assert ( + "Error during casting column 'EventTime' to TimestampType(): Column 'EventTime' cannot be cast to TimestampType()." + in str(exc_info.value) + ) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_flatline_filter.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_flatline_filter.py new file mode 100644 index 000000000..6e5086b9a --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_flatline_filter.py @@ -0,0 +1,131 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest +import os +from pyspark.sql import SparkSession +from src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.flatline_filter import ( + FlatlineFilter, +) + + +@pytest.fixture(scope="session") +def spark(): + spark = ( + SparkSession.builder.master("local[2]") + .appName("FlatlineDetectionTest") + .getOrCreate() + ) + yield spark + spark.stop() + + +def test_flatline_filter_no_flatlining(spark): + df = spark.createDataFrame( + [ + ("A2PS64V0J.:ZUX09R", "2024-01-02 03:49:45.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 07:53:11.000", "Good", "0.119999997"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 11:56:42.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 16:00:12.000", "Good", "0.150000006"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 20:03:46.000", "Good", "0.340000004"), + ], + ["TagName", "EventTime", "Status", "Value"], + ) + + detector = FlatlineFilter(df, watch_columns=["Value"], tolerance_timespan=2) + result = detector.filter_data() + + assert sorted(result.collect()) == sorted(df.collect()) + + +def test_flatline_detection_with_flatlining(spark): + df = spark.createDataFrame( + [ + ("A2PS64V0J.:ZUX09R", "2024-01-02 03:49:45.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 07:53:11.000", "Good", "0.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 11:56:42.000", "Good", "0.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 16:00:12.000", "Good", "Null"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 20:03:46.000", "Good", "0.340000004"), + ], + ["TagName", "EventTime", "Status", "Value"], + ) + + detector = FlatlineFilter(df, watch_columns=["Value"], tolerance_timespan=2) + result = detector.filter_data() + + rows_to_remove = [ + { + "TagName": "A2PS64V0J.:ZUX09R", + "EventTime": "2024-01-02 07:53:11.000", + "Status": "Good", + "Value": "0.0", + }, + { + "TagName": "A2PS64V0J.:ZUX09R", + "EventTime": "2024-01-02 07:53:11.000", + "Status": "Good", + "Value": "0.0", + }, + { + "TagName": "A2PS64V0J.:ZUX09R", + "EventTime": "2024-01-02 11:56:42.000", + "Status": "Good", + "Value": "0.0", + }, + { + "TagName": "A2PS64V0J.:ZUX09R", + "EventTime": "2024-01-02 16:00:12.000", + "Status": "Good", + "Value": "None", + }, + ] + rows_to_remove_df = spark.createDataFrame(rows_to_remove) + expected_df = df.subtract(rows_to_remove_df) + assert sorted(result.collect()) == sorted(expected_df.collect()) + + +def test_large_dataset(spark): + base_path = os.path.dirname(__file__) + file_path = os.path.join(base_path, "../../test_data.csv") + df = spark.read.option("header", "true").csv(file_path) + + assert df.count() > 0, "Dataframe was not loaded correctly" + + detector = FlatlineFilter(df, watch_columns=["Value"], tolerance_timespan=2) + result = detector.filter_data() + + rows_to_remove = [ + { + "TagName": "FLATLINE_TEST", + "EventTime": "2024-01-02 02:35:10.511000", + "Status": "Good", + "Value": "0.0", + }, + { + "TagName": "FLATLINE_TEST", + "EventTime": "2024-01-02 02:49:10.408000", + "Status": "Good", + "Value": "0.0", + }, + { + "TagName": "FLATLINE_TEST", + "EventTime": "2024-01-02 14:57:10.372000", + "Status": "Good", + "Value": "0.0", + }, + ] + rows_to_remove_df = spark.createDataFrame(rows_to_remove) + + expected_df = df.subtract(rows_to_remove_df) + + assert sorted(result.collect()) == sorted(expected_df.collect()) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_gaussian_smoothing.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_gaussian_smoothing.py new file mode 100644 index 000000000..1c8131903 --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_gaussian_smoothing.py @@ -0,0 +1,142 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import pytest +from pyspark.sql import SparkSession + +from src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.gaussian_smoothing import ( + GaussianSmoothing, +) + + +@pytest.fixture(scope="session") +def spark_session(): + spark = ( + SparkSession.builder.master("local[2]") + .appName("GaussianSmoothingTest") + .getOrCreate() + ) + yield spark + spark.stop() + + +def test_gaussian_smoothing_temporal(spark_session: SparkSession): + df = spark_session.createDataFrame( + [ + ("A2PS64V0J.:ZUX09R", "2024-01-02 03:49:45.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 07:53:11.000", "Good", "0.119999997"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 11:56:42.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 16:00:12.000", "Good", "0.150000006"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 20:03:46.000", "Good", "0.340000004"), + ], + ["TagName", "EventTime", "Status", "Value"], + ) + + smoother = GaussianSmoothing( + df=df, + sigma=2.0, + id_col="TagName", + mode="temporal", + timestamp_col="EventTime", + value_col="Value", + ) + result_df = smoother.filter_data() + + original_values = df.select("Value").collect() + smoothed_values = result_df.select("Value").collect() + + assert ( + original_values != smoothed_values + ), "Values should be smoothed and not identical" + + assert result_df.count() == df.count(), "Result should have same number of rows" + + +def test_gaussian_smoothing_spatial(spark_session: SparkSession): + df = spark_session.createDataFrame( + [ + ("A2PS64V0J.:ZUX09R", "2024-01-02 03:49:45.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 07:53:11.000", "Good", "0.119999997"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 11:56:42.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 16:00:12.000", "Good", "0.150000006"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 20:03:46.000", "Good", "0.340000004"), + ], + ["TagName", "EventTime", "Status", "Value"], + ) + + # Apply smoothing + smoother = GaussianSmoothing( + df=df, + sigma=3.0, + id_col="TagName", + mode="spatial", + timestamp_col="EventTime", + value_col="Value", + ) + result_df = smoother.filter_data() + + original_values = df.select("Value").collect() + smoothed_values = result_df.select("Value").collect() + + assert ( + original_values != smoothed_values + ), "Values should be smoothed and not identical" + assert result_df.count() == df.count(), "Result should have same number of rows" + + +def test_interval_detection_large_data_set(spark_session: SparkSession): + # Should not timeout + base_path = os.path.dirname(__file__) + file_path = os.path.join(base_path, "../../test_data.csv") + + df = spark_session.read.option("header", "true").csv(file_path) + + smoother = GaussianSmoothing( + df=df, + sigma=1, + id_col="TagName", + mode="temporal", + timestamp_col="EventTime", + value_col="Value", + ) + + actual_df = smoother.filter_data() + assert ( + actual_df.count() == df.count() + ), "Output should have same number of rows as input" + + +def test_gaussian_smoothing_invalid_mode(spark_session: SparkSession): + # Create test data + df = spark_session.createDataFrame( + [ + ("A2PS64V0J.:ZUX09R", "2024-01-02 03:49:45.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 07:53:11.000", "Good", "0.119999997"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 11:56:42.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 16:00:12.000", "Good", "0.150000006"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 20:03:46.000", "Good", "0.340000004"), + ], + ["TagName", "EventTime", "Status", "Value"], + ) + + # Attempt to initialize with an invalid mode + with pytest.raises(ValueError, match="mode must be either 'temporal' or 'spatial'"): + GaussianSmoothing( + df=df, + sigma=2.0, + id_col="TagName", + mode="invalid_mode", # Invalid mode + timestamp_col="EventTime", + value_col="Value", + ) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_interval_filtering.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_interval_filtering.py new file mode 100644 index 000000000..a8fa04f32 --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_interval_filtering.py @@ -0,0 +1,377 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from datetime import datetime + +import pytest + + +from pyspark.sql import SparkSession +from src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.interval_filtering import ( + IntervalFiltering, +) +from tests.sdk.python.rtdip_sdk.pipelines.logging.test_log_collection import spark + + +@pytest.fixture(scope="session") +def spark_session(): + spark = ( + SparkSession.builder.master("local[2]") + .appName("CheckValueRangesTest") + .getOrCreate() + ) + yield spark + spark.stop() + + +def convert_to_datetime(date_time: str): + return datetime.strptime(date_time, "%Y-%m-%d %H:%M:%S.%f") + + +def test_interval_detection_easy(spark_session: SparkSession): + expected_df = spark_session.createDataFrame( + [ + ("A2PS64V0J.:ZUX09R", "2024-01-02 03:49:45.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 07:53:11.000", "Good", "0.119999997"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 11:56:42.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 16:00:12.000", "Good", "0.150000006"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 20:03:46.000", "Good", "0.340000004"), + ], + ["TagName", "EventTime", "Status", "Value"], + ) + + df = spark_session.createDataFrame( + [ + ("A2PS64V0J.:ZUX09R", "2024-01-02 03:49:45.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 07:53:11.000", "Good", "0.119999997"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 11:56:42.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 16:00:12.000", "Good", "0.150000006"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 20:03:46.000", "Good", "0.340000004"), + ], + ["TagName", "EventTime", "Status", "Value"], + ) + + interval_filtering_wrangler = IntervalFiltering( + spark_session, df, 1, "seconds", "EventTime" + ) + actual_df = interval_filtering_wrangler.filter_data() + + assert expected_df.columns == actual_df.columns + assert expected_df.schema == actual_df.schema + assert expected_df.collect() == actual_df.collect() + + +def test_interval_detection_easy_unordered(spark_session: SparkSession): + expected_df = spark_session.createDataFrame( + [ + ("A2PS64V0J.:ZUX09R", "2024-01-02 03:49:45.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 07:53:11.000", "Good", "0.119999997"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 11:56:42.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 16:00:12.000", "Good", "0.150000006"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 20:03:46.000", "Good", "0.340000004"), + ], + ["TagName", "EventTime", "Status", "Value"], + ) + + df = spark_session.createDataFrame( + [ + ("A2PS64V0J.:ZUX09R", "2024-01-02 07:53:11.000", "Good", "0.119999997"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 16:00:12.000", "Good", "0.150000006"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 03:49:45.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 11:56:42.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 20:03:46.000", "Good", "0.340000004"), + ], + ["TagName", "EventTime", "Status", "Value"], + ) + + interval_filtering_wrangler = IntervalFiltering( + spark_session, df, 1, "seconds", "EventTime" + ) + actual_df = interval_filtering_wrangler.filter_data() + + assert expected_df.columns == actual_df.columns + assert expected_df.schema == actual_df.schema + assert expected_df.collect() == actual_df.collect() + + +def test_interval_detection_milliseconds(spark_session: SparkSession): + expected_df = spark_session.createDataFrame( + [ + ("A2PS64V0JR", "2024-01-02 20:03:46.000"), + ("A2PS64asd.:ZUX09R", "2024-01-02 20:03:46.020"), + ("A2PS64asd.:ZUX09R", "2024-01-02 20:03:46.030"), + ], + ["TagName", "Time"], + ) + + df = spark_session.createDataFrame( + [ + ("A2PS64V0JR", "2024-01-02 20:03:46.000"), + ("A2PS64asd.:ZUX09R", "2024-01-02 20:03:46.020"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 20:03:46.025"), + ("A2PS64asd.:ZUX09R", "2024-01-02 20:03:46.030"), + ("A2PS64V0J.:ZUasdX09R", "2024-01-02 20:03:46.035"), + ], + ["TagName", "Time"], + ) + + interval_filtering_wrangler = IntervalFiltering( + spark_session, df, 10, "milliseconds", "Time" + ) + actual_df = interval_filtering_wrangler.filter_data() + + assert expected_df.columns == actual_df.columns + assert expected_df.schema == actual_df.schema + assert expected_df.collect() == actual_df.collect() + + +def test_interval_detection_minutes(spark_session: SparkSession): + expected_df = spark_session.createDataFrame( + [ + ("A2PS64V0JR", "2024-01-02 20:03:46.000"), + ("A2PS64asd.:ZUX09R", "2024-01-02 20:06:46.000"), + ("A2PS64asd.:ZUX09R", "2024-01-02 20:12:46.030"), + ], + ["TagName", "Time"], + ) + + df = spark_session.createDataFrame( + [ + ("A2PS64V0JR", "2024-01-02 20:03:46.000"), + ("A2PS64asd.:ZUX09R", "2024-01-02 20:06:46.000"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 20:09:45.999"), + ("A2PS64asd.:ZUX09R", "2024-01-02 20:12:46.030"), + ("A2PS64V0J.:ZUasdX09R", "2024-01-02 20:03:46.035"), + ], + ["TagName", "Time"], + ) + + interval_filtering_wrangler = IntervalFiltering( + spark_session, df, 3, "minutes", "Time" + ) + actual_df = interval_filtering_wrangler.filter_data() + + assert expected_df.columns == actual_df.columns + assert expected_df.schema == actual_df.schema + assert expected_df.collect() == actual_df.collect() + + +def test_interval_detection_hours(spark_session: SparkSession): + expected_df = spark_session.createDataFrame( + [ + ("A2PS64V0JR", "2024-01-02 20:03:46.000"), + ("A2PS64asd.:ZUX09R", "2024-01-02 21:06:46.000"), + ("A2PS64V0J.:ZUasdX09R", "2024-01-02 23:03:46.035"), + ], + ["TagName", "EventTime"], + ) + + df = spark_session.createDataFrame( + [ + ("A2PS64V0JR", "2024-01-02 20:03:46.000"), + ("A2PS64asd.:ZUX09R", "2024-01-02 21:06:46.000"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 21:09:45.999"), + ("A2PS64asd.:ZUX09R", "2024-01-02 21:12:46.030"), + ("A2PS64V0J.:ZUasdX09R", "2024-01-02 23:03:46.035"), + ], + ["TagName", "EventTime"], + ) + + interval_filtering_wrangler = IntervalFiltering(spark_session, df, 1, "hours") + actual_df = interval_filtering_wrangler.filter_data() + + assert expected_df.columns == actual_df.columns + assert expected_df.schema == actual_df.schema + assert expected_df.collect() == actual_df.collect() + + +def test_interval_detection_days(spark_session: SparkSession): + expected_df = spark_session.createDataFrame( + [ + ("A2PS64V0JR", "2024-01-02 20:03:46.000"), + ("A2PS64asd.:ZUX09R", "2024-01-03 21:03:46.000"), + ("A2PS64asd.:ZUX09R", "2024-01-04 21:12:46.030"), + ("A2PS64V0J.:ZUasdX09R", "2028-01-01 23:03:46.035"), + ], + ["TagName", "EventTime"], + ) + + df = spark_session.createDataFrame( + [ + ("A2PS64V0JR", "2024-01-02 20:03:46.000"), + ("A2PS64asd.:ZUX09R", "2024-01-03 21:03:46.000"), + ("A2PS64V0J.:ZUX09R", "2024-01-04 21:03:45.999"), + ("A2PS64asd.:ZUX09R", "2024-01-04 21:12:46.030"), + ("A2PS64V0J.:ZUasdX09R", "2028-01-01 23:03:46.035"), + ], + ["TagName", "EventTime"], + ) + + interval_filtering_wrangler = IntervalFiltering(spark_session, df, 1, "days") + actual_df = interval_filtering_wrangler.filter_data() + + assert expected_df.columns == actual_df.columns + assert expected_df.schema == actual_df.schema + assert expected_df.collect() == actual_df.collect() + + +def test_interval_detection_wrong_time_stamp_column_name(spark_session: SparkSession): + df = spark_session.createDataFrame( + [ + ("A2PS64V0JR", "2024-01-02 20:03:46.000"), + ("A2PS64asd.:ZUX09R", "2024-01-02 21:06:46.000"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 21:09:45.999"), + ("A2PS64asd.:ZUX09R", "2024-01-02 21:12:46.030"), + ("A2PS64V0J.:ZUasdX09R", "2024-01-02 23:03:46.035"), + ], + ["TagName", "EventTime"], + ) + + interval_filtering_wrangler = IntervalFiltering( + spark_session, df, 1, "hours", "Time" + ) + + with pytest.raises(ValueError): + interval_filtering_wrangler.filter_data() + + +def test_interval_detection_wrong_interval_unit_pass(spark_session: SparkSession): + df = spark_session.createDataFrame( + [ + ("A2PS64V0JR", "2024-01-02 20:03:46.000"), + ("A2PS64asd.:ZUX09R", "2024-01-02 21:06:46.000"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 21:09:45.999"), + ("A2PS64asd.:ZUX09R", "2024-01-02 21:12:46.030"), + ("A2PS64V0J.:ZUasdX09R", "2024-01-02 23:03:46.035"), + ], + ["TagName", "EventTime"], + ) + + interval_filtering_wrangler = IntervalFiltering( + spark_session, df, 1, "years", "EventTime" + ) + + with pytest.raises(ValueError): + interval_filtering_wrangler.filter_data() + + +def test_interval_detection_faulty_time_stamp(spark_session: SparkSession): + df = spark_session.createDataFrame( + [ + ("A2PS64V0JR", "2024-01-09-02 20:03:46.000"), + ("A2PS64asd.:ZUX09R", "2024-01-02 21:06:46.000"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 21:09:45.999"), + ("A2PS64asd.:ZUX09R", "2024-01-02 21:12:46.030"), + ("A2PS64V0J.:ZUasdX09R", "2024-01-02 23:03:46.035"), + ], + ["TagName", "EventTime"], + ) + + interval_filtering_wrangler = IntervalFiltering( + spark_session, df, 1, "minutes", "EventTime" + ) + + with pytest.raises(ValueError): + interval_filtering_wrangler.filter_data() + + +def test_interval_tolerance(spark_session: SparkSession): + expected_df = spark_session.createDataFrame( + [ + ("A2PS64V0J.:ZUX09R", "2024-01-02 03:49:45.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 03:49:47.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 03:49:50.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 03:49:52.000", "Good", "0.129999995"), + ], + ["TagName", "EventTime", "Status", "Value"], + ) + + df = spark_session.createDataFrame( + [ + ("A2PS64V0J.:ZUX09R", "2024-01-02 03:49:45.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 03:49:46.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 03:49:47.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 03:49:50.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 03:49:51.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 03:49:52.000", "Good", "0.129999995"), + ], + ["TagName", "EventTime", "Status", "Value"], + ) + + interval_filtering_wrangler = IntervalFiltering( + spark_session, df, 3, "seconds", "EventTime", 1 + ) + actual_df = interval_filtering_wrangler.filter_data() + + assert expected_df.columns == actual_df.columns + assert expected_df.schema == actual_df.schema + assert expected_df.collect() == actual_df.collect() + + +def test_interval_detection_date_time_columns(spark_session: SparkSession): + expected_df = spark_session.createDataFrame( + [ + ("A2PS64V0JR", convert_to_datetime("2024-01-02 20:03:46.000")), + ("A2PS64asd.:ZUX09R", convert_to_datetime("2024-01-02 21:06:46.000")), + ("A2PS64V0J.:ZUasdX09R", convert_to_datetime("2024-01-02 23:03:46.035")), + ], + ["TagName", "EventTime"], + ) + df = spark_session.createDataFrame( + [ + ("A2PS64V0JR", convert_to_datetime("2024-01-02 20:03:46.000")), + ("A2PS64asd.:ZUX09R", convert_to_datetime("2024-01-02 21:06:46.000")), + ("A2PS64V0J.:ZUX09R", convert_to_datetime("2024-01-02 21:09:45.999")), + ("A2PS64asd.:ZUX09R", convert_to_datetime("2024-01-02 21:12:46.030")), + ("A2PS64V0J.:ZUasdX09R", convert_to_datetime("2024-01-02 23:03:46.035")), + ], + ["TagName", "EventTime"], + ) + + interval_filtering_wrangler = IntervalFiltering(spark_session, df, 1, "hours") + actual_df = interval_filtering_wrangler.filter_data() + + assert expected_df.columns == actual_df.columns + assert expected_df.schema == actual_df.schema + assert expected_df.collect() == actual_df.collect() + + +def test_interval_detection_large_data_set(spark_session: SparkSession): + base_path = os.path.dirname(__file__) + file_path = os.path.join(base_path, "../../test_data.csv") + + df = spark_session.read.option("header", "true").csv(file_path) + + interval_filtering_wrangler = IntervalFiltering(spark_session, df, 1, "hours") + + actual_df = interval_filtering_wrangler.filter_data() + assert actual_df.count() == 25 + + +def test_interval_detection_wrong_datatype(spark_session: SparkSession): + df = spark_session.createDataFrame( + [ + ("A2PS64V0JR", "invalid_data_type"), + ("A2PS64asd.:ZUX09R", "invalid_data_type"), + ("A2PS64V0J.:ZUX09R", "invalid_data_type"), + ("A2PS64asd.:ZUX09R", "invalid_data_type"), + ("A2PS64V0J.:ZUasdX09R", "invalid_data_type"), + ], + ["TagName", "EventTime"], + ) + + interval_filtering_wrangler = IntervalFiltering(spark_session, df, 1, "hours") + + with pytest.raises(ValueError): + interval_filtering_wrangler.filter_data() diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_k_sigma_anomaly_detection.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_k_sigma_anomaly_detection.py new file mode 100644 index 000000000..bee9b0678 --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_k_sigma_anomaly_detection.py @@ -0,0 +1,140 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from pyspark.sql import SparkSession +import pytest +from src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.k_sigma_anomaly_detection import ( + KSigmaAnomalyDetection, +) +import os + +# Normal data mean=10 stddev=5 + 3 anomalies +# fmt: off +normal_input_values = [ 5.19811497, 8.34437927, 3.62104032, 10.02819525, 6.1183447 , + 20.10067378, 10.32313075, 14.090119 , 21.43078927, 2.76624332, + 10.84089416, 1.90722629, 11.19750641, 13.70925639, 5.61011921, + 4.50072694, 13.79440311, 13.30173747, 7.07183589, 12.79853139, 100] + +normal_expected_values = [ 5.19811497, 8.34437927, 3.62104032, 10.02819525, 6.1183447 , + 20.10067378, 10.32313075, 14.090119 , 21.43078927, 2.76624332, + 10.84089416, 1.90722629, 11.19750641, 13.70925639, 5.61011921, + 4.50072694, 13.79440311, 13.30173747, 7.07183589, 12.79853139] +# fmt: on + +# These values are tricky for the mean method, as the anomaly has a big effect on the mean +input_values = [1, 2, 3, 4, 20] +expected_values = [1, 2, 3, 4] + + +def test_filter_with_mean(spark_session: SparkSession): + # Test with normal data + normal_input_df = spark_session.createDataFrame( + [(float(num),) for num in normal_input_values], schema=["value"] + ) + normal_expected_df = spark_session.createDataFrame( + [(float(num),) for num in normal_expected_values], schema=["value"] + ) + + normal_filtered_df = KSigmaAnomalyDetection( + spark_session, + normal_input_df, + column_names=["value"], + k_value=3, + use_median=False, + ).filter_data() + + assert normal_expected_df.collect() == normal_filtered_df.collect() + + # Test with data that has an anomaly that shifts the mean significantly + input_df = spark_session.createDataFrame( + [(float(num),) for num in input_values], schema=["value"] + ) + expected_df = spark_session.createDataFrame( + [(float(num),) for num in expected_values], schema=["value"] + ) + + filtered_df = KSigmaAnomalyDetection( + spark_session, input_df, column_names=["value"], k_value=3, use_median=False + ).filter_data() + + assert expected_df.collect() != filtered_df.collect() + + +def test_filter_with_median(spark_session: SparkSession): + # Test with normal data + normal_input_df = spark_session.createDataFrame( + [(float(num),) for num in normal_input_values], schema=["value"] + ) + normal_expected_df = spark_session.createDataFrame( + [(float(num),) for num in normal_expected_values], schema=["value"] + ) + + normal_filtered_df = KSigmaAnomalyDetection( + spark_session, + normal_input_df, + column_names=["value"], + k_value=3, + use_median=True, + ).filter_data() + + assert normal_expected_df.collect() == normal_filtered_df.collect() + + # Test with data that has an anomaly that shifts the mean significantly + input_df = spark_session.createDataFrame( + [(float(num),) for num in input_values], schema=["value"] + ) + expected_df = spark_session.createDataFrame( + [(float(num),) for num in expected_values], schema=["value"] + ) + + filtered_df = KSigmaAnomalyDetection( + spark_session, input_df, column_names=["value"], k_value=3, use_median=True + ).filter_data() + + assert expected_df.collect() == filtered_df.collect() + + +def test_filter_with_wrong_types(spark_session: SparkSession): + wrong_column_type_df = spark_session.createDataFrame( + [(f"string {i}",) for i in range(10)], schema=["value"] + ) + + # wrong value type + with pytest.raises(ValueError): + KSigmaAnomalyDetection( + spark_session, + wrong_column_type_df, + column_names=["value"], + k_value=3, + use_median=True, + ).filter_data() + + # missing column + with pytest.raises(ValueError): + KSigmaAnomalyDetection( + spark_session, + wrong_column_type_df, + column_names=["$value"], + k_value=3, + use_median=True, + ).filter_data() + + +def test_large_dataset(spark_session): + base_path = os.path.dirname(__file__) + file_path = os.path.join(base_path, "../../test_data.csv") + df = spark_session.read.option("header", "true").csv(file_path) + + assert df.count() > 0, "Dataframe was not loaded correct" + + KSigmaAnomalyDetection(spark_session, df, column_names=["Value"]).filter_data() diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_missing_value_imputation.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_missing_value_imputation.py new file mode 100644 index 000000000..242581571 --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_missing_value_imputation.py @@ -0,0 +1,403 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest +import os + +from pyspark.sql import SparkSession +from pyspark.sql.dataframe import DataFrame +from pyspark.sql.functions import col, unix_timestamp, abs as A +from pyspark.sql.types import ( + StructType, + StructField, + StringType, + TimestampType, + FloatType, +) + +from src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.missing_value_imputation import ( + MissingValueImputation, +) + + +@pytest.fixture(scope="session") +def spark_session(): + return SparkSession.builder.master("local[2]").appName("test").getOrCreate() + + +def test_missing_value_imputation(spark_session: SparkSession): + + schema = StructType( + [ + StructField("TagName", StringType(), True), + StructField("EventTime", StringType(), True), + StructField("Status", StringType(), True), + StructField("Value", StringType(), True), + ] + ) + + expected_schema = StructType( + [ + StructField("TagName", StringType(), True), + StructField("EventTime", TimestampType(), True), + StructField("Status", StringType(), True), + StructField("Value", FloatType(), True), + ] + ) + + test_data = [ + ("A2PS64V0J.:ZUX09R", "2024-01-01 03:29:21.000", "Good", "1.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-01 07:32:55.000", "Good", "2.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-01 11:36:29.000", "Good", "3.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-01 15:39:03.000", "Good", "4.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-01 19:42:37.000", "Good", "5.0"), + # ("A2PS64V0J.:ZUX09R", "2024-01-01 23:46:11.000", "Good", "6.0"), # Test values + ("A2PS64V0J.:ZUX09R", "2024-01-02 03:49:45.000", "Good", "7.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 07:53:11.000", "Good", "8.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 11:56:42.000", "Good", "9.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 16:00:12.000", "Good", "10.0"), + ( + "A2PS64V0J.:ZUX09R", + "2024-01-02 20:13:46.000", + "Good", + "11.0", + ), # Tolerance Test + ("A2PS64V0J.:ZUX09R", "2024-01-03 00:07:20.000", "Good", "12.0"), + # ("A2PS64V0J.:ZUX09R", "2024-01-03 04:10:54.000", "Good", "13.0"), + # ("A2PS64V0J.:ZUX09R", "2024-01-03 08:14:28.000", "Good", "14.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-03 12:18:02.000", "Good", "15.0"), + # ("A2PS64V0J.:ZUX09R", "2024-01-03 16:21:36.000", "Good", "16.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-03 20:25:10.000", "Good", "17.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-04 00:28:44.000", "Good", "18.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-04 04:32:18.000", "Good", "19.0"), + # Real missing values + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:01:43", "Good", "4686.259766"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:02:44", "Good", "4691.161621"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:04:44", "Good", "4686.259766"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:05:44", "Good", "4691.161621"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:11:46", "Good", "4686.259766"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:13:46", "Good", "4691.161621"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:16:47", "Good", "4691.161621"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:19:48", "Good", "4696.063477"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:20:48", "Good", "4691.161621"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:25:50", "Good", "4681.35791"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:26:50", "Good", "4691.161621"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:27:50", "Good", "4696.063477"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:28:50", "Good", "4691.161621"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:31:51", "Good", "4696.063477"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:32:52", "Good", "4691.161621"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:42:52", "Good", "4691.161621"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:42:54", "Good", "4696.063477"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:43:54", "Good", "4691.161621"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:44:54", "Good", "4696.063477"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:45:54", "Good", "4691.161621"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:46:55", "Good", "4696.063477"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:47:55", "Good", "4691.161621"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:51:56", "Good", "4696.063477"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:52:56", "Good", "4691.161621"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:55:57", "Good", "4691.161621"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:56:58", "Good", "4696.063477"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:57:58", "Good", "4691.161621"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:59:59", "Good", "4696.063477"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:00:59", "Good", "4691.161621"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:05:01", "Good", "4696.063477"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:10:02", "Good", "4696.063477"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:11:03", "Good", "4691.161621"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:13:06", "Good", "4696.063477"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:17:07", "Good", "4691.161621"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:18:07", "Good", "4696.063477"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:20:07", "Good", "4686.259766"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:21:07", "Good", "4700.96582"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:25:09", "Good", "4676.456055"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:26:09", "Good", "4696.063477"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:30:09", "Good", "4700.96582"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:35:10", "Good", "4696.063477"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:36:10", "Good", "4700.96582"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:40:11", "Good", "4696.063477"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:42:11", "Good", "4700.96582"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:43:11", "Good", "4705.867676"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:44:11", "Good", "4700.96582"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:46:11", "Good", "4696.063477"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:47:11", "Good", "4700.96582"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:53:13", "Good", "4696.063477"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:54:13", "Good", "4700.96582"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:55:13", "Good", "4686.259766"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:56:13", "Good", "4700.96582"), + ] + + expected_data = [ + ("A2PS64V0J.:ZUX09R", "2024-01-01 03:29:21", "Good", "1.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-01 07:32:55", "Good", "2.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-01 11:36:29", "Good", "3.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-01 15:39:03", "Good", "4.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-01 19:42:37", "Good", "5.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-01 23:46:10", "Good", "6.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 03:49:45", "Good", "7.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 07:53:11", "Good", "8.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 11:56:42", "Good", "9.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 16:00:12", "Good", "10.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 20:13:46", "Good", "11.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-03 00:07:20", "Good", "12.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-03 04:10:50", "Good", "13.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-03 08:14:20", "Good", "14.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-03 12:18:02", "Good", "15.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-03 16:21:30", "Good", "16.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-03 20:25:10", "Good", "17.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-04 00:28:44", "Good", "18.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-04 04:32:18", "Good", "19.0"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:01:43", "Good", "4686.26"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:02:44", "Good", "4691.1616"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:03:44", "Good", "4688.019"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:04:44", "Good", "4686.26"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:05:44", "Good", "4691.1616"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:06:44", "Good", "4694.203"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:07:44", "Good", "4693.92"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:08:44", "Good", "4691.6475"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:09:44", "Good", "4688.722"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:10:44", "Good", "4686.481"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:11:46", "Good", "4686.26"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:12:46", "Good", "4688.637"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:13:46", "Good", "4691.1616"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:14:46", "Good", "4691.4985"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:15:46", "Good", "4690.817"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:16:47", "Good", "4691.1616"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:17:47", "Good", "4693.7354"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:18:47", "Good", "4696.372"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:19:48", "Good", "4696.0635"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:20:48", "Good", "4691.1616"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:21:48", "Good", "4684.8516"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:22:48", "Good", "4679.2305"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:23:48", "Good", "4675.784"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:24:48", "Good", "4675.998"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:25:50", "Good", "4681.358"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:26:50", "Good", "4691.1616"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:27:50", "Good", "4696.0635"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:28:50", "Good", "4691.1616"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:29:50", "Good", "4691.056"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:30:50", "Good", "4694.813"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:31:51", "Good", "4696.0635"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:32:52", "Good", "4691.1616"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:33:52", "Good", "4685.6963"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:34:52", "Good", "4681.356"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:35:52", "Good", "4678.175"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:36:52", "Good", "4676.186"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:37:52", "Good", "4675.423"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:38:52", "Good", "4675.9185"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:39:52", "Good", "4677.707"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:40:52", "Good", "4680.8213"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:41:52", "Good", "4685.295"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:42:52", "Good", "4691.1616"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:42:54", "Good", "4696.0635"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:43:52", "Good", "4692.863"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:43:54", "Good", "4691.1616"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:44:54", "Good", "4696.0635"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:45:54", "Good", "4691.1616"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:46:55", "Good", "4696.0635"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:47:55", "Good", "4691.1616"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:48:55", "Good", "4689.178"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:49:55", "Good", "4692.111"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:50:55", "Good", "4695.794"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:51:56", "Good", "4696.0635"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:52:56", "Good", "4691.1616"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:53:56", "Good", "4687.381"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:54:56", "Good", "4687.1104"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:55:57", "Good", "4691.1616"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:56:58", "Good", "4696.0635"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:57:58", "Good", "4691.1616"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:58:58", "Good", "4693.161"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:59:59", "Good", "4696.0635"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:00:59", "Good", "4691.1616"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:01:59", "Good", "4688.2207"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:02:59", "Good", "4689.07"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:03:59", "Good", "4692.1904"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:05:01", "Good", "4696.0635"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:06:01", "Good", "4699.3506"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:07:01", "Good", "4701.433"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:08:01", "Good", "4701.872"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:09:01", "Good", "4700.228"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:10:02", "Good", "4696.0635"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:11:03", "Good", "4691.1616"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:12:03", "Good", "4692.6973"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:13:06", "Good", "4696.0635"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:14:06", "Good", "4695.113"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:15:06", "Good", "4691.5415"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:16:06", "Good", "4689.0054"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:17:07", "Good", "4691.1616"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:18:07", "Good", "4696.0635"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:19:07", "Good", "4688.7515"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:20:07", "Good", "4686.26"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:21:07", "Good", "4700.966"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:22:07", "Good", "4700.935"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:23:07", "Good", "4687.808"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:24:07", "Good", "4675.1323"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:25:09", "Good", "4676.456"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:26:09", "Good", "4696.0635"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:27:09", "Good", "4708.868"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:28:09", "Good", "4711.2476"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:29:09", "Good", "4707.2603"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:30:09", "Good", "4700.966"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:31:09", "Good", "4695.7764"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:32:09", "Good", "4692.5146"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:33:09", "Good", "4691.358"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:34:09", "Good", "4692.482"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:35:10", "Good", "4696.0635"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:36:10", "Good", "4700.966"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:37:10", "Good", "4702.4126"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:38:10", "Good", "4700.763"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:39:10", "Good", "4697.9897"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:40:11", "Good", "4696.0635"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:41:11", "Good", "4696.747"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:42:11", "Good", "4700.966"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:43:11", "Good", "4705.8677"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:44:11", "Good", "4700.966"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:45:11", "Good", "4695.9624"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:46:11", "Good", "4696.0635"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:47:11", "Good", "4700.966"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:48:11", "Good", "4702.187"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:49:11", "Good", "4699.401"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:50:11", "Good", "4695.0015"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:51:11", "Good", "4691.3823"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:52:11", "Good", "4690.9385"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:53:13", "Good", "4696.0635"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:54:13", "Good", "4700.966"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:55:13", "Good", "4686.26"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:56:13", "Good", "4700.966"), + ] + + test_df = spark_session.createDataFrame(test_data, schema=schema) + expected_df = spark_session.createDataFrame(expected_data, schema=schema) + + missing_value_imputation = MissingValueImputation(spark_session, test_df) + actual_df = DataFrame + + try: + if missing_value_imputation.validate(expected_schema): + actual_df = missing_value_imputation.filter_data() + except Exception as e: + print(repr(e)) + + assert isinstance(actual_df, DataFrame) + + assert expected_df.columns == actual_df.columns + assert expected_schema == actual_df.schema + + def assert_dataframe_similar( + expected_df, actual_df, tolerance=1e-4, time_tolerance_seconds=5 + ): + + expected_df = expected_df.orderBy(["TagName", "EventTime"]) + actual_df = actual_df.orderBy(["TagName", "EventTime"]) + + expected_df = expected_df.withColumn("Value", col("Value").cast("float")) + actual_df = actual_df.withColumn("Value", col("Value").cast("float")) + + for expected_row, actual_row in zip(expected_df.collect(), actual_df.collect()): + for expected_val, actual_val, column_name in zip( + expected_row, actual_row, expected_df.columns + ): + if column_name == "Value": + assert ( + abs(expected_val - actual_val) < tolerance + ), f"Value mismatch: {expected_val} != {actual_val}" + elif column_name == "EventTime": + expected_event_time = unix_timestamp(col("EventTime")).cast( + "timestamp" + ) + actual_event_time = unix_timestamp(col("EventTime")).cast( + "timestamp" + ) + + time_diff = A( + expected_event_time.cast("long") + - actual_event_time.cast("long") + ) + condition = time_diff <= time_tolerance_seconds + + mismatched_rows = expected_df.join( + actual_df, on=["TagName", "EventTime"], how="inner" + ).filter(~condition) + + assert ( + mismatched_rows.count() == 0 + ), f"EventTime mismatch: {expected_val} != {actual_val} (tolerance: {time_tolerance_seconds}s)" + else: + assert ( + expected_val == actual_val + ), f"Mismatch in column '{column_name}': {expected_val} != {actual_val}" + + assert_dataframe_similar(expected_df, actual_df, tolerance=1e-4) + + +def test_missing_value_imputation_large_data_set(spark_session: SparkSession): + test_path = os.path.dirname(__file__) + data_path = os.path.join(test_path, "../../test_data.csv") + + actual_df = spark_session.read.option("header", "true").csv(data_path) + + expected_schema = StructType( + [ + StructField("TagName", StringType(), True), + StructField("EventTime", TimestampType(), True), + StructField("Status", StringType(), True), + StructField("Value", FloatType(), True), + ] + ) + + missing_value_imputation_component = MissingValueImputation( + spark_session, actual_df + ) + result_df = DataFrame + + try: + if missing_value_imputation_component.validate(expected_schema): + result_df = missing_value_imputation_component.filter_data() + except Exception as e: + print(repr(e)) + + assert isinstance(actual_df, DataFrame) + + assert result_df.schema == expected_schema + assert result_df.count() > actual_df.count() + + +def test_missing_value_imputation_wrong_datatype(spark_session: SparkSession): + + expected_schema = StructType( + [ + StructField("TagName", StringType(), True), + StructField("EventTime", TimestampType(), True), + StructField("Status", StringType(), True), + StructField("Value", FloatType(), True), + ] + ) + + test_df = spark_session.createDataFrame( + [ + ("A2PS64V0J.:ZUX09R", "invalid_data_type", "Good", "1.0"), + ("A2PS64V0J.:ZUX09R", "invalid_data_type", "Good", "2.0"), + ("A2PS64V0J.:ZUX09R", "invalid_data_type", "Good", "3.0"), + ("A2PS64V0J.:ZUX09R", "invalid_data_type", "Good", "4.0"), + ("A2PS64V0J.:ZUX09R", "invalid_data_type", "Good", "5.0"), + ], + ["TagName", "EventTime", "Status", "Value"], + ) + + missing_value_imputation_component = MissingValueImputation(spark_session, test_df) + + with pytest.raises(ValueError) as exc_info: + missing_value_imputation_component.validate(expected_schema) + + assert ( + "Error during casting column 'EventTime' to TimestampType(): Column 'EventTime' cannot be cast to TimestampType()." + in str(exc_info.value) + ) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_normalization.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_normalization.py new file mode 100644 index 000000000..128ee14c5 --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_normalization.py @@ -0,0 +1,184 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pandas.io.formats.format import math +import pytest +import os + +from pyspark.sql import SparkSession +from pyspark.sql.dataframe import DataFrame + +from src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.normalization.denormalization import ( + Denormalization, +) +from src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.normalization.normalization import ( + NormalizationBaseClass, +) +from src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.normalization.normalization_mean import ( + NormalizationMean, +) +from src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.normalization.normalization_minmax import ( + NormalizationMinMax, +) + + +@pytest.fixture(scope="session") +def spark_session(): + return SparkSession.builder.master("local[2]").appName("test").getOrCreate() + + +def test_nonexistent_column_normalization(spark_session: SparkSession): + input_df = spark_session.createDataFrame( + [ + (1.0,), + (2.0,), + ], + ["Value"], + ) + + with pytest.raises(ValueError): + NormalizationMean(input_df, column_names=["NonexistingColumn"], in_place=True) + + +def test_wrong_column_type_normalization(spark_session: SparkSession): + input_df = spark_session.createDataFrame( + [ + ("a",), + ("b",), + ], + ["Value"], + ) + + with pytest.raises(ValueError): + NormalizationMean(input_df, column_names=["Value"]) + + +def test_non_inplace_normalization(spark_session: SparkSession): + input_df = spark_session.createDataFrame( + [ + (1.0,), + (2.0,), + ], + ["Value"], + ) + + expected_normalised_df = spark_session.createDataFrame( + [ + (1.0, 0.0), + (2.0, 1.0), + ], + ["Value", "Value_minmax_normalization"], + ) + + normalization_component = NormalizationMinMax( + input_df, column_names=["Value"], in_place=False + ) + normalised_df = normalization_component.filter_data() + + assert isinstance(normalised_df, DataFrame) + + assert expected_normalised_df.columns == normalised_df.columns + assert expected_normalised_df.schema == normalised_df.schema + assert expected_normalised_df.collect() == normalised_df.collect() + + denormalization_component = Denormalization(normalised_df, normalization_component) + reverted_df = denormalization_component.filter_data() + + assert isinstance(reverted_df, DataFrame) + + assert input_df.columns == reverted_df.columns + assert input_df.schema == reverted_df.schema + assert input_df.collect() == reverted_df.collect() + + +@pytest.mark.parametrize("class_to_test", NormalizationBaseClass.__subclasses__()) +def test_idempotence_with_positive_values( + spark_session: SparkSession, class_to_test: NormalizationBaseClass +): + input_df = spark_session.createDataFrame( + [ + (1.0,), + (2.0,), + (3.0,), + (4.0,), + (5.0,), + ], + ["Value"], + ) + + expected_df = input_df.alias("input_df") + helper_assert_idempotence(class_to_test, input_df, expected_df) + + +@pytest.mark.parametrize("class_to_test", NormalizationBaseClass.__subclasses__()) +def test_idempotence_with_zero_values( + spark_session: SparkSession, class_to_test: NormalizationBaseClass +): + input_df = spark_session.createDataFrame( + [ + (0.0,), + (0.0,), + (0.0,), + (0.0,), + (0.0,), + ], + ["Value"], + ) + + expected_df = input_df.alias("input_df") + helper_assert_idempotence(class_to_test, input_df, expected_df) + + +@pytest.mark.parametrize("class_to_test", NormalizationBaseClass.__subclasses__()) +def test_idempotence_with_large_data_set( + spark_session: SparkSession, class_to_test: NormalizationBaseClass +): + base_path = os.path.dirname(__file__) + file_path = os.path.join(base_path, "../../test_data.csv") + input_df = spark_session.read.option("header", "true").csv(file_path) + input_df = input_df.withColumn("Value", input_df["Value"].cast("double")) + assert input_df.count() > 0, "Dataframe was not loaded correct" + input_df.show() + + expected_df = input_df.alias("input_df") + helper_assert_idempotence(class_to_test, input_df, expected_df) + + +def helper_assert_idempotence( + class_to_test: NormalizationBaseClass, + input_df: DataFrame, + expected_df: DataFrame, +): + try: + normalization_component = class_to_test( + input_df, column_names=["Value"], in_place=True + ) + actual_df = normalization_component.filter_data() + + denormalization_component = Denormalization(actual_df, normalization_component) + actual_df = denormalization_component.filter_data() + + assert isinstance(actual_df, DataFrame) + + assert expected_df.columns == actual_df.columns + assert expected_df.schema == actual_df.schema + + for row1, row2 in zip(expected_df.collect(), actual_df.collect()): + for col1, col2 in zip(row1, row2): + if isinstance(col1, float) and isinstance(col2, float): + assert math.isclose(col1, col2, rel_tol=1e-9) + else: + assert col1 == col2 + except ZeroDivisionError: + pass diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_one_hot_encoding.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_one_hot_encoding.py new file mode 100644 index 000000000..9664bb0e8 --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_one_hot_encoding.py @@ -0,0 +1,195 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest +import math + +from pyspark.sql import SparkSession +from pyspark.sql.types import StructType, StructField, StringType, FloatType +from src.sdk.python.rtdip_sdk.pipelines.transformers.spark.machine_learning.one_hot_encoding import ( + OneHotEncoding, +) + +# Define the schema outside the test functions +SCHEMA = StructType( + [ + StructField("TagName", StringType(), True), + StructField("EventTime", StringType(), True), + StructField("Status", StringType(), True), + StructField("Value", FloatType(), True), + ] +) + + +@pytest.fixture(scope="session") +def spark_session(): + return SparkSession.builder.master("local[2]").appName("test").getOrCreate() + + +def test_empty_df(spark_session): + """Empty DataFrame""" + empty_df = spark_session.createDataFrame([], SCHEMA) + encoder = OneHotEncoding(empty_df, "TagName") + + with pytest.raises(ValueError, match="The DataFrame is empty."): + encoder = OneHotEncoding(empty_df, "TagName") + encoder.transform() + + +def test_single_unique_value(spark_session): + """Single Unique Value""" + data = [ + ("A2PS64V0J.:ZUX09R", "2024-01-02 20:03:46", "Good", 0.34), + ("A2PS64V0J.:ZUX09R", "2024-01-02 16:00:12", "Good", 0.15), + ] + df = spark_session.createDataFrame(data, SCHEMA) + encoder = OneHotEncoding(df, "TagName") + result_df = encoder.transform() + + expected_columns = [ + "TagName", + "EventTime", + "Status", + "Value", + "TagName_A2PS64V0J.:ZUX09R", + ] + assert ( + result_df.columns == expected_columns + ), "Columns do not match for single unique value." + for row in result_df.collect(): + assert ( + row["TagName_A2PS64V0J.:ZUX09R"] == 1 + ), "Expected 1 for the one-hot encoded column." + + +def test_null_values(spark_session): + """Column with Null Values""" + data = [ + ("A2PS64V0J.:ZUX09R", "2024-01-02 20:03:46", "Good", 0.34), + (None, "2024-01-02 16:00:12", "Good", 0.15), + ] + df = spark_session.createDataFrame(data, SCHEMA) + encoder = OneHotEncoding(df, "TagName") + result_df = encoder.transform() + + expected_columns = [ + "TagName", + "EventTime", + "Status", + "Value", + "TagName_A2PS64V0J.:ZUX09R", + "TagName_None", + ] + assert ( + result_df.columns == expected_columns + ), f"Columns do not match for null value case. Expected {expected_columns}, but got {result_df.columns}" + for row in result_df.collect(): + if row["TagName"] == "A2PS64V0J.:ZUX09R": + assert ( + row["TagName_A2PS64V0J.:ZUX09R"] == 1 + ), "Expected 1 for valid TagName." + assert ( + row["TagName_None"] == 0 + ), "Expected 0 for TagName_None for valid TagName." + elif row["TagName"] is None: + assert ( + row["TagName_A2PS64V0J.:ZUX09R"] == 0 + ), "Expected 0 for TagName_A2PS64V0J.:ZUX09R for None TagName." + assert ( + row["TagName_None"] == 0 + ), "Expected 0 for TagName_None for None TagName." + + +def test_large_unique_values(spark_session): + """Large Number of Unique Values""" + data = [ + (f"Tag_{i}", f"2024-01-02 20:03:{i:02d}", "Good", i * 1.0) for i in range(1000) + ] + df = spark_session.createDataFrame(data, SCHEMA) + encoder = OneHotEncoding(df, "TagName") + result_df = encoder.transform() + + assert ( + len(result_df.columns) == len(SCHEMA.fields) + 1000 + ), "Expected 1000 additional columns for one-hot encoding." + + +def test_special_characters(spark_session): + """Special Characters in Column Values""" + data = [ + ("A2PS64V0J.:ZUX09R", "2024-01-02 20:03:46", "Good", 0.34), + ("@Special#Tag!", "2024-01-02 16:00:12", "Good", 0.15), + ] + df = spark_session.createDataFrame(data, SCHEMA) + encoder = OneHotEncoding(df, "TagName") + result_df = encoder.transform() + + expected_columns = [ + "TagName", + "EventTime", + "Status", + "Value", + "TagName_A2PS64V0J.:ZUX09R", + "TagName_@Special#Tag!", + ] + assert ( + result_df.columns == expected_columns + ), "Columns do not match for special characters." + for row in result_df.collect(): + for tag in ["A2PS64V0J.:ZUX09R", "@Special#Tag!"]: + expected_value = 1 if row["TagName"] == tag else 0 + column_name = f"TagName_{tag}" + assert ( + row[column_name] == expected_value + ), f"Expected {expected_value} for {column_name}." + + +# removed because of test performance +# def test_distinct_value(spark_session): +# """Dataset with Multiple TagName Values""" + +# data = [ +# ("A2PS64V0J.:ZUX09R", "2024-01-02 20:03:46", "Good", 0.3400000035762787), +# ("A2PS64V0J.:ZUX09R", "2024-01-02 16:00:12", "Good", 0.15000000596046448), +# ( +# "-4O7LSSAM_3EA02:2GT7E02I_R_MP", +# "2024-01-02 20:09:58", +# "Good", +# 7107.82080078125, +# ), +# ("_LT2EPL-9PM0.OROTENV3:", "2024-01-02 12:27:10", "Good", 19407.0), +# ("1N325T3MTOR-P0L29:9.T0", "2024-01-02 23:41:10", "Good", 19376.0), +# ] + +# df = spark_session.createDataFrame(data, SCHEMA) + +# encoder = OneHotEncoding(df, "TagName") +# result_df = encoder.transform() + +# result = result_df.collect() + +# expected_columns = df.columns + [ +# f"TagName_{row['TagName']}" for row in df.select("TagName").distinct().collect() +# ] + +# assert set(result_df.columns) == set(expected_columns) + +# tag_names = df.select("TagName").distinct().collect() +# for row in result: +# tag_name = row["TagName"] +# for tag in tag_names: +# column_name = f"TagName_{tag['TagName']}" +# if tag["TagName"] == tag_name: +# assert math.isclose(row[column_name], 1.0, rel_tol=1e-09, abs_tol=1e-09) +# else: +# assert math.isclose(row[column_name], 0.0, rel_tol=1e-09, abs_tol=1e-09) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_out_of_range_value_filter.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_out_of_range_value_filter.py new file mode 100644 index 000000000..913ae9ffa --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_out_of_range_value_filter.py @@ -0,0 +1,111 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest +from pyspark.sql import SparkSession +import os + + +from src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.out_of_range_value_filter import ( + OutOfRangeValueFilter, +) + + +@pytest.fixture(scope="session") +def spark(): + spark = ( + SparkSession.builder.master("local[2]") + .appName("DeleteOutOfRangeValuesTest") + .getOrCreate() + ) + yield spark + spark.stop() + + +@pytest.fixture +def test_data(spark): + data = [ + ("A2PS64V0J.:ZUX09R", "2024-01-02 03:49:45.000", "Good", "1"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 07:53:11.000", "Good", "2"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 11:56:42.000", "Good", "3"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 16:00:12.000", "Good", "4"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 20:03:46.000", "Good", "5"), + ("Tag2", "2024-01-02 03:49:45.000", "Good", "1"), + ("Tag2", "2024-01-02 07:53:11.000", "Good", "2"), + ("Tag2", "2024-01-02 11:56:42.000", "Good", "3"), + ("Tag2", "2024-01-02 16:00:12.000", "Good", "4"), + ("Tag2", "2024-01-02 20:03:46.000", "Good", "5"), + ] + return spark.createDataFrame(data, ["TagName", "EventTime", "Status", "Value"]) + + +def test_basic(spark, test_data): + tag_ranges = { + "A2PS64V0J.:ZUX09R": {"min": 2, "max": 4, "inclusive_bounds": True}, + "Tag2": {"min": 1, "max": 5, "inclusive_bounds": False}, + } + manipulator = OutOfRangeValueFilter(test_data, tag_ranges) + + rows_to_remove = [ + { + "TagName": "A2PS64V0J.:ZUX09R", + "EventTime": "2024-01-02 07:53:11.000", + "Status": "Good", + "Value": "2", + }, + { + "TagName": "Tag2", + "EventTime": "2024-01-02 11:56:42.000", + "Status": "Good", + "Value": "3", + }, + ] + rows_to_remove_df = spark.createDataFrame(rows_to_remove) + expected = test_data.subtract(rows_to_remove_df) + + result = manipulator.filter_data() + + assert sorted(result.collect()) == sorted(expected.collect()) + + +def test_large_dataset(spark): + base_path = os.path.dirname(__file__) + file_path = os.path.join(base_path, "../../test_data.csv") + df = spark.read.option("header", "true").csv(file_path) + assert df.count() > 0, "Dataframe was not loaded correct" + + tag_ranges = { + "value_range": {"min": 2, "max": 4, "inclusive_bounds": True}, + } + manipulator = OutOfRangeValueFilter(df, tag_ranges) + + rows_to_remove = [ + { + "TagName": "value_range", + "EventTime": "2024-01-02 03:49:45", + "Status": "Good", + "Value": "1.0", + }, + { + "TagName": "value_range", + "EventTime": "2024-01-02 20:03:46", + "Status": "Good", + "Value": "5.0", + }, + ] + rows_to_remove_df = spark.createDataFrame(rows_to_remove) + expected = df.subtract(rows_to_remove_df) + + result = manipulator.filter_data() + + assert sorted(result.collect()) == sorted(expected.collect()) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/__init__ .py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/__init__ .py similarity index 95% rename from tests/sdk/python/rtdip_sdk/pipelines/monitoring/__init__ .py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/__init__ .py index 5305a429e..1832b01ae 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/__init__ .py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/__init__ .py @@ -1,4 +1,4 @@ -# Copyright 2022 RTDIP +# Copyright 2025 RTDIP # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/__init__.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/__init__.py new file mode 100644 index 000000000..1832b01ae --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_check_value_ranges.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_check_value_ranges.py new file mode 100644 index 000000000..9e036666b --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_check_value_ranges.py @@ -0,0 +1,140 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest +from pyspark.sql import SparkSession +from io import StringIO +import logging +import os + + +from src.sdk.python.rtdip_sdk.pipelines.data_quality.monitoring.spark.check_value_ranges import ( + CheckValueRanges, +) + + +@pytest.fixture(scope="session") +def spark(): + spark = ( + SparkSession.builder.master("local[2]") + .appName("CheckValueRangesTest") + .getOrCreate() + ) + yield spark + spark.stop() + + +@pytest.fixture +def log_capture(): + log_stream = StringIO() + logger = logging.getLogger("CheckValueRanges") + logger.setLevel(logging.INFO) + handler = logging.StreamHandler(log_stream) + formatter = logging.Formatter("%(message)s") + handler.setFormatter(formatter) + logger.addHandler(handler) + yield log_stream + logger.removeHandler(handler) + handler.close() + + +@pytest.fixture +def test_data(spark): + data = [ + ("A2PS64V0J.:ZUX09R", "2024-01-02 03:49:45.000", "Good", "1"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 07:53:11.000", "Good", "2"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 11:56:42.000", "Good", "3"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 16:00:12.000", "Good", "4"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 20:03:46.000", "Good", "5"), + ("Tag2", "2024-01-02 03:49:45.000", "Good", "1"), + ("Tag2", "2024-01-02 07:53:11.000", "Good", "2"), + ("Tag2", "2024-01-02 11:56:42.000", "Good", "3"), + ("Tag2", "2024-01-02 16:00:12.000", "Good", "4"), + ("Tag2", "2024-01-02 20:03:46.000", "Good", "5"), + ] + return spark.createDataFrame(data, ["TagName", "EventTime", "Status", "Value"]) + + +def test_basic(test_data, log_capture): + tag_ranges = { + "A2PS64V0J.:ZUX09R": {"min": 2, "max": 4, "inclusive_bounds": True}, + "Tag2": {"min": 1, "max": 5, "inclusive_bounds": False}, + } + monitor = CheckValueRanges(test_data, tag_ranges) + monitor.check() + expected_logs = [ + # For temperature with inclusive_bounds='both' + "Found 2 rows in 'Value' column for TagName 'A2PS64V0J.:ZUX09R' out of range.", + f"Out of range row for TagName 'A2PS64V0J.:ZUX09R': Row(TagName='A2PS64V0J.:ZUX09R', EventTime=datetime.datetime(2024, 1, 2, 3, 49, 45), Status='Good', Value=1.0)", + f"Out of range row for TagName 'A2PS64V0J.:ZUX09R': Row(TagName='A2PS64V0J.:ZUX09R', EventTime=datetime.datetime(2024, 1, 2, 20, 3, 46), Status='Good', Value=5.0)", + f"Found 2 rows in 'Value' column for TagName 'Tag2' out of range.", + f"Out of range row for TagName 'Tag2': Row(TagName='Tag2', EventTime=datetime.datetime(2024, 1, 2, 3, 49, 45), Status='Good', Value=1.0)", + f"Out of range row for TagName 'Tag2': Row(TagName='Tag2', EventTime=datetime.datetime(2024, 1, 2, 20, 3, 46), Status='Good', Value=5.0)", + ] + log_contents = log_capture.getvalue() + actual_logs = log_contents.strip().split("\n") + assert len(expected_logs) == len( + actual_logs + ), f"Expected {len(expected_logs)} logs, got {len(actual_logs)}" + for expected, actual in zip(expected_logs, actual_logs): + assert expected == actual, f"Expected: '{expected}', got: '{actual}'" + + +def test_invalid_tag_name(test_data): + tag_ranges = { + "InvalidTagName": {"min": 0, "max": 100}, + } + with pytest.raises(ValueError) as excinfo: + monitor = CheckValueRanges(df=test_data, tag_ranges=tag_ranges) + monitor.check() + + assert "TagName 'InvalidTagName' not found in DataFrame." in str(excinfo.value) + + +def test_no_min_or_max(test_data): + tag_ranges = { + "A2PS64V0J.:ZUX09R": {}, # Weder 'min' noch 'max' angegeben + } + with pytest.raises(ValueError) as excinfo: + monitor = CheckValueRanges(df=test_data, tag_ranges=tag_ranges) + monitor.check() + assert ( + "TagName 'A2PS64V0J.:ZUX09R' must have at least 'min' or 'max' specified." + in str(excinfo.value) + ) + + +def test_large_dataset(spark, log_capture): + base_path = os.path.dirname(__file__) + file_path = os.path.join(base_path, "../../test_data.csv") + df = spark.read.option("header", "true").csv(file_path) + assert df.count() > 0, "Dataframe was not loaded correct" + + tag_ranges = { + "value_range": {"min": 2, "max": 4, "inclusive_bounds": True}, + } + monitor = CheckValueRanges(df, tag_ranges) + monitor.check() + + expected_logs = [ + "Found 2 rows in 'Value' column for TagName 'value_range' out of range.", + f"Out of range row for TagName 'value_range': Row(TagName='value_range', EventTime=datetime.datetime(2024, 1, 2, 3, 49, 45), Status=' Good', Value=1.0)", + f"Out of range row for TagName 'value_range': Row(TagName='value_range', EventTime=datetime.datetime(2024, 1, 2, 20, 3, 46), Status=' Good', Value=5.0)", + ] + actual_logs = log_capture.getvalue().strip().split("\n") + + assert len(expected_logs) == len( + actual_logs + ), f"Expected {len(expected_logs)} logs, got {len(actual_logs)}" + for expected, actual in zip(expected_logs, actual_logs): + assert expected in actual, f"Expected: '{expected}', got: '{actual}'" diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_flatline_detection.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_flatline_detection.py new file mode 100644 index 000000000..64aac49b2 --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_flatline_detection.py @@ -0,0 +1,155 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest +import os +from pyspark.sql import SparkSession +from src.sdk.python.rtdip_sdk.pipelines.data_quality.monitoring.spark.flatline_detection import ( + FlatlineDetection, +) + +import logging +from io import StringIO + + +@pytest.fixture(scope="session") +def spark(): + spark = ( + SparkSession.builder.master("local[2]") + .appName("FlatlineDetectionTest") + .getOrCreate() + ) + yield spark + spark.stop() + + +@pytest.fixture +def log_capture(): + log_stream = StringIO() + logger = logging.getLogger("FlatlineDetection") + logger.setLevel(logging.INFO) + handler = logging.StreamHandler(log_stream) + formatter = logging.Formatter("%(message)s") + handler.setFormatter(formatter) + logger.addHandler(handler) + yield log_stream + logger.removeHandler(handler) + handler.close() + + +def test_flatline_detection_no_flatlining(spark, log_capture): + df = spark.createDataFrame( + [ + ("A2PS64V0J.:ZUX09R", "2024-01-02 03:49:45.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 07:53:11.000", "Good", "0.119999997"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 11:56:42.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 16:00:12.000", "Good", "0.150000006"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 20:03:46.000", "Good", "0.340000004"), + ], + ["TagName", "EventTime", "Status", "Value"], + ) + + detector = FlatlineDetection(df, watch_columns=["Value"], tolerance_timespan=2) + detector.check() + + expected_logs = [ + "No flatlining detected.", + ] + actual_logs = log_capture.getvalue().strip().split("\n") + + assert len(expected_logs) == len( + actual_logs + ), f"Expected {len(expected_logs)} logs, got {len(actual_logs)}" + for expected, actual in zip(expected_logs, actual_logs): + assert expected == actual, f"Expected: '{expected}', got: '{actual}'" + + +def test_flatline_detection_with_flatlining(spark, log_capture): + df = spark.createDataFrame( + [ + ("A2PS64V0J.:ZUX09R", "2024-01-02 03:49:45.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 07:53:11.000", "Good", "0.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 11:56:42.000", "Good", "0.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 16:00:12.000", "Good", "Null"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 20:03:46.000", "Good", "0.340000004"), + ], + ["TagName", "EventTime", "Status", "Value"], + ) + + detector = FlatlineDetection(df, watch_columns=["Value"], tolerance_timespan=2) + detector.check() + + expected_logs = [ + "Flatlining detected in column 'Value' at row: Row(TagName='A2PS64V0J.:ZUX09R', EventTime=datetime.datetime(2024, 1, 2, 7, 53, 11), Status='Good', Value=0.0, Value_flatline_flag=1, Value_group=1).", + "Flatlining detected in column 'Value' at row: Row(TagName='A2PS64V0J.:ZUX09R', EventTime=datetime.datetime(2024, 1, 2, 11, 56, 42), Status='Good', Value=0.0, Value_flatline_flag=1, Value_group=1).", + "Flatlining detected in column 'Value' at row: Row(TagName='A2PS64V0J.:ZUX09R', EventTime=datetime.datetime(2024, 1, 2, 16, 0, 12), Status='Good', Value=None, Value_flatline_flag=1, Value_group=1).", + ] + actual_logs = log_capture.getvalue().strip().split("\n") + + assert len(expected_logs) == len( + actual_logs + ), f"Expected {len(expected_logs)} logs, got {len(actual_logs)}" + for expected, actual in zip(expected_logs, actual_logs): + assert expected in actual, f"Expected: '{expected}', got: '{actual}'" + + +def test_flatline_detection_with_tolerance(spark, log_capture): + df = spark.createDataFrame( + [ + ("A2PS64V0J.:ZUX09R", "2024-01-02 03:49:45.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 07:53:11.000", "Good", "0.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 11:56:42.000", "Good", "0.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 16:00:12.000", "Good", "Null"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 20:03:46.000", "Good", "0.340000004"), + ], + ["TagName", "EventTime", "Status", "Value"], + ) + + detector = FlatlineDetection(df, watch_columns=["Value"], tolerance_timespan=3) + detector.check() + + expected_logs = [ + "No flatlining detected.", + ] + actual_logs = log_capture.getvalue().strip().split("\n") + + assert len(expected_logs) == len( + actual_logs + ), f"Expected {len(expected_logs)} logs, got {len(actual_logs)}" + for expected, actual in zip(expected_logs, actual_logs): + assert expected in actual, f"Expected: '{expected}', got: '{actual}'" + + +def test_large_dataset(spark, log_capture): + base_path = os.path.dirname(__file__) + file_path = os.path.join(base_path, "../../test_data.csv") + df = spark.read.option("header", "true").csv(file_path) + + print(df.count) + assert df.count() > 0, "Dataframe was not loaded correct" + + detector = FlatlineDetection(df, watch_columns=["Value"], tolerance_timespan=2) + detector.check() + + expected_logs = [ + "Flatlining detected in column 'Value' at row: Row(TagName='FLATLINE_TEST', EventTime=datetime.datetime(2024, 1, 2, 2, 35, 10, 511000), Status='Good', Value=0.0, Value_flatline_flag=1, Value_group=1).", + "Flatlining detected in column 'Value' at row: Row(TagName='FLATLINE_TEST', EventTime=datetime.datetime(2024, 1, 2, 2, 49, 10, 408000), Status='Good', Value=0.0, Value_flatline_flag=1, Value_group=1).", + "Flatlining detected in column 'Value' at row: Row(TagName='FLATLINE_TEST', EventTime=datetime.datetime(2024, 1, 2, 14, 57, 10, 372000), Status='Good', Value=0.0, Value_flatline_flag=1, Value_group=1).", + ] + actual_logs = log_capture.getvalue().strip().split("\n") + + assert len(expected_logs) == len( + actual_logs + ), f"Expected {len(expected_logs)} logs, got {len(actual_logs)}" + for expected, actual in zip(expected_logs, actual_logs): + assert expected in actual, f"Expected: '{expected}', got: '{actual}'" diff --git a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_great_expectations_data_quality.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_great_expectations_data_quality.py similarity index 85% rename from tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_great_expectations_data_quality.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_great_expectations_data_quality.py index 00bb57902..23ee3f970 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_great_expectations_data_quality.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_great_expectations_data_quality.py @@ -1,8 +1,20 @@ -import pytest +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from pytest_mock import MockerFixture -from pyspark.sql import SparkSession, DataFrame +from pyspark.sql import SparkSession -from src.sdk.python.rtdip_sdk.pipelines.monitoring.spark.data_quality.great_expectations_data_quality import ( +from src.sdk.python.rtdip_sdk.pipelines.data_quality.monitoring.spark.great_expectations_data_quality import ( GreatExpectationsDataQuality, ) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_identify_missing_data_interval.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_identify_missing_data_interval.py new file mode 100644 index 000000000..2f3fc9482 --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_identify_missing_data_interval.py @@ -0,0 +1,247 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest +import os +from pyspark.sql import SparkSession + +from src.sdk.python.rtdip_sdk.pipelines.logging.logger_manager import LoggerManager +from src.sdk.python.rtdip_sdk.pipelines.data_quality.monitoring.spark.identify_missing_data_interval import ( + IdentifyMissingDataInterval, +) + +import logging +from io import StringIO + + +@pytest.fixture(scope="session") +def spark(): + spark = ( + SparkSession.builder.master("local[2]") + .appName("IdentifyMissingDataIntervalTest") + .getOrCreate() + ) + yield spark + spark.stop() + + +@pytest.fixture +def log_capture(): + log_stream = StringIO() + logger_manager = LoggerManager() + logger = logger_manager.create_logger("IdentifyMissingDataInterval") + + handler = logging.StreamHandler(log_stream) + formatter = logging.Formatter("%(message)s") + handler.setFormatter(formatter) + logger.addHandler(handler) + yield log_stream + logger.removeHandler(handler) + handler.close() + + +def test_missing_intervals_with_given_interval_multiple_tags(spark, caplog): + df = spark.createDataFrame( + [ + ("A2PS64V0J.:ZUX09R", "2024-01-02 00:00:00.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 00:00:10.000", "Good", "0.119999997"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 00:00:20.000", "Good", "0.129999995"), + ( + "A2PS64V0J.:ZUX09R", + "2024-01-02 00:00:36.000", + "Good", + "0.150000006", + ), # Missing interval (20s to 36s) + ("A2PS64V0J.:ZUX09R", "2024-01-02 00:00:45.000", "Good", "0.340000004"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 00:00:55.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 00:01:05.000", "Good", "0.119999997"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 00:01:15.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 00:01:25.000", "Good", "0.150000006"), + ( + "A2PS64V0J.:ZUX09R", + "2024-01-02 00:01:41.000", + "Good", + "0.340000004", + ), # Missing interval (25s to 41s) + ], + ["TagName", "EventTime", "Status", "Value"], + ) + + monitor = IdentifyMissingDataInterval( + df=df, + interval="10s", + tolerance="500ms", + ) + + with caplog.at_level(logging.INFO, logger="IdentifyMissingDataInterval"): + monitor.check() + expected_logs = [ + "Using provided expected interval: 10000.0 ms", + "Using provided tolerance: 500.0 ms", + "Maximum acceptable interval with tolerance: 10500.0 ms", + "Detected Missing Intervals:", + "Tag: A2PS64V0J.:ZUX09R Missing Interval from 2024-01-02 00:00:20 to 2024-01-02 00:00:36 Duration: 0h 0m 16s", + "Tag: A2PS64V0J.:ZUX09R Missing Interval from 2024-01-02 00:01:25 to 2024-01-02 00:01:41 Duration: 0h 0m 16s", + ] + actual_logs = [ + record.message + for record in caplog.records + if record.levelname == "INFO" and record.name == "IdentifyMissingDataInterval" + ] + + assert len(expected_logs) == len( + actual_logs + ), f"Expected {len(expected_logs)} logs, got {len(actual_logs)} " + for expected, actual in zip(expected_logs, actual_logs): + assert expected == actual, f"Expected: '{expected}', got: '{actual}'" + + +def test_missing_intervals_with_calculated_interval(spark, caplog): + + df = spark.createDataFrame( + [ + ("A2PS64V0J.:ZUX09R", "2024-01-02 00:00:00.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 00:00:10.000", "Good", "0.119999997"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 00:00:20.000", "Good", "0.129999995"), + ( + "A2PS64V0J.:ZUX09R", + "2024-01-02 00:00:36.000", + "Good", + "0.150000006", + ), # Missing interval (20s to 36s) + ("A2PS64V0J.:ZUX09R", "2024-01-02 00:00:45.000", "Good", "0.340000004"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 00:00:55.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 00:01:05.000", "Good", "0.119999997"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 00:01:15.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 00:01:25.000", "Good", "0.150000006"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 00:01:30.000", "Good", "0.340000004"), + ], + ["TagName", "EventTime", "Status", "Value"], + ) + monitor = IdentifyMissingDataInterval( + df=df, + ) + + with caplog.at_level(logging.INFO, logger="IdentifyMissingDataInterval"): + monitor.check() + expected_logs = [ + "Using median of time differences as expected interval: 10000.0 ms", + "Calculated tolerance: 10.0 ms (MAD-based)", + "Maximum acceptable interval with tolerance: 10010.0 ms", + "Detected Missing Intervals:", + "Tag: A2PS64V0J.:ZUX09R Missing Interval from 2024-01-02 00:00:20 to 2024-01-02 00:00:36 Duration: 0h 0m 16s", + ] + actual_logs = [ + record.message + for record in caplog.records + if record.levelname == "INFO" and record.name == "IdentifyMissingDataInterval" + ] + + assert len(expected_logs) == len( + actual_logs + ), f"Expected {len(expected_logs)} logs, got {len(actual_logs)} " + for expected, actual in zip(expected_logs, actual_logs): + assert expected == actual, f"Expected: '{expected}', got: '{actual}'" + + +def test_no_missing_intervals(spark, caplog): + + df = spark.createDataFrame( + [ + ("A2PS64V0J.:ZUX09R", "2024-01-02 00:00:00.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 00:00:10.000", "Good", "0.119999997"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 00:00:20.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 00:00:30.000", "Good", "0.150000006"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 00:00:40.000", "Good", "0.340000004"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 00:00:50.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 00:01:00.000", "Good", "0.119999997"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 00:01:10.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 00:01:20.000", "Good", "0.150000006"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 00:01:30.000", "Good", "0.340000004"), + ], + ["TagName", "EventTime", "Status", "Value"], + ) + monitor = IdentifyMissingDataInterval( + df=df, + interval="10s", + tolerance="5s", + ) + + with caplog.at_level(logging.INFO, logger="IdentifyMissingDataInterval"): + monitor.check() + expected_logs = [ + "Using provided expected interval: 10000.0 ms", + "Using provided tolerance: 5000.0 ms", + "Maximum acceptable interval with tolerance: 15000.0 ms", + "No missing intervals detected.", + ] + actual_logs = [ + record.message + for record in caplog.records + if record.levelname == "INFO" and record.name == "IdentifyMissingDataInterval" + ] + + assert len(expected_logs) == len( + actual_logs + ), f"Expected {len(expected_logs)} logs, got {len(actual_logs)} " + for expected, actual in zip(expected_logs, actual_logs): + assert expected == actual, f"Expected: '{expected}', got: '{actual}'" + + +def test_invalid_timedelta_format(spark, caplog): + df = spark.createDataFrame( + [ + ("A2PS64V0J.:ZUX09R", "2024-01-02 16:00:12.000", "Good", "0.150000006"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 20:03:46.000", "Good", "0.340000004"), + ], + ["TagName", "EventTime", "Status", "Value"], + ) + monitor = IdentifyMissingDataInterval( + df=df, + interval="10seconds", # should be '10s' + ) + + with pytest.raises(ValueError) as exc_info: + with caplog.at_level(logging.ERROR, logger="IdentifyMissingDataInterval"): + monitor.check() + + assert "Invalid time format: 10seconds" in str(exc_info.value) + assert "Invalid time format: 10seconds" in caplog.text + + +def test_large_data_set(spark, caplog): + base_path = os.path.dirname(__file__) + file_path = os.path.join(base_path, "../../test_data.csv") + df = spark.read.option("header", "true").csv(file_path) + assert df.count() > 0, "Dataframe was not loaded correct" + monitor = IdentifyMissingDataInterval( + df=df, + interval="1s", + tolerance="10ms", + ) + with caplog.at_level(logging.INFO, logger="IdentifyMissingDataInterval"): + monitor.check() + expected_logs = [ + "Tag: MISSING_DATA Missing Interval from 2024-01-02 00:08:11 to 2024-01-02 00:08:13 Duration: 0h 0m 2s" + ] + actual_logs = [ + record.message + for record in caplog.records + if record.levelname == "INFO" + and record.name == "IdentifyMissingDataInterval" + and "MISSING_DATA" in record.message + ] + + assert any( + expected in actual for expected in expected_logs for actual in actual_logs + ), "Expected logs not found in actual logs" diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_identify_missing_data_pattern.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_identify_missing_data_pattern.py new file mode 100644 index 000000000..52fb27799 --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_identify_missing_data_pattern.py @@ -0,0 +1,244 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# tests/test_identify_missing_data_pattern.py + +import pytest +import logging +import os + +from pyspark.sql import SparkSession + +from src.sdk.python.rtdip_sdk.pipelines.data_quality.monitoring.spark.identify_missing_data_pattern import ( + IdentifyMissingDataPattern, +) + + +@pytest.fixture(scope="session") +def spark(): + spark = ( + SparkSession.builder.master("local[2]") + .appName("IdentifyMissingDataPatternTest") + .getOrCreate() + ) + spark.sparkContext.setLogLevel("ERROR") # Unterdrücke WARN-Messages + yield spark + spark.stop() + + +def test_no_missing_patterns(spark, caplog): + df = spark.createDataFrame( + [ + ("A2PS64V0J.:ZUX09R", "2024-02-11 00:00:00", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-02-11 00:00:13", "Good", "0.119999997"), + ("A2PS64V0J.:ZUX09R", "2024-02-11 00:00:49", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-02-11 00:01:00", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-02-11 00:01:13", "Good", "0.119999997"), + ("A2PS64V0J.:ZUX09R", "2024-02-11 00:01:49", "Good", "0.129999995"), + ], + ["TagName", "EventTime", "Status", "Value"], + ) + patterns = [{"second": 0}, {"second": 13}, {"second": 49}] + monitor = IdentifyMissingDataPattern( + df=df, patterns=patterns, frequency="minutely", tolerance="1s" + ) + + with caplog.at_level(logging.INFO, logger="IdentifyMissingDataPattern"): + monitor.check() + + actual_logs = [ + record.message + for record in caplog.records + if record.levelname == "INFO" and record.name == "IdentifyMissingDataPattern" + ] + assert "Using tolerance: 1000.0 ms (1.0 seconds)" in actual_logs + assert "Identified 0 missing patterns." in actual_logs + assert "No missing patterns detected." in actual_logs + + +def test_some_missing_patterns(spark, caplog): + df = spark.createDataFrame( + [ + ("A2PS64V0J.:ZUX09R", "2024-02-11 00:00:00", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-02-11 00:00:13", "Good", "0.119999997"), + ("A2PS64V0J.:ZUX09R", "2024-02-11 00:00:49", "Good", "0.129999995"), + ( + "A2PS64V0J.:ZUX09R", + "2024-02-11 00:01:05", + "Good", + "0.129999995", + ), # Nothing matches in minute 1 + ("A2PS64V0J.:ZUX09R", "2024-02-11 00:01:17", "Good", "0.119999997"), + ], + ["TagName", "EventTime", "Status", "Value"], + ) + patterns = [{"second": 0}, {"second": 13}, {"second": 49}] + monitor = IdentifyMissingDataPattern( + df=df, patterns=patterns, frequency="minutely", tolerance="1s" + ) + + with caplog.at_level(logging.INFO, logger="IdentifyMissingDataPattern"): + monitor.check() + + actual_logs = [ + record.message + for record in caplog.records + if record.levelname == "INFO" and record.name == "IdentifyMissingDataPattern" + ] + assert "Using tolerance: 1000.0 ms (1.0 seconds)" in actual_logs + assert "Identified 2 missing patterns." in actual_logs + assert "Detected Missing Patterns:" in actual_logs + assert "Missing Pattern at 2024-02-11 00:01:00.000" in actual_logs + assert "Missing Pattern at 2024-02-11 00:01:13.000" in actual_logs + + +def test_all_missing_patterns(spark, caplog): + df = spark.createDataFrame( + [ + ("A2PS64V0J.:ZUX09R", "2024-02-11 00:00:05", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-02-11 00:00:17", "Good", "0.119999997"), + ("A2PS64V0J.:ZUX09R", "2024-02-11 00:00:29", "Good", "0.129999995"), + ( + "A2PS64V0J.:ZUX09R", + "2024-02-11 00:01:05", + "Good", + "0.129999995", + ), + ("A2PS64V0J.:ZUX09R", "2024-02-11 00:01:17", "Good", "0.119999997"), + ("A2PS64V0J.:ZUX09R", "2024-02-11 00:01:29", "Good", "0.129999995"), + ], + ["TagName", "EventTime", "Status", "Value"], + ) + + patterns = [{"second": 0}, {"second": 13}, {"second": 49}] + monitor = IdentifyMissingDataPattern( + df=df, patterns=patterns, frequency="minutely", tolerance="1s" + ) + + with caplog.at_level(logging.INFO, logger="IdentifyMissingDataPattern"): + monitor.check() + + actual_logs = [ + record.message + for record in caplog.records + if record.levelname == "INFO" and record.name == "IdentifyMissingDataPattern" + ] + assert "Using tolerance: 1000.0 ms (1.0 seconds)" in actual_logs + assert "Identified 5 missing patterns." in actual_logs + assert "Detected Missing Patterns:" in actual_logs + missing_patterns = [ + "Missing Pattern at 2024-02-11 00:00:00.000", + "Missing Pattern at 2024-02-11 00:00:13.000", + "Missing Pattern at 2024-02-11 00:00:49.000", + "Missing Pattern at 2024-02-11 00:01:00.000", + "Missing Pattern at 2024-02-11 00:01:13.000", + ] + for pattern in missing_patterns: + assert pattern in actual_logs + + +def test_invalid_patterns(spark, caplog): + df = spark.createDataFrame( + [ + ("A2PS64V0J.:ZUX09R", "2024-02-11 00:01:49", "Good", "0.129999995"), + ], + ["TagName", "EventTime", "Status", "Value"], + ) + + patterns = [ + {"minute": 0}, # Invalid for 'minutely' frequency + {"second": 13}, + {"second": 49}, + ] + monitor = IdentifyMissingDataPattern( + df=df, patterns=patterns, frequency="minutely", tolerance="1s" + ) + + with pytest.raises(ValueError) as exc_info, caplog.at_level( + logging.ERROR, logger="IdentifyMissingDataPattern" + ): + monitor.check() + + assert "Each pattern must have a 'second' key for 'minutely' frequency." in str( + exc_info.value + ) + + +def test_invalid_tolerance_format(spark, caplog): + df = spark.createDataFrame( + [ + ("A2PS64V0J.:ZUX09R", "2024-02-11 00:01:49", "Good", "0.129999995"), + ], + ["TagName", "EventTime", "Status", "Value"], + ) + patterns = [{"second": 0}, {"second": 13}, {"second": 49}] + monitor = IdentifyMissingDataPattern( + df=df, patterns=patterns, frequency="minutely", tolerance="1minute" + ) + + with pytest.raises(ValueError) as exc_info, caplog.at_level( + logging.ERROR, logger="IdentifyMissingDataPattern" + ): + monitor.check() + + assert "Invalid tolerance format: 1minute" in str(exc_info.value) + actual_logs = [ + record.message + for record in caplog.records + if record.levelname == "ERROR" and record.name == "IdentifyMissingDataPattern" + ] + assert "Invalid tolerance format: 1minute" in actual_logs + + +def test_hourly_patterns_with_microseconds(spark, caplog): + df = spark.createDataFrame( + [ + ("A2PS64V0J.:ZUX09R", "2024-02-11 00:00:00.200", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-02-11 00:59:59.800", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-02-11 01:00:30.500", "Good", "0.129999995"), + ], + ["TagName", "EventTime", "Status", "Value"], + ) + + patterns = [ + {"minute": 0, "second": 0, "millisecond": 0}, + {"minute": 30, "second": 30, "millisecond": 500}, + ] + monitor = IdentifyMissingDataPattern( + df=df, patterns=patterns, frequency="hourly", tolerance="500ms" + ) + + with caplog.at_level(logging.INFO, logger="IdentifyMissingDataPattern"): + monitor.check() + + actual_logs = [ + record.message + for record in caplog.records + if record.levelname == "INFO" and record.name == "IdentifyMissingDataPattern" + ] + assert "Using tolerance: 500.0 ms (0.5 seconds)" in actual_logs + assert "Identified 1 missing patterns." in actual_logs + assert "Detected Missing Patterns:" in actual_logs + assert "Missing Pattern at 2024-02-11 00:30:30.500" in actual_logs + + +def test_large_data_set(spark): + base_path = os.path.dirname(__file__) + file_path = os.path.join(base_path, "../../test_data.csv") + df = spark.read.option("header", "true").csv(file_path) + assert df.count() > 0, "Dataframe was not loaded correct" + patterns = [{"second": 0}, {"second": 13}, {"second": 49}] + monitor = IdentifyMissingDataPattern( + df=df, patterns=patterns, frequency="minutely", tolerance="1s" + ) + monitor.check() diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_moving_average.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_moving_average.py new file mode 100644 index 000000000..46b7396f9 --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_moving_average.py @@ -0,0 +1,104 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest +import os +from pyspark.sql import SparkSession +from src.sdk.python.rtdip_sdk.pipelines.data_quality.monitoring.spark.moving_average import ( + MovingAverage, +) +import logging +from io import StringIO + + +@pytest.fixture(scope="session") +def spark(): + spark = ( + SparkSession.builder.master("local[2]") + .appName("MovingAverageTest") + .getOrCreate() + ) + yield spark + spark.stop() + + +@pytest.fixture +def log_capture(): + log_stream = StringIO() + logger = logging.getLogger("MovingAverage") + logger.setLevel(logging.INFO) + handler = logging.StreamHandler(log_stream) + formatter = logging.Formatter("%(message)s") + handler.setFormatter(formatter) + logger.addHandler(handler) + yield log_stream + logger.removeHandler(handler) + handler.close() + + +def test_moving_average_basic(spark, log_capture): + df = spark.createDataFrame( + [ + ("Tag1", "2024-01-02 03:49:45.000", "Good", 1.0), + ("Tag1", "2024-01-02 07:53:11.000", "Good", 2.0), + ("Tag1", "2024-01-02 11:56:42.000", "Good", 3.0), + ("Tag1", "2024-01-02 16:00:12.000", "Good", 4.0), + ("Tag1", "2024-01-02 20:03:46.000", "Good", 5.0), + ], + ["TagName", "EventTime", "Status", "Value"], + ) + + detector = MovingAverage(df, window_size=3) + detector.check() + + expected_logs = [ + "Computing moving averages:", + "Tag: Tag1, Time: 2024-01-02 03:49:45, Value: 1.0, Moving Avg: 1.0", + "Tag: Tag1, Time: 2024-01-02 07:53:11, Value: 2.0, Moving Avg: 1.5", + "Tag: Tag1, Time: 2024-01-02 11:56:42, Value: 3.0, Moving Avg: 2.0", + "Tag: Tag1, Time: 2024-01-02 16:00:12, Value: 4.0, Moving Avg: 3.0", + "Tag: Tag1, Time: 2024-01-02 20:03:46, Value: 5.0, Moving Avg: 4.0", + ] + + actual_logs = log_capture.getvalue().strip().split("\n") + + assert len(expected_logs) == len( + actual_logs + ), f"Expected {len(expected_logs)} logs, got {len(actual_logs)}" + + for expected, actual in zip(expected_logs, actual_logs): + assert expected in actual, f"Expected: '{expected}', got: '{actual}'" + + +def test_moving_average_invalid_window_size(spark): + df = spark.createDataFrame( + [ + ("Tag1", "2024-01-02 03:49:45.000", "Good", 1.0), + ("Tag1", "2024-01-02 07:53:11.000", "Good", 2.0), + ], + ["TagName", "EventTime", "Status", "Value"], + ) + + with pytest.raises(ValueError, match="window_size must be a positive integer."): + MovingAverage(df, window_size=-2) + + +def test_large_dataset(spark): + base_path = os.path.dirname(__file__) + file_path = os.path.join(base_path, "../../test_data.csv") + df = spark.read.option("header", "true").csv(file_path) + + assert df.count() > 0, "DataFrame was nicht geladen." + + detector = MovingAverage(df, window_size=5) + detector.check() diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_quality/test_data.csv b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/test_data.csv new file mode 100644 index 000000000..71e1e0895 --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/test_data.csv @@ -0,0 +1,1019 @@ +TagName,EventTime,Status,Value +A2PS64V0J.:ZUX09R,2024-01-02 20:03:46.000,Good,0.3400000035762787 +A2PS64V0J.:ZUX09R,2024-01-02 16:00:12.000,Good,0.1500000059604644 +A2PS64V0J.:ZUX09R,2024-01-02 11:56:42.000,Good,0.1299999952316284 +A2PS64V0J.:ZUX09R,2024-01-02 07:53:11.000,Good,0.1199999973177909 +A2PS64V0J.:ZUX09R,2024-01-02 03:49:45.000,Good,0.1299999952316284 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 20:09:58.053,Good,7107.82080078125 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 12:27:10.518,Good,19407.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 05:23:10.143,Good,19403.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 01:31:10.086,Good,19399.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 23:41:10.358,Good,19376.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 18:09:10.488,Good,19375.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 16:15:10.492,Good,19376.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 06:51:10.077,Good,19403.0 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 07:42:24.227,Good,6.55859375 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 06:08:23.777,Good,5921.5498046875 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 05:14:10.896,Good,5838.216796875 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 01:37:10.967,Good,5607.82568359375 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 00:26:53.449,Good,5563.7080078125 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 01:11:10.361,Good,19396.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 14:01:10.150,Good,19409.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 10:22:10.018,Good,19402.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 03:58:10.496,Good,19403.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 06:50:10.483,Good,19402.0 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 07:26:20.495,Good,6.55126953125 +R0:Z24WVP.0S10L,2024-01-02 21:26:00.001,Good,2266.861083984375 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 21:16:08.988,Good,7205.85986328125 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 14:25:10.252,Good,19410.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 07:18:10.275,Good,19404.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 16:12:10.288,Good,19377.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 03:04:10.256,Good,19403.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 03:16:10.178,Good,19401.0 +R0:Z24WVP.0S10L,2024-01-02 16:21:00.001,Good,2267.4541015625 +R0:Z24WVP.0S10L,2024-01-02 10:28:01.001,Good,2344.558349609375 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 07:23:40.514,Good,6132.33349609375 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 04:34:57.886,Good,5818.609375 +1N325T3MTOR-P0L29:9.T0,2024-01-02 19:45:10.416,Good,19371.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 16:35:10.108,Good,19376.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 11:22:10.381,Good,19404.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 01:08:10.214,Good,19396.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 00:57:10.083,Good,19397.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 23:44:10.054,Good,19378.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 21:57:10.201,Good,19377.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 19:38:10.450,Good,19375.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 15:13:10.477,Good,19385.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 09:12:10.466,Good,19402.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 08:22:10.145,Good,19403.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 06:42:10.099,Good,19404.0 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 17:12:09.997,Good,6867.62548828125 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 08:54:59.922,Good,6249.98046875 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 06:45:10.238,Good,19404.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 08:52:10.381,Good,19402.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 06:37:10.213,Good,19403.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 10:13:10.226,Good,19403.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 07:43:10.096,Good,19404.0 +R0:Z24WVP.0S10L,2024-01-02 21:08:00.001,Good,2266.861083984375 +R0:Z24WVP.0S10L,2024-01-02 04:44:01.001,Good,2307.78564453125 +R0:Z24WVP.0S10L,2024-01-02 03:38:00.001,Good,2306.006103515625 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 05:30:10.341,Good,19404.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 19:06:10.475,Good,19375.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 14:36:10.389,Good,19410.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 20:01:10.231,Good,19374.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 03:20:10.309,Good,19403.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 02:52:10.136,Good,19403.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 00:08:10.000,Good,19395.0 +R0:Z24WVP.0S10L,2024-01-02 22:40:00.001,Good,2300.074951171875 +R0:Z24WVP.0S10L,2024-01-02 10:22:00.001,Good,2346.9306640625 +PM20:PCO4SLU_000R4.3D0_T-23,2024-01-02 23:39:20.058,Good,5.300000190734863 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 14:35:31.661,Good,6514.685546875 +1N325T3MTOR-P0L29:9.T0,2024-01-02 17:34:10.228,Good,19375.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 16:39:10.043,Good,19375.0 +R0:Z24WVP.0S10L,2024-01-02 20:02:00.000,Good,2266.861083984375 +R0:Z24WVP.0S10L,2024-01-02 01:45:01.001,Good,2304.81982421875 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 12:38:10.472,Good,19406.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 07:19:10.316,Good,19403.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 01:28:10.208,Good,19399.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 00:12:10.481,Good,19395.0 +R0:Z24WVP.0S10L,2024-01-02 18:54:00.001,Good,2266.26806640625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 19:48:56.048,Good,7073.50732421875 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 23:38:10.214,Good,19377.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 12:06:10.336,Good,19405.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 01:19:10.497,Good,19399.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 23:35:10.480,Good,19378.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 22:44:10.247,Good,19380.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 15:42:10.046,Good,19376.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 00:40:10.497,Good,19397.0 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 09:47:55.430,Good,6.615234375 +R0:Z24WVP.0S10L,2024-01-02 12:36:00.001,Good,2264.488525390625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 21:41:15.646,Good,7240.17333984375 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 19:23:42.152,Good,7034.29150390625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 06:31:30.460,Good,5975.47119140625 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 17:48:10.347,Good,19373.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 01:32:10.261,Good,19399.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 21:14:10.435,Good,19378.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 23:30:10.228,Good,19376.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 06:54:10.356,Good,19403.0 +R0:Z24WVP.0S10L,2024-01-02 23:47:00.001,Good,2258.5576171875 +R0:Z24WVP.0S10L,2024-01-02 23:05:00.001,Good,2298.88916015625 +R0:Z24WVP.0S10L,2024-01-02 18:39:00.001,Good,2266.26806640625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 07:03:36.141,Good,6068.6083984375 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 03:33:10.113,Good,19403.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 18:40:10.232,Good,19376.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 09:47:10.467,Good,19402.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 05:50:10.087,Good,19403.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 21:59:10.357,Good,19379.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 20:04:10.452,Good,19374.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 15:05:10.307,Good,19394.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 15:03:10.279,Good,19395.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 11:11:10.407,Good,19403.0 +R0:Z24WVP.0S10L,2024-01-02 14:25:00.001,Good,2265.081787109375 +R0:Z24WVP.0S10L,2024-01-02 01:17:00.001,Good,2306.006103515625 +1N325T3MTOR-P0L29:9.T0,2024-01-02 13:23:10.098,Good,19409.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 14:31:10.337,Good,19411.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 00:05:10.479,Good,19396.0 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 04:22:36.151,Good,6.43603515625 +R0:Z24WVP.0S10L,2024-01-02 19:30:00.014,Good,2266.26806640625 +R0:Z24WVP.0S10L,2024-01-02 07:22:00.001,Good,2310.158203125 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 22:43:28.441,Good,7284.291015625 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 17:33:10.245,Good,19374.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 16:24:10.199,Good,19376.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 08:54:10.428,Good,19403.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 07:34:10.156,Good,19403.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 05:13:10.270,Good,19404.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 00:33:10.295,Good,19397.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 18:40:10.232,Good,19376.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 09:39:10.294,Good,19402.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 08:36:10.294,Good,19404.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 07:18:10.275,Good,19404.0 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 16:47:04.123,Good,6848.017578125 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 06:05:22.981,Good,5906.84423828125 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 04:22:10.076,Good,19404.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 13:34:10.499,Good,19408.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 16:46:10.139,Good,19377.0 +R0:Z24WVP.0S10L,2024-01-02 12:53:00.001,Good,2265.6748046875 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 01:25:06.919,Good,5588.2177734375 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 20:02:10.354,Good,19373.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 06:28:10.325,Good,19403.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 00:48:10.122,Good,19396.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 11:53:10.049,Good,19405.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 06:34:10.389,Good,19403.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 16:19:10.174,Good,19376.0 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 04:35:39.227,Good,6.4423828125 +R0:Z24WVP.0S10L,2024-01-02 14:45:00.001,Good,2266.26806640625 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 22:42:10.034,Good,19378.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 22:07:10.035,Good,19380.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 21:15:10.449,Good,19379.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 17:48:10.347,Good,19373.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 17:11:10.376,Good,19375.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 13:46:10.091,Good,19409.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 11:55:10.339,Good,19404.0 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 03:38:44.198,Good,5705.8642578125 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 22:21:10.452,Good,19379.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 19:20:10.382,Good,19379.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 16:10:10.095,Good,19377.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 14:35:10.297,Good,19410.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 08:42:10.486,Good,19404.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 00:32:10.169,Good,19395.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 08:04:10.068,Good,19404.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 04:32:10.413,Good,19403.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 10:14:10.274,Good,19402.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 01:54:10.132,Good,19399.0 +R0:Z24WVP.0S10L,2024-01-02 20:54:00.001,Good,2266.26806640625 +R0:Z24WVP.0S10L,2024-01-02 02:02:00.001,Good,2304.81982421875 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 14:48:34.105,Good,6534.29345703125 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 02:57:10.117,Good,19404.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 14:15:10.393,Good,19410.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 03:35:10.215,Good,19403.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 22:16:10.070,Good,19378.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 19:01:10.497,Good,19375.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 16:38:10.380,Good,19377.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 15:25:10.428,Good,19375.0 +R0:Z24WVP.0S10L,2024-01-02 14:54:00.001,Good,2266.26806640625 +R0:Z24WVP.0S10L,2024-01-02 12:15:00.001,Good,2264.488525390625 +R0:Z24WVP.0S10L,2024-01-02 09:36:00.001,Good,2312.53076171875 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 06:24:27.269,Good,5960.765625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 04:30:56.563,Good,5818.609375 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 22:17:10.113,Good,19377.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 06:19:10.348,Good,19403.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 05:39:10.120,Good,19403.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 09:35:10.483,Good,19403.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 22:17:10.113,Good,19377.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 21:52:10.264,Good,19378.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 19:58:10.031,Good,19375.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 11:21:10.383,Good,19403.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 10:55:10.264,Good,19403.0 +R0:Z24WVP.0S10L,2024-01-02 19:10:00.001,Good,2266.26806640625 +R0:Z24WVP.0S10L,2024-01-02 10:38:00.001,Good,2347.52392578125 +R0:Z24WVP.0S10L,2024-01-02 01:16:01.001,Good,2305.413330078125 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 00:25:10.042,Good,19396.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 22:11:10.233,Good,19379.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 03:36:10.463,Good,19402.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 20:51:10.216,Good,19378.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 14:25:10.252,Good,19410.0 +R0:Z24WVP.0S10L,2024-01-02 18:04:00.001,Good,2266.861083984375 +R0:Z24WVP.0S10L,2024-01-02 14:48:00.001,Good,2266.26806640625 +R0:Z24WVP.0S10L,2024-01-02 02:26:01.001,Good,2304.81982421875 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 23:45:10.147,Good,19377.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 14:37:10.404,Good,19411.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 20:50:10.027,Good,19377.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 19:08:10.248,Good,19374.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 18:53:10.249,Good,19372.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 12:46:10.520,Good,19409.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 13:57:10.389,Good,19409.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 10:57:10.430,Good,19403.0 +R0:Z24WVP.0S10L,2024-01-02 20:12:00.001,Good,2266.26806640625 +R0:Z24WVP.0S10L,2024-01-02 14:46:01.001,Good,2266.861083984375 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 15:46:50.909,Good,6700.95947265625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 14:40:32.055,Good,6519.58740234375 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 12:12:52.261,Good,6362.72509765625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 05:04:07.396,Good,5828.4130859375 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 12:02:10.417,Good,19405.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 11:48:10.231,Good,19403.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 23:10:10.055,Good,19378.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 21:22:10.379,Good,19377.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 22:05:10.279,Good,19376.0 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 05:18:49.267,Good,6.4658203125 +R0:Z24WVP.0S10L,2024-01-02 01:43:00.001,Good,2304.81982421875 +R0:Z24WVP.0S10L,2024-01-02 01:03:00.001,Good,2304.81982421875 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 21:30:10.122,Good,19380.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 15:16:10.297,Good,19383.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 02:24:10.132,Good,19401.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 00:21:10.191,Good,19396.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 17:00:10.325,Good,19378.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 03:26:10.116,Good,19403.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 17:16:10.199,Good,19374.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 13:54:10.106,Good,19409.0 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 19:15:12.284,Good,6.810546875 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 23:51:10.379,Good,19377.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 06:41:10.504,Good,19403.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 04:24:10.265,Good,19403.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 16:50:10.432,Good,19376.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 15:33:10.389,Good,19375.0 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 10:09:00.796,Good,6.625 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 05:15:48.607,Good,6.46435546875 +R0:Z24WVP.0S10L,2024-01-02 21:47:00.001,Good,2266.861083984375 +R0:Z24WVP.0S10L,2024-01-02 12:44:00.001,Good,2264.488525390625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 08:17:51.642,Good,6205.86279296875 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 21:57:10.201,Good,19377.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 18:25:10.157,Good,19376.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 08:39:10.378,Good,19402.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 01:18:10.423,Good,19398.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 22:25:10.262,Good,19380.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 07:22:10.465,Good,19403.0 +R0:Z24WVP.0S10L,2024-01-02 23:00:00.001,Good,2296.5166015625 +R0:Z24WVP.0S10L,2024-01-02 05:50:00.001,Good,2308.378662109375 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 04:20:10.029,Good,19404.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 02:56:10.024,Good,19402.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 18:31:10.152,Good,19374.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 03:13:10.406,Good,19402.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 12:35:10.110,Good,19406.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 03:47:10.341,Good,19403.0 +R0:Z24WVP.0S10L,2024-01-02 10:45:00.001,Good,2263.8955078125 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 08:38:10.281,Good,19402.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 14:53:10.052,Good,19403.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 13:10:10.491,Good,19408.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 07:51:10.090,Good,19404.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 03:05:10.291,Good,19402.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 09:54:10.181,Good,19403.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 03:59:10.079,Good,19402.0 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 06:19:03.191,Good,6.515625 +R0:Z24WVP.0S10L,2024-01-02 18:52:00.001,Good,2266.26806640625 +R0:Z24WVP.0S10L,2024-01-02 17:57:00.001,Good,2267.4541015625 +R0:Z24WVP.0S10L,2024-01-02 14:43:00.001,Good,2266.26806640625 +R0:Z24WVP.0S10L,2024-01-02 03:31:01.001,Good,2306.006103515625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 21:49:17.685,Good,7249.97705078125 +1N325T3MTOR-P0L29:9.T0,2024-01-02 22:57:10.292,Good,19378.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 15:36:10.106,Good,19376.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 02:38:10.212,Good,19402.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 22:25:10.262,Good,19380.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 19:39:10.032,Good,19373.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 00:50:10.168,Good,19396.0 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 05:11:47.514,Good,6.46142578125 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 15:25:45.091,Good,6656.841796875 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 12:40:10.199,Good,19406.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 14:30:10.243,Good,19410.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 13:24:10.225,Good,19408.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 21:45:10.330,Good,19379.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 02:05:10.348,Good,19399.0 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 06:44:09.960,Good,6.53076171875 +R0:Z24WVP.0S10L,2024-01-02 19:43:00.001,Good,2266.26806640625 +R0:Z24WVP.0S10L,2024-01-02 11:43:01.001,Good,2266.26806640625 +R0:Z24WVP.0S10L,2024-01-02 05:52:00.001,Good,2308.378662109375 +R0:Z24WVP.0S10L,2024-01-02 00:53:00.001,Good,2305.413330078125 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 21:55:19.247,Good,7254.87939453125 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 18:37:10.382,Good,19373.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 13:13:10.228,Good,19410.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 18:56:10.434,Good,19374.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 08:58:10.254,Good,19402.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 20:45:10.464,Good,19376.0 +R0:Z24WVP.0S10L,2024-01-02 13:07:00.001,Good,2264.488525390625 +R0:Z24WVP.0S10L,2024-01-02 12:38:00.001,Good,2265.081787109375 +R0:Z24WVP.0S10L,2024-01-02 10:32:00.001,Good,2346.9306640625 +R0:Z24WVP.0S10L,2024-01-02 07:45:00.001,Good,2310.158203125 +R0:Z24WVP.0S10L,2024-01-02 02:42:00.001,Good,2304.81982421875 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 08:38:57.109,Good,6220.56884765625 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 18:22:10.184,Good,19374.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 18:08:10.394,Good,19377.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 15:24:10.385,Good,19377.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 10:56:10.343,Good,19402.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 01:21:10.136,Good,19398.0 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 00:41:43.646,Good,6.39013671875 +R0:Z24WVP.0S10L,2024-01-02 03:55:00.001,Good,2306.006103515625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 09:10:04.230,Good,6245.07861328125 +1N325T3MTOR-P0L29:9.T0,2024-01-02 17:36:10.430,Good,19375.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 17:28:10.059,Good,19375.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 12:21:10.044,Good,19406.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 00:18:10.500,Good,19396.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 09:18:10.258,Good,19403.0 +R0:Z24WVP.0S10L,2024-01-02 08:38:00.002,Good,2311.344482421875 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 17:32:14.792,Good,6892.13525390625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 05:30:14.921,Good,5843.119140625 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 22:11:10.233,Good,19379.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 22:06:10.388,Good,19378.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 20:10:10.302,Good,19375.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 08:25:10.032,Good,19402.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 04:45:10.419,Good,19404.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 05:17:10.151,Good,19402.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 10:22:10.018,Good,19402.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 05:09:10.247,Good,19403.0 +R0:Z24WVP.0S10L,2024-01-02 23:40:00.001,Good,2301.8544921875 +R0:Z24WVP.0S10L,2024-01-02 13:45:00.001,Good,2265.081787109375 +R0:Z24WVP.0S10L,2024-01-02 07:19:00.001,Good,2310.158203125 +R0:Z24WVP.0S10L,2024-01-02 02:41:00.001,Good,2305.413330078125 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 15:29:46.609,Good,6676.44970703125 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 04:33:57.828,Good,5823.51123046875 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 10:21:10.464,Good,19402.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 09:49:10.165,Good,19403.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 22:04:10.313,Good,19379.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 15:22:10.304,Good,19376.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 14:36:10.389,Good,19410.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 04:05:10.365,Good,19403.0 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 05:56:57.891,Good,6.5009765625 +R0:Z24WVP.0S10L,2024-01-02 16:49:00.001,Good,2267.4541015625 +R0:Z24WVP.0S10L,2024-01-02 15:38:00.001,Good,2266.861083984375 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 21:51:18.376,Good,7245.0751953125 +1N325T3MTOR-P0L29:9.T0,2024-01-02 22:23:10.093,Good,19379.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 14:22:10.398,Good,19410.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 14:05:10.327,Good,19409.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 18:53:10.249,Good,19372.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 12:07:10.458,Good,19406.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 07:35:10.184,Good,19404.0 +R0:Z24WVP.0S10L,2024-01-02 21:43:00.001,Good,2266.26806640625 +R0:Z24WVP.0S10L,2024-01-02 14:42:01.001,Good,2266.26806640625 +R0:Z24WVP.0S10L,2024-01-02 00:44:00.001,Good,2304.81982421875 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 10:09:19.567,Good,6274.490234375 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 03:41:10.441,Good,19404.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 03:37:09.997,Good,19403.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 23:11:10.120,Good,19375.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 02:33:10.374,Good,19402.0 +R0:Z24WVP.0S10L,2024-01-02 23:45:00.001,Good,2275.7578125 +R0:Z24WVP.0S10L,2024-01-02 05:58:00.001,Good,2309.56494140625 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 10:37:10.172,Good,19402.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 07:02:10.081,Good,19402.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 07:02:10.081,Good,19402.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 22:42:10.034,Good,19377.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 12:50:10.139,Good,19408.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 11:17:10.123,Good,19403.0 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 05:55:57.659,Good,6.49951171875 +R0:Z24WVP.0S10L,2024-01-02 23:37:00.001,Good,2300.074951171875 +R0:Z24WVP.0S10L,2024-01-02 02:54:00.001,Good,2306.006103515625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 09:18:05.695,Good,6259.7841796875 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 15:21:10.276,Good,19377.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 23:32:10.219,Good,19378.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 17:37:10.431,Good,19375.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 09:41:10.450,Good,19403.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 08:42:10.486,Good,19404.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 02:51:10.029,Good,19403.0 +R0:Z24WVP.0S10L,2024-01-02 18:02:00.001,Good,2266.861083984375 +R0:Z24WVP.0S10L,2024-01-02 10:17:00.001,Good,2344.558349609375 +R0:Z24WVP.0S10L,2024-01-02 06:03:00.001,Good,2309.56494140625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 08:53:59.739,Good,6245.07861328125 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 06:19:26.112,Good,5941.15771484375 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 13:27:10.473,Good,19409.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 09:50:10.257,Good,19402.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 09:05:10.021,Good,19404.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 23:37:10.214,Good,19377.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 23:20:10.142,Good,19376.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 14:17:10.062,Good,19410.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 04:11:10.500,Good,19404.0 +R0:Z24WVP.0S10L,2024-01-02 17:24:00.001,Good,2267.4541015625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 06:29:30.287,Good,5970.5693359375 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 13:44:10.013,Good,19409.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 09:29:10.029,Good,19401.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 03:08:10.053,Good,19403.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 23:47:10.271,Good,19375.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 14:23:10.068,Good,19411.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 10:45:10.004,Good,19403.0 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 20:26:59.616,Good,7122.52685546875 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 10:34:10.422,Good,19403.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 02:20:10.225,Good,19401.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 17:51:10.236,Good,19375.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 07:59:10.286,Good,19403.0 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 21:33:46.754,Good,6.81005859375 +R0:Z24WVP.0S10L,2024-01-02 18:51:00.001,Good,2266.861083984375 +R0:Z24WVP.0S10L,2024-01-02 14:06:01.001,Good,2266.26806640625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 04:18:54.746,Good,5794.099609375 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 05:25:10.303,Good,19404.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 04:37:10.348,Good,19404.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 09:30:10.125,Good,19402.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 03:21:10.432,Good,19403.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 02:07:10.491,Good,19400.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 10:52:10.285,Good,19403.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 04:13:10.194,Good,19403.0 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 09:39:52.993,Good,6.61083984375 +R0:Z24WVP.0S10L,2024-01-02 14:36:00.001,Good,2266.861083984375 +R0:Z24WVP.0S10L,2024-01-02 10:24:00.001,Good,2342.779052734375 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 04:27:56.333,Good,5818.609375 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 22:40:10.365,Good,19379.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 06:29:10.405,Good,19404.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 13:54:10.106,Good,19409.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 07:36:10.230,Good,19403.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 21:08:10.070,Good,19374.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 14:46:10.068,Good,19411.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 06:19:10.348,Good,19403.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 00:38:10.325,Good,19396.0 +R0:Z24WVP.0S10L,2024-01-02 13:15:00.001,Good,2264.488525390625 +R0:Z24WVP.0S10L,2024-01-02 09:49:00.001,Good,2345.151611328125 +R0:Z24WVP.0S10L,2024-01-02 06:30:00.001,Good,2308.971923828125 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 18:50:34.408,Good,6990.17431640625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 01:34:09.551,Good,5607.82568359375 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 19:53:10.261,Good,19373.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 00:41:10.106,Good,19396.0 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 12:17:53.443,Good,6377.43115234375 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 18:43:10.331,Good,19375.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 15:42:10.046,Good,19376.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 09:28:10.514,Good,19402.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 02:47:10.305,Good,19402.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 19:24:10.180,Good,19374.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 12:42:10.399,Good,19408.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 11:09:10.224,Good,19403.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 15:30:10.074,Good,19375.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 05:53:10.081,Good,19404.0 +R0:Z24WVP.0S10L,2024-01-02 21:40:01.001,Good,2266.861083984375 +R0:Z24WVP.0S10L,2024-01-02 02:05:00.001,Good,2304.81982421875 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 23:38:10.214,Good,19377.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 18:46:10.111,Good,19375.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 09:58:10.127,Good,19402.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 07:51:10.090,Good,19404.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 05:28:10.082,Good,19405.0 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 22:49:04.742,Good,6.79833984375 +R0:Z24WVP.0S10L,2024-01-02 07:17:00.001,Good,2310.158203125 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 06:40:31.323,Good,6009.78515625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 04:00:49.864,Good,5759.78564453125 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 21:55:10.104,Good,19377.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 19:49:10.315,Good,19375.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 09:07:10.167,Good,19403.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 21:47:10.469,Good,19378.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 02:53:10.240,Good,19402.0 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 17:55:53.258,Good,6.7783203125 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 20:05:57.805,Good,7098.01708984375 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 13:47:18.272,Good,6455.8623046875 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 04:17:54.710,Good,5808.80517578125 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 18:30:10.082,Good,19375.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 18:11:10.145,Good,19375.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 14:10:10.123,Good,19410.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 08:46:10.198,Good,19402.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 02:34:10.017,Good,19403.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 00:44:10.292,Good,19396.0 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 18:04:55.376,Good,6.78125 +R0:Z24WVP.0S10L,2024-01-02 16:13:00.001,Good,2267.4541015625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 14:18:26.463,Good,6490.17578125 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 04:22:55.937,Good,5818.609375 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 13:34:10.499,Good,19408.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 23:14:10.381,Good,19377.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 08:07:10.462,Good,19402.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 23:53:10.115,Good,19377.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 20:33:10.229,Good,19376.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 19:13:10.494,Good,19369.0 +R0:Z24WVP.0S10L,2024-01-02 19:03:00.001,Good,2266.26806640625 +R0:Z24WVP.0S10L,2024-01-02 15:36:00.001,Good,2266.26806640625 +R0:Z24WVP.0S10L,2024-01-02 05:36:00.001,Good,2307.78564453125 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 01:36:10.902,Good,5602.92333984375 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 21:42:10.070,Good,19380.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 04:41:10.146,Good,19407.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 15:29:09.995,Good,19377.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 11:01:10.012,Good,19403.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 08:44:10.161,Good,19403.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 00:43:10.184,Good,19396.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 17:04:10.483,Good,19374.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 06:29:10.405,Good,19404.0 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 05:13:48.079,Good,6.462890625 +R0:Z24WVP.0S10L,2024-01-02 19:25:01.005,Good,2266.26806640625 +R0:Z24WVP.0S10L,2024-01-02 13:26:00.001,Good,2264.488525390625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 11:37:43.098,Good,6323.509765625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 04:40:59.941,Good,5823.51123046875 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 12:46:10.520,Good,19409.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 04:47:10.175,Good,19404.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 04:15:10.369,Good,19404.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 03:55:10.308,Good,19404.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 22:06:10.388,Good,19378.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 16:06:10.090,Good,19375.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 22:54:10.133,Good,19379.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 07:34:10.156,Good,19403.0 +R0:Z24WVP.0S10L,2024-01-02 08:20:00.001,Good,2310.751220703125 +1N325T3MTOR-P0L29:9.T0,2024-01-02 17:38:10.521,Good,19376.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 11:31:10.184,Good,19404.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 23:33:10.264,Good,19378.0 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 07:39:23.513,Good,6.55712890625 +R0:Z24WVP.0S10L,2024-01-02 22:10:00.004,Good,2266.26806640625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 23:39:42.178,Good,7338.21240234375 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 07:41:10.491,Good,19402.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 09:26:10.264,Good,19402.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 08:20:10.015,Good,19403.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 07:48:10.322,Good,19405.0 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 04:48:42.149,Good,6.4482421875 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 02:48:10.555,Good,6.40771484375 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 20:35:00.708,Good,7147.03662109375 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 19:19:41.408,Good,7039.19384765625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 04:04:50.985,Good,5774.49169921875 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 23:13:10.358,Good,19376.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 04:10:10.359,Good,19404.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 23:52:10.501,Good,19374.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 14:26:10.299,Good,19411.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 12:18:10.305,Good,19406.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 03:03:10.098,Good,19402.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 23:55:10.207,Good,19380.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 02:32:10.250,Good,19402.0 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 23:38:41.723,Good,7333.310546875 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 17:38:10.521,Good,19376.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 15:30:10.074,Good,19375.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 06:48:10.312,Good,19403.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 15:38:10.185,Good,19374.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 07:33:10.109,Good,19405.0 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 08:56:41.384,Good,6.59033203125 +R0:Z24WVP.0S10L,2024-01-02 06:20:00.000,Good,2308.971923828125 +R0:Z24WVP.0S10L,2024-01-02 01:26:00.001,Good,2305.413330078125 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 05:36:16.042,Good,5857.82470703125 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 04:53:03.575,Good,5828.4130859375 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 22:20:10.430,Good,19378.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 11:53:10.049,Good,19405.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 10:17:10.042,Good,19403.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 08:15:10.160,Good,19403.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 02:08:10.042,Good,19400.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 22:46:10.373,Good,19379.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 06:57:10.475,Good,19405.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 18:42:10.287,Good,19375.0 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 10:01:59.016,Good,6.62158203125 +R0:Z24WVP.0S10L,2024-01-02 15:28:00.001,Good,2266.861083984375 +R0:Z24WVP.0S10L,2024-01-02 15:17:01.001,Good,2266.861083984375 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 17:34:10.228,Good,19375.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 15:17:10.443,Good,19382.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 13:58:10.441,Good,19410.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 07:14:10.029,Good,19403.0 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 06:01:21.345,Good,5906.84423828125 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 19:54:10.321,Good,19374.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 18:50:10.468,Good,19376.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 21:13:10.367,Good,19378.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 19:14:10.095,Good,19374.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 10:15:10.427,Good,19403.0 +R0:Z24WVP.0S10L,2024-01-02 16:11:00.001,Good,2266.861083984375 +R0:Z24WVP.0S10L,2024-01-02 10:42:00.001,Good,2317.86865234375 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 12:34:57.359,Good,6401.94091796875 +1N325T3MTOR-P0L29:9.T0,2024-01-02 01:17:10.377,Good,19399.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 05:26:10.455,Good,19404.0 +R0:Z24WVP.0S10L,2024-01-02 17:03:00.001,Good,2267.4541015625 +R0:Z24WVP.0S10L,2024-01-02 08:50:00.001,Good,2311.344482421875 +R0:Z24WVP.0S10L,2024-01-02 02:23:00.001,Good,2305.413330078125 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 17:48:17.999,Good,6916.64501953125 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 16:37:01.662,Good,6818.60595703125 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 20:43:10.355,Good,19377.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 18:49:10.361,Good,19373.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 07:50:10.001,Good,19403.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 20:14:10.115,Good,19376.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 12:46:10.520,Good,19409.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 03:44:10.087,Good,19402.0 +R0:Z24WVP.0S10L,2024-01-02 15:37:00.001,Good,2266.861083984375 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 15:44:49.831,Good,6700.95947265625 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 10:08:10.397,Good,19404.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 09:44:10.249,Good,19402.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 05:06:10.086,Good,19403.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 05:17:10.151,Good,19402.0 +R0:Z24WVP.0S10L,2024-01-02 22:48:00.001,Good,2294.14404296875 +R0:Z24WVP.0S10L,2024-01-02 10:34:01.001,Good,2346.337646484375 +R0:Z24WVP.0S10L,2024-01-02 08:11:00.001,Good,2311.344482421875 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 16:57:05.956,Good,6852.919921875 +1N325T3MTOR-P0L29:9.T0,2024-01-02 21:11:10.231,Good,19377.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 11:43:10.450,Good,19404.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 11:43:10.450,Good,19404.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 06:16:10.119,Good,19403.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 02:23:10.486,Good,19402.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 21:19:10.217,Good,19378.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 17:05:10.481,Good,19375.0 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 15:57:25.699,Good,6.73828125 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 05:59:58.650,Good,6.50244140625 +R0:Z24WVP.0S10L,2024-01-02 00:43:00.001,Good,2305.413330078125 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 21:04:05.340,Good,7191.15380859375 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 15:43:49.817,Good,6696.0576171875 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 05:53:10.081,Good,19404.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 00:12:10.481,Good,19395.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 13:28:10.073,Good,19407.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 11:56:10.430,Good,19405.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 04:04:10.272,Good,19404.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 01:06:10.039,Good,19397.0 +R0:Z24WVP.0S10L,2024-01-02 12:34:00.001,Good,2264.488525390625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 03:28:40.945,Good,5715.66845703125 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 14:39:10.051,Good,19411.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 01:45:10.038,Good,19400.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 19:56:10.387,Good,19375.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 19:54:10.321,Good,19374.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 18:15:10.412,Good,19375.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 15:59:10.328,Good,19376.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 10:26:10.316,Good,19403.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 23:12:10.230,Good,19377.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 16:10:10.095,Good,19377.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 15:58:10.047,Good,19377.0 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 15:20:17.109,Good,6.72802734375 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 09:13:45.909,Good,6.59765625 +R0:Z24WVP.0S10L,2024-01-02 13:29:00.001,Good,2264.488525390625 +R0:Z24WVP.0S10L,2024-01-02 12:27:00.001,Good,2264.488525390625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 17:49:18.753,Good,6921.546875 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 11:09:10.224,Good,19403.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 02:01:10.092,Good,19401.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 21:34:10.447,Good,19377.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 08:15:10.160,Good,19403.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 08:00:10.322,Good,19403.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 22:33:10.275,Good,19379.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 05:19:10.287,Good,19403.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 01:39:10.074,Good,19398.0 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 11:41:24.009,Good,6.6650390625 +R0:Z24WVP.0S10L,2024-01-02 17:56:00.001,Good,2267.4541015625 +R0:Z24WVP.0S10L,2024-01-02 03:56:00.001,Good,2307.1923828125 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 04:52:10.012,Good,19405.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 02:11:10.205,Good,19402.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 18:23:10.425,Good,19375.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 12:40:10.199,Good,19406.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 03:53:10.250,Good,19403.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 05:49:10.055,Good,19404.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 03:39:10.192,Good,19402.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 02:20:10.225,Good,19401.0 +R0:Z24WVP.0S10L,2024-01-02 00:58:00.001,Good,2304.81982421875 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 15:33:47.029,Good,6661.74365234375 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 10:23:22.774,Good,6284.2939453125 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 12:43:10.451,Good,19407.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 10:44:10.395,Good,19402.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 15:38:10.185,Good,19374.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 08:53:10.368,Good,19402.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 07:37:10.314,Good,19404.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 00:51:10.285,Good,19397.0 +R0:Z24WVP.0S10L,2024-01-02 23:49:00.001,Good,2216.44677734375 +R0:Z24WVP.0S10L,2024-01-02 19:29:00.001,Good,2266.861083984375 +R0:Z24WVP.0S10L,2024-01-02 04:20:00.000,Good,2307.78564453125 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 16:25:10.277,Good,19375.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 04:43:10.246,Good,19402.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 11:55:10.339,Good,19404.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 16:16:10.039,Good,19377.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 14:22:10.398,Good,19410.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 14:01:10.150,Good,19409.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 10:45:10.004,Good,19403.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 08:14:10.047,Good,19405.0 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 20:51:37.349,Good,6.81982421875 +R0:Z24WVP.0S10L,2024-01-02 21:02:00.001,Good,2266.26806640625 +R0:Z24WVP.0S10L,2024-01-02 17:33:00.001,Good,2266.861083984375 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 14:51:34.876,Good,6544.09716796875 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 06:03:22.788,Good,5911.74609375 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 03:02:10.022,Good,19402.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 02:59:10.274,Good,19402.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 23:45:10.147,Good,19377.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 17:49:10.439,Good,19375.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 14:37:10.404,Good,19411.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 06:44:10.164,Good,19402.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 11:37:10.417,Good,19404.0 +R0:Z24WVP.0S10L,2024-01-02 21:01:00.001,Good,2266.26806640625 +R0:Z24WVP.0S10L,2024-01-02 10:09:00.001,Good,2347.52392578125 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 23:58:44.589,Good,7348.01611328125 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 21:05:10.440,Good,19378.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 20:18:10.365,Good,19378.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 06:50:10.483,Good,19402.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 03:21:10.432,Good,19403.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 20:58:10.014,Good,19376.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 19:36:10.327,Good,19373.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 09:28:10.514,Good,19402.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 03:58:10.496,Good,19403.0 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 19:20:41.454,Good,7044.095703125 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 10:00:17.477,Good,6269.58837890625 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 16:55:10.391,Good,19378.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 16:24:10.199,Good,19376.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 17:26:10.395,Good,19374.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 22:50:10.417,Good,19379.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 19:23:10.080,Good,19375.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 13:24:10.225,Good,19408.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 09:07:10.167,Good,19403.0 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 23:12:34.643,Good,7313.70263671875 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 00:33:10.295,Good,19397.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 20:41:10.113,Good,19378.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 17:33:10.245,Good,19374.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 06:46:10.279,Good,19402.0 +R0:Z24WVP.0S10L,2024-01-02 21:18:00.001,Good,2267.4541015625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 16:08:56.169,Good,6764.6845703125 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 20:08:10.139,Good,19377.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 14:31:10.337,Good,19411.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 20:44:10.404,Good,19377.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 16:08:10.286,Good,19376.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 17:30:10.069,Good,19375.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 12:43:10.451,Good,19407.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 11:16:10.111,Good,19402.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 10:08:10.397,Good,19404.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 08:42:10.486,Good,19404.0 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 03:43:26.560,Good,6.42138671875 +R0:Z24WVP.0S10L,2024-01-02 16:50:00.001,Good,2266.861083984375 +R0:Z24WVP.0S10L,2024-01-02 05:20:00.001,Good,2307.78564453125 +R0:Z24WVP.0S10L,2024-01-02 01:10:01.001,Good,2304.81982421875 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 20:07:57.901,Good,7102.9189453125 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 22:28:09.999,Good,19379.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 21:41:10.018,Good,19377.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 09:46:10.423,Good,19403.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 06:55:10.327,Good,19404.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 02:18:10.088,Good,19401.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 09:36:10.495,Good,19403.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 03:14:10.453,Good,19404.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 23:51:10.379,Good,19377.0 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 05:35:52.723,Good,6.48291015625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 13:07:08.204,Good,6416.646484375 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 23:19:10.031,Good,19381.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 17:02:10.357,Good,19373.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 20:45:10.464,Good,19376.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 10:57:10.430,Good,19403.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 17:18:10.315,Good,19373.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 11:25:10.137,Good,19404.0 +R0:Z24WVP.0S10L,2024-01-02 22:30:00.001,Good,2297.702880859375 +R0:Z24WVP.0S10L,2024-01-02 19:20:01.005,Good,2266.26806640625 +R0:Z24WVP.0S10L,2024-01-02 14:30:00.001,Good,2265.081787109375 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 08:19:52.152,Good,6205.86279296875 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 20:05:10.480,Good,19374.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 15:38:10.185,Good,19374.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 09:16:10.203,Good,19403.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 02:16:10.055,Good,19402.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 00:21:10.191,Good,19396.0 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 16:05:55.853,Good,6759.78271484375 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 08:44:58.008,Good,6235.2744140625 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 23:28:10.451,Good,19378.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 08:32:10.509,Good,19404.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 08:21:10.109,Good,19402.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 03:53:10.250,Good,19402.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 20:03:10.398,Good,19376.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 00:36:10.169,Good,19397.0 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 00:47:44.880,Good,6.3916015625 +R0:Z24WVP.0S10L,2024-01-02 21:58:00.001,Good,2266.26806640625 +R0:Z24WVP.0S10L,2024-01-02 11:41:00.001,Good,2264.488525390625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 12:06:51.126,Good,6348.01953125 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 23:14:10.381,Good,19377.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 14:33:10.350,Good,19411.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 22:54:10.133,Good,19379.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 22:12:10.357,Good,19376.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 14:44:10.477,Good,19411.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 13:12:10.139,Good,19408.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 12:57:10.333,Good,19408.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 06:31:10.082,Good,19404.0 +R0:Z24WVP.0S10L,2024-01-02 15:12:00.001,Good,2266.861083984375 +R0:Z24WVP.0S10L,2024-01-02 15:09:00.001,Good,2266.26806640625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 11:36:42.912,Good,6318.607421875 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 03:21:39.440,Good,5710.7666015625 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 17:07:10.148,Good,19377.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 02:36:10.137,Good,19401.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 01:40:10.234,Good,19400.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 17:01:10.327,Good,19376.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 08:56:10.144,Good,19403.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 02:52:10.136,Good,19403.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 19:42:10.236,Good,19375.0 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 09:54:57.274,Good,6.6181640625 +R0:Z24WVP.0S10L,2024-01-02 19:28:00.001,Good,2266.26806640625 +R0:Z24WVP.0S10L,2024-01-02 14:24:00.001,Good,2265.6748046875 +R0:Z24WVP.0S10L,2024-01-02 12:13:01.001,Good,2264.488525390625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 03:56:48.624,Good,5749.98193359375 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 08:52:10.381,Good,19402.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 05:38:10.055,Good,19405.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 04:42:10.228,Good,19403.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 22:49:10.479,Good,19378.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 22:48:10.463,Good,19381.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 12:06:10.336,Good,19405.0 +R0:Z24WVP.0S10L,2024-01-02 17:14:00.010,Good,2266.861083984375 +R0:Z24WVP.0S10L,2024-01-02 08:22:00.001,Good,2310.751220703125 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 18:41:32.154,Good,6975.46826171875 +1N325T3MTOR-P0L29:9.T0,2024-01-02 08:28:10.177,Good,19403.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 23:23:10.401,Good,19379.0 +R0:Z24WVP.0S10L,2024-01-02 08:28:00.001,Good,2310.751220703125 +R0:Z24WVP.0S10L,2024-01-02 06:09:00.001,Good,2309.56494140625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 22:20:23.671,Good,7264.68310546875 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 04:57:10.259,Good,19403.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 06:00:10.076,Good,19404.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 03:43:10.057,Good,19403.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 21:28:10.414,Good,19381.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 14:32:10.320,Good,19410.0 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 18:16:58.170,Good,6.78564453125 +R0:Z24WVP.0S10L,2024-01-02 19:05:00.002,Good,2266.26806640625 +R0:Z24WVP.0S10L,2024-01-02 04:30:00.001,Good,2307.1923828125 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 11:25:36.517,Good,6313.70556640625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 08:56:00.471,Good,6245.07861328125 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 19:42:10.236,Good,19375.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 17:06:10.054,Good,19376.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 01:06:10.039,Good,19397.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 03:45:10.154,Good,19404.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 20:36:10.390,Good,19378.0 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 20:18:28.932,Good,6.830078125 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 04:25:37.035,Good,6.43798828125 +R0:Z24WVP.0S10L,2024-01-02 21:44:01.001,Good,2266.861083984375 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 21:32:10.391,Good,19378.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 17:36:10.430,Good,19375.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 06:27:10.202,Good,19403.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 00:38:10.325,Good,19396.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 21:53:10.356,Good,19382.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 19:21:10.429,Good,19375.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 02:58:10.178,Good,19403.0 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 20:49:36.771,Good,6.8212890625 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 16:20:31.311,Good,6.74560546875 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 02:38:10.555,Good,6.40771484375 +R0:Z24WVP.0S10L,2024-01-02 22:38:00.001,Good,2301.8544921875 +R0:Z24WVP.0S10L,2024-01-02 21:00:00.001,Good,2266.861083984375 +R0:Z24WVP.0S10L,2024-01-02 18:33:00.001,Good,2266.861083984375 +R0:Z24WVP.0S10L,2024-01-02 08:08:01.001,Good,2311.344482421875 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 16:17:57.885,Good,6789.1943359375 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 15:58:54.526,Good,6730.37109375 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 05:31:15.014,Good,5848.02099609375 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 03:03:35.090,Good,5691.15869140625 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 02:04:10.335,Good,19400.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 18:47:10.185,Good,19374.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 18:43:10.331,Good,19375.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 05:05:10.007,Good,19404.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 07:42:10.511,Good,19402.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 01:35:10.830,Good,19399.0 +R0:Z24WVP.0S10L,2024-01-02 14:31:00.001,Good,2266.26806640625 +R0:Z24WVP.0S10L,2024-01-02 13:51:00.001,Good,2264.488525390625 +R0:Z24WVP.0S10L,2024-01-02 03:03:00.001,Good,2306.006103515625 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 09:40:10.354,Good,19401.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 08:07:10.462,Good,19402.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 04:33:10.486,Good,19404.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 14:12:10.158,Good,19409.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 08:09:10.185,Good,19402.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 21:50:10.172,Good,19378.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 15:28:10.247,Good,19378.0 +R0:Z24WVP.0S10L,2024-01-02 17:31:00.000,Good,2267.4541015625 +R0:Z24WVP.0S10L,2024-01-02 08:29:00.001,Good,2311.344482421875 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 21:11:07.186,Good,7200.9580078125 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 06:44:10.164,Good,19402.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 00:26:10.188,Good,19396.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 23:34:10.340,Good,19377.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 19:21:10.429,Good,19375.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 05:22:10.029,Good,19404.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 00:46:10.365,Good,19397.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 22:06:10.388,Good,19378.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 21:27:10.117,Good,19379.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 19:57:10.479,Good,19376.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 17:43:10.430,Good,19374.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 14:04:10.204,Good,19409.0 +R0:Z24WVP.0S10L,2024-01-02 22:43:00.001,Good,2299.48193359375 +R0:Z24WVP.0S10L,2024-01-02 21:11:00.001,Good,2266.26806640625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 13:46:17.703,Good,6450.9599609375 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 21:28:10.414,Good,19380.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 15:14:10.016,Good,19384.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 13:30:10.275,Good,19409.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 10:59:10.057,Good,19403.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 23:36:10.072,Good,19376.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 22:18:10.195,Good,19377.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 18:12:10.208,Good,19375.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 18:12:10.208,Good,19375.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 11:15:10.041,Good,19403.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 07:06:10.473,Good,19404.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 03:41:10.441,Good,19404.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 18:17:10.032,Good,19374.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 15:53:10.203,Good,19376.0 +R0:Z24WVP.0S10L,2024-01-02 21:32:00.001,Good,2266.26806640625 +R0:Z24WVP.0S10L,2024-01-02 18:32:00.001,Good,2266.26806640625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 22:22:23.814,Good,7259.78125 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 05:49:19.748,Good,5867.62841796875 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 22:54:10.133,Good,19379.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 16:48:10.312,Good,19376.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 09:14:10.055,Good,19402.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 02:57:10.117,Good,19404.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 22:02:10.069,Good,19377.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 12:42:10.399,Good,19408.0 +R0:Z24WVP.0S10L,2024-01-02 20:10:00.001,Good,2266.861083984375 +R0:Z24WVP.0S10L,2024-01-02 14:51:00.001,Good,2266.26806640625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 10:45:28.351,Good,6299.0 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 05:03:06.771,Good,5833.31494140625 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 23:08:10.425,Good,19378.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 18:15:10.412,Good,19375.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 16:11:10.167,Good,19376.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 15:49:10.462,Good,19373.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 10:14:10.274,Good,19401.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 17:04:10.483,Good,19374.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 07:35:10.184,Good,19404.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 23:32:10.219,Good,19378.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 20:40:10.207,Good,19375.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 14:06:10.401,Good,19410.0 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 11:14:17.402,Good,6.65478515625 +R0:Z24WVP.0S10L,2024-01-02 09:37:00.001,Good,2311.937255859375 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 23:56:10.312,Good,19376.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 23:47:10.271,Good,19375.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 20:53:10.157,Good,19378.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 22:56:10.250,Good,19377.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 19:19:10.376,Good,19374.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 19:02:10.026,Good,19374.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 06:01:10.129,Good,19404.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 03:23:10.045,Good,19403.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 00:16:10.242,Good,19394.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 08:39:10.378,Good,19402.0 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 21:14:08.404,Good,7200.9580078125 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 08:34:56.025,Good,6220.56884765625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 06:33:30.708,Good,5985.275390625 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 23:43:10.001,Good,19377.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 22:44:10.247,Good,19380.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 19:16:10.265,Good,19374.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 18:47:10.185,Good,19374.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 00:40:10.497,Good,19397.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 20:21:10.333,Good,19378.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 23:39:10.184,Good,19378.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 15:21:10.276,Good,19377.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 15:10:10.346,Good,19388.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 14:30:10.243,Good,19410.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 10:24:10.145,Good,19403.0 +R0:Z24WVP.0S10L,2024-01-02 16:22:00.001,Good,2266.861083984375 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 22:37:27.473,Good,7274.48681640625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 12:43:00.288,Good,6406.8427734375 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 20:42:10.247,Good,19378.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 11:47:10.138,Good,19404.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 21:26:10.073,Good,19377.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 16:00:10.422,Good,19376.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 14:44:10.477,Good,19411.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 05:02:10.169,Good,19404.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 20:35:10.368,Good,19374.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 19:46:10.011,Good,19375.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 19:36:10.327,Good,19373.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 18:33:10.253,Good,19373.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 18:33:10.253,Good,19373.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 17:06:10.054,Good,19376.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 11:00:10.012,Good,19402.0 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 18:41:04.113,Good,6.7958984375 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 10:44:09.973,Good,6.642578125 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 04:59:05.845,Good,5828.4130859375 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 23:59:10.033,Good,19377.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 21:58:10.317,Good,19379.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 21:02:10.333,Good,19376.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 03:30:10.500,Good,19403.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 02:44:10.167,Good,19403.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 05:56:10.285,Good,19405.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 02:04:10.335,Good,19400.0 +R0:Z24WVP.0S10L,2024-01-02 12:09:00.001,Good,2264.488525390625 +1N325T3MTOR-P0L29:9.T0,2024-01-02 23:38:10.214,Good,19377.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 07:41:10.491,Good,19402.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 05:45:10.083,Good,19404.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 05:08:10.152,Good,19404.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 01:09:10.227,Good,19397.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 11:35:10.227,Good,19403.0 +R0:Z24WVP.0S10L,2024-01-02 23:43:00.001,Good,2295.330322265625 +R0:Z24WVP.0S10L,2024-01-02 23:35:00.001,Good,2297.702880859375 +R0:Z24WVP.0S10L,2024-01-02 12:12:00.001,Good,2265.081787109375 +R0:Z24WVP.0S10L,2024-01-02 05:33:00.001,Good,2308.378662109375 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 15:07:39.464,Good,6583.3125 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 15:39:10.372,Good,19375.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 09:23:10.497,Good,19402.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 16:29:10.143,Good,19378.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 05:15:10.019,Good,19404.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 00:43:10.184,Good,19396.0 +R0:Z24WVP.0S10L,2024-01-02 13:14:00.001,Good,2265.081787109375 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 06:51:34.506,Good,6044.0986328125 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 23:25:10.443,Good,19377.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 21:43:10.195,Good,19378.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 15:12:10.505,Good,19386.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 12:42:10.399,Good,19408.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 03:38:10.129,Good,19403.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 01:52:10.488,Good,19401.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 06:03:10.114,Good,19404.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 05:29:10.186,Good,19403.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 01:25:10.483,Good,19398.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 01:12:10.421,Good,19398.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 11:41:10.301,Good,19404.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 05:57:10.423,Good,19402.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 04:12:10.078,Good,19403.0 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 07:29:41.930,Good,6147.03955078125 +1N325T3MTOR-P0L29:9.T0,2024-01-02 22:12:10.357,Good,19376.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 16:28:10.469,Good,19376.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 10:34:10.422,Good,19403.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 17:01:10.327,Good,19376.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 11:54:10.205,Good,19404.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 08:47:10.339,Good,19404.0 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 16:06:55.961,Good,6754.880859375 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 14:24:27.995,Good,6490.17578125 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 07:33:43.150,Good,6156.84326171875 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 02:47:30.694,Good,5671.55078125 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 22:55:10.198,Good,19378.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 20:30:10.442,Good,19376.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 20:04:10.452,Good,19374.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 15:37:10.090,Good,19376.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 13:12:10.139,Good,19408.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 04:20:10.029,Good,19404.0 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 11:57:48.785,Good,6333.3134765625 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 20:46:10.070,Good,19375.0 +_LT2EPL-9PM0.OROTENV3:,2024-01-02 11:18:10.090,Good,19404.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 18:17:10.032,Good,19374.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 16:38:10.380,Good,19377.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 14:34:10.348,Good,19412.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 00:22:10.264,Good,19395.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 23:14:10.381,Good,19377.0 +TT33-01M9Z2L9:P20.AIRO5N,2024-01-02 21:10:10.203,Good,19376.0 +value_range, 2024-01-02 03:49:45.000, Good, 1 +value_range, 2024-01-02 07:53:11.000, Good, 2 +value_range, 2024-01-02 11:56:42.000, Good, 3 +value_range, 2024-01-02 16:00:12.000, Good, 4 +value_range, 2024-01-02 20:03:46.000, Good, 5 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 17:29:47.361,Good,6.7666015625 +O:05RI0.2T2M6STN6_PP-I165AT,2024-01-02 11:08:16.131,Good,6.6533203125 +R0:Z24WVP.0S10L,2024-01-02 10:54:00.001,Good,2264.488525390625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 11:15:36.517,Good,6313.70556640625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 03:52:47.354,Good,5740.17822265625 +-4O7LSSAM_3EA02:2GT7E02I_R_MP,2024-01-02 01:56:15.905,Good,5627.43310546875 +FLATLINE_TEST,2024-01-02 22:50:10.417,Good,19379.0 +FLATLINE_TEST,2024-01-02 14:57:10.372,Good,0 +FLATLINE_TEST,2024-01-02 02:49:10.408,Good,0 +FLATLINE_TEST,2024-01-02 02:35:10.511,Good,0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 21:51:10.219,Good,19402.0 +1N325T3MTOR-P0L29:9.T0,2024-01-02 17:08:10.242,Good,19402.0 +MISSING_DATA,2024-01-02 00:08:10.000,Good,19379.0 +MISSING_DATA,2024-01-02 00:08:11.000,Good,1 +MISSING_DATA,2024-01-02 00:08:13.000,Good,1 +MISSING_DATA,2024-01-02 00:08:14.000,Good,1 +MISSING_DATA_PATTERN,2024-01-05 00:02:10.000,Good,19379.0 +MISSING_DATA_PATTERN,2024-01-05 00:02:11.000,Good,1 +MISSING_DATA_PATTERN,2024-01-05 00:02:13.000,Good,1 +MISSING_DATA_PATTERN,2024-01-05 00:02:14.000,Good,1 + diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_quality/test_input_validator.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/test_input_validator.py new file mode 100644 index 000000000..69eeba3fa --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/test_input_validator.py @@ -0,0 +1,160 @@ +# Copyright 2022 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from pyspark.sql import SparkSession +from pyspark.sql.types import ( + StructType, + StructField, + StringType, + TimestampType, + FloatType, +) + +from src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.missing_value_imputation import ( + MissingValueImputation, +) + + +@pytest.fixture(scope="session") +def spark_session(): + return SparkSession.builder.master("local[2]").appName("test").getOrCreate() + + +def test_input_validator_basic(spark_session: SparkSession): + test_schema = StructType( + [ + StructField("TagName", StringType(), True), + StructField("EventTime", StringType(), True), + StructField("Status", StringType(), True), + StructField("Value", StringType(), True), + ] + ) + + expected_schema = StructType( + [ + StructField("TagName", StringType(), True), + StructField("EventTime", TimestampType(), True), + StructField("Status", StringType(), True), + StructField("Value", FloatType(), True), + ] + ) + + column_expected_schema = StructType( + [ + StructField("TagName", StringType(), True), + StructField("EventTime", TimestampType(), True), + StructField("Status", StringType(), True), + StructField("Value", FloatType(), True), + StructField("Tolerance", FloatType(), True), + ] + ) + + pyspark_type_schema = { + "TagName": StringType(), + "EventTime": TimestampType(), + "Status": StringType(), + "Value": float, + } + + test_data = [ + ("A2PS64V0J.:ZUX09R", "2024-01-01 03:29:21.000", "Good", "1.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-01 07:32:55.000", "Good", "2.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-01 11:36:29.000", "Good", "3.0"), + ] + + dirty_data = [ + ("A2PS64V0J.:ZUX09R", "2024-01-01 03:29:21.000", "Good", "abc"), + ("A2PS64V0J.:ZUX09R", "2024-01-01 07:32:55.000", "Good", "rtdip"), + ("A2PS64V0J.:ZUX09R", "2024-01-01 11:36:29.000", "Good", "def"), + ] + + test_df = spark_session.createDataFrame(test_data, schema=test_schema) + dirty_df = spark_session.createDataFrame(dirty_data, schema=test_schema) + + test_component = MissingValueImputation(spark_session, test_df) + dirty_component = MissingValueImputation(spark_session, dirty_df) + + # Check if the column exists + with pytest.raises(ValueError) as e: + test_component.validate(column_expected_schema) + assert "Column 'Tolerance' is missing in the DataFrame." in str(e.value) + + # Check for pyspark Datatypes + with pytest.raises(TypeError) as e: + test_component.validate(pyspark_type_schema) + assert ( + "Expected and actual types must be instances of pyspark.sql.types.DataType." + in str(e.value) + ) + + # Check for casting failures + with pytest.raises(ValueError) as e: + dirty_component.validate(expected_schema) + assert ( + "Error during casting column 'Value' to FloatType(): Column 'Value' cannot be cast to FloatType()." + in str(e.value) + ) + + # Check for success + assert test_component.validate(expected_schema) == True + assert test_component.df.schema == expected_schema + + +def test_input_validator_with_null_strings(spark_session: SparkSession): + # Schema und Testdaten + test_schema = StructType( + [ + StructField("TagName", StringType(), True), + StructField("EventTime", StringType(), True), + StructField("Status", StringType(), True), + StructField("Value", StringType(), True), + ] + ) + + expected_schema = StructType( + [ + StructField("TagName", StringType(), True), + StructField("EventTime", TimestampType(), True), + StructField("Status", StringType(), True), + StructField("Value", FloatType(), True), + ] + ) + + test_data_with_null_strings = [ + ("A2PS64V0J.:ZUX09R", "2024-01-01 03:29:21.000", "Good", "None"), + ("A2PS64V0J.:ZUX09R", "2024-01-01 07:32:55.000", "Good", "none"), + ("A2PS64V0J.:ZUX09R", "2024-01-01 11:36:29.000", "Good", "Null"), + ("A2PS64V0J.:ZUX09R", "2024-01-01 15:40:00.000", "Good", "null"), + ("A2PS64V0J.:ZUX09R", "2024-01-01 19:50:00.000", "Good", ""), + ] + + test_df = spark_session.createDataFrame( + test_data_with_null_strings, schema=test_schema + ) + + test_component = MissingValueImputation(spark_session, test_df) + + # Validate the DataFrame + assert test_component.validate(expected_schema) == True + processed_df = test_component.df + + # Prüfen, ob alle Werte in "Value" None sind + value_column = processed_df.select("Value").collect() + + for row in value_column: + assert ( + row["Value"] is None + ), f"Value {row['Value']} wurde nicht korrekt zu None konvertiert." diff --git a/tests/sdk/python/rtdip_sdk/pipelines/deploy/test_databricks_deploy.py b/tests/sdk/python/rtdip_sdk/pipelines/deploy/test_databricks_deploy.py index d20dc35f8..2802b0430 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/deploy/test_databricks_deploy.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/deploy/test_databricks_deploy.py @@ -19,8 +19,8 @@ import pytest from src.sdk.python.rtdip_sdk.pipelines.deploy import ( - DatabricksSDKDeploy, CreateJob, + DatabricksSDKDeploy, JobCluster, ClusterSpec, Task, diff --git a/tests/sdk/python/rtdip_sdk/pipelines/forecasting/__init__.py b/tests/sdk/python/rtdip_sdk/pipelines/forecasting/__init__.py new file mode 100644 index 000000000..1832b01ae --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/forecasting/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/sdk/python/rtdip_sdk/pipelines/forecasting/spark/__init__.py b/tests/sdk/python/rtdip_sdk/pipelines/forecasting/spark/__init__.py new file mode 100644 index 000000000..1832b01ae --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/forecasting/spark/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/sdk/python/rtdip_sdk/pipelines/forecasting/spark/test_arima.py b/tests/sdk/python/rtdip_sdk/pipelines/forecasting/spark/test_arima.py new file mode 100644 index 000000000..7c6891cc1 --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/forecasting/spark/test_arima.py @@ -0,0 +1,520 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd +import pytest +import os + +from pyspark.sql import SparkSession +from pyspark.sql.dataframe import DataFrame +from pyspark.sql.types import ( + StructType, + StructField, + StringType, + TimestampType, + FloatType, +) + +from src.sdk.python.rtdip_sdk._sdk_utils.pandas import ( + _prepare_pandas_to_convert_to_spark, +) +from src.sdk.python.rtdip_sdk.pipelines.forecasting.spark.arima import ( + ArimaPrediction, +) +from src.sdk.python.rtdip_sdk.pipelines.forecasting.spark.auto_arima import ( + ArimaAutoPrediction, +) + +# Testcases to add: + +# = TEST COLUMN NAME FINDER = +# Non-existing columns +# Wrong columns given +# correct columns given + +# = COLUMN-BASED = + +# = SOURCE-BASED = +# Pass additional future data -> should not be discarded + +# = PMD-Arima = +# Column-based +# Source-based + + +@pytest.fixture(scope="session") +def spark_session(): + # Additional config needed since older PySpark <3.5 have troubles converting data with timestamps to pandas Dataframes + return ( + SparkSession.builder.master("local[2]") + .appName("test") + .config("spark.sql.execution.arrow.pyspark.enabled", "true") + .getOrCreate() + ) + + +@pytest.fixture(scope="session") +def historic_data(): + hist_data = [ + ("A2PS64V0J.:ZUX09R", "2024-01-01 03:29:21", "Good", "1.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-01 07:32:55", "Good", "2.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-01 11:36:29", "Good", "3.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-01 15:39:03", "Good", "4.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-01 19:42:37", "Good", "5.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-01 23:46:10", "Good", "6.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 03:49:45", "Good", "7.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 07:53:11", "Good", "8.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 11:56:42", "Good", "9.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 16:00:12", "Good", "10.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 20:13:46", "Good", "11.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-03 00:07:20", "Good", "12.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-03 04:10:50", "Good", "13.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-03 08:14:20", "Good", "14.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-03 12:18:02", "Good", "15.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-03 16:21:30", "Good", "16.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-03 20:25:10", "Good", "17.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-04 00:28:44", "Good", "18.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-04 04:32:18", "Good", "19.0"), + ("A2PS64V0J.:ZUX09R", "2024-01-04 08:35:52", "Good", "20.0"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:01:43", "Good", "4686.26"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:02:44", "Good", "4691.1616"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:03:44", "Good", "4688.019"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:04:44", "Good", "4686.26"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:05:44", "Good", "4691.1616"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:06:44", "Good", "4694.203"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:07:44", "Good", "4693.92"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:08:44", "Good", "4691.6475"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:09:44", "Good", "4688.722"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:10:44", "Good", "4686.481"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:11:46", "Good", "4686.26"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:12:46", "Good", "4688.637"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:13:46", "Good", "4691.1616"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:14:46", "Good", "4691.4985"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:15:46", "Good", "4690.817"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:16:47", "Good", "4691.1616"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:17:47", "Good", "4693.7354"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:18:47", "Good", "4696.372"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:19:48", "Good", "4696.0635"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:20:48", "Good", "4691.1616"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:21:48", "Good", "4684.8516"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:22:48", "Good", "4679.2305"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:23:48", "Good", "4675.784"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:24:48", "Good", "4675.998"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:25:50", "Good", "4681.358"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:26:50", "Good", "4691.1616"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:27:50", "Good", "4696.0635"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:28:50", "Good", "4691.1616"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:29:50", "Good", "4691.056"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:30:50", "Good", "4694.813"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:31:51", "Good", "4696.0635"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:32:52", "Good", "4691.1616"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:33:52", "Good", "4685.6963"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:34:52", "Good", "4681.356"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:35:52", "Good", "4678.175"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:36:52", "Good", "4676.186"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:37:52", "Good", "4675.423"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:38:52", "Good", "4675.9185"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:39:52", "Good", "4677.707"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:40:52", "Good", "4680.8213"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:41:52", "Good", "4685.295"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:42:52", "Good", "4691.1616"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:42:54", "Good", "4696.0635"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:43:52", "Good", "4692.863"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:43:54", "Good", "4691.1616"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:44:54", "Good", "4696.0635"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:45:54", "Good", "4691.1616"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:46:55", "Good", "4696.0635"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:47:55", "Good", "4691.1616"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:48:55", "Good", "4689.178"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:49:55", "Good", "4692.111"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:50:55", "Good", "4695.794"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:51:56", "Good", "4696.0635"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:52:56", "Good", "4691.1616"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:53:56", "Good", "4687.381"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:54:56", "Good", "4687.1104"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:55:57", "Good", "4691.1616"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:56:58", "Good", "4696.0635"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:57:58", "Good", "4691.1616"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:58:58", "Good", "4693.161"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 00:59:59", "Good", "4696.0635"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:00:59", "Good", "4691.1616"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:01:59", "Good", "4688.2207"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:02:59", "Good", "4689.07"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:03:59", "Good", "4692.1904"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:05:01", "Good", "4696.0635"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:06:01", "Good", "4699.3506"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:07:01", "Good", "4701.433"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:08:01", "Good", "4701.872"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:09:01", "Good", "4700.228"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:10:02", "Good", "4696.0635"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:11:03", "Good", "4691.1616"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:12:03", "Good", "4692.6973"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:13:06", "Good", "4696.0635"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:14:06", "Good", "4695.113"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:15:06", "Good", "4691.5415"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:16:06", "Good", "4689.0054"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:17:07", "Good", "4691.1616"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:18:07", "Good", "4696.0635"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:19:07", "Good", "4688.7515"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:20:07", "Good", "4686.26"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:21:07", "Good", "4700.966"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:22:07", "Good", "4700.935"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:23:07", "Good", "4687.808"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:24:07", "Good", "4675.1323"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:25:09", "Good", "4676.456"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:26:09", "Good", "4696.0635"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:27:09", "Good", "4708.868"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:28:09", "Good", "4711.2476"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:29:09", "Good", "4707.2603"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:30:09", "Good", "4700.966"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:31:09", "Good", "4695.7764"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:32:09", "Good", "4692.5146"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:33:09", "Good", "4691.358"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:34:09", "Good", "4692.482"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:35:10", "Good", "4696.0635"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:36:10", "Good", "4700.966"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:37:10", "Good", "4702.4126"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:38:10", "Good", "4700.763"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:39:10", "Good", "4697.9897"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:40:11", "Good", "4696.0635"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:41:11", "Good", "4696.747"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:42:11", "Good", "4700.966"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:43:11", "Good", "4705.8677"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:44:11", "Good", "4700.966"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:45:11", "Good", "4695.9624"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:46:11", "Good", "4696.0635"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:47:11", "Good", "4700.966"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:48:11", "Good", "4702.187"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:49:11", "Good", "4699.401"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:50:11", "Good", "4695.0015"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:51:11", "Good", "4691.3823"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:52:11", "Good", "4690.9385"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:53:13", "Good", "4696.0635"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:54:13", "Good", "4700.966"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:55:13", "Good", "4686.26"), + ("-4O7LSSAM_3EA02:2GT7E02I_R_MP", "2023-12-31 01:56:13", "Good", "4700.966"), + ] + return hist_data + + +@pytest.fixture(scope="session") +def source_based_synthetic_data(): + output_object = {} + + df1 = pd.DataFrame() + df2 = pd.DataFrame() + np.random.seed(0) + + arr_len = 100 + h_a_l = int(arr_len / 2) + df1["Value"] = np.random.rand(arr_len) + np.sin( + np.linspace(0, arr_len / 2, num=arr_len) + ) + df2["Value"] = ( + df1["Value"] * 2 + np.cos(np.linspace(0, arr_len / 2, num=arr_len)) + 5 + ) + df1["index"] = np.asarray( + pd.date_range(start="1/1/2024", end="2/1/2024", periods=arr_len) + ).astype(str) + df2["index"] = np.asarray( + pd.date_range(start="1/1/2024", end="2/1/2024", periods=arr_len) + ).astype(str) + df1["TagName"] = "PrimarySensor" + df2["TagName"] = "SecondarySensor" + df1["Status"] = "Good" + df2["Status"] = "Good" + + output_object["df1"] = df1 + output_object["df2"] = df2 + output_object["arr_len"] = arr_len + output_object["h_a_l"] = h_a_l + output_object["half_df1_full_df2"] = _prepare_pandas_to_convert_to_spark( + pd.concat([df1.head(h_a_l), df2]) + ) + output_object["full_df1_full_df2"] = _prepare_pandas_to_convert_to_spark( + pd.concat([df1, df2]) + ) + output_object["full_df1_half_df2"] = _prepare_pandas_to_convert_to_spark( + pd.concat([df1, df2.head(h_a_l)]) + ) + output_object["half_df1_half_df2"] = _prepare_pandas_to_convert_to_spark( + pd.concat([df1.head(h_a_l), df2.head(h_a_l)]) + ) + return output_object + + +@pytest.fixture(scope="session") +def column_based_synthetic_data(): + output_object = {} + + df1 = pd.DataFrame() + np.random.seed(0) + + arr_len = 100 + h_a_l = int(arr_len / 2) + idx_start = "1/1/2024" + idx_end = "2/1/2024" + + df1["PrimarySensor"] = np.random.rand(arr_len) + np.sin( + np.linspace(0, arr_len / 2, num=arr_len) + ) + df1["SecondarySensor"] = ( + df1["PrimarySensor"] * 2 + np.cos(np.linspace(0, arr_len / 2, num=arr_len)) + 5 + ) + df1["index"] = np.asarray( + pd.date_range(start=idx_start, end=idx_end, periods=arr_len) + ).astype(str) + + output_object["df"] = df1 + output_object["arr_len"] = arr_len + output_object["h_a_l"] = h_a_l + output_object["half_df1_full_df2"] = _prepare_pandas_to_convert_to_spark(df1.copy()) + output_object["half_df1_full_df2"].loc[h_a_l:, "PrimarySensor"] = None + output_object["full_df1_full_df2"] = _prepare_pandas_to_convert_to_spark(df1.copy()) + output_object["full_df1_half_df2"] = _prepare_pandas_to_convert_to_spark(df1.copy()) + output_object["full_df1_half_df2"].loc[h_a_l:, "SecondarySensor"] = None + output_object["half_df1_half_df2"] = _prepare_pandas_to_convert_to_spark( + df1.copy().head(h_a_l) + ) + return output_object + + +def test_nonexistent_column_arima(spark_session: SparkSession): + input_df = spark_session.createDataFrame( + [ + (1.0,), + (2.0,), + ], + ["Value"], + ) + + with pytest.raises(ValueError): + ArimaPrediction(input_df, to_extend_name="NonexistingColumn") + + +def test_invalid_size_arima(spark_session: SparkSession): + input_df = spark_session.createDataFrame( + [ + (1.0,), + (2.0,), + ], + ["Value"], + ) + + with pytest.raises(ValueError): + ArimaPrediction( + input_df, + to_extend_name="Value", + order=(3, 0, 0), + seasonal_order=(3, 0, 0, 62), + number_of_data_points_to_analyze=62, + ) + + +def test_single_column_prediction_arima(spark_session: SparkSession, historic_data): + schema = StructType( + [ + StructField("TagName", StringType(), True), + StructField("EventTime", StringType(), True), + StructField("Status", StringType(), True), + StructField("Value", FloatType(), True), + ] + ) + + # convert last column to float + for idx, item in enumerate(historic_data): + historic_data[idx] = item[0:3] + (float(item[3]),) + + input_df = spark_session.createDataFrame(historic_data, schema=schema) + + h_a_l = int(input_df.count() / 2) + + arima_comp = ArimaPrediction( + input_df, + value_name="Value", + past_data_style=ArimaPrediction.InputStyle.SOURCE_BASED, + to_extend_name="-4O7LSSAM_3EA02:2GT7E02I_R_MP", + number_of_data_points_to_analyze=input_df.count(), + number_of_data_points_to_predict=h_a_l, + order=(3, 0, 0), + seasonal_order=(3, 0, 0, 62), + timestamp_name="EventTime", + source_name="TagName", + status_name="Status", + ) + forecasted_df = arima_comp.filter_data() + # print(forecasted_df.show(forecasted_df.count(), False)) + + assert isinstance(forecasted_df, DataFrame) + + assert input_df.columns == forecasted_df.columns + assert forecasted_df.count() == (input_df.count() + h_a_l) + + +def test_single_column_prediction_auto_arima( + spark_session: SparkSession, historic_data +): + + schema = StructType( + [ + StructField("TagName", StringType(), True), + StructField("EventTime", StringType(), True), + StructField("Status", StringType(), True), + StructField("Value", FloatType(), True), + ] + ) + + # convert last column to float + for idx, item in enumerate(historic_data): + historic_data[idx] = item[0:3] + (float(item[3]),) + + input_df = spark_session.createDataFrame(historic_data, schema=schema) + + h_a_l = int(input_df.count() / 2) + + arima_comp = ArimaAutoPrediction( + past_data=input_df, + # past_data_style=ArimaPrediction.InputStyle.SOURCE_BASED, + # value_name="Value", + to_extend_name="-4O7LSSAM_3EA02:2GT7E02I_R_MP", + number_of_data_points_to_analyze=input_df.count(), + number_of_data_points_to_predict=h_a_l, + # timestamp_name="EventTime", + # source_name="TagName", + # status_name="Status", + seasonal=True, + ) + forecasted_df = arima_comp.filter_data() + # print(forecasted_df.show(forecasted_df.count(), False)) + + assert isinstance(forecasted_df, DataFrame) + + assert input_df.columns == forecasted_df.columns + assert forecasted_df.count() == (input_df.count() + h_a_l) + assert arima_comp.value_name == "Value" + assert arima_comp.past_data_style == ArimaPrediction.InputStyle.SOURCE_BASED + assert arima_comp.timestamp_name == "EventTime" + assert arima_comp.source_name == "TagName" + assert arima_comp.status_name == "Status" + + +def test_column_based_prediction_arima( + spark_session: SparkSession, column_based_synthetic_data +): + + schema = StructType( + [ + StructField("PrimarySource", StringType(), True), + StructField("SecondarySource", StringType(), True), + StructField("EventTime", StringType(), True), + ] + ) + + data = column_based_synthetic_data["half_df1_half_df2"] + + input_df = spark_session.createDataFrame(data, schema=schema) + + arima_comp = ArimaAutoPrediction( + past_data=input_df, + to_extend_name="PrimarySource", + number_of_data_points_to_analyze=input_df.count(), + number_of_data_points_to_predict=input_df.count(), + seasonal=True, + ) + forecasted_df = arima_comp.filter_data() + + # forecasted_df.show() + + assert isinstance(forecasted_df, DataFrame) + + assert input_df.columns == forecasted_df.columns + assert forecasted_df.count() == (input_df.count() + input_df.count()) + assert arima_comp.value_name == None + assert arima_comp.past_data_style == ArimaPrediction.InputStyle.COLUMN_BASED + assert arima_comp.timestamp_name == "EventTime" + assert arima_comp.source_name is None + assert arima_comp.status_name is None + + +def test_arima_large_data_set(spark_session: SparkSession): + test_path = os.path.dirname(__file__) + data_path = os.path.join(test_path, "../../data_quality/test_data.csv") + + input_df = spark_session.read.option("header", "true").csv(data_path) + + expected_schema = StructType( + [ + StructField("TagName", StringType(), True), + StructField("EventTime", TimestampType(), True), + StructField("Status", StringType(), True), + StructField("Value", FloatType(), True), + ] + ) + + print((input_df.count(), len(input_df.columns))) + + count_signal = input_df.filter('TagName = "R0:Z24WVP.0S10L"').count() + h_a_l = int(count_signal / 2) + + arima_comp = ArimaAutoPrediction( + input_df, + to_extend_name="R0:Z24WVP.0S10L", + number_of_data_points_to_analyze=count_signal, + number_of_data_points_to_predict=h_a_l, + ) + + result_df = arima_comp.filter_data() + + tolerance = 0.01 + + assert isinstance(result_df, DataFrame) + + assert result_df.count() == pytest.approx((input_df.count() + h_a_l), rel=tolerance) + + +def test_arima_wrong_datatype(spark_session: SparkSession): + + expected_schema = StructType( + [ + StructField("TagName", StringType(), True), + StructField("EventTime", TimestampType(), True), + StructField("Status", StringType(), True), + StructField("Value", FloatType(), True), + ] + ) + + test_df = spark_session.createDataFrame( + [ + ("A2PS64V0J.:ZUX09R", "invalid_data_type", "Good", "1.0"), + ("A2PS64V0J.:ZUX09R", "invalid_data_type", "Good", "2.0"), + ("A2PS64V0J.:ZUX09R", "invalid_data_type", "Good", "3.0"), + ("A2PS64V0J.:ZUX09R", "invalid_data_type", "Good", "4.0"), + ("A2PS64V0J.:ZUX09R", "invalid_data_type", "Good", "5.0"), + ], + ["TagName", "EventTime", "Status", "Value"], + ) + + count_signal = 5 + h_a_l = int(count_signal / 2) + + with pytest.raises(ValueError) as exc_info: + arima_comp = ArimaAutoPrediction( + test_df, + to_extend_name="A2PS64V0J.:ZUX09R", + number_of_data_points_to_analyze=count_signal, + number_of_data_points_to_predict=h_a_l, + ) + + arima_comp.validate(expected_schema) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/forecasting/spark/test_data_binning.py b/tests/sdk/python/rtdip_sdk/pipelines/forecasting/spark/test_data_binning.py new file mode 100644 index 000000000..f4f8fafee --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/forecasting/spark/test_data_binning.py @@ -0,0 +1,71 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest +from pyspark.sql import SparkSession +from pyspark.ml.linalg import Vectors +from src.sdk.python.rtdip_sdk.pipelines.forecasting.spark.data_binning import ( + DataBinning, +) + + +@pytest.fixture(scope="session") +def spark(): + return ( + SparkSession.builder.master("local[*]") + .appName("Linear Regression Unit Test") + .getOrCreate() + ) + + +@pytest.fixture(scope="function") +def sample_data(spark): + data = [ + (Vectors.dense([1.0]),), + (Vectors.dense([1.2]),), + (Vectors.dense([1.5]),), + (Vectors.dense([5.0]),), + (Vectors.dense([5.2]),), + (Vectors.dense([9.8]),), + (Vectors.dense([10.0]),), + (Vectors.dense([10.2]),), + ] + + return spark.createDataFrame(data, ["features"]) + + +def test_data_binning_kmeans(sample_data): + binning = DataBinning(column_name="features", bins=3, output_column_name="bin") + + result_df = binning.train(sample_data).predict(sample_data) + + assert "bin" in result_df.columns + assert result_df.count() == sample_data.count() + + bin_values = result_df.select("bin").distinct().collect() + bin_numbers = [row.bin for row in bin_values] + assert all(0 <= bin_num < 3 for bin_num in bin_numbers) + + for row in result_df.collect(): + if row["features"] in [1.0, 1.2, 1.5]: + assert row["bin"] == 2 + elif row["features"] in [5.0, 5.2]: + assert row["bin"] == 1 + elif row["features"] in [9.8, 10.0, 10.2]: + assert row["bin"] == 0 + + +def test_data_binning_invalid_method(sample_data): + with pytest.raises(Exception) as exc_info: + DataBinning(column_name="features", bins=3, method="invalid_method") + assert "Unknown method" in str(exc_info.value) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/forecasting/spark/test_k_nearest_neighbors.py b/tests/sdk/python/rtdip_sdk/pipelines/forecasting/spark/test_k_nearest_neighbors.py new file mode 100644 index 000000000..95d91c4bf --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/forecasting/spark/test_k_nearest_neighbors.py @@ -0,0 +1,300 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import pytest +from pyspark.sql import SparkSession +from pyspark.sql.types import ( + StructType, + StructField, + StringType, + TimestampType, + FloatType, +) +from datetime import datetime +from src.sdk.python.rtdip_sdk.pipelines.forecasting.spark.k_nearest_neighbors import ( + KNearestNeighbors, +) +from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer +from pyspark.sql.functions import col + +# Schema definition (same as template) +SCHEMA = StructType( + [ + StructField("TagName", StringType(), True), + StructField("EventTime", TimestampType(), True), + StructField("Status", StringType(), True), + StructField("Value", FloatType(), True), + ] +) + + +@pytest.fixture(scope="session") +def spark(): + return ( + SparkSession.builder.master("local[*]").appName("KNN Unit Test").getOrCreate() + ) + + +@pytest.fixture(scope="function") +def sample_data(spark): + # Using similar data structure as template but with more varied values + data = [ + ( + "TAG1", + datetime.strptime("2024-01-02 20:03:46.000", "%Y-%m-%d %H:%M:%S.%f"), + "Good", + 0.34, + ), + ( + "TAG1", + datetime.strptime("2024-01-02 20:04:46.000", "%Y-%m-%d %H:%M:%S.%f"), + "Good", + 0.35, + ), + ( + "TAG2", + datetime.strptime("2024-01-02 20:05:46.000", "%Y-%m-%d %H:%M:%S.%f"), + "Good", + 0.45, + ), + ( + "TAG2", + datetime.strptime("2024-01-02 20:06:46.000", "%Y-%m-%d %H:%M:%S.%f"), + "Bad", + 0.55, + ), + ] + return spark.createDataFrame(data, schema=SCHEMA) + + +@pytest.fixture(scope="function") +def prepared_data(sample_data): + # Convert categorical variables, Index TagName and Status + tag_indexer = StringIndexer(inputCol="TagName", outputCol="TagIndex") + status_indexer = StringIndexer(inputCol="Status", outputCol="StatusIndex") + + df = tag_indexer.fit(sample_data).transform(sample_data) + df = status_indexer.fit(df).transform(df) + + assembler = VectorAssembler( + inputCols=["TagIndex", "StatusIndex", "Value"], outputCol="raw_features" + ) + df = assembler.transform(df) + + scaler = StandardScaler( + inputCol="raw_features", outputCol="features", withStd=True, withMean=True + ) + return scaler.fit(df).transform(df) + + +def test_knn_initialization(prepared_data): + """Test KNN initialization with various parameters""" + # Test valid initialization + knn = KNearestNeighbors( + features_col="features", + label_col="Value", + timestamp_col="EventTime", + k=3, + weighted=True, + distance_metric="combined", + ) + assert knn.k == 3 + assert knn.weighted is True + + # Test invalid distance metric + with pytest.raises(ValueError): + KNearestNeighbors( + features_col="features", + label_col="Value", + distance_metric="invalid_metric", + ) + + # Test missing timestamp column for temporal distance + with pytest.raises(ValueError): + KNearestNeighbors( + features_col="features", + label_col="Value", + # timestamp_col is compulsory for temporal distance + distance_metric="temporal", + ) + + +def test_data_splitting(prepared_data): + """Test the data splitting functionality""" + knn = KNearestNeighbors( + features_col="features", + label_col="Value", + timestamp_col="EventTime", + ) + + train_df, test_df = prepared_data.randomSplit([0.8, 0.2], seed=42) + + assert train_df.count() + test_df.count() == prepared_data.count() + assert train_df.count() > 0 + assert test_df.count() > 0 + + +def test_model_training(prepared_data): + """Test model training functionality""" + knn = KNearestNeighbors( + features_col="features", + label_col="Value", + timestamp_col="EventTime", + ) + + train_df, _ = prepared_data.randomSplit([0.8, 0.2], seed=42) + trained_model = knn.train(train_df) + + assert trained_model is not None + assert trained_model.train_features is not None + assert trained_model.train_labels is not None + + +def test_predictions(prepared_data): + """Test prediction functionality""" + knn = KNearestNeighbors( + features_col="features", + label_col="Value", + timestamp_col="EventTime", + weighted=True, + ) + + train_df, test_df = prepared_data.randomSplit([0.8, 0.2], seed=42) + knn.train(train_df) + predictions = knn.predict(test_df) + + assert "prediction" in predictions.columns + assert predictions.count() > 0 + assert all(pred is not None for pred in predictions.select("prediction").collect()) + + +def test_temporal_distance(prepared_data): + """Test temporal distance calculation""" + knn = KNearestNeighbors( + features_col="features", + label_col="Value", + timestamp_col="EventTime", + distance_metric="temporal", + ) + + train_df, test_df = prepared_data.randomSplit([0.8, 0.2], seed=42) + knn.train(train_df) + predictions = knn.predict(test_df) + + assert predictions.count() > 0 + assert "prediction" in predictions.columns + + +def test_combined_distance(prepared_data): + """Test combined distance calculation""" + knn = KNearestNeighbors( + features_col="features", + label_col="Value", + timestamp_col="EventTime", + distance_metric="combined", + temporal_weight=0.5, + ) + + train_df, test_df = prepared_data.randomSplit([0.8, 0.2], seed=42) + knn.train(train_df) + predictions = knn.predict(test_df) + + assert predictions.count() > 0 + assert "prediction" in predictions.columns + + +def test_invalid_data_handling(spark): + """Test handling of invalid data""" + invalid_data = [ + ("TAG1", "invalid_date", "Good", "invalid_value"), + ("TAG1", "2024-01-02 20:03:46.000", "Good", "NaN"), + ("TAG2", "2024-01-02 20:03:46.000", None, 123.45), + ] + + schema = StructType( + [ + StructField("TagName", StringType(), True), + StructField("EventTime", StringType(), True), + StructField("Status", StringType(), True), + StructField("Value", StringType(), True), + ] + ) + + df = spark.createDataFrame(invalid_data, schema=schema) + + try: + df = df.withColumn("Value", col("Value").cast(FloatType())) + invalid_rows = df.filter(col("Value").isNull()) + valid_rows = df.filter(col("Value").isNotNull()) + + assert invalid_rows.count() > 0 + assert valid_rows.count() > 0 + except Exception as e: + pytest.fail(f"Unexpected error during invalid data handling: {e}") + + +def test_large_dataset(spark): + """Test KNN on a larger dataset""" + base_path = os.path.dirname(__file__) + file_path = os.path.join(base_path, "../../data_quality/test_data.csv") + + try: + df = spark.read.option("header", "true").csv(file_path) + df = df.withColumn("Value", col("Value").cast(FloatType())) + df = df.withColumn("EventTime", col("EventTime").cast(TimestampType())) + + prepared_df = prepare_data_for_knn(df) + + knn = KNearestNeighbors( + features_col="features", + label_col="Value", + timestamp_col="EventTime", + ) + + train_df, test_df = prepared_df.randomSplit([0.8, 0.2], seed=42) + knn.train(train_df) + predictions = knn.predict(test_df) + + assert predictions.count() > 0 + assert "prediction" in predictions.columns + except Exception as e: + pytest.fail(f"Failed to process large dataset: {e}") + + +def prepare_data_for_knn(df): + """Helper function to prepare data for KNN""" + + # Convert categorical variables + indexers = [ + StringIndexer(inputCol=col, outputCol=f"{col}Index") + for col in ["TagName", "Status"] + if col in df.columns + ] + + for indexer in indexers: + df = indexer.fit(df).transform(df) + + # Create feature vector + numeric_cols = [col for col in df.columns if df.schema[col].dataType == FloatType()] + index_cols = [col for col in df.columns if col.endswith("Index")] + feature_cols = numeric_cols + index_cols + + assembler = VectorAssembler(inputCols=feature_cols, outputCol="raw_features") + df = assembler.transform(df) + + # Scale features + scaler = StandardScaler( + inputCol="raw_features", outputCol="features", withStd=True, withMean=True + ) + return scaler.fit(df).transform(df) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/forecasting/spark/test_linear_regression.py b/tests/sdk/python/rtdip_sdk/pipelines/forecasting/spark/test_linear_regression.py new file mode 100644 index 000000000..aa43830fc --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/forecasting/spark/test_linear_regression.py @@ -0,0 +1,321 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import pytest +from pyspark.sql import SparkSession +from pyspark.sql import Row +from pyspark.sql.types import ( + StructType, + StructField, + StringType, + TimestampType, + FloatType, +) +from datetime import datetime +from src.sdk.python.rtdip_sdk.pipelines.forecasting.spark.linear_regression import ( + LinearRegression, +) +from src.sdk.python.rtdip_sdk.pipelines.transformers.spark.machine_learning.columns_to_vector import ( + ColumnsToVector, +) +from src.sdk.python.rtdip_sdk.pipelines.transformers.spark.machine_learning.polynomial_features import ( + PolynomialFeatures, +) + +SCHEMA = StructType( + [ + StructField("TagName", StringType(), True), + StructField("EventTime", TimestampType(), True), + StructField("Status", StringType(), True), + StructField("Value", FloatType(), True), + ] +) + + +@pytest.fixture(scope="session") +def spark(): + return ( + SparkSession.builder.master("local[*]") + .appName("Linear Regression Unit Test") + .getOrCreate() + ) + + +@pytest.fixture(scope="function") +def sample_data(spark): + data = [ + ( + "A2PS64V0J.:ZUX09R", + datetime.strptime("2024-01-02 20:03:46.000", "%Y-%m-%d %H:%M:%S.%f"), + "Good", + 0.3400000035762787, + ), + ( + "A2PS64V0J.:ZUX09R", + datetime.strptime("2024-01-02 16:00:12.000", "%Y-%m-%d %H:%M:%S.%f"), + "Good", + 0.15000000596046448, + ), + ( + "A2PS64V0J.:ZUX09R", + datetime.strptime("2024-01-02 11:56:42.000", "%Y-%m-%d %H:%M:%S.%f"), + "Good", + 0.12999999523162842, + ), + ( + "A2PS64V0J.:ZUX09R", + datetime.strptime("2024-01-02 07:53:11.000", "%Y-%m-%d %H:%M:%S.%f"), + "Good", + 0.11999999731779099, + ), + ( + "A2PS64V0J.:ZUX09R", + datetime.strptime("2024-01-02 03:49:45.000", "%Y-%m-%d %H:%M:%S.%f"), + "Good", + 0.12999999523162842, + ), + ( + "-4O7LSSAM_3EA02:2GT7E02I_R_MP", + datetime.strptime("2024-01-02 20:09:58.053", "%Y-%m-%d %H:%M:%S.%f"), + "Good", + 7107.82080078125, + ), + ( + "_LT2EPL-9PM0.OROTENV3:", + datetime.strptime("2024-01-02 12:27:10.518", "%Y-%m-%d %H:%M:%S.%f"), + "Good", + 19407.0, + ), + ( + "_LT2EPL-9PM0.OROTENV3:", + datetime.strptime("2024-01-02 05:23:10.143", "%Y-%m-%d %H:%M:%S.%f"), + "Good", + 19403.0, + ), + ( + "_LT2EPL-9PM0.OROTENV3:", + datetime.strptime("2024-01-02 01:31:10.086", "%Y-%m-%d %H:%M:%S.%f"), + "Good", + 19399.0, + ), + ( + "1N325T3MTOR-P0L29:9.T0", + datetime.strptime("2024-01-02 23:41:10.358", "%Y-%m-%d %H:%M:%S.%f"), + "Good", + 19376.0, + ), + ( + "TT33-01M9Z2L9:P20.AIRO5N", + datetime.strptime("2024-01-02 18:09:10.488", "%Y-%m-%d %H:%M:%S.%f"), + "Good", + 19375.0, + ), + ( + "TT33-01M9Z2L9:P20.AIRO5N", + datetime.strptime("2024-01-02 16:15:10.492", "%Y-%m-%d %H:%M:%S.%f"), + "Good", + 19376.0, + ), + ( + "TT33-01M9Z2L9:P20.AIRO5N", + datetime.strptime("2024-01-02 06:51:10.077", "%Y-%m-%d %H:%M:%S.%f"), + "Good", + 19403.0, + ), + ( + "O:05RI0.2T2M6STN6_PP-I165AT", + datetime.strptime("2024-01-02 07:42:24.227", "%Y-%m-%d %H:%M:%S.%f"), + "Good", + 6.55859375, + ), + ( + "-4O7LSSAM_3EA02:2GT7E02I_R_MP", + datetime.strptime("2024-01-02 06:08:23.777", "%Y-%m-%d %H:%M:%S.%f"), + "Good", + 5921.5498046875, + ), + ( + "-4O7LSSAM_3EA02:2GT7E02I_R_MP", + datetime.strptime("2024-01-02 05:14:10.896", "%Y-%m-%d %H:%M:%S.%f"), + "Good", + 5838.216796875, + ), + ( + "-4O7LSSAM_3EA02:2GT7E02I_R_MP", + datetime.strptime("2024-01-02 01:37:10.967", "%Y-%m-%d %H:%M:%S.%f"), + "Good", + 5607.82568359375, + ), + ( + "-4O7LSSAM_3EA02:2GT7E02I_R_MP", + datetime.strptime("2024-01-02 00:26:53.449", "%Y-%m-%d %H:%M:%S.%f"), + "Good", + 5563.7080078125, + ), + ] + + return spark.createDataFrame(data, schema=SCHEMA) + + +def test_columns_to_vector(sample_data): + df = sample_data + columns_to_vector = ColumnsToVector( + df=df, input_cols=["Value"], output_col="features" + ) + transformed_df = columns_to_vector.transform() + + assert "features" in transformed_df.columns + transformed_df.show() + + +def test_polynomial_features(sample_data): + df = sample_data + # Convert 'Value' to a vector using ColumnsToVector + columns_to_vector = ColumnsToVector( + df=df, input_cols=["Value"], output_col="features" + ) + vectorized_df = columns_to_vector.transform() + + polynomial_features = PolynomialFeatures( + df=vectorized_df, + input_col="features", + output_col="poly_features", + poly_degree=2, + ) + transformed_df = polynomial_features.transform() + assert ( + "poly_features" in transformed_df.columns + ), "Polynomial features column not created" + assert transformed_df.count() > 0, "Transformed DataFrame is empty" + + transformed_df.show() + + +def test_dataframe_validation(sample_data): + df = sample_data + + required_columns = ["TagName", "EventTime", "Status", "Value"] + for column in required_columns: + if column not in df.columns: + raise ValueError(f"Missing required column: {column}") + + try: + df.withColumn("Value", df["Value"].cast(FloatType())) + except Exception as e: + raise ValueError("Column 'Value' could not be converted to FloatType.") from e + + +def test_invalid_data_handling(spark): + + data = [ + ("A2PS64V0J.:ZUX09R", "invalid_date", "Good", "invalid_value"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 20:03:46.000", "Good", "NaN"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 20:03:46.000", None, 123.45), + ("A2PS64V0J.:ZUX09R", "2024-01-02 20:03:46.000", "Good", 123.45), + ] + + schema = StructType( + [ + StructField("TagName", StringType(), True), + StructField("EventTime", StringType(), True), + StructField("Status", StringType(), True), + StructField("Value", StringType(), True), + ] + ) + + df = spark.createDataFrame(data, schema=schema) + + try: + df = df.withColumn("Value", df["Value"].cast(FloatType())) + except Exception as e: + pytest.fail(f"Unexpected error during casting: {e}") + + invalid_rows = df.filter(df["Value"].isNull()) + valid_rows = df.filter(df["Value"].isNotNull()) + + assert invalid_rows.count() > 0, "No invalid rows detected when expected" + assert valid_rows.count() > 0, "All rows were invalid, which is unexpected" + + if valid_rows.count() > 0: + vectorized_df = ColumnsToVector( + df=valid_rows, input_cols=["Value"], output_col="features" + ).transform() + assert ( + "features" in vectorized_df.columns + ), "Vectorized column 'features' not created" + + +def test_invalid_prediction_without_training(sample_data): + df = sample_data + + vectorized_df = ColumnsToVector( + df=df, input_cols=["Value"], output_col="features" + ).transform() + + linear_regression = LinearRegression( + features_col="features", + label_col="Value", + prediction_col="prediction", + ) + + # Attempt prediction without training + with pytest.raises( + AttributeError, match="'LinearRegression' object has no attribute 'model'" + ): + linear_regression.predict(vectorized_df) + + +def test_prediction_on_large_dataset(spark): + base_path = os.path.dirname(__file__) + file_path = os.path.join(base_path, "../../data_quality/test_data.csv") + df = spark.read.option("header", "true").csv(file_path) + assert df.count() > 0, "Dataframe was not loaded correctly" + + assert df.count() > 0, "Dataframe was not loaded correctly" + assert "EventTime" in df.columns, "Missing 'EventTime' column in dataframe" + assert "Value" in df.columns, "Missing 'Value' column in dataframe" + + df = df.withColumn("Value", df["Value"].cast("float")) + assert ( + df.select("Value").schema[0].dataType == FloatType() + ), "Value column was not cast to FloatType" + + vectorized_df = ColumnsToVector( + df=df, input_cols=["Value"], output_col="features" + ).transform() + + assert ( + "features" in vectorized_df.columns + ), "Vectorized column 'features' not created" + + linear_regression = LinearRegression( + features_col="features", + label_col="Value", + prediction_col="prediction", + ) + + train_df, test_df = linear_regression.split_data(vectorized_df, train_ratio=0.8) + assert train_df.count() > 0, "Training dataset is empty" + assert test_df.count() > 0, "Testing dataset is empty" + + model = linear_regression.train(train_df) + assert model is not None, "Model training failed" + + predictions = model.predict(test_df) + + assert predictions is not None, "Predictions dataframe is empty" + assert predictions.count() > 0, "No predictions were generated" + assert ( + "prediction" in predictions.columns + ), "Missing 'prediction' column in predictions dataframe" diff --git a/tests/sdk/python/rtdip_sdk/pipelines/logging/__init__.py b/tests/sdk/python/rtdip_sdk/pipelines/logging/__init__.py new file mode 100644 index 000000000..1832b01ae --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/logging/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/sdk/python/rtdip_sdk/pipelines/logging/test_log_collection.py b/tests/sdk/python/rtdip_sdk/pipelines/logging/test_log_collection.py new file mode 100644 index 000000000..103f09f01 --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/logging/test_log_collection.py @@ -0,0 +1,149 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +import pytest + +from pandas import DataFrame +from pyspark.sql import SparkSession + +from src.sdk.python.rtdip_sdk.pipelines.logging.logger_manager import LoggerManager +from src.sdk.python.rtdip_sdk.pipelines.logging.spark.runtime_log_collector import ( + RuntimeLogCollector, +) +from src.sdk.python.rtdip_sdk.pipelines.data_quality.monitoring.spark.identify_missing_data_interval import ( + IdentifyMissingDataInterval, +) + +import logging + + +@pytest.fixture(scope="session") +def spark(): + spark = ( + SparkSession.builder.master("local[2]") + .appName("LogCollectionTest") + .getOrCreate() + ) + yield spark + spark.stop() + + +def test_logger_manager_basic_function(spark): + df = spark.createDataFrame( + [ + ("A2PS64V0J.:ZUX09R", "2024-01-02 00:00:00.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 00:01:25.000", "Good", "0.150000006"), + ( + "A2PS64V0J.:ZUX09R", + "2024-01-02 00:01:41.000", + "Good", + "0.340000004", + ), # Missing interval (25s to 41s) + ], + ["TagName", "EventTime", "Status", "Value"], + ) + monitor = IdentifyMissingDataInterval( + df=df, + interval="10s", + tolerance="500ms", + ) + log_collector = RuntimeLogCollector(spark) + + assert monitor.logger_manager is log_collector.logger_manager + + +def test_df_output(spark, caplog): + log_collector = RuntimeLogCollector(spark) + df = spark.createDataFrame( + [ + ("A2PS64V0J.:ZUX09R", "2024-01-02 00:00:00.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 00:00:10.000", "Good", "0.119999997"), + ], + ["TagName", "EventTime", "Status", "Value"], + ) + + monitor = IdentifyMissingDataInterval( + df=df, + interval="10s", + tolerance="500ms", + ) + log_handler = log_collector._attach_dataframe_handler_to_logger( + "IdentifyMissingDataInterval" + ) + + with caplog.at_level(logging.INFO, logger="IdentifyMissingDataInterval"): + monitor.check() + + result_df = log_handler.get_logs_as_df() + + assert result_df.count() == 4 + + +def test_unique_dataframes(spark, caplog): + log_collector = RuntimeLogCollector(spark) + df = spark.createDataFrame( + [ + ("A2PS64V0J.:ZUX09R", "2024-01-02 00:00:00.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 00:00:10.000", "Good", "0.119999997"), + ], + ["TagName", "EventTime", "Status", "Value"], + ) + logger = LoggerManager().create_logger("Test_Logger") + monitor = IdentifyMissingDataInterval( + df=df, + interval="10s", + tolerance="500ms", + ) + log_handler_identify_missing_data_interval = ( + log_collector._attach_dataframe_handler_to_logger("IdentifyMissingDataInterval") + ) + + log_handler_test = log_collector._attach_dataframe_handler_to_logger("Test_Logger") + + with caplog.at_level(logging.INFO, logger="IdentifyMissingDataInterval"): + monitor.check() + + result_df = log_handler_identify_missing_data_interval.get_logs_as_df() + result_df_test = log_handler_test.get_logs_as_df() + + assert result_df.count() != result_df_test.count() + + +def test_file_logging(spark, caplog): + + log_collector = RuntimeLogCollector(spark) + df = spark.createDataFrame( + [ + ("A2PS64V0J.:ZUX09R", "2024-01-02 00:00:00.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 00:00:10.000", "Good", "0.119999997"), + ], + ["TagName", "EventTime", "Status", "Value"], + ) + monitor = IdentifyMissingDataInterval( + df=df, + interval="10s", + tolerance="500ms", + ) + log_collector._attach_file_handler_to_loggers("logs.log", ".") + + with caplog.at_level(logging.INFO, logger="IdentifyMissingDataInterval"): + monitor.check() + + with open("./logs.log", "r") as f: + logs = f.readlines() + + assert len(logs) == 4 + if os.path.exists("./logs.log"): + os.remove("./logs.log") diff --git a/tests/sdk/python/rtdip_sdk/pipelines/logging/test_logger_manager.py b/tests/sdk/python/rtdip_sdk/pipelines/logging/test_logger_manager.py new file mode 100644 index 000000000..0b2e4e6cc --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/logging/test_logger_manager.py @@ -0,0 +1,31 @@ +# Copyright 2025 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from src.sdk.python.rtdip_sdk.pipelines.logging.logger_manager import LoggerManager + + +def test_logger_manager_basic_function(): + logger_manager = LoggerManager() + logger1 = logger_manager.create_logger("logger1") + assert logger1 is logger_manager.get_logger("logger1") + + assert logger_manager.get_logger("logger2") is None + + +def test_singleton_functionality(): + logger_manager = LoggerManager() + logger_manager2 = LoggerManager() + + assert logger_manager is logger_manager2 diff --git a/tests/sdk/python/rtdip_sdk/queries/_test_utils/sdk_test_objects.py b/tests/sdk/python/rtdip_sdk/queries/_test_utils/sdk_test_objects.py index daaae4cd2..6459c2e16 100644 --- a/tests/sdk/python/rtdip_sdk/queries/_test_utils/sdk_test_objects.py +++ b/tests/sdk/python/rtdip_sdk/queries/_test_utils/sdk_test_objects.py @@ -33,34 +33,33 @@ MOCKED_QUERY_OFFSET_LIMIT = "LIMIT 10 OFFSET 10 " # Raw -RAW_MOCKED_QUERY = 'WITH raw_events AS (SELECT DISTINCT from_utc_timestamp(date_trunc("millisecond",`EventTime`), "+0000") AS `EventTime`, `TagName`, `Status`, `Value` FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp("2011-01-01T00:00:00+00:00") AND to_timestamp("2011-01-02T23:59:59+00:00") AND `TagName` IN (\'mocked-TAGNAME\') ORDER BY `TagName`, `EventTime` ) SELECT * FROM raw_events ' -RAW_MOCKED_QUERY_CHECK_TAGS = 'WITH raw_events AS (SELECT DISTINCT from_utc_timestamp(date_trunc("millisecond",`EventTime`), "+0000") AS `EventTime`, `TagName`, `Status`, `Value` FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp("2011-01-01T00:00:00+00:00") AND to_timestamp("2011-01-02T23:59:59+00:00") AND UPPER(`TagName`) IN (\'MOCKED-TAGNAME\') ORDER BY `TagName`, `EventTime` ) SELECT * FROM raw_events ' -RAW_MOCKED_QUERY_DISPLAY_UOM = 'WITH raw_events AS (SELECT DISTINCT from_utc_timestamp(date_trunc("millisecond",`EventTime`), "+0000") AS `EventTime`, `TagName`, `Status`, `Value` FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp("2011-01-01T00:00:00+00:00") AND to_timestamp("2011-01-02T23:59:59+00:00") AND `TagName` IN (\'mocked-TAGNAME\') ORDER BY `TagName`, `EventTime` ) SELECT e.`EventTime`, e.`TagName`, e.`Status`, e.`Value`, m.`UOM` FROM raw_events e LEFT OUTER JOIN `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_metadata` m ON e.`TagName` = m.`TagName` ' +RAW_MOCKED_QUERY = "WITH raw AS (SELECT DISTINCT from_utc_timestamp(date_trunc('millisecond',`EventTime`), '+0000') AS `EventTime`, `TagName`, `Status`, `Value` FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp('2011-01-01T00:00:00+00:00') AND to_timestamp('2011-01-02T23:59:59+00:00') AND `TagName` IN ('mocked-TAGNAME') ORDER BY `TagName`, `EventTime`) SELECT * FROM raw" +RAW_MOCKED_QUERY_CHECK_TAGS = "WITH raw AS (SELECT DISTINCT from_utc_timestamp(date_trunc('millisecond',`EventTime`), '+0000') AS `EventTime`, `TagName`, `Status`, `Value` FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp('2011-01-01T00:00:00+00:00') AND to_timestamp('2011-01-02T23:59:59+00:00') AND UPPER(`TagName`) IN ('MOCKED-TAGNAME') ORDER BY `TagName`, `EventTime`) SELECT * FROM raw" +RAW_MOCKED_QUERY_DISPLAY_UOM = "WITH raw AS (SELECT DISTINCT from_utc_timestamp(date_trunc('millisecond',`EventTime`), '+0000') AS `EventTime`, `TagName`, `Status`, `Value` FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp('2011-01-01T00:00:00+00:00') AND to_timestamp('2011-01-02T23:59:59+00:00') AND `TagName` IN ('mocked-TAGNAME') ORDER BY `TagName`, `EventTime`), uom AS (SELECT raw.*, metadata.`UoM` FROM raw LEFT OUTER JOIN `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_metadata` AS metadata ON raw.`TagName` = metadata.`TagName`) SELECT * FROM uom" # Resample -RESAMPLE_MOCKED_QUERY = 'WITH raw_events AS (SELECT DISTINCT from_utc_timestamp(date_trunc("millisecond",`EventTime`), "+0000") AS `EventTime`, `TagName`, `Status`, `Value` FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp("2011-01-01T00:00:00+00:00") AND to_timestamp("2011-01-02T23:59:59+00:00") AND `TagName` IN (\'mocked-TAGNAME\') ) ,date_array AS (SELECT explode(sequence(from_utc_timestamp(to_timestamp("2011-01-01T00:00:00+00:00"), "+0000"), from_utc_timestamp(to_timestamp("2011-01-02T23:59:59+00:00"), "+0000"), INTERVAL \'15 minute\')) AS timestamp_array) ,window_buckets AS (SELECT timestamp_array AS window_start, timestampadd(minute, 15, timestamp_array) AS window_end FROM date_array) ,resample AS (SELECT /*+ RANGE_JOIN(d, 900 ) */ d.window_start, d.window_end, e.`TagName`, avg(e.`Value`) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `Value` FROM window_buckets d INNER JOIN raw_events e ON d.window_start <= e.`EventTime` AND d.window_end > e.`EventTime`) ,project AS (SELECT window_start AS `EventTime`, `TagName`, `Value` FROM resample GROUP BY window_start, `TagName`, `Value` ORDER BY `TagName`, `EventTime` ) SELECT * FROM project ' -RESAMPLE_MOCKED_QUERY_CHECK_TAGS = 'WITH raw_events AS (SELECT DISTINCT from_utc_timestamp(date_trunc("millisecond",`EventTime`), "+0000") AS `EventTime`, `TagName`, `Status`, `Value` FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp("2011-01-01T00:00:00+00:00") AND to_timestamp("2011-01-02T23:59:59+00:00") AND UPPER(`TagName`) IN (\'MOCKED-TAGNAME\') ) ,date_array AS (SELECT explode(sequence(from_utc_timestamp(to_timestamp("2011-01-01T00:00:00+00:00"), "+0000"), from_utc_timestamp(to_timestamp("2011-01-02T23:59:59+00:00"), "+0000"), INTERVAL \'15 minute\')) AS timestamp_array) ,window_buckets AS (SELECT timestamp_array AS window_start, timestampadd(minute, 15, timestamp_array) AS window_end FROM date_array) ,resample AS (SELECT /*+ RANGE_JOIN(d, 900 ) */ d.window_start, d.window_end, e.`TagName`, avg(e.`Value`) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `Value` FROM window_buckets d INNER JOIN raw_events e ON d.window_start <= e.`EventTime` AND d.window_end > e.`EventTime`) ,project AS (SELECT window_start AS `EventTime`, `TagName`, `Value` FROM resample GROUP BY window_start, `TagName`, `Value` ORDER BY `TagName`, `EventTime` ) SELECT * FROM project ' -RESAMPLE_MOCKED_QUERY_PIVOT = 'WITH raw_events AS (SELECT DISTINCT from_utc_timestamp(date_trunc("millisecond",`EventTime`), "+0000") AS `EventTime`, `TagName`, `Status`, `Value` FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp("2011-01-01T00:00:00+00:00") AND to_timestamp("2011-01-02T23:59:59+00:00") AND `TagName` IN (\'mocked-TAGNAME\') ) ,date_array AS (SELECT explode(sequence(from_utc_timestamp(to_timestamp("2011-01-01T00:00:00+00:00"), "+0000"), from_utc_timestamp(to_timestamp("2011-01-02T23:59:59+00:00"), "+0000"), INTERVAL \'15 minute\')) AS timestamp_array) ,window_buckets AS (SELECT timestamp_array AS window_start, timestampadd(minute, 15, timestamp_array) AS window_end FROM date_array) ,resample AS (SELECT /*+ RANGE_JOIN(d, 900 ) */ d.window_start, d.window_end, e.`TagName`, avg(e.`Value`) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `Value` FROM window_buckets d INNER JOIN raw_events e ON d.window_start <= e.`EventTime` AND d.window_end > e.`EventTime`) ,project AS (SELECT window_start AS `EventTime`, `TagName`, `Value` FROM resample GROUP BY window_start, `TagName`, `Value` ORDER BY `TagName`, `EventTime` ) ,pivot AS (SELECT * FROM (SELECT `EventTime`, `Value`, `TagName` AS `TagName` FROM project) PIVOT (FIRST(`Value`) FOR `TagName` IN (\'mocked-TAGNAME\' AS `mocked-TAGNAME`))) SELECT * FROM pivot ORDER BY `EventTime` ' -RESAMPLE_MOCKED_QUERY_UOM = 'WITH raw_events AS (SELECT DISTINCT from_utc_timestamp(date_trunc("millisecond",`EventTime`), "+0000") AS `EventTime`, `TagName`, `Status`, `Value` FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp("2011-01-01T00:00:00+00:00") AND to_timestamp("2011-01-02T23:59:59+00:00") AND `TagName` IN (\'mocked-TAGNAME\') ) ,date_array AS (SELECT explode(sequence(from_utc_timestamp(to_timestamp("2011-01-01T00:00:00+00:00"), "+0000"), from_utc_timestamp(to_timestamp("2011-01-02T23:59:59+00:00"), "+0000"), INTERVAL \'15 minute\')) AS timestamp_array) ,window_buckets AS (SELECT timestamp_array AS window_start, timestampadd(minute, 15, timestamp_array) AS window_end FROM date_array) ,resample AS (SELECT /*+ RANGE_JOIN(d, 900 ) */ d.window_start, d.window_end, e.`TagName`, avg(e.`Value`) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `Value` FROM window_buckets d INNER JOIN raw_events e ON d.window_start <= e.`EventTime` AND d.window_end > e.`EventTime`) ,project AS (SELECT window_start AS `EventTime`, `TagName`, `Value` FROM resample GROUP BY window_start, `TagName`, `Value` ORDER BY `TagName`, `EventTime` ) SELECT p.`EventTime`, p.`TagName`, p.`Value`, m.`UoM` FROM project p LEFT OUTER JOIN `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_metadata` m ON p.`TagName` = m.`TagName` ' +RESAMPLE_MOCKED_QUERY = "WITH raw AS (SELECT DISTINCT from_utc_timestamp(date_trunc('millisecond',`EventTime`), '+0000') AS `EventTime`, window(from_utc_timestamp(date_trunc('millisecond',`EventTime`), '+0000'), '15 minute', '15 minute', '0 second') AS `window`, `TagName`, `Status`, `Value` FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp('2011-01-01T00:00:00+00:00') AND timestampadd(minute, 15, to_timestamp('2011-01-02T23:59:59+00:00')) AND `TagName` IN ('mocked-TAGNAME')), resample AS (SELECT raw.`TagName`, raw.`window`.start AS `EventTime`, avg(raw.`Value`) AS `Value` FROM raw GROUP BY raw.`TagName`, raw.`window`.start ORDER BY `TagName`, `EventTime`) SELECT * FROM resample" +RESAMPLE_MOCKED_QUERY_CHECK_TAGS = "WITH raw AS (SELECT DISTINCT from_utc_timestamp(date_trunc('millisecond',`EventTime`), '+0000') AS `EventTime`, window(from_utc_timestamp(date_trunc('millisecond',`EventTime`), '+0000'), '15 minute', '15 minute', '0 second') AS `window`, `TagName`, `Status`, `Value` FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp('2011-01-01T00:00:00+00:00') AND timestampadd(minute, 15, to_timestamp('2011-01-02T23:59:59+00:00')) AND UPPER(`TagName`) IN ('MOCKED-TAGNAME')), resample AS (SELECT raw.`TagName`, raw.`window`.start AS `EventTime`, avg(raw.`Value`) AS `Value` FROM raw GROUP BY raw.`TagName`, raw.`window`.start ORDER BY `TagName`, `EventTime`) SELECT * FROM resample" +RESAMPLE_MOCKED_QUERY_PIVOT = "WITH raw AS (SELECT DISTINCT from_utc_timestamp(date_trunc('millisecond',`EventTime`), '+0000') AS `EventTime`, window(from_utc_timestamp(date_trunc('millisecond',`EventTime`), '+0000'), '15 minute', '15 minute', '0 second') AS `window`, `TagName`, `Status`, `Value` FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp('2011-01-01T00:00:00+00:00') AND timestampadd(minute, 15, to_timestamp('2011-01-02T23:59:59+00:00')) AND `TagName` IN ('mocked-TAGNAME')), resample AS (SELECT raw.`TagName`, raw.`window`.start AS `EventTime`, avg(raw.`Value`) AS `Value` FROM raw GROUP BY raw.`TagName`, raw.`window`.start), pivot AS (SELECT * FROM (SELECT `EventTime`, `Value`, `TagName` FROM resample) PIVOT (FIRST(`Value`) FOR `TagName` IN ('mocked-TAGNAME' AS `mocked-TAGNAME`)) ORDER BY `EventTime`) SELECT * FROM pivot" +RESAMPLE_MOCKED_QUERY_UOM = "WITH raw AS (SELECT DISTINCT from_utc_timestamp(date_trunc('millisecond',`EventTime`), '+0000') AS `EventTime`, window(from_utc_timestamp(date_trunc('millisecond',`EventTime`), '+0000'), '15 minute', '15 minute', '0 second') AS `window`, `TagName`, `Status`, `Value` FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp('2011-01-01T00:00:00+00:00') AND timestampadd(minute, 15, to_timestamp('2011-01-02T23:59:59+00:00')) AND `TagName` IN ('mocked-TAGNAME')), resample AS (SELECT raw.`TagName`, raw.`window`.start AS `EventTime`, avg(raw.`Value`) AS `Value` FROM raw GROUP BY raw.`TagName`, raw.`window`.start ORDER BY `TagName`, `EventTime`), uom AS (SELECT resample.*, metadata.`UoM` FROM resample LEFT OUTER JOIN `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_metadata` AS metadata ON resample.`TagName` = metadata.`TagName`) SELECT * FROM uom" # Plot -PLOT_MOCKED_QUERY = 'WITH raw_events AS (SELECT DISTINCT from_utc_timestamp(date_trunc("millisecond",`EventTime`), "+0000") AS `EventTime`, `TagName`, `Status`, `Value` FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp("2011-01-01T00:00:00+00:00") AND to_timestamp("2011-01-02T23:59:59+00:00") AND `TagName` IN (\'mocked-TAGNAME\') ) ,date_array AS (SELECT explode(sequence(from_utc_timestamp(to_timestamp("2011-01-01T00:00:00+00:00"), "+0000"), from_utc_timestamp(to_timestamp("2011-01-02T23:59:59+00:00"), "+0000"), INTERVAL \'15 minute\')) AS timestamp_array) ,window_buckets AS (SELECT timestamp_array AS window_start, timestampadd(minute, 15, timestamp_array) AS window_end FROM date_array) ,plot AS (SELECT /*+ RANGE_JOIN(d, 900 ) */ d.window_start, d.window_end, e.`TagName`, min(CASE WHEN `Status` = \'Bad\' THEN null ELSE struct(e.`Value`, e.`EventTime`) END) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `min_Value`, max(CASE WHEN `Status` = \'Bad\' THEN null ELSE struct(e.`Value`, e.`EventTime`) END) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `max_Value`, first(CASE WHEN `Status` = \'Bad\' THEN null ELSE struct(e.`Value`, e.`EventTime`) END, True) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `first_Value`, last(CASE WHEN `Status` = \'Bad\' THEN null ELSE struct(e.`Value`, e.`EventTime`) END, True) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `last_Value`, first(CASE WHEN `Status` = \'Bad\' THEN struct(e.`Value`, e.`EventTime`) ELSE null END, True) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `excp_Value` FROM window_buckets d INNER JOIN raw_events e ON d.window_start <= e.`EventTime` AND d.window_end > e.`EventTime`) ,deduplicate AS (SELECT window_start AS `EventTime`, `TagName`, `min_Value` as `Min`, `max_Value` as `Max`, `first_Value` as `First`, `last_Value` as `Last`, `excp_Value` as `Exception` FROM plot GROUP BY window_start, `TagName`, `min_Value`, `max_Value`, `first_Value`, `last_Value`, `excp_Value`) ,project AS (SELECT distinct Values.EventTime, `TagName`, Values.Value FROM (SELECT * FROM deduplicate UNPIVOT (`Values` for `Aggregation` IN (`Min`, `Max`, `First`, `Last`, `Exception`))) ORDER BY `TagName`, `EventTime` ) SELECT * FROM project ' -PLOT_MOCKED_QUERY_CHECK_TAGS = 'WITH raw_events AS (SELECT DISTINCT from_utc_timestamp(date_trunc("millisecond",`EventTime`), "+0000") AS `EventTime`, `TagName`, `Status`, `Value` FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp("2011-01-01T00:00:00+00:00") AND to_timestamp("2011-01-02T23:59:59+00:00") AND UPPER(`TagName`) IN (\'MOCKED-TAGNAME\') ) ,date_array AS (SELECT explode(sequence(from_utc_timestamp(to_timestamp("2011-01-01T00:00:00+00:00"), "+0000"), from_utc_timestamp(to_timestamp("2011-01-02T23:59:59+00:00"), "+0000"), INTERVAL \'15 minute\')) AS timestamp_array) ,window_buckets AS (SELECT timestamp_array AS window_start, timestampadd(minute, 15, timestamp_array) AS window_end FROM date_array) ,plot AS (SELECT /*+ RANGE_JOIN(d, 900 ) */ d.window_start, d.window_end, e.`TagName`, min(CASE WHEN `Status` = \'Bad\' THEN null ELSE struct(e.`Value`, e.`EventTime`) END) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `min_Value`, max(CASE WHEN `Status` = \'Bad\' THEN null ELSE struct(e.`Value`, e.`EventTime`) END) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `max_Value`, first(CASE WHEN `Status` = \'Bad\' THEN null ELSE struct(e.`Value`, e.`EventTime`) END, True) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `first_Value`, last(CASE WHEN `Status` = \'Bad\' THEN null ELSE struct(e.`Value`, e.`EventTime`) END, True) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `last_Value`, first(CASE WHEN `Status` = \'Bad\' THEN struct(e.`Value`, e.`EventTime`) ELSE null END, True) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `excp_Value` FROM window_buckets d INNER JOIN raw_events e ON d.window_start <= e.`EventTime` AND d.window_end > e.`EventTime`) ,deduplicate AS (SELECT window_start AS `EventTime`, `TagName`, `min_Value` as `Min`, `max_Value` as `Max`, `first_Value` as `First`, `last_Value` as `Last`, `excp_Value` as `Exception` FROM plot GROUP BY window_start, `TagName`, `min_Value`, `max_Value`, `first_Value`, `last_Value`, `excp_Value`) ,project AS (SELECT distinct Values.EventTime, `TagName`, Values.Value FROM (SELECT * FROM deduplicate UNPIVOT (`Values` for `Aggregation` IN (`Min`, `Max`, `First`, `Last`, `Exception`))) ORDER BY `TagName`, `EventTime` ) SELECT * FROM project ' -PLOT_MOCKED_QUERY_PIVOT = "WITH raw_events AS (SELECT DISTINCT from_utc_timestamp(date_trunc(\"millisecond\",`EventTime`), \"+0000\") AS `EventTime`, `TagName`, `Status`, `Value` FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp(\"2011-01-01T00:00:00+00:00\") AND to_timestamp(\"2011-01-02T23:59:59+00:00\") AND `TagName` IN ('mocked-TAGNAME') ) ,date_array AS (SELECT explode(sequence(from_utc_timestamp(to_timestamp(\"2011-01-01T00:00:00+00:00\"), \"+0000\"), from_utc_timestamp(to_timestamp(\"2011-01-02T23:59:59+00:00\"), \"+0000\"), INTERVAL '15 minute')) AS timestamp_array) ,window_buckets AS (SELECT timestamp_array AS window_start, timestampadd(minute, 15, timestamp_array) AS window_end FROM date_array) ,plot AS (SELECT /*+ RANGE_JOIN(d, 900 ) */ d.window_start, d.window_end, e.`TagName`, min(CASE WHEN `Status` = 'Bad' THEN null ELSE struct(e.`Value`, e.`EventTime`) END) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `min_Value`, max(CASE WHEN `Status` = 'Bad' THEN null ELSE struct(e.`Value`, e.`EventTime`) END) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `max_Value`, first(CASE WHEN `Status` = 'Bad' THEN null ELSE struct(e.`Value`, e.`EventTime`) END, True) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `first_Value`, last(CASE WHEN `Status` = 'Bad' THEN null ELSE struct(e.`Value`, e.`EventTime`) END, True) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `last_Value`, first(CASE WHEN `Status` = 'Bad' THEN struct(e.`Value`, e.`EventTime`) ELSE null END, True) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `excp_Value` FROM window_buckets d INNER JOIN raw_events e ON d.window_start <= e.`EventTime` AND d.window_end > e.`EventTime`) ,deduplicate AS (SELECT window_start AS `EventTime`, `TagName`, `min_Value` as `Min`, `max_Value` as `Max`, `first_Value` as `First`, `last_Value` as `Last`, `excp_Value` as `Exception` FROM plot GROUP BY window_start, `TagName`, `min_Value`, `max_Value`, `first_Value`, `last_Value`, `excp_Value`) ,project AS (SELECT distinct Values.EventTime, `TagName`, Values.Value FROM (SELECT * FROM deduplicate UNPIVOT (`Values` for `Aggregation` IN (`Min`, `Max`, `First`, `Last`, `Exception`))) ORDER BY `TagName`, `EventTime` ) ,pivot AS (SELECT * FROM (SELECT `EventTime`, `Value`, `TagName` AS `TagName` FROM project) PIVOT (FIRST(`Value`) FOR `TagName` IN ('mocked-TAGNAME' AS `mocked-TAGNAME`))) SELECT * FROM pivot ORDER BY `EventTime` " -PLOT_MOCKED_QUERY_UOM = 'WITH raw_events AS (SELECT DISTINCT from_utc_timestamp(date_trunc("millisecond",`EventTime`), "+0000") AS `EventTime`, `TagName`, `Status`, `Value` FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp("2011-01-01T00:00:00+00:00") AND to_timestamp("2011-01-02T23:59:59+00:00") AND `TagName` IN (\'mocked-TAGNAME\') ) ,date_array AS (SELECT explode(sequence(from_utc_timestamp(to_timestamp("2011-01-01T00:00:00+00:00"), "+0000"), from_utc_timestamp(to_timestamp("2011-01-02T23:59:59+00:00"), "+0000"), INTERVAL \'15 minute\')) AS timestamp_array) ,window_buckets AS (SELECT timestamp_array AS window_start, timestampadd(minute, 15, timestamp_array) AS window_end FROM date_array) ,plot AS (SELECT /*+ RANGE_JOIN(d, 900 ) */ d.window_start, d.window_end, e.`TagName`, min(CASE WHEN `Status` = \'Bad\' THEN null ELSE struct(e.`Value`, e.`EventTime`) END) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `min_Value`, max(CASE WHEN `Status` = \'Bad\' THEN null ELSE struct(e.`Value`, e.`EventTime`) END) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `max_Value`, first(CASE WHEN `Status` = \'Bad\' THEN null ELSE struct(e.`Value`, e.`EventTime`) END, True) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `first_Value`, last(CASE WHEN `Status` = \'Bad\' THEN null ELSE struct(e.`Value`, e.`EventTime`) END, True) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `last_Value`, first(CASE WHEN `Status` = \'Bad\' THEN struct(e.`Value`, e.`EventTime`) ELSE null END, True) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `excp_Value` FROM window_buckets d INNER JOIN raw_events e ON d.window_start <= e.`EventTime` AND d.window_end > e.`EventTime`) ,deduplicate AS (SELECT window_start AS `EventTime`, `TagName`, `min_Value` as `Min`, `max_Value` as `Max`, `first_Value` as `First`, `last_Value` as `Last`, `excp_Value` as `Exception` FROM plot GROUP BY window_start, `TagName`, `min_Value`, `max_Value`, `first_Value`, `last_Value`, `excp_Value`) ,project AS (SELECT distinct Values.EventTime, `TagName`, Values.Value FROM (SELECT * FROM deduplicate UNPIVOT (`Values` for `Aggregation` IN (`Min`, `Max`, `First`, `Last`, `Exception`))) ORDER BY `TagName`, `EventTime` ) SELECT p.`EventTime`, p.`TagName`, p.`Value`, m.`UoM` FROM project p LEFT OUTER JOIN `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_metadata` m ON p.`TagName` = m.`TagName` ' +PLOT_MOCKED_QUERY = "WITH raw_events AS (SELECT DISTINCT from_utc_timestamp(date_trunc('millisecond',`EventTime`), '+0000') AS `EventTime`, `TagName`, `Status`, `Value` FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp('2011-01-01T00:00:00+00:00') AND to_timestamp('2011-01-02T23:59:59+00:00') AND `TagName` IN ('mocked-TAGNAME')), date_array AS (SELECT explode(sequence(from_utc_timestamp(to_timestamp('2011-01-01T00:00:00+00:00'), '+0000'), from_utc_timestamp(to_timestamp('2011-01-02T23:59:59+00:00'), '+0000'), INTERVAL '15 minute')) AS timestamp_array), window_buckets AS (SELECT timestamp_array AS window_start, timestampadd(minute, 15, timestamp_array) AS window_end FROM date_array), plot AS (SELECT /*+ RANGE_JOIN(d, 900) */ d.window_start, d.window_end, e.`TagName`, min(CASE WHEN `Status` = 'Bad' THEN null ELSE struct(e.`Value`, e.`EventTime`) END) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `min_Value`, max(CASE WHEN `Status` = 'Bad' THEN null ELSE struct(e.`Value`, e.`EventTime`) END) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `max_Value`, first(CASE WHEN `Status` = 'Bad' THEN null ELSE struct(e.`Value`, e.`EventTime`) END, True) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `first_Value`, last(CASE WHEN `Status` = 'Bad' THEN null ELSE struct(e.`Value`, e.`EventTime`) END, True) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `last_Value`, first(CASE WHEN `Status` = 'Bad' THEN struct(e.`Value`, e.`EventTime`) ELSE null END, True) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `excp_Value` FROM window_buckets d INNER JOIN raw_events e ON d.window_start <= e.`EventTime` AND d.window_end > e.`EventTime`), deduplicate AS (SELECT window_start AS `EventTime`, `TagName`, `min_Value` as `Min`, `max_Value` as `Max`, `first_Value` as `First`, `last_Value` as `Last`, `excp_Value` as `Exception` FROM plot GROUP BY window_start, `TagName`, `min_Value`, `max_Value`, `first_Value`, `last_Value`, `excp_Value`), project AS (SELECT distinct Values.EventTime, `TagName`, Values.Value FROM (SELECT * FROM deduplicate UNPIVOT (`Values` for `Aggregation` IN (`Min`, `Max`, `First`, `Last`, `Exception`))) ORDER BY `TagName`, `EventTime`) SELECT * FROM project" +PLOT_MOCKED_QUERY_CHECK_TAGS = "WITH raw_events AS (SELECT DISTINCT from_utc_timestamp(date_trunc('millisecond',`EventTime`), '+0000') AS `EventTime`, `TagName`, `Status`, `Value` FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp('2011-01-01T00:00:00+00:00') AND to_timestamp('2011-01-02T23:59:59+00:00') AND UPPER(`TagName`) IN ('MOCKED-TAGNAME')), date_array AS (SELECT explode(sequence(from_utc_timestamp(to_timestamp('2011-01-01T00:00:00+00:00'), '+0000'), from_utc_timestamp(to_timestamp('2011-01-02T23:59:59+00:00'), '+0000'), INTERVAL '15 minute')) AS timestamp_array), window_buckets AS (SELECT timestamp_array AS window_start, timestampadd(minute, 15, timestamp_array) AS window_end FROM date_array), plot AS (SELECT /*+ RANGE_JOIN(d, 900) */ d.window_start, d.window_end, e.`TagName`, min(CASE WHEN `Status` = 'Bad' THEN null ELSE struct(e.`Value`, e.`EventTime`) END) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `min_Value`, max(CASE WHEN `Status` = 'Bad' THEN null ELSE struct(e.`Value`, e.`EventTime`) END) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `max_Value`, first(CASE WHEN `Status` = 'Bad' THEN null ELSE struct(e.`Value`, e.`EventTime`) END, True) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `first_Value`, last(CASE WHEN `Status` = 'Bad' THEN null ELSE struct(e.`Value`, e.`EventTime`) END, True) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `last_Value`, first(CASE WHEN `Status` = 'Bad' THEN struct(e.`Value`, e.`EventTime`) ELSE null END, True) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `excp_Value` FROM window_buckets d INNER JOIN raw_events e ON d.window_start <= e.`EventTime` AND d.window_end > e.`EventTime`), deduplicate AS (SELECT window_start AS `EventTime`, `TagName`, `min_Value` as `Min`, `max_Value` as `Max`, `first_Value` as `First`, `last_Value` as `Last`, `excp_Value` as `Exception` FROM plot GROUP BY window_start, `TagName`, `min_Value`, `max_Value`, `first_Value`, `last_Value`, `excp_Value`), project AS (SELECT distinct Values.EventTime, `TagName`, Values.Value FROM (SELECT * FROM deduplicate UNPIVOT (`Values` for `Aggregation` IN (`Min`, `Max`, `First`, `Last`, `Exception`))) ORDER BY `TagName`, `EventTime`) SELECT * FROM project" +PLOT_MOCKED_QUERY_PIVOT = "WITH raw_events AS (SELECT DISTINCT from_utc_timestamp(date_trunc('millisecond',`EventTime`), '+0000') AS `EventTime`, `TagName`, `Status`, `Value` FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp('2011-01-01T00:00:00+00:00') AND to_timestamp('2011-01-02T23:59:59+00:00') AND `TagName` IN ('mocked-TAGNAME')), date_array AS (SELECT explode(sequence(from_utc_timestamp(to_timestamp('2011-01-01T00:00:00+00:00'), '+0000'), from_utc_timestamp(to_timestamp('2011-01-02T23:59:59+00:00'), '+0000'), INTERVAL '15 minute')) AS timestamp_array), window_buckets AS (SELECT timestamp_array AS window_start, timestampadd(minute, 15, timestamp_array) AS window_end FROM date_array), plot AS (SELECT /*+ RANGE_JOIN(d, 900) */ d.window_start, d.window_end, e.`TagName`, min(CASE WHEN `Status` = 'Bad' THEN null ELSE struct(e.`Value`, e.`EventTime`) END) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `min_Value`, max(CASE WHEN `Status` = 'Bad' THEN null ELSE struct(e.`Value`, e.`EventTime`) END) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `max_Value`, first(CASE WHEN `Status` = 'Bad' THEN null ELSE struct(e.`Value`, e.`EventTime`) END, True) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `first_Value`, last(CASE WHEN `Status` = 'Bad' THEN null ELSE struct(e.`Value`, e.`EventTime`) END, True) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `last_Value`, first(CASE WHEN `Status` = 'Bad' THEN struct(e.`Value`, e.`EventTime`) ELSE null END, True) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `excp_Value` FROM window_buckets d INNER JOIN raw_events e ON d.window_start <= e.`EventTime` AND d.window_end > e.`EventTime`), deduplicate AS (SELECT window_start AS `EventTime`, `TagName`, `min_Value` as `Min`, `max_Value` as `Max`, `first_Value` as `First`, `last_Value` as `Last`, `excp_Value` as `Exception` FROM plot GROUP BY window_start, `TagName`, `min_Value`, `max_Value`, `first_Value`, `last_Value`, `excp_Value`), project AS (SELECT distinct Values.EventTime, `TagName`, Values.Value FROM (SELECT * FROM deduplicate UNPIVOT (`Values` for `Aggregation` IN (`Min`, `Max`, `First`, `Last`, `Exception`)))), pivot AS (SELECT * FROM (SELECT `EventTime`, `Value`, `TagName` FROM project) PIVOT (FIRST(`Value`) FOR `TagName` IN ('mocked-TAGNAME' AS `mocked-TAGNAME`)) ORDER BY `EventTime`) SELECT * FROM pivot" +PLOT_MOCKED_QUERY_UOM = "WITH raw_events AS (SELECT DISTINCT from_utc_timestamp(date_trunc('millisecond',`EventTime`), '+0000') AS `EventTime`, `TagName`, `Status`, `Value` FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp('2011-01-01T00:00:00+00:00') AND to_timestamp('2011-01-02T23:59:59+00:00') AND `TagName` IN ('mocked-TAGNAME')), date_array AS (SELECT explode(sequence(from_utc_timestamp(to_timestamp('2011-01-01T00:00:00+00:00'), '+0000'), from_utc_timestamp(to_timestamp('2011-01-02T23:59:59+00:00'), '+0000'), INTERVAL '15 minute')) AS timestamp_array), window_buckets AS (SELECT timestamp_array AS window_start, timestampadd(minute, 15, timestamp_array) AS window_end FROM date_array), plot AS (SELECT /*+ RANGE_JOIN(d, 900) */ d.window_start, d.window_end, e.`TagName`, min(CASE WHEN `Status` = 'Bad' THEN null ELSE struct(e.`Value`, e.`EventTime`) END) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `min_Value`, max(CASE WHEN `Status` = 'Bad' THEN null ELSE struct(e.`Value`, e.`EventTime`) END) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `max_Value`, first(CASE WHEN `Status` = 'Bad' THEN null ELSE struct(e.`Value`, e.`EventTime`) END, True) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `first_Value`, last(CASE WHEN `Status` = 'Bad' THEN null ELSE struct(e.`Value`, e.`EventTime`) END, True) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `last_Value`, first(CASE WHEN `Status` = 'Bad' THEN struct(e.`Value`, e.`EventTime`) ELSE null END, True) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `excp_Value` FROM window_buckets d INNER JOIN raw_events e ON d.window_start <= e.`EventTime` AND d.window_end > e.`EventTime`), deduplicate AS (SELECT window_start AS `EventTime`, `TagName`, `min_Value` as `Min`, `max_Value` as `Max`, `first_Value` as `First`, `last_Value` as `Last`, `excp_Value` as `Exception` FROM plot GROUP BY window_start, `TagName`, `min_Value`, `max_Value`, `first_Value`, `last_Value`, `excp_Value`), project AS (SELECT distinct Values.EventTime, `TagName`, Values.Value FROM (SELECT * FROM deduplicate UNPIVOT (`Values` for `Aggregation` IN (`Min`, `Max`, `First`, `Last`, `Exception`))) ORDER BY `TagName`, `EventTime`), uom AS (SELECT project.*, metadata.`UoM` FROM project LEFT OUTER JOIN `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_metadata` AS metadata ON project.`TagName` = metadata.`TagName`) SELECT * FROM uom" # Interpolate -INTERPOLATE_MOCKED_QUERY = 'WITH resample AS (WITH raw_events AS (SELECT DISTINCT from_utc_timestamp(date_trunc("millisecond",`EventTime`), "+0000") AS `EventTime`, `TagName`, `Status`, `Value` FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp("2011-01-01T00:00:00+00:00") AND to_timestamp("2011-01-02T23:59:59+00:00") AND `TagName` IN (\'mocked-TAGNAME\') ) ,date_array AS (SELECT explode(sequence(from_utc_timestamp(to_timestamp("2011-01-01T00:00:00+00:00"), "+0000"), from_utc_timestamp(to_timestamp("2011-01-02T23:59:59+00:00"), "+0000"), INTERVAL \'15 minute\')) AS timestamp_array) ,window_buckets AS (SELECT timestamp_array AS window_start, timestampadd(minute, 15, timestamp_array) AS window_end FROM date_array) ,resample AS (SELECT /*+ RANGE_JOIN(d, 900 ) */ d.window_start, d.window_end, e.`TagName`, avg(e.`Value`) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `Value` FROM window_buckets d INNER JOIN raw_events e ON d.window_start <= e.`EventTime` AND d.window_end > e.`EventTime`) ,project AS (SELECT window_start AS `EventTime`, `TagName`, `Value` FROM resample GROUP BY window_start, `TagName`, `Value` ) SELECT * FROM project ),date_array AS (SELECT explode(sequence(from_utc_timestamp(to_timestamp("2011-01-01T00:00:00+00:00"), "+0000"), from_utc_timestamp(to_timestamp("2011-01-02T23:59:59+00:00"), "+0000"), INTERVAL \'15 minute\')) AS `EventTime`, explode(array(\'mocked-TAGNAME\')) AS `TagName`) ,project AS (SELECT a.`EventTime`, a.`TagName`, last_value(b.`Value`, true) OVER (PARTITION BY a.`TagName` ORDER BY a.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS `Value` FROM date_array a LEFT OUTER JOIN resample b ON a.`EventTime` = b.`EventTime` AND a.`TagName` = b.`TagName`) SELECT * FROM project ORDER BY `TagName`, `EventTime` ' -INTERPOLATE_MOCKED_QUERY_BACKWARD_FILL = 'WITH resample AS (WITH raw_events AS (SELECT DISTINCT from_utc_timestamp(date_trunc("millisecond",`EventTime`), "+0000") AS `EventTime`, `TagName`, `Status`, `Value` FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp("2011-01-01T00:00:00+00:00") AND to_timestamp("2011-01-02T23:59:59+00:00") AND `TagName` IN (\'mocked-TAGNAME\') ) ,date_array AS (SELECT explode(sequence(from_utc_timestamp(to_timestamp("2011-01-01T00:00:00+00:00"), "+0000"), from_utc_timestamp(to_timestamp("2011-01-02T23:59:59+00:00"), "+0000"), INTERVAL \'15 minute\')) AS timestamp_array) ,window_buckets AS (SELECT timestamp_array AS window_start, timestampadd(minute, 15, timestamp_array) AS window_end FROM date_array) ,resample AS (SELECT /*+ RANGE_JOIN(d, 900 ) */ d.window_start, d.window_end, e.`TagName`, avg(e.`Value`) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `Value` FROM window_buckets d INNER JOIN raw_events e ON d.window_start <= e.`EventTime` AND d.window_end > e.`EventTime`) ,project AS (SELECT window_start AS `EventTime`, `TagName`, `Value` FROM resample GROUP BY window_start, `TagName`, `Value` ) SELECT * FROM project ),date_array AS (SELECT explode(sequence(from_utc_timestamp(to_timestamp("2011-01-01T00:00:00+00:00"), "+0000"), from_utc_timestamp(to_timestamp("2011-01-02T23:59:59+00:00"), "+0000"), INTERVAL \'15 minute\')) AS `EventTime`, explode(array(\'mocked-TAGNAME\')) AS `TagName`) ,project AS (SELECT a.`EventTime`, a.`TagName`, first_value(b.`Value`, true) OVER (PARTITION BY a.`TagName` ORDER BY a.`EventTime` ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS `Value` FROM date_array a LEFT OUTER JOIN resample b ON a.`EventTime` = b.`EventTime` AND a.`TagName` = b.`TagName`) SELECT * FROM project ORDER BY `TagName`, `EventTime` ' -INTERPOLATE_MOCKED_QUERY_CHECK_TAGS = 'WITH resample AS (WITH raw_events AS (SELECT DISTINCT from_utc_timestamp(date_trunc("millisecond",`EventTime`), "+0000") AS `EventTime`, `TagName`, `Status`, `Value` FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp("2011-01-01T00:00:00+00:00") AND to_timestamp("2011-01-02T23:59:59+00:00") AND UPPER(`TagName`) IN (\'MOCKED-TAGNAME\') ) ,date_array AS (SELECT explode(sequence(from_utc_timestamp(to_timestamp("2011-01-01T00:00:00+00:00"), "+0000"), from_utc_timestamp(to_timestamp("2011-01-02T23:59:59+00:00"), "+0000"), INTERVAL \'15 minute\')) AS timestamp_array) ,window_buckets AS (SELECT timestamp_array AS window_start, timestampadd(minute, 15, timestamp_array) AS window_end FROM date_array) ,resample AS (SELECT /*+ RANGE_JOIN(d, 900 ) */ d.window_start, d.window_end, e.`TagName`, avg(e.`Value`) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `Value` FROM window_buckets d INNER JOIN raw_events e ON d.window_start <= e.`EventTime` AND d.window_end > e.`EventTime`) ,project AS (SELECT window_start AS `EventTime`, `TagName`, `Value` FROM resample GROUP BY window_start, `TagName`, `Value` ) SELECT * FROM project ),date_array AS (SELECT DISTINCT explode(sequence(from_utc_timestamp(to_timestamp("2011-01-01T00:00:00+00:00"), "+0000"), from_utc_timestamp(to_timestamp("2011-01-02T23:59:59+00:00"), "+0000"), INTERVAL \'15 minute\')) AS `EventTime`, explode(array(`TagName`)) AS `TagName` FROM resample) ,project AS (SELECT a.`EventTime`, a.`TagName`, first_value(b.`Value`, true) OVER (PARTITION BY a.`TagName` ORDER BY a.`EventTime` ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS `Value` FROM date_array a LEFT OUTER JOIN resample b ON a.`EventTime` = b.`EventTime` AND a.`TagName` = b.`TagName`) SELECT * FROM project ORDER BY `TagName`, `EventTime` ' -INTERPOLATE_MOCKED_QUERY_PIVOT = 'WITH resample AS (WITH raw_events AS (SELECT DISTINCT from_utc_timestamp(date_trunc("millisecond",`EventTime`), "+0000") AS `EventTime`, `TagName`, `Status`, `Value` FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp("2011-01-01T00:00:00+00:00") AND to_timestamp("2011-01-02T23:59:59+00:00") AND `TagName` IN (\'mocked-TAGNAME\') ) ,date_array AS (SELECT explode(sequence(from_utc_timestamp(to_timestamp("2011-01-01T00:00:00+00:00"), "+0000"), from_utc_timestamp(to_timestamp("2011-01-02T23:59:59+00:00"), "+0000"), INTERVAL \'15 minute\')) AS timestamp_array) ,window_buckets AS (SELECT timestamp_array AS window_start, timestampadd(minute, 15, timestamp_array) AS window_end FROM date_array) ,resample AS (SELECT /*+ RANGE_JOIN(d, 900 ) */ d.window_start, d.window_end, e.`TagName`, avg(e.`Value`) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `Value` FROM window_buckets d INNER JOIN raw_events e ON d.window_start <= e.`EventTime` AND d.window_end > e.`EventTime`) ,project AS (SELECT window_start AS `EventTime`, `TagName`, `Value` FROM resample GROUP BY window_start, `TagName`, `Value` ) SELECT * FROM project ),date_array AS (SELECT explode(sequence(from_utc_timestamp(to_timestamp("2011-01-01T00:00:00+00:00"), "+0000"), from_utc_timestamp(to_timestamp("2011-01-02T23:59:59+00:00"), "+0000"), INTERVAL \'15 minute\')) AS `EventTime`, explode(array(\'mocked-TAGNAME\')) AS `TagName`) ,project AS (SELECT a.`EventTime`, a.`TagName`, first_value(b.`Value`, true) OVER (PARTITION BY a.`TagName` ORDER BY a.`EventTime` ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS `Value` FROM date_array a LEFT OUTER JOIN resample b ON a.`EventTime` = b.`EventTime` AND a.`TagName` = b.`TagName`) ,pivot AS (SELECT * FROM (SELECT `EventTime`, `Value`, `TagName` AS `TagName` FROM project) PIVOT (FIRST(`Value`) FOR `TagName` IN (\'mocked-TAGNAME\' AS `mocked-TAGNAME`))) SELECT * FROM pivot ORDER BY `EventTime` ' -INTERPOLATE_MOCKED_QUERY_UOM = 'WITH resample AS (WITH raw_events AS (SELECT DISTINCT from_utc_timestamp(date_trunc("millisecond",`EventTime`), "+0000") AS `EventTime`, `TagName`, `Status`, `Value` FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp("2011-01-01T00:00:00+00:00") AND to_timestamp("2011-01-02T23:59:59+00:00") AND `TagName` IN (\'mocked-TAGNAME\') ) ,date_array AS (SELECT explode(sequence(from_utc_timestamp(to_timestamp("2011-01-01T00:00:00+00:00"), "+0000"), from_utc_timestamp(to_timestamp("2011-01-02T23:59:59+00:00"), "+0000"), INTERVAL \'15 minute\')) AS timestamp_array) ,window_buckets AS (SELECT timestamp_array AS window_start, timestampadd(minute, 15, timestamp_array) AS window_end FROM date_array) ,resample AS (SELECT /*+ RANGE_JOIN(d, 900 ) */ d.window_start, d.window_end, e.`TagName`, avg(e.`Value`) OVER (PARTITION BY e.`TagName`, d.window_start ORDER BY e.`EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `Value` FROM window_buckets d INNER JOIN raw_events e ON d.window_start <= e.`EventTime` AND d.window_end > e.`EventTime`) ,project AS (SELECT window_start AS `EventTime`, `TagName`, `Value` FROM resample GROUP BY window_start, `TagName`, `Value` ) SELECT p.`EventTime`, p.`TagName`, p.`Value`, m.`UoM` FROM project p LEFT OUTER JOIN `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_metadata` m ON p.`TagName` = m.`TagName` ),date_array AS (SELECT explode(sequence(from_utc_timestamp(to_timestamp("2011-01-01T00:00:00+00:00"), "+0000"), from_utc_timestamp(to_timestamp("2011-01-02T23:59:59+00:00"), "+0000"), INTERVAL \'15 minute\')) AS `EventTime`, explode(array(\'mocked-TAGNAME\')) AS `TagName`) ,project AS (SELECT a.`EventTime`, a.`TagName`, first_value(b.`Value`, true) OVER (PARTITION BY a.`TagName` ORDER BY a.`EventTime` ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS `Value` FROM date_array a LEFT OUTER JOIN resample b ON a.`EventTime` = b.`EventTime` AND a.`TagName` = b.`TagName`) SELECT p.`EventTime`, p.`TagName`, p.`Value`, m.`UoM` FROM project p LEFT OUTER JOIN `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_metadata` m ON p.`TagName` = m.`TagName` ORDER BY `TagName`, `EventTime` ' +INTERPOLATE_MOCKED_QUERY = "WITH raw AS (SELECT from_utc_timestamp(date_trunc('millisecond',`EventTime`), '+0000') AS `EventTime`, window(from_utc_timestamp(date_trunc('millisecond',`EventTime`), '+0000'), '15 minute', '15 minute', '0 second') AS `window`, `TagName`, `Status`, `Value` FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp('2011-01-01T00:00:00+00:00') AND timestampadd(minute, 15, to_timestamp('2011-01-02T23:59:59+00:00')) AND `TagName` IN ('mocked-TAGNAME')), intervals AS (SELECT DISTINCT explode(sequence(from_utc_timestamp(to_timestamp('2011-01-01T00:00:00+00:00'), '+0000'), from_utc_timestamp(to_timestamp('2011-01-02T23:59:59+00:00'), '+0000'), INTERVAL '15 minute')) AS `EventTime`, explode(array('mocked-TAGNAME')) AS `TagName`), fill_intervals as (SELECT intervals.`TagName`, intervals.`EventTime` as `EventTime`, raw. `EventTime` as `OriginalEventTime`, raw.`Value`, CASE WHEN raw.`Value` IS NULL THEN NULL ELSE struct(raw.`EventTime`, raw.`Value`) END AS `EventTime_Value` FROM intervals LEFT OUTER JOIN raw ON intervals.`EventTime` = raw.`window`.start AND intervals.`TagName` = raw.`TagName`), interpolate_calculate AS (SELECT `OriginalEventTime`, `EventTime`, `TagName`, CASE WHEN `Value` IS NOT NULL THEN NULL ELSE LAG(`EventTime_Value`) IGNORE NULLS OVER (PARTITION BY `TagName` ORDER BY `EventTime`) END AS PrevEventTimeValue, CASE WHEN `Value` IS NOT NULL THEN NULL ELSE LEAD(`EventTime_Value`) IGNORE NULLS OVER (PARTITION BY `TagName` ORDER BY `EventTime`) END AS NextEventTimeValue, CASE WHEN `OriginalEventTime` = `EventTime` THEN `Value` WHEN `PrevEventTimeValue` IS NOT NULL AND `NextEventTimeValue` IS NOT NULL THEN `PrevEventTimeValue`.`Value` + ((`NextEventTimeValue`.`Value` - `PrevEventTimeValue`.`Value`) * (unix_timestamp(`EventTime`) - unix_timestamp(`PrevEventTimeValue`.`EventTime`)) / (unix_timestamp(`NextEventTimeValue`.`EventTime`) - unix_timestamp(`PrevEventTimeValue`.`EventTime`))) WHEN `PrevEventTimeValue` IS NOT NULL THEN `PrevEventTimeValue`.`Value` ELSE NULL END as `Value` FROM fill_intervals ), interpolate AS (SELECT `EventTime`, `TagName`, `Value` FROM interpolate_calculate WHERE `OriginalEventTime` IS NULL OR `OriginalEventTime` = `EventTime` ORDER BY `TagName`, `EventTime`) SELECT * FROM interpolate" +INTERPOLATE_MOCKED_QUERY_CHECK_TAGS = "WITH raw AS (SELECT from_utc_timestamp(date_trunc('millisecond',`EventTime`), '+0000') AS `EventTime`, window(from_utc_timestamp(date_trunc('millisecond',`EventTime`), '+0000'), '15 minute', '15 minute', '0 second') AS `window`, `TagName`, `Status`, `Value` FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp('2011-01-01T00:00:00+00:00') AND timestampadd(minute, 15, to_timestamp('2011-01-02T23:59:59+00:00')) AND UPPER(`TagName`) IN ('MOCKED-TAGNAME')), intervals AS (SELECT DISTINCT explode(sequence(from_utc_timestamp(to_timestamp('2011-01-01T00:00:00+00:00'), '+0000'), from_utc_timestamp(to_timestamp('2011-01-02T23:59:59+00:00'), '+0000'), INTERVAL '15 minute')) AS `EventTime`, explode(array('MOCKED-TAGNAME')) AS `TagName`), fill_intervals as (SELECT intervals.`TagName`, intervals.`EventTime` as `EventTime`, raw. `EventTime` as `OriginalEventTime`, raw.`Value`, CASE WHEN raw.`Value` IS NULL THEN NULL ELSE struct(raw.`EventTime`, raw.`Value`) END AS `EventTime_Value` FROM intervals LEFT OUTER JOIN raw ON intervals.`EventTime` = raw.`window`.start AND intervals.`TagName` = raw.`TagName`), interpolate_calculate AS (SELECT `OriginalEventTime`, `EventTime`, `TagName`, CASE WHEN `Value` IS NOT NULL THEN NULL ELSE LAG(`EventTime_Value`) IGNORE NULLS OVER (PARTITION BY `TagName` ORDER BY `EventTime`) END AS PrevEventTimeValue, CASE WHEN `Value` IS NOT NULL THEN NULL ELSE LEAD(`EventTime_Value`) IGNORE NULLS OVER (PARTITION BY `TagName` ORDER BY `EventTime`) END AS NextEventTimeValue, CASE WHEN `OriginalEventTime` = `EventTime` THEN `Value` WHEN `PrevEventTimeValue` IS NOT NULL AND `NextEventTimeValue` IS NOT NULL THEN `PrevEventTimeValue`.`Value` + ((`NextEventTimeValue`.`Value` - `PrevEventTimeValue`.`Value`) * (unix_timestamp(`EventTime`) - unix_timestamp(`PrevEventTimeValue`.`EventTime`)) / (unix_timestamp(`NextEventTimeValue`.`EventTime`) - unix_timestamp(`PrevEventTimeValue`.`EventTime`))) WHEN `PrevEventTimeValue` IS NOT NULL THEN `PrevEventTimeValue`.`Value` ELSE NULL END as `Value` FROM fill_intervals ), interpolate AS (SELECT `EventTime`, `TagName`, `Value` FROM interpolate_calculate WHERE `OriginalEventTime` IS NULL OR `OriginalEventTime` = `EventTime` ORDER BY `TagName`, `EventTime`) SELECT * FROM interpolate" +INTERPOLATE_MOCKED_QUERY_PIVOT = "WITH raw AS (SELECT from_utc_timestamp(date_trunc('millisecond',`EventTime`), '+0000') AS `EventTime`, window(from_utc_timestamp(date_trunc('millisecond',`EventTime`), '+0000'), '15 minute', '15 minute', '0 second') AS `window`, `TagName`, `Status`, `Value` FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp('2011-01-01T00:00:00+00:00') AND timestampadd(minute, 15, to_timestamp('2011-01-02T23:59:59+00:00')) AND `TagName` IN ('mocked-TAGNAME')), intervals AS (SELECT DISTINCT explode(sequence(from_utc_timestamp(to_timestamp('2011-01-01T00:00:00+00:00'), '+0000'), from_utc_timestamp(to_timestamp('2011-01-02T23:59:59+00:00'), '+0000'), INTERVAL '15 minute')) AS `EventTime`, explode(array('mocked-TAGNAME')) AS `TagName`), fill_intervals as (SELECT intervals.`TagName`, intervals.`EventTime` as `EventTime`, raw. `EventTime` as `OriginalEventTime`, raw.`Value`, CASE WHEN raw.`Value` IS NULL THEN NULL ELSE struct(raw.`EventTime`, raw.`Value`) END AS `EventTime_Value` FROM intervals LEFT OUTER JOIN raw ON intervals.`EventTime` = raw.`window`.start AND intervals.`TagName` = raw.`TagName`), interpolate_calculate AS (SELECT `OriginalEventTime`, `EventTime`, `TagName`, CASE WHEN `Value` IS NOT NULL THEN NULL ELSE LAG(`EventTime_Value`) IGNORE NULLS OVER (PARTITION BY `TagName` ORDER BY `EventTime`) END AS PrevEventTimeValue, CASE WHEN `Value` IS NOT NULL THEN NULL ELSE LEAD(`EventTime_Value`) IGNORE NULLS OVER (PARTITION BY `TagName` ORDER BY `EventTime`) END AS NextEventTimeValue, CASE WHEN `OriginalEventTime` = `EventTime` THEN `Value` WHEN `PrevEventTimeValue` IS NOT NULL AND `NextEventTimeValue` IS NOT NULL THEN `PrevEventTimeValue`.`Value` + ((`NextEventTimeValue`.`Value` - `PrevEventTimeValue`.`Value`) * (unix_timestamp(`EventTime`) - unix_timestamp(`PrevEventTimeValue`.`EventTime`)) / (unix_timestamp(`NextEventTimeValue`.`EventTime`) - unix_timestamp(`PrevEventTimeValue`.`EventTime`))) WHEN `PrevEventTimeValue` IS NOT NULL THEN `PrevEventTimeValue`.`Value` ELSE NULL END as `Value` FROM fill_intervals ), interpolate AS (SELECT `EventTime`, `TagName`, `Value` FROM interpolate_calculate WHERE `OriginalEventTime` IS NULL OR `OriginalEventTime` = `EventTime` ), pivot AS (SELECT * FROM (SELECT `EventTime`, `Value`, `TagName` FROM interpolate) PIVOT (FIRST(`Value`) FOR `TagName` IN ('mocked-TAGNAME' AS `mocked-TAGNAME`)) ORDER BY `EventTime`) SELECT * FROM pivot" +INTERPOLATE_MOCKED_QUERY_UOM = "WITH raw AS (SELECT from_utc_timestamp(date_trunc('millisecond',`EventTime`), '+0000') AS `EventTime`, window(from_utc_timestamp(date_trunc('millisecond',`EventTime`), '+0000'), '15 minute', '15 minute', '0 second') AS `window`, `TagName`, `Status`, `Value` FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp('2011-01-01T00:00:00+00:00') AND timestampadd(minute, 15, to_timestamp('2011-01-02T23:59:59+00:00')) AND `TagName` IN ('mocked-TAGNAME')), intervals AS (SELECT DISTINCT explode(sequence(from_utc_timestamp(to_timestamp('2011-01-01T00:00:00+00:00'), '+0000'), from_utc_timestamp(to_timestamp('2011-01-02T23:59:59+00:00'), '+0000'), INTERVAL '15 minute')) AS `EventTime`, explode(array('mocked-TAGNAME')) AS `TagName`), fill_intervals as (SELECT intervals.`TagName`, intervals.`EventTime` as `EventTime`, raw. `EventTime` as `OriginalEventTime`, raw.`Value`, CASE WHEN raw.`Value` IS NULL THEN NULL ELSE struct(raw.`EventTime`, raw.`Value`) END AS `EventTime_Value` FROM intervals LEFT OUTER JOIN raw ON intervals.`EventTime` = raw.`window`.start AND intervals.`TagName` = raw.`TagName`), interpolate_calculate AS (SELECT `OriginalEventTime`, `EventTime`, `TagName`, CASE WHEN `Value` IS NOT NULL THEN NULL ELSE LAG(`EventTime_Value`) IGNORE NULLS OVER (PARTITION BY `TagName` ORDER BY `EventTime`) END AS PrevEventTimeValue, CASE WHEN `Value` IS NOT NULL THEN NULL ELSE LEAD(`EventTime_Value`) IGNORE NULLS OVER (PARTITION BY `TagName` ORDER BY `EventTime`) END AS NextEventTimeValue, CASE WHEN `OriginalEventTime` = `EventTime` THEN `Value` WHEN `PrevEventTimeValue` IS NOT NULL AND `NextEventTimeValue` IS NOT NULL THEN `PrevEventTimeValue`.`Value` + ((`NextEventTimeValue`.`Value` - `PrevEventTimeValue`.`Value`) * (unix_timestamp(`EventTime`) - unix_timestamp(`PrevEventTimeValue`.`EventTime`)) / (unix_timestamp(`NextEventTimeValue`.`EventTime`) - unix_timestamp(`PrevEventTimeValue`.`EventTime`))) WHEN `PrevEventTimeValue` IS NOT NULL THEN `PrevEventTimeValue`.`Value` ELSE NULL END as `Value` FROM fill_intervals ), interpolate AS (SELECT `EventTime`, `TagName`, `Value` FROM interpolate_calculate WHERE `OriginalEventTime` IS NULL OR `OriginalEventTime` = `EventTime` ORDER BY `TagName`, `EventTime`), uom AS (SELECT interpolate.*, metadata.`UoM` FROM interpolate LEFT OUTER JOIN `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_metadata` AS metadata ON interpolate.`TagName` = metadata.`TagName`) SELECT * FROM uom" # Time Weighted Average TWA_MOCKED_QUERY = 'WITH raw_events AS (SELECT DISTINCT `TagName`, from_utc_timestamp(date_trunc("millisecond",`EventTime`), "+0000") AS `EventTime`, `Status`, `Value` FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE to_date(`EventTime`) BETWEEN date_sub(to_date(to_timestamp("2011-01-01T00:00:00+00:00")), 1) AND date_add(to_date(to_timestamp("2011-01-02T23:59:59+00:00")), 1) AND `TagName` IN (\'mocked-TAGNAME\') ) ,date_array AS (SELECT explode(sequence(from_utc_timestamp(to_timestamp("2011-01-01T00:00:00+00:00"), "+0000"), from_utc_timestamp(to_timestamp("2011-01-02T23:59:59+00:00"), "+0000"), INTERVAL \'15 minute\')) AS `EventTime`, explode(array(\'mocked-TAGNAME\')) AS `TagName`) ,boundary_events AS (SELECT coalesce(a.`TagName`, b.`TagName`) AS `TagName`, coalesce(a.`EventTime`, b.`EventTime`) AS `EventTime`, b.`Status`, b.`Value` FROM date_array a FULL OUTER JOIN raw_events b ON a.`EventTime` = b.`EventTime` AND a.`TagName` = b.`TagName`) ,window_buckets AS (SELECT `EventTime` AS window_start, LEAD(`EventTime`) OVER (ORDER BY `EventTime`) AS window_end FROM (SELECT distinct `EventTime` FROM date_array) ) ,window_events AS (SELECT /*+ RANGE_JOIN(b, 900 ) */ b.`TagName`, b.`EventTime`, a.window_start AS `WindowEventTime`, b.`Status`, b.`Value` FROM boundary_events b LEFT OUTER JOIN window_buckets a ON a.window_start <= b.`EventTime` AND a.window_end > b.`EventTime`) ,fill_status AS (SELECT *, last_value(`Status`, true) OVER (PARTITION BY `TagName` ORDER BY `EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS `Fill_Status`, CASE WHEN `Fill_Status` <> "Bad" THEN `Value` ELSE null END AS `Good_Value` FROM window_events) ,fill_value AS (SELECT *, last_value(`Good_Value`, true) OVER (PARTITION BY `TagName` ORDER BY `EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS `Fill_Value` FROM fill_status) ,fill_step AS (SELECT *, false AS Step FROM fill_value) ,interpolate AS (SELECT *, CASE WHEN `Step` = false AND `Status` IS NULL AND `Value` IS NULL THEN lag(`EventTime`) OVER ( PARTITION BY `TagName` ORDER BY `EventTime` ) ELSE NULL END AS `Previous_EventTime`, CASE WHEN `Step` = false AND `Status` IS NULL AND `Value` IS NULL THEN lag(`Fill_Value`) OVER ( PARTITION BY `TagName` ORDER BY `EventTime` ) ELSE NULL END AS `Previous_Fill_Value`, lead(`EventTime`) OVER ( PARTITION BY `TagName` ORDER BY `EventTime` ) AS `Next_EventTime`, CASE WHEN `Step` = false AND `Status` IS NULL AND `Value` IS NULL THEN lead(`Fill_Value`) OVER ( PARTITION BY `TagName` ORDER BY `EventTime` ) ELSE NULL END AS `Next_Fill_Value`, CASE WHEN `Step` = false AND `Status` IS NULL AND `Value` IS NULL THEN `Previous_Fill_Value` + ( (`Next_Fill_Value` - `Previous_Fill_Value`) * ( ( unix_timestamp(`EventTime`) - unix_timestamp(`Previous_EventTime`) ) / ( unix_timestamp(`Next_EventTime`) - unix_timestamp(`Previous_EventTime`) ) ) ) ELSE NULL END AS `Interpolated_Value`, coalesce(`Interpolated_Value`, `Fill_Value`) as `Event_Value` FROM fill_step ),twa_calculations AS (SELECT `TagName`, `EventTime`, `WindowEventTime`, `Step`, `Status`, `Value`, `Previous_EventTime`, `Previous_Fill_Value`, `Next_EventTime`, `Next_Fill_Value`, `Interpolated_Value`, `Fill_Status`, `Fill_Value`, `Event_Value`, lead(`Fill_Status`) OVER (PARTITION BY `TagName` ORDER BY `EventTime`) AS `Next_Status` , CASE WHEN `Next_Status` <> "Bad" OR (`Fill_Status` <> "Bad" AND `Next_Status` = "Bad") THEN lead(`Event_Value`) OVER (PARTITION BY `TagName` ORDER BY `EventTime`) ELSE `Value` END AS `Next_Value_For_Status` , CASE WHEN `Fill_Status` <> "Bad" THEN `Next_Value_For_Status` ELSE 0 END AS `Next_Value` , CASE WHEN `Fill_Status` <> "Bad" AND `Next_Status` <> "Bad" THEN ((cast(`Next_EventTime` AS double) - cast(`EventTime` AS double)) / 60) WHEN `Fill_Status` <> "Bad" AND `Next_Status` = "Bad" THEN ((cast(`Next_EventTime` AS integer) - cast(`EventTime` AS double)) / 60) ELSE 0 END AS good_minutes , CASE WHEN Step == false THEN ((`Event_Value` + `Next_Value`) * 0.5) * good_minutes ELSE (`Event_Value` * good_minutes) END AS twa_value FROM interpolate) ,twa AS (SELECT `TagName`, `WindowEventTime` AS `EventTime`, sum(twa_value) / sum(good_minutes) AS `Value` from twa_calculations GROUP BY `TagName`, `WindowEventTime`) ,project AS (SELECT * FROM twa WHERE `EventTime` BETWEEN to_timestamp("2011-01-01T00:00:00") AND to_timestamp("2011-01-02T23:59:59")) SELECT * FROM project ORDER BY `TagName`, `EventTime` ' TWA_MOCKED_QUERY_CHECK_TAGS = 'WITH raw_events AS (SELECT DISTINCT `TagName`, from_utc_timestamp(date_trunc("millisecond",`EventTime`), "+0000") AS `EventTime`, `Status`, `Value` FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE to_date(`EventTime`) BETWEEN date_sub(to_date(to_timestamp("2011-01-01T00:00:00+00:00")), 1) AND date_add(to_date(to_timestamp("2011-01-02T23:59:59+00:00")), 1) AND UPPER(`TagName`) IN (\'MOCKED-TAGNAME\') ) ,date_array AS (SELECT DISTINCT explode(sequence(from_utc_timestamp(to_timestamp("2011-01-01T00:00:00+00:00"), "+0000"), from_utc_timestamp(to_timestamp("2011-01-02T23:59:59+00:00"), "+0000"), INTERVAL \'15 minute\')) AS `EventTime`, explode(array(`TagName`)) AS `TagName` FROM raw_events) ,boundary_events AS (SELECT coalesce(a.`TagName`, b.`TagName`) AS `TagName`, coalesce(a.`EventTime`, b.`EventTime`) AS `EventTime`, b.`Status`, b.`Value` FROM date_array a FULL OUTER JOIN raw_events b ON a.`EventTime` = b.`EventTime` AND a.`TagName` = b.`TagName`) ,window_buckets AS (SELECT `EventTime` AS window_start, LEAD(`EventTime`) OVER (ORDER BY `EventTime`) AS window_end FROM (SELECT distinct `EventTime` FROM date_array) ) ,window_events AS (SELECT /*+ RANGE_JOIN(b, 900 ) */ b.`TagName`, b.`EventTime`, a.window_start AS `WindowEventTime`, b.`Status`, b.`Value` FROM boundary_events b LEFT OUTER JOIN window_buckets a ON a.window_start <= b.`EventTime` AND a.window_end > b.`EventTime`) ,fill_status AS (SELECT *, last_value(`Status`, true) OVER (PARTITION BY `TagName` ORDER BY `EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS `Fill_Status`, CASE WHEN `Fill_Status` <> "Bad" THEN `Value` ELSE null END AS `Good_Value` FROM window_events) ,fill_value AS (SELECT *, last_value(`Good_Value`, true) OVER (PARTITION BY `TagName` ORDER BY `EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS `Fill_Value` FROM fill_status) ,fill_step AS (SELECT *, false AS Step FROM fill_value) ,interpolate AS (SELECT *, CASE WHEN `Step` = false AND `Status` IS NULL AND `Value` IS NULL THEN lag(`EventTime`) OVER ( PARTITION BY `TagName` ORDER BY `EventTime` ) ELSE NULL END AS `Previous_EventTime`, CASE WHEN `Step` = false AND `Status` IS NULL AND `Value` IS NULL THEN lag(`Fill_Value`) OVER ( PARTITION BY `TagName` ORDER BY `EventTime` ) ELSE NULL END AS `Previous_Fill_Value`, lead(`EventTime`) OVER ( PARTITION BY `TagName` ORDER BY `EventTime` ) AS `Next_EventTime`, CASE WHEN `Step` = false AND `Status` IS NULL AND `Value` IS NULL THEN lead(`Fill_Value`) OVER ( PARTITION BY `TagName` ORDER BY `EventTime` ) ELSE NULL END AS `Next_Fill_Value`, CASE WHEN `Step` = false AND `Status` IS NULL AND `Value` IS NULL THEN `Previous_Fill_Value` + ( (`Next_Fill_Value` - `Previous_Fill_Value`) * ( ( unix_timestamp(`EventTime`) - unix_timestamp(`Previous_EventTime`) ) / ( unix_timestamp(`Next_EventTime`) - unix_timestamp(`Previous_EventTime`) ) ) ) ELSE NULL END AS `Interpolated_Value`, coalesce(`Interpolated_Value`, `Fill_Value`) as `Event_Value` FROM fill_step ),twa_calculations AS (SELECT `TagName`, `EventTime`, `WindowEventTime`, `Step`, `Status`, `Value`, `Previous_EventTime`, `Previous_Fill_Value`, `Next_EventTime`, `Next_Fill_Value`, `Interpolated_Value`, `Fill_Status`, `Fill_Value`, `Event_Value`, lead(`Fill_Status`) OVER (PARTITION BY `TagName` ORDER BY `EventTime`) AS `Next_Status` , CASE WHEN `Next_Status` <> "Bad" OR (`Fill_Status` <> "Bad" AND `Next_Status` = "Bad") THEN lead(`Event_Value`) OVER (PARTITION BY `TagName` ORDER BY `EventTime`) ELSE `Value` END AS `Next_Value_For_Status` , CASE WHEN `Fill_Status` <> "Bad" THEN `Next_Value_For_Status` ELSE 0 END AS `Next_Value` , CASE WHEN `Fill_Status` <> "Bad" AND `Next_Status` <> "Bad" THEN ((cast(`Next_EventTime` AS double) - cast(`EventTime` AS double)) / 60) WHEN `Fill_Status` <> "Bad" AND `Next_Status` = "Bad" THEN ((cast(`Next_EventTime` AS integer) - cast(`EventTime` AS double)) / 60) ELSE 0 END AS good_minutes , CASE WHEN Step == false THEN ((`Event_Value` + `Next_Value`) * 0.5) * good_minutes ELSE (`Event_Value` * good_minutes) END AS twa_value FROM interpolate) ,twa AS (SELECT `TagName`, `WindowEventTime` AS `EventTime`, sum(twa_value) / sum(good_minutes) AS `Value` from twa_calculations GROUP BY `TagName`, `WindowEventTime`) ,project AS (SELECT * FROM twa WHERE `EventTime` BETWEEN to_timestamp("2011-01-01T00:00:00") AND to_timestamp("2011-01-02T23:59:59")) SELECT * FROM project ORDER BY `TagName`, `EventTime` ' TWA_MOCKED_QUERY_PIVOT = 'WITH raw_events AS (SELECT DISTINCT `TagName`, from_utc_timestamp(date_trunc("millisecond",`EventTime`), "+0000") AS `EventTime`, `Status`, `Value` FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE to_date(`EventTime`) BETWEEN date_sub(to_date(to_timestamp("2011-01-01T00:00:00+00:00")), 1) AND date_add(to_date(to_timestamp("2011-01-02T23:59:59+00:00")), 1) AND `TagName` IN (\'mocked-TAGNAME\') ) ,date_array AS (SELECT explode(sequence(from_utc_timestamp(to_timestamp("2011-01-01T00:00:00+00:00"), "+0000"), from_utc_timestamp(to_timestamp("2011-01-02T23:59:59+00:00"), "+0000"), INTERVAL \'15 minute\')) AS `EventTime`, explode(array(\'mocked-TAGNAME\')) AS `TagName`) ,boundary_events AS (SELECT coalesce(a.`TagName`, b.`TagName`) AS `TagName`, coalesce(a.`EventTime`, b.`EventTime`) AS `EventTime`, b.`Status`, b.`Value` FROM date_array a FULL OUTER JOIN raw_events b ON a.`EventTime` = b.`EventTime` AND a.`TagName` = b.`TagName`) ,window_buckets AS (SELECT `EventTime` AS window_start, LEAD(`EventTime`) OVER (ORDER BY `EventTime`) AS window_end FROM (SELECT distinct `EventTime` FROM date_array) ) ,window_events AS (SELECT /*+ RANGE_JOIN(b, 900 ) */ b.`TagName`, b.`EventTime`, a.window_start AS `WindowEventTime`, b.`Status`, b.`Value` FROM boundary_events b LEFT OUTER JOIN window_buckets a ON a.window_start <= b.`EventTime` AND a.window_end > b.`EventTime`) ,fill_status AS (SELECT *, last_value(`Status`, true) OVER (PARTITION BY `TagName` ORDER BY `EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS `Fill_Status`, CASE WHEN `Fill_Status` <> "Bad" THEN `Value` ELSE null END AS `Good_Value` FROM window_events) ,fill_value AS (SELECT *, last_value(`Good_Value`, true) OVER (PARTITION BY `TagName` ORDER BY `EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS `Fill_Value` FROM fill_status) ,fill_step AS (SELECT *, false AS Step FROM fill_value) ,interpolate AS (SELECT *, CASE WHEN `Step` = false AND `Status` IS NULL AND `Value` IS NULL THEN lag(`EventTime`) OVER ( PARTITION BY `TagName` ORDER BY `EventTime` ) ELSE NULL END AS `Previous_EventTime`, CASE WHEN `Step` = false AND `Status` IS NULL AND `Value` IS NULL THEN lag(`Fill_Value`) OVER ( PARTITION BY `TagName` ORDER BY `EventTime` ) ELSE NULL END AS `Previous_Fill_Value`, lead(`EventTime`) OVER ( PARTITION BY `TagName` ORDER BY `EventTime` ) AS `Next_EventTime`, CASE WHEN `Step` = false AND `Status` IS NULL AND `Value` IS NULL THEN lead(`Fill_Value`) OVER ( PARTITION BY `TagName` ORDER BY `EventTime` ) ELSE NULL END AS `Next_Fill_Value`, CASE WHEN `Step` = false AND `Status` IS NULL AND `Value` IS NULL THEN `Previous_Fill_Value` + ( (`Next_Fill_Value` - `Previous_Fill_Value`) * ( ( unix_timestamp(`EventTime`) - unix_timestamp(`Previous_EventTime`) ) / ( unix_timestamp(`Next_EventTime`) - unix_timestamp(`Previous_EventTime`) ) ) ) ELSE NULL END AS `Interpolated_Value`, coalesce(`Interpolated_Value`, `Fill_Value`) as `Event_Value` FROM fill_step ),twa_calculations AS (SELECT `TagName`, `EventTime`, `WindowEventTime`, `Step`, `Status`, `Value`, `Previous_EventTime`, `Previous_Fill_Value`, `Next_EventTime`, `Next_Fill_Value`, `Interpolated_Value`, `Fill_Status`, `Fill_Value`, `Event_Value`, lead(`Fill_Status`) OVER (PARTITION BY `TagName` ORDER BY `EventTime`) AS `Next_Status` , CASE WHEN `Next_Status` <> "Bad" OR (`Fill_Status` <> "Bad" AND `Next_Status` = "Bad") THEN lead(`Event_Value`) OVER (PARTITION BY `TagName` ORDER BY `EventTime`) ELSE `Value` END AS `Next_Value_For_Status` , CASE WHEN `Fill_Status` <> "Bad" THEN `Next_Value_For_Status` ELSE 0 END AS `Next_Value` , CASE WHEN `Fill_Status` <> "Bad" AND `Next_Status` <> "Bad" THEN ((cast(`Next_EventTime` AS double) - cast(`EventTime` AS double)) / 60) WHEN `Fill_Status` <> "Bad" AND `Next_Status` = "Bad" THEN ((cast(`Next_EventTime` AS integer) - cast(`EventTime` AS double)) / 60) ELSE 0 END AS good_minutes , CASE WHEN Step == false THEN ((`Event_Value` + `Next_Value`) * 0.5) * good_minutes ELSE (`Event_Value` * good_minutes) END AS twa_value FROM interpolate) ,twa AS (SELECT `TagName`, `WindowEventTime` AS `EventTime`, sum(twa_value) / sum(good_minutes) AS `Value` from twa_calculations GROUP BY `TagName`, `WindowEventTime`) ,project AS (SELECT * FROM twa WHERE `EventTime` BETWEEN to_timestamp("2011-01-01T00:00:00") AND to_timestamp("2011-01-02T23:59:59")) ,pivot AS (SELECT * FROM (SELECT `EventTime`, `Value`, `TagName` AS `TagName` FROM project) PIVOT (FIRST(`Value`) FOR `TagName` IN (\'mocked-TAGNAME\' AS `mocked-TAGNAME`))) SELECT * FROM pivot ORDER BY `EventTime` ' -TWA_MOCKED_QUERY_METADATA = 'WITH raw_events AS (SELECT DISTINCT `TagName`, from_utc_timestamp(date_trunc("millisecond",`EventTime`), "+0000") AS `EventTime`, `Status`, `Value` FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE to_date(`EventTime`) BETWEEN date_sub(to_date(to_timestamp("2011-01-01T00:00:00+00:00")), 1) AND date_add(to_date(to_timestamp("2011-01-02T23:59:59+00:00")), 1) AND `TagName` IN (\'mocked-TAGNAME\') ) ,date_array AS (SELECT explode(sequence(from_utc_timestamp(to_timestamp("2011-01-01T00:00:00+00:00"), "+0000"), from_utc_timestamp(to_timestamp("2011-01-02T23:59:59+00:00"), "+0000"), INTERVAL \'15 minute\')) AS `EventTime`, explode(array(\'mocked-TAGNAME\')) AS `TagName`) ,boundary_events AS (SELECT coalesce(a.`TagName`, b.`TagName`) AS `TagName`, coalesce(a.`EventTime`, b.`EventTime`) AS `EventTime`, b.`Status`, b.`Value` FROM date_array a FULL OUTER JOIN raw_events b ON a.`EventTime` = b.`EventTime` AND a.`TagName` = b.`TagName`) ,window_buckets AS (SELECT `EventTime` AS window_start, LEAD(`EventTime`) OVER (ORDER BY `EventTime`) AS window_end FROM (SELECT distinct `EventTime` FROM date_array) ) ,window_events AS (SELECT /*+ RANGE_JOIN(b, 900 ) */ b.`TagName`, b.`EventTime`, a.window_start AS `WindowEventTime`, b.`Status`, b.`Value` FROM boundary_events b LEFT OUTER JOIN window_buckets a ON a.window_start <= b.`EventTime` AND a.window_end > b.`EventTime`) ,fill_status AS (SELECT *, last_value(`Status`, true) OVER (PARTITION BY `TagName` ORDER BY `EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS `Fill_Status`, CASE WHEN `Fill_Status` <> "Bad" THEN `Value` ELSE null END AS `Good_Value` FROM window_events) ,fill_value AS (SELECT *, last_value(`Good_Value`, true) OVER (PARTITION BY `TagName` ORDER BY `EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS `Fill_Value` FROM fill_status) ,fill_step AS (SELECT *, IFNULL(Step, false) AS Step FROM fill_value f LEFT JOIN `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_metadata` m ON f.`TagName` = m.`TagName`) ,interpolate AS (SELECT *, CASE WHEN `Step` = false AND `Status` IS NULL AND `Value` IS NULL THEN lag(`EventTime`) OVER ( PARTITION BY `TagName` ORDER BY `EventTime` ) ELSE NULL END AS `Previous_EventTime`, CASE WHEN `Step` = false AND `Status` IS NULL AND `Value` IS NULL THEN lag(`Fill_Value`) OVER ( PARTITION BY `TagName` ORDER BY `EventTime` ) ELSE NULL END AS `Previous_Fill_Value`, lead(`EventTime`) OVER ( PARTITION BY `TagName` ORDER BY `EventTime` ) AS `Next_EventTime`, CASE WHEN `Step` = false AND `Status` IS NULL AND `Value` IS NULL THEN lead(`Fill_Value`) OVER ( PARTITION BY `TagName` ORDER BY `EventTime` ) ELSE NULL END AS `Next_Fill_Value`, CASE WHEN `Step` = false AND `Status` IS NULL AND `Value` IS NULL THEN `Previous_Fill_Value` + ( (`Next_Fill_Value` - `Previous_Fill_Value`) * ( ( unix_timestamp(`EventTime`) - unix_timestamp(`Previous_EventTime`) ) / ( unix_timestamp(`Next_EventTime`) - unix_timestamp(`Previous_EventTime`) ) ) ) ELSE NULL END AS `Interpolated_Value`, coalesce(`Interpolated_Value`, `Fill_Value`) as `Event_Value` FROM fill_step ),twa_calculations AS (SELECT `TagName`, `EventTime`, `WindowEventTime`, `Step`, `Status`, `Value`, `Previous_EventTime`, `Previous_Fill_Value`, `Next_EventTime`, `Next_Fill_Value`, `Interpolated_Value`, `Fill_Status`, `Fill_Value`, `Event_Value`, lead(`Fill_Status`) OVER (PARTITION BY `TagName` ORDER BY `EventTime`) AS `Next_Status` , CASE WHEN `Next_Status` <> "Bad" OR (`Fill_Status` <> "Bad" AND `Next_Status` = "Bad") THEN lead(`Event_Value`) OVER (PARTITION BY `TagName` ORDER BY `EventTime`) ELSE `Value` END AS `Next_Value_For_Status` , CASE WHEN `Fill_Status` <> "Bad" THEN `Next_Value_For_Status` ELSE 0 END AS `Next_Value` , CASE WHEN `Fill_Status` <> "Bad" AND `Next_Status` <> "Bad" THEN ((cast(`Next_EventTime` AS double) - cast(`EventTime` AS double)) / 60) WHEN `Fill_Status` <> "Bad" AND `Next_Status` = "Bad" THEN ((cast(`Next_EventTime` AS integer) - cast(`EventTime` AS double)) / 60) ELSE 0 END AS good_minutes , CASE WHEN Step == false THEN ((`Event_Value` + `Next_Value`) * 0.5) * good_minutes ELSE (`Event_Value` * good_minutes) END AS twa_value FROM interpolate) ,twa AS (SELECT `TagName`, `WindowEventTime` AS `EventTime`, sum(twa_value) / sum(good_minutes) AS `Value` from twa_calculations GROUP BY `TagName`, `WindowEventTime`) ,project AS (SELECT * FROM twa WHERE `EventTime` BETWEEN to_timestamp("2011-01-01T00:00:00") AND to_timestamp("2011-01-02T23:59:59")) SELECT * FROM project ORDER BY `TagName`, `EventTime` ' +TWA_MOCKED_QUERY_METADATA = 'WITH raw_events AS (SELECT DISTINCT `TagName`, from_utc_timestamp(date_trunc("millisecond",`EventTime`), "+0000") AS `EventTime`, `Status`, `Value` FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE to_date(`EventTime`) BETWEEN date_sub(to_date(to_timestamp("2011-01-01T00:00:00+00:00")), 1) AND date_add(to_date(to_timestamp("2011-01-02T23:59:59+00:00")), 1) AND `TagName` IN (\'mocked-TAGNAME\') ) ,date_array AS (SELECT explode(sequence(from_utc_timestamp(to_timestamp("2011-01-01T00:00:00+00:00"), "+0000"), from_utc_timestamp(to_timestamp("2011-01-02T23:59:59+00:00"), "+0000"), INTERVAL \'15 minute\')) AS `EventTime`, explode(array(\'mocked-TAGNAME\')) AS `TagName`) ,boundary_events AS (SELECT coalesce(a.`TagName`, b.`TagName`) AS `TagName`, coalesce(a.`EventTime`, b.`EventTime`) AS `EventTime`, b.`Status`, b.`Value` FROM date_array a FULL OUTER JOIN raw_events b ON a.`EventTime` = b.`EventTime` AND a.`TagName` = b.`TagName`) ,window_buckets AS (SELECT `EventTime` AS window_start, LEAD(`EventTime`) OVER (ORDER BY `EventTime`) AS window_end FROM (SELECT distinct `EventTime` FROM date_array) ) ,window_events AS (SELECT /*+ RANGE_JOIN(b, 900 ) */ b.`TagName`, b.`EventTime`, a.window_start AS `WindowEventTime`, b.`Status`, b.`Value` FROM boundary_events b LEFT OUTER JOIN window_buckets a ON a.window_start <= b.`EventTime` AND a.window_end > b.`EventTime`) ,fill_status AS (SELECT *, last_value(`Status`, true) OVER (PARTITION BY `TagName` ORDER BY `EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS `Fill_Status`, CASE WHEN `Fill_Status` <> "Bad" THEN `Value` ELSE null END AS `Good_Value` FROM window_events) ,fill_value AS (SELECT *, last_value(`Good_Value`, true) OVER (PARTITION BY `TagName` ORDER BY `EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS `Fill_Value` FROM fill_status) ,fill_step AS (SELECT f.*, IFNULL(m.Step, false) AS Step FROM fill_value f LEFT JOIN `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_metadata` m ON f.`TagName` = m.`TagName`) ,interpolate AS (SELECT *, CASE WHEN `Step` = false AND `Status` IS NULL AND `Value` IS NULL THEN lag(`EventTime`) OVER ( PARTITION BY `TagName` ORDER BY `EventTime` ) ELSE NULL END AS `Previous_EventTime`, CASE WHEN `Step` = false AND `Status` IS NULL AND `Value` IS NULL THEN lag(`Fill_Value`) OVER ( PARTITION BY `TagName` ORDER BY `EventTime` ) ELSE NULL END AS `Previous_Fill_Value`, lead(`EventTime`) OVER ( PARTITION BY `TagName` ORDER BY `EventTime` ) AS `Next_EventTime`, CASE WHEN `Step` = false AND `Status` IS NULL AND `Value` IS NULL THEN lead(`Fill_Value`) OVER ( PARTITION BY `TagName` ORDER BY `EventTime` ) ELSE NULL END AS `Next_Fill_Value`, CASE WHEN `Step` = false AND `Status` IS NULL AND `Value` IS NULL THEN `Previous_Fill_Value` + ( (`Next_Fill_Value` - `Previous_Fill_Value`) * ( ( unix_timestamp(`EventTime`) - unix_timestamp(`Previous_EventTime`) ) / ( unix_timestamp(`Next_EventTime`) - unix_timestamp(`Previous_EventTime`) ) ) ) ELSE NULL END AS `Interpolated_Value`, coalesce(`Interpolated_Value`, `Fill_Value`) as `Event_Value` FROM fill_step ),twa_calculations AS (SELECT `TagName`, `EventTime`, `WindowEventTime`, `Step`, `Status`, `Value`, `Previous_EventTime`, `Previous_Fill_Value`, `Next_EventTime`, `Next_Fill_Value`, `Interpolated_Value`, `Fill_Status`, `Fill_Value`, `Event_Value`, lead(`Fill_Status`) OVER (PARTITION BY `TagName` ORDER BY `EventTime`) AS `Next_Status` , CASE WHEN `Next_Status` <> "Bad" OR (`Fill_Status` <> "Bad" AND `Next_Status` = "Bad") THEN lead(`Event_Value`) OVER (PARTITION BY `TagName` ORDER BY `EventTime`) ELSE `Value` END AS `Next_Value_For_Status` , CASE WHEN `Fill_Status` <> "Bad" THEN `Next_Value_For_Status` ELSE 0 END AS `Next_Value` , CASE WHEN `Fill_Status` <> "Bad" AND `Next_Status` <> "Bad" THEN ((cast(`Next_EventTime` AS double) - cast(`EventTime` AS double)) / 60) WHEN `Fill_Status` <> "Bad" AND `Next_Status` = "Bad" THEN ((cast(`Next_EventTime` AS integer) - cast(`EventTime` AS double)) / 60) ELSE 0 END AS good_minutes , CASE WHEN Step == false THEN ((`Event_Value` + `Next_Value`) * 0.5) * good_minutes ELSE (`Event_Value` * good_minutes) END AS twa_value FROM interpolate) ,twa AS (SELECT `TagName`, `WindowEventTime` AS `EventTime`, sum(twa_value) / sum(good_minutes) AS `Value` from twa_calculations GROUP BY `TagName`, `WindowEventTime`) ,project AS (SELECT * FROM twa WHERE `EventTime` BETWEEN to_timestamp("2011-01-01T00:00:00") AND to_timestamp("2011-01-02T23:59:59")) SELECT * FROM project ORDER BY `TagName`, `EventTime` ' TWA_MOCKED_QUERY_UOM = 'WITH raw_events AS (SELECT DISTINCT `TagName`, from_utc_timestamp(date_trunc("millisecond",`EventTime`), "+0000") AS `EventTime`, `Status`, `Value` FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE to_date(`EventTime`) BETWEEN date_sub(to_date(to_timestamp("2011-01-01T00:00:00+00:00")), 1) AND date_add(to_date(to_timestamp("2011-01-02T23:59:59+00:00")), 1) AND `TagName` IN (\'mocked-TAGNAME\') ) ,date_array AS (SELECT explode(sequence(from_utc_timestamp(to_timestamp("2011-01-01T00:00:00+00:00"), "+0000"), from_utc_timestamp(to_timestamp("2011-01-02T23:59:59+00:00"), "+0000"), INTERVAL \'15 minute\')) AS `EventTime`, explode(array(\'mocked-TAGNAME\')) AS `TagName`) ,boundary_events AS (SELECT coalesce(a.`TagName`, b.`TagName`) AS `TagName`, coalesce(a.`EventTime`, b.`EventTime`) AS `EventTime`, b.`Status`, b.`Value` FROM date_array a FULL OUTER JOIN raw_events b ON a.`EventTime` = b.`EventTime` AND a.`TagName` = b.`TagName`) ,window_buckets AS (SELECT `EventTime` AS window_start, LEAD(`EventTime`) OVER (ORDER BY `EventTime`) AS window_end FROM (SELECT distinct `EventTime` FROM date_array) ) ,window_events AS (SELECT /*+ RANGE_JOIN(b, 900 ) */ b.`TagName`, b.`EventTime`, a.window_start AS `WindowEventTime`, b.`Status`, b.`Value` FROM boundary_events b LEFT OUTER JOIN window_buckets a ON a.window_start <= b.`EventTime` AND a.window_end > b.`EventTime`) ,fill_status AS (SELECT *, last_value(`Status`, true) OVER (PARTITION BY `TagName` ORDER BY `EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS `Fill_Status`, CASE WHEN `Fill_Status` <> "Bad" THEN `Value` ELSE null END AS `Good_Value` FROM window_events) ,fill_value AS (SELECT *, last_value(`Good_Value`, true) OVER (PARTITION BY `TagName` ORDER BY `EventTime` ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS `Fill_Value` FROM fill_status) ,fill_step AS (SELECT *, false AS Step FROM fill_value) ,interpolate AS (SELECT *, CASE WHEN `Step` = false AND `Status` IS NULL AND `Value` IS NULL THEN lag(`EventTime`) OVER ( PARTITION BY `TagName` ORDER BY `EventTime` ) ELSE NULL END AS `Previous_EventTime`, CASE WHEN `Step` = false AND `Status` IS NULL AND `Value` IS NULL THEN lag(`Fill_Value`) OVER ( PARTITION BY `TagName` ORDER BY `EventTime` ) ELSE NULL END AS `Previous_Fill_Value`, lead(`EventTime`) OVER ( PARTITION BY `TagName` ORDER BY `EventTime` ) AS `Next_EventTime`, CASE WHEN `Step` = false AND `Status` IS NULL AND `Value` IS NULL THEN lead(`Fill_Value`) OVER ( PARTITION BY `TagName` ORDER BY `EventTime` ) ELSE NULL END AS `Next_Fill_Value`, CASE WHEN `Step` = false AND `Status` IS NULL AND `Value` IS NULL THEN `Previous_Fill_Value` + ( (`Next_Fill_Value` - `Previous_Fill_Value`) * ( ( unix_timestamp(`EventTime`) - unix_timestamp(`Previous_EventTime`) ) / ( unix_timestamp(`Next_EventTime`) - unix_timestamp(`Previous_EventTime`) ) ) ) ELSE NULL END AS `Interpolated_Value`, coalesce(`Interpolated_Value`, `Fill_Value`) as `Event_Value` FROM fill_step ),twa_calculations AS (SELECT `TagName`, `EventTime`, `WindowEventTime`, `Step`, `Status`, `Value`, `Previous_EventTime`, `Previous_Fill_Value`, `Next_EventTime`, `Next_Fill_Value`, `Interpolated_Value`, `Fill_Status`, `Fill_Value`, `Event_Value`, lead(`Fill_Status`) OVER (PARTITION BY `TagName` ORDER BY `EventTime`) AS `Next_Status` , CASE WHEN `Next_Status` <> "Bad" OR (`Fill_Status` <> "Bad" AND `Next_Status` = "Bad") THEN lead(`Event_Value`) OVER (PARTITION BY `TagName` ORDER BY `EventTime`) ELSE `Value` END AS `Next_Value_For_Status` , CASE WHEN `Fill_Status` <> "Bad" THEN `Next_Value_For_Status` ELSE 0 END AS `Next_Value` , CASE WHEN `Fill_Status` <> "Bad" AND `Next_Status` <> "Bad" THEN ((cast(`Next_EventTime` AS double) - cast(`EventTime` AS double)) / 60) WHEN `Fill_Status` <> "Bad" AND `Next_Status` = "Bad" THEN ((cast(`Next_EventTime` AS integer) - cast(`EventTime` AS double)) / 60) ELSE 0 END AS good_minutes , CASE WHEN Step == false THEN ((`Event_Value` + `Next_Value`) * 0.5) * good_minutes ELSE (`Event_Value` * good_minutes) END AS twa_value FROM interpolate) ,twa AS (SELECT `TagName`, `WindowEventTime` AS `EventTime`, sum(twa_value) / sum(good_minutes) AS `Value` from twa_calculations GROUP BY `TagName`, `WindowEventTime`) ,project AS (SELECT * FROM twa WHERE `EventTime` BETWEEN to_timestamp("2011-01-01T00:00:00") AND to_timestamp("2011-01-02T23:59:59")) SELECT p.`EventTime`, p.`TagName`, p.`Value`, m.`UoM` FROM project p LEFT OUTER JOIN `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_metadata` m ON p.`TagName` = m.`TagName` ORDER BY `TagName`, `EventTime` ' # Interpolation at Time @@ -93,6 +92,6 @@ LATEST_MOCKED_QUERY_UOM = "WITH latest AS (SELECT * FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_latest` WHERE `TagName` IN ('mocked-TAGNAME') ORDER BY `TagName` ) SELECT l.*, m.`UoM` FROM latest l LEFT OUTER JOIN `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_metadata` m ON l.`TagName` = m.`TagName` " # Summary -SUMMARY_MOCKED_QUERY = 'WITH summary AS (SELECT `TagName`, count(`Value`) as Count, CAST(Avg(`Value`) as decimal(10, 2)) as Avg, CAST(Min(`Value`) as decimal(10, 2)) as Min, CAST(Max(`Value`) as decimal(10, 2)) as Max, CAST(stddev(`Value`) as decimal(10, 2)) as StDev, CAST(sum(`Value`) as decimal(10, 2)) as Sum, CAST(variance(`Value`) as decimal(10, 2)) as Var FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp("2011-01-01T00:00:00+00:00") AND to_timestamp("2011-01-02T23:59:59+00:00") AND `TagName` IN (\'mocked-TAGNAME\') GROUP BY `TagName`) SELECT * FROM summary ' -SUMMARY_MOCKED_QUERY_CHECK_TAGS = 'WITH summary AS (SELECT `TagName`, count(`Value`) as Count, CAST(Avg(`Value`) as decimal(10, 2)) as Avg, CAST(Min(`Value`) as decimal(10, 2)) as Min, CAST(Max(`Value`) as decimal(10, 2)) as Max, CAST(stddev(`Value`) as decimal(10, 2)) as StDev, CAST(sum(`Value`) as decimal(10, 2)) as Sum, CAST(variance(`Value`) as decimal(10, 2)) as Var FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp("2011-01-01T00:00:00+00:00") AND to_timestamp("2011-01-02T23:59:59+00:00") AND UPPER(`TagName`) IN (\'MOCKED-TAGNAME\') GROUP BY `TagName`) SELECT * FROM summary ' -SUMMARY_MOCKED_QUERY_UOM = 'WITH summary AS (SELECT `TagName`, count(`Value`) as Count, CAST(Avg(`Value`) as decimal(10, 2)) as Avg, CAST(Min(`Value`) as decimal(10, 2)) as Min, CAST(Max(`Value`) as decimal(10, 2)) as Max, CAST(stddev(`Value`) as decimal(10, 2)) as StDev, CAST(sum(`Value`) as decimal(10, 2)) as Sum, CAST(variance(`Value`) as decimal(10, 2)) as Var FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp("2011-01-01T00:00:00+00:00") AND to_timestamp("2011-01-02T23:59:59+00:00") AND `TagName` IN (\'mocked-TAGNAME\') GROUP BY `TagName`) SELECT s.*, m.`UoM` FROM summary s LEFT OUTER JOIN `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_metadata` m ON s.`TagName` = m.`TagName` ' +SUMMARY_MOCKED_QUERY = "WITH summary AS (SELECT `TagName`, count(`Value`) as Count, CAST(Avg(`Value`) as decimal(10, 2)) as Avg, CAST(Min(`Value`) as decimal(10, 2)) as Min, CAST(Max(`Value`) as decimal(10, 2)) as Max, CAST(stddev(`Value`) as decimal(10, 2)) as StDev, CAST(sum(`Value`) as decimal(10, 2)) as Sum, CAST(variance(`Value`) as decimal(10, 2)) as Var FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp('2011-01-01T00:00:00+00:00') AND to_timestamp('2011-01-02T23:59:59+00:00') AND `TagName` IN ('mocked-TAGNAME') GROUP BY `TagName`) SELECT * FROM summary" +SUMMARY_MOCKED_QUERY_CHECK_TAGS = "WITH summary AS (SELECT `TagName`, count(`Value`) as Count, CAST(Avg(`Value`) as decimal(10, 2)) as Avg, CAST(Min(`Value`) as decimal(10, 2)) as Min, CAST(Max(`Value`) as decimal(10, 2)) as Max, CAST(stddev(`Value`) as decimal(10, 2)) as StDev, CAST(sum(`Value`) as decimal(10, 2)) as Sum, CAST(variance(`Value`) as decimal(10, 2)) as Var FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp('2011-01-01T00:00:00+00:00') AND to_timestamp('2011-01-02T23:59:59+00:00') AND UPPER(`TagName`) IN ('MOCKED-TAGNAME') GROUP BY `TagName`) SELECT * FROM summary" +SUMMARY_MOCKED_QUERY_UOM = "WITH summary AS (SELECT `TagName`, count(`Value`) as Count, CAST(Avg(`Value`) as decimal(10, 2)) as Avg, CAST(Min(`Value`) as decimal(10, 2)) as Min, CAST(Max(`Value`) as decimal(10, 2)) as Max, CAST(stddev(`Value`) as decimal(10, 2)) as StDev, CAST(sum(`Value`) as decimal(10, 2)) as Sum, CAST(variance(`Value`) as decimal(10, 2)) as Var FROM `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_events_mocked-data-type` WHERE `EventTime` BETWEEN to_timestamp('2011-01-01T00:00:00+00:00') AND to_timestamp('2011-01-02T23:59:59+00:00') AND `TagName` IN ('mocked-TAGNAME') GROUP BY `TagName`), uom AS (SELECT summary.*, metadata.`UoM` FROM summary LEFT OUTER JOIN `mocked-buiness-unit`.`sensors`.`mocked-asset_mocked-data-security-level_metadata` AS metadata ON summary.`TagName` = metadata.`TagName`) SELECT * FROM uom" diff --git a/tests/sdk/python/rtdip_sdk/queries/time_series/test_interpolate.py b/tests/sdk/python/rtdip_sdk/queries/time_series/test_interpolate.py index 87a1ed068..858150997 100644 --- a/tests/sdk/python/rtdip_sdk/queries/time_series/test_interpolate.py +++ b/tests/sdk/python/rtdip_sdk/queries/time_series/test_interpolate.py @@ -26,7 +26,6 @@ from tests.sdk.python.rtdip_sdk.queries._test_utils.sdk_test_objects import ( MOCKED_PARAMETER_DICT, INTERPOLATE_MOCKED_QUERY, - INTERPOLATE_MOCKED_QUERY_BACKWARD_FILL, INTERPOLATE_MOCKED_QUERY_CHECK_TAGS, MOCKED_QUERY_OFFSET_LIMIT, INTERPOLATE_MOCKED_QUERY_PIVOT, @@ -36,16 +35,16 @@ MOCKED_INTERPOLATE_PARAMETER_DICT = MOCKED_PARAMETER_DICT.copy() MOCKED_INTERPOLATE_PARAMETER_DICT["time_interval_rate"] = "15" MOCKED_INTERPOLATE_PARAMETER_DICT["time_interval_unit"] = "minute" -MOCKED_INTERPOLATE_PARAMETER_DICT["agg_method"] = "avg" -MOCKED_INTERPOLATE_PARAMETER_DICT["interpolation_method"] = "backward_fill" MOCKED_INTERPOLATE_PARAMETER_DICT["pivot"] = False -def test_interpolate_backward_fill(mocker: MockerFixture): +def test_interpolate(mocker: MockerFixture): + TEST_PARAMETERS = MOCKED_INTERPOLATE_PARAMETER_DICT.copy() + TEST_PARAMETERS["display_uom"] = False _test_base_succeed( mocker, - MOCKED_INTERPOLATE_PARAMETER_DICT, - INTERPOLATE_MOCKED_QUERY_BACKWARD_FILL, + TEST_PARAMETERS, + INTERPOLATE_MOCKED_QUERY, interpolate_get, ) @@ -61,18 +60,6 @@ def test_interpolate_uom(mocker: MockerFixture): ) -def test_interpolate_forward_fill(mocker: MockerFixture): - TEST_PARAMETERS = MOCKED_INTERPOLATE_PARAMETER_DICT.copy() - TEST_PARAMETERS["interpolation_method"] = "forward_fill" - TEST_PARAMETERS["display_uom"] = False - _test_base_succeed( - mocker, - TEST_PARAMETERS, - INTERPOLATE_MOCKED_QUERY, - interpolate_get, - ) - - def test_interpolate_check_tags(mocker: MockerFixture): TEST_PARAMETERS = MOCKED_INTERPOLATE_PARAMETER_DICT.copy() TEST_PARAMETERS["case_insensitivity_tag_search"] = True @@ -86,7 +73,6 @@ def test_interpolate_check_tags(mocker: MockerFixture): def test_interpolate_sample_rate_unit(mocker: MockerFixture): TEST_PARAMETERS = MOCKED_INTERPOLATE_PARAMETER_DICT.copy() - TEST_PARAMETERS["interpolation_method"] = "forward_fill" TEST_PARAMETERS["case_insensitivity_tag_search"] = False TEST_PARAMETERS["sample_rate"] = "15" TEST_PARAMETERS["sample_unit"] = "minute" @@ -111,14 +97,13 @@ def test_interpolate_pivot(mocker: MockerFixture): def test_interpolate_offset_limit(mocker: MockerFixture): TEST_PARAMETERS = MOCKED_INTERPOLATE_PARAMETER_DICT.copy() - TEST_PARAMETERS["interpolation_method"] = "forward_fill" TEST_PARAMETERS["pivot"] = False TEST_PARAMETERS["offset"] = 10 TEST_PARAMETERS["limit"] = 10 _test_base_succeed( mocker, TEST_PARAMETERS, - INTERPOLATE_MOCKED_QUERY + MOCKED_QUERY_OFFSET_LIMIT, + INTERPOLATE_MOCKED_QUERY + " " + MOCKED_QUERY_OFFSET_LIMIT.strip(), interpolate_get, ) diff --git a/tests/sdk/python/rtdip_sdk/queries/time_series/test_plot.py b/tests/sdk/python/rtdip_sdk/queries/time_series/test_plot.py index fefab34be..e81a81185 100644 --- a/tests/sdk/python/rtdip_sdk/queries/time_series/test_plot.py +++ b/tests/sdk/python/rtdip_sdk/queries/time_series/test_plot.py @@ -34,6 +34,7 @@ MOCKED_PLOT_PARAMETER_DICT = MOCKED_PARAMETER_DICT.copy() MOCKED_PLOT_PARAMETER_DICT["time_interval_rate"] = "15" MOCKED_PLOT_PARAMETER_DICT["time_interval_unit"] = "minute" +MOCKED_PLOT_PARAMETER_DICT["pivot"] = False def test_plot_success(mocker: MockerFixture): @@ -91,13 +92,13 @@ def test_plot_uom(mocker: MockerFixture): def test_plot_offset_limit(mocker: MockerFixture): - MOCKED_PLOT_PARAMETER_DICT["display_uom"] = False MOCKED_PLOT_PARAMETER_DICT["offset"] = 10 MOCKED_PLOT_PARAMETER_DICT["limit"] = 10 + MOCKED_PLOT_PARAMETER_DICT["display_uom"] = False _test_base_succeed( mocker, MOCKED_PLOT_PARAMETER_DICT, - (PLOT_MOCKED_QUERY + MOCKED_QUERY_OFFSET_LIMIT), + (PLOT_MOCKED_QUERY + " " + MOCKED_QUERY_OFFSET_LIMIT.strip()), plot_get, ) diff --git a/tests/sdk/python/rtdip_sdk/queries/time_series/test_query_builder.py b/tests/sdk/python/rtdip_sdk/queries/time_series/test_query_builder.py index 15e30ea3e..419a73022 100644 --- a/tests/sdk/python/rtdip_sdk/queries/time_series/test_query_builder.py +++ b/tests/sdk/python/rtdip_sdk/queries/time_series/test_query_builder.py @@ -98,8 +98,6 @@ def test_query_builder_interpolate(mocker: MockerFixture): end_date="2021-01-02", time_interval_rate="1", time_interval_unit="hour", - agg_method="avg", - interpolation_method="linear", ) ) assert data == {"test": "data"} diff --git a/tests/sdk/python/rtdip_sdk/queries/time_series/test_raw.py b/tests/sdk/python/rtdip_sdk/queries/time_series/test_raw.py index 2315d69dd..9c66586dd 100644 --- a/tests/sdk/python/rtdip_sdk/queries/time_series/test_raw.py +++ b/tests/sdk/python/rtdip_sdk/queries/time_series/test_raw.py @@ -56,7 +56,7 @@ def test_raw_offset_limit(mocker: MockerFixture): _test_base_succeed( mocker, MOCKED_RAW_DICT, - RAW_MOCKED_QUERY + MOCKED_QUERY_OFFSET_LIMIT, + RAW_MOCKED_QUERY + " " + MOCKED_QUERY_OFFSET_LIMIT.strip(), raw_get, ) diff --git a/tests/sdk/python/rtdip_sdk/queries/time_series/test_resample.py b/tests/sdk/python/rtdip_sdk/queries/time_series/test_resample.py index 35bd18d54..bcdd83fab 100644 --- a/tests/sdk/python/rtdip_sdk/queries/time_series/test_resample.py +++ b/tests/sdk/python/rtdip_sdk/queries/time_series/test_resample.py @@ -97,7 +97,7 @@ def test_resample_offset_limit(mocker: MockerFixture): _test_base_succeed( mocker, MOCKED_RESAMPLED_PARAMETER_DICT, - (RESAMPLE_MOCKED_QUERY + MOCKED_QUERY_OFFSET_LIMIT), + (RESAMPLE_MOCKED_QUERY + " " + MOCKED_QUERY_OFFSET_LIMIT.strip()), resample_get, ) diff --git a/tests/sdk/python/rtdip_sdk/queries/time_series/test_summary.py b/tests/sdk/python/rtdip_sdk/queries/time_series/test_summary.py index d706bf328..7d14dec43 100644 --- a/tests/sdk/python/rtdip_sdk/queries/time_series/test_summary.py +++ b/tests/sdk/python/rtdip_sdk/queries/time_series/test_summary.py @@ -77,7 +77,7 @@ def test_summary_offset_limit(mocker: MockerFixture): _test_base_succeed( mocker, MOCKED_SUMMARY_DICT, - SUMMARY_MOCKED_QUERY + MOCKED_QUERY_OFFSET_LIMIT, + SUMMARY_MOCKED_QUERY + " " + MOCKED_QUERY_OFFSET_LIMIT.strip(), summary_get, ) diff --git a/tests/sdk/python/rtdip_sdk/queries/time_series/test_time_weighted_average.py b/tests/sdk/python/rtdip_sdk/queries/time_series/test_time_weighted_average.py index 0f048ce5a..ad8b3b279 100644 --- a/tests/sdk/python/rtdip_sdk/queries/time_series/test_time_weighted_average.py +++ b/tests/sdk/python/rtdip_sdk/queries/time_series/test_time_weighted_average.py @@ -60,18 +60,7 @@ def test_time_weighted_average_check_tags(mocker: MockerFixture): TWA_MOCKED_QUERY_CHECK_TAGS, time_weighted_average_get, ) - - -def test_time_weighted_average_with_window_size_mins(mocker: MockerFixture): MOCKED_TWA_PARAMETER_DICT["case_insensitivity_tag_search"] = False - MOCKED_TWA_PARAMETER_DICT["window_size_mins"] = 15 - - _test_base_succeed( - mocker, - MOCKED_TWA_PARAMETER_DICT, - TWA_MOCKED_QUERY, - time_weighted_average_get, - ) def test_time_weighted_average_metadata_step(mocker: MockerFixture): diff --git a/tests/sdk/python/rtdip_sdk/queries/weather/test_latest.py b/tests/sdk/python/rtdip_sdk/queries/weather/test_latest.py index 5296234a7..e8455ab18 100644 --- a/tests/sdk/python/rtdip_sdk/queries/weather/test_latest.py +++ b/tests/sdk/python/rtdip_sdk/queries/weather/test_latest.py @@ -36,7 +36,6 @@ ACCESS_TOKEN = "mock_databricks_token" DATABRICKS_SQL_CONNECT = "databricks.sql.connect" DATABRICKS_SQL_CONNECT_CURSOR = "databricks.sql.connect.cursor" -INTERPOLATION_METHOD = "test/test/test" MOCKED_QUERY_GRID = "SELECT * FROM `forecast`.`weather`.`mock_region_mock_security_events_mock_data_type_latest` WHERE `Latitude` > 36 AND `Latitude` < 38 AND `Longitude` > -109.1 AND `Longitude` < -107.1 ORDER BY `TagName` " MOCKED_QUERY_POINT = "SELECT * FROM `forecast`.`weather`.`mock_region_mock_security_events_mock_data_type_latest` WHERE `Latitude` == 37 AND `Longitude` == -108.1 ORDER BY `TagName` " MOCKED_QUERY_OFFSET_LIMIT = "LIMIT 10 OFFSET 10 " diff --git a/tests/sdk/python/rtdip_sdk/queries/weather/test_raw.py b/tests/sdk/python/rtdip_sdk/queries/weather/test_raw.py index 0d2ad94eb..90ea418ad 100644 --- a/tests/sdk/python/rtdip_sdk/queries/weather/test_raw.py +++ b/tests/sdk/python/rtdip_sdk/queries/weather/test_raw.py @@ -36,7 +36,6 @@ ACCESS_TOKEN = "mock_databricks_token" DATABRICKS_SQL_CONNECT = "databricks.sql.connect" DATABRICKS_SQL_CONNECT_CURSOR = "databricks.sql.connect.cursor" -INTERPOLATION_METHOD = "test/test/test" MOCKED_QUERY_GRID = 'SELECT * FROM `forecast`.`weather`.`mock_region_mock_security_events_mock_data_type` WHERE (`EventTime` BETWEEN to_timestamp("2024-01-01") AND to_timestamp("2024-01-03")) AND (`EnqueuedTime` BETWEEN to_timestamp("2023-12-28") AND to_timestamp("2023-12-31")) AND `Latitude` > 36 AND `Latitude` < 38 AND `Longitude` > -109.1 AND `Longitude` < -107.1 ORDER BY `TagName` ' MOCKED_QUERY_POINT = 'SELECT * FROM `forecast`.`weather`.`mock_region_mock_security_events_mock_data_type` WHERE (`EventTime` BETWEEN to_timestamp("2024-01-01") AND to_timestamp("2024-01-03")) AND (`EnqueuedTime` BETWEEN to_timestamp("2023-12-28") AND to_timestamp("2023-12-31")) AND `Latitude` == 37 AND `Longitude` == -108.1 ORDER BY `TagName` ' MOCKED_QUERY_OFFSET_LIMIT = "LIMIT 10 OFFSET 10 "