diff --git a/.github/workflows/testing_ci.yml b/.github/workflows/testing_ci.yml index bee3c39..eef4a0a 100644 --- a/.github/workflows/testing_ci.yml +++ b/.github/workflows/testing_ci.yml @@ -17,7 +17,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-latest, windows-latest, macOS-latest] + os: [ubuntu-latest, windows-latest, macOS-13] python-version: ['3.7', '3.11'] steps: diff --git a/README.md b/README.md index da70e57..4e4394f 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@

Welcome to TSDB

a Python toolbox to ease loading public time-series datasets

* @@ -24,7 +24,7 @@ GitHub Repo stars

@@ -46,7 +46,7 @@

-> 📣 TSDB now supports a total of 1️⃣6️⃣9️⃣ time-series datasets ‼️ +> 📣 TSDB now supports a total of 1️⃣7️⃣0️⃣ time-series datasets ‼️

TSDB is a part of @@ -71,7 +71,7 @@ TSDB now is available on + +

+ +

+ ``` bibtex -@article{du2023PyPOTS, +@article{du2023pypots, title={{PyPOTS: a Python toolbox for data mining on Partially-Observed Time Series}}, author={Wenjie Du}, year={2023}, @@ -137,21 +144,6 @@ doi={10.48550/arXiv.2305.18811}, > PyPOTS: a Python toolbox for data mining on Partially-Observed Time Series. > arXiv, abs/2305.18811.https://arxiv.org/abs/2305.18811 -or - -``` bibtex -@inproceedings{du2023PyPOTS, -title={{PyPOTS: a Python toolbox for data mining on Partially-Observed Time Series}}, -booktitle={9th SIGKDD workshop on Mining and Learning from Time Series (MiLeTS'23)}, -author={Wenjie Du}, -year={2023}, -url={https://arxiv.org/abs/2305.18811}, -} -``` - -> Wenjie Du. (2023). -> PyPOTS: a Python toolbox for data mining on Partially-Observed Time Series. -> In *9th SIGKDD workshop on Mining and Learning from Time Series (MiLeTS'23)*. https://arxiv.org/abs/2305.18811

diff --git a/dataset_profiles/italy_air_quality/README.md b/dataset_profiles/italy_air_quality/README.md new file mode 100644 index 0000000..94a20cd --- /dev/null +++ b/dataset_profiles/italy_air_quality/README.md @@ -0,0 +1,17 @@ +# Italy Air Quality + +## Citing this dataset 🤗 + +`Vito,Saverio. (2016). Air Quality. UCI Machine Learning Repository. https://doi.org/10.24432/C59K5F` + +or + +```bibtex +@misc{vito2016air, +author = {Vito,Saverio}, +title = {{Air Quality}}, +year = {2016}, +howpublished = {UCI Machine Learning Repository}, +note = {{DOI}: https://doi.org/10.24432/C59K5F} +} +``` diff --git a/docs/index.rst b/docs/index.rst index cee7040..bc66a41 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,51 +1,58 @@ -.. TSDB documentation master file, created by - sphinx-quickstart on Wed Mar 15 15:23:52 2023. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. +.. PyPOTS documentation index page + Created by Wenjie Du + License: BSD-3-Clause Welcome to TSDB documentation! ================================ .. image:: https://pypots.com/figs/pypots_logos/TSDB/logo_FFBG.svg - :height: 160 + :height: 180 :align: right :target: https://github.com/WenjieDu/TSDB :alt: TSDB logo -**A Python Toolbox Helping Load Time-Series Datasets Easily** +**A Python Toolbox to Ease Loading Public Time-Series Datasets** .. image:: https://img.shields.io/badge/python-v3-E97040?logo=python&logoColor=white :alt: Python version + .. image:: https://img.shields.io/github/v/release/wenjiedu/tsdb?color=EE781F&include_prereleases&label=Release&logo=github&logoColor=white :alt: the latest release version :target: https://img.shields.io/github/v/release/wenjiedu/tsdb?color=EE781F&include_prereleases&label=Release&logo=github&logoColor=white + .. image:: https://img.shields.io/badge/License-BSD--3-E9BB41?logo=opensourceinitiative&logoColor=white :alt: License :target: https://github.com/WenjieDu/TSDB/blob/main/LICENSE + .. image:: https://img.shields.io/github/actions/workflow/status/wenjiedu/tsdb/testing_ci.yml?logo=github&color=C8D8E1&label=CI :alt: GitHub Testing :target: https://github.com/WenjieDu/TSDB/actions/workflows/testing_ci.yml + .. image:: https://img.shields.io/codeclimate/maintainability-percentage/WenjieDu/TSDB?color=3C7699&label=Maintainability&logo=codeclimate :alt: Code Climate maintainability :target: https://codeclimate.com/github/WenjieDu/TSDB + .. image:: https://img.shields.io/coverallsCoverage/github/WenjieDu/TSDB?branch=main&logo=coveralls&color=75C1C4&label=Coverage :alt: Coveralls report :target: https://coveralls.io/github/WenjieDu/TSDB -.. image:: https://img.shields.io/conda/dn/conda-forge/tsdb?label=Conda%20Downloads&color=AED0ED&logo=anaconda&logoColor=white + +.. image:: https://img.shields.io/endpoint?url=https://pypots.com/figs/downloads_badges/conda_tsdb_downloads.json :alt: Conda downloads :target: https://anaconda.org/conda-forge/pypots -.. image:: https://img.shields.io/endpoint?url=https%3A%2F%2Fraw.githubusercontent.com%2FWenjieDu%2FWenjieDu%2Fmain%2Ffigs%2Fprojects%2Ftsdb_downloads.json + +.. image:: https://img.shields.io/endpoint?url=https://pypots.com/figs/downloads_badges/pypi_tsdb_downloads.json :alt: PyPI downloads :target: https://pepy.tech/project/tsdb + .. image:: https://img.shields.io/badge/Contributor%20Covenant-v2.1-4baaaa.svg :alt: CODE of CONDUCT + .. image:: https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fgithub.com%2FWenjieDu%2FTime_Series_Database&count_bg=%2379C83D&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=Visits+since+April+2022&edge_flat=false :alt: Visit num +📣 TSDB now supports a total of 1️⃣6️⃣9️⃣ time-series datasets ‼️ -📣 TSDB now supports a total of 1️⃣6️⃣8️⃣ time-series datasets ‼️ - -.. image:: https://raw.githubusercontent.com/PyPOTS/pypots.github.io/main/static/figs/pypots_logos/PyPOTS/logo_FFBG.svg - :height: 160 +.. image:: https://pypots.com/figs/pypots_logos/PyPOTS/logo_FFBG.svg + :width: 120 :align: left :target: https://github.com/WenjieDu/PyPOTS :alt: PyPOTS logo @@ -72,7 +79,7 @@ TSDB is available on both `PyPI `_ and `Anaco Install it with `conda install tsdb`, you may need to specify the channel with option `-c conda-forge` -or install from PyPI: +or install via PyPI: pip install tsdb @@ -85,12 +92,22 @@ or install from source code: import tsdb - tsdb.list_available_datasets() # list all available datasets in TSDB - data = tsdb.load_dataset('physionet_2012') # select the dataset you need and load it, TSDB will download, extract, and process it automatically - tsdb.download_and_extract('physionet_2012', './save_it_here') # if you need the raw data, use download_and_extract() - tsdb.list_cached_data() # datasets you once loaded are cached, and you can check them with list_cached_data() - tsdb.delete_cached_data(dataset_name='physionet_2012') # you can delete only one specific dataset and preserve others - tsdb.delete_cached_data() # or you can delete all cache with delete_cached_data() to free disk space + # list all available datasets in TSDB + tsdb.list() + # select the dataset you need and load it, TSDB will download, extract, and process it automatically + data = tsdb.load('physionet_2012') + # if you need the raw data, use download_and_extract() + tsdb.download_and_extract('physionet_2012', './save_it_here') + # datasets you once loaded are cached, and you can check them with list_cached_data() + tsdb.list_cache() + # you can delete only one specific dataset and preserve others + tsdb.delete_cache(dataset_name='physionet_2012') + # or you can delete all cache with delete_cached_data() to free disk space + tsdb.delete_cache() + + # to avoid taking up too much space if downloading many datasets, + # TSDB cache directory can be migrated to an external disk + tsdb.migrate_cache("/mnt/external_disk/TSDB_cache") That's all. Simple and efficient. Enjoy it! 😃 @@ -117,6 +134,14 @@ and we are pursuing to publish it in prestigious academic venues, e.g. JMLR (tra `Machine Learning Open Source Software `_). If you use PyPOTS in your work, please cite it as below and 🌟star `PyPOTS repository `_ to make others notice this library. 🤗 + +.. image:: https://pypots.com/figs/pypots_logos/Ecosystem/PyPOTS_Ecosystem_Pipeline.png + :height: 300 + :align: center + :target: https://pypots.com/ecosystem/ + :alt: PyPOTS Ecosystem Pipeline + + .. code-block:: bibtex :linenos: diff --git a/tsdb/__init__.py b/tsdb/__init__.py index d5ecb1c..0e5c951 100644 --- a/tsdb/__init__.py +++ b/tsdb/__init__.py @@ -21,7 +21,7 @@ # # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. # 'X.Y.dev0' is the canonical version of 'X.Y.dev' -__version__ = "0.3.1" +__version__ = "0.4" from .data_processing import ( CACHED_DATASET_DIR, diff --git a/tsdb/data_processing.py b/tsdb/data_processing.py index 09c93c8..14105d4 100644 --- a/tsdb/data_processing.py +++ b/tsdb/data_processing.py @@ -18,6 +18,7 @@ load_beijing_air_quality, load_ucr_uea_dataset, load_ais, + load_italy_air_quality, ) from .utils.downloading import download_and_extract from .utils.file import purge_path, pickle_load, pickle_dump, determine_data_home @@ -100,6 +101,8 @@ def load(dataset_name: str, use_cache: bool = True) -> dict: result = load_ett(dataset_saving_path) elif dataset_name == "beijing_multisite_air_quality": result = load_beijing_air_quality(dataset_saving_path) + elif dataset_name == "italy_air_quality": + result = load_italy_air_quality(dataset_saving_path) elif dataset_name == "vessel_ais": result = load_ais(dataset_saving_path) elif "ucr_uea_" in dataset_name: diff --git a/tsdb/database.py b/tsdb/database.py index 566b14f..060c550 100644 --- a/tsdb/database.py +++ b/tsdb/database.py @@ -30,6 +30,8 @@ # https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/beijing_multisite_air_quality "beijing_multisite_air_quality": "https://archive.ics.uci.edu/ml/machine-learning-databases/00501/" "PRSA2017_Data_20130301-20170228.zip", + # https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/italy_air_quality + "italy_air_quality": "https://archive.ics.uci.edu/static/public/360/air+quality.zip", # # https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/vessel_ais "vessel_ais": "https://zenodo.org/record/8064564/files/parquets.zip", diff --git a/tsdb/loading_funcs/__init__.py b/tsdb/loading_funcs/__init__.py index fae0583..cb4770f 100644 --- a/tsdb/loading_funcs/__init__.py +++ b/tsdb/loading_funcs/__init__.py @@ -7,11 +7,12 @@ from .beijing_multisite_air_quality import load_beijing_air_quality from .electricity_load_diagrams import load_electricity +from .electricity_transformer_temperature import load_ett +from .italy_air_quality import load_italy_air_quality from .physionet_2012 import load_physionet2012 from .physionet_2019 import load_physionet2019 from .ucr_uea_datasets import load_ucr_uea_dataset from .vessel_ais import load_ais -from .electricity_transformer_temperature import load_ett __all__ = [ "load_beijing_air_quality", @@ -21,4 +22,5 @@ "load_ucr_uea_dataset", "load_ais", "load_ett", + "load_italy_air_quality", ] diff --git a/tsdb/loading_funcs/italy_air_quality.py b/tsdb/loading_funcs/italy_air_quality.py new file mode 100644 index 0000000..b7fdf75 --- /dev/null +++ b/tsdb/loading_funcs/italy_air_quality.py @@ -0,0 +1,41 @@ +""" +Scripts related to dataset Italy Air Quality. + +For more information please refer to: +https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/italy_air_quality +""" + +# Created by Wenjie Du +# License: BSD-3-Clause + +import os + +import pandas as pd + + +def load_italy_air_quality(local_path): + """Load dataset Italy Air Quality. + + Parameters + ---------- + local_path : str, + The local path of dir saving the raw data of Beijing Multi-site Air Quality. + + Returns + ------- + data : dict + A dictionary contains X: + X : pandas.DataFrame + The time-series data of Beijing Multi-site Air Quality. + """ + file_path = os.path.join(local_path, "AirQualityUCI.csv") + df = pd.read_csv(file_path, sep=";", decimal=",") + # remove empty columns + df.drop(columns=["Unnamed: 15", "Unnamed: 16"], inplace=True) + # remove rows with all NaN, i.e. Date is NaN + df = df[~df["Date"].isna()] + + data = { + "X": df, + } + return data diff --git a/tsdb/loading_funcs/physionet_2012.py b/tsdb/loading_funcs/physionet_2012.py index 6eab9db..e8a75e7 100644 --- a/tsdb/loading_funcs/physionet_2012.py +++ b/tsdb/loading_funcs/physionet_2012.py @@ -56,12 +56,11 @@ def load_physionet2012(local_path): ) # ensure RecordID's type is int outcome = outcome.set_index("RecordID") outcome_collector.append(outcome) - y = pd.concat(outcome_collector) - - df_collector = [] # iterate over all samples + set_collector = [] for m_ in time_series_measurements_dir: + df_collector = [] raw_data_dir = os.path.join(local_path, m_) for filename in os.listdir(raw_data_dir): recordID = int(filename.split(".txt")[0]) @@ -80,11 +79,16 @@ def load_physionet2012(local_path): df_temp["Age"] = df_temp.loc[0, "Age"] df_temp["Height"] = df_temp.loc[0, "Height"] df_collector.append(df_temp) - - df = pd.concat(df_collector, sort=True) - X = df.reset_index(drop=True) - unique_ids = df["RecordID"].unique() - y = y.loc[unique_ids] - - data = {"X": X, "y": y, "static_features": ["Age", "Gender", "ICUType", "Height"]} + df = pd.concat(df_collector, sort=True) + set_collector.append(df) + + data = { + "set-a": set_collector[0], + "set-b": set_collector[1], + "set-c": set_collector[2], + "outcomes-a": outcome_collector[0], + "outcomes-b": outcome_collector[1], + "outcomes-c": outcome_collector[2], + "static_features": ["Age", "Gender", "ICUType", "Height"], + } return data diff --git a/tsdb/loading_funcs/physionet_2019.py b/tsdb/loading_funcs/physionet_2019.py index 8e75273..df2fd9e 100644 --- a/tsdb/loading_funcs/physionet_2019.py +++ b/tsdb/loading_funcs/physionet_2019.py @@ -16,11 +16,12 @@ def load_physionet2019(local_path): time_series_measurements_dir = ["training", "training_setB"] - label_feature = "SepsisLabel" # feature SepsisLabel contains labels indicating whether patients get sepsis - time_feature = "ICULOS" # ICU length-of-stay (hours since ICU admit) + # label_feature = "SepsisLabel" # feature SepsisLabel contains labels indicating whether patients get sepsis + # time_feature = "ICULOS" # ICU length-of-stay (hours since ICU admit) - df_collector = [] + set_collector = [] for m_ in time_series_measurements_dir: + df_collector = [] raw_data_dir = os.path.join(local_path, m_) for filename in os.listdir(raw_data_dir): recordID = filename.split(".psv")[0] @@ -28,11 +29,12 @@ def load_physionet2019(local_path): df_temp = pd.read_csv(f, sep="|", header=0) df_temp["RecordID"] = recordID df_collector.append(df_temp) - - df = pd.concat(df_collector, sort=True) - df = df.reset_index(drop=True) - y = df[["RecordID", time_feature, label_feature]] - X = df.drop(label_feature, axis=1) - - data = {"X": X, "y": y, "static_features": ["Age", "Gender", "HospAdmTime"]} + df = pd.concat(df_collector, sort=True) + set_collector.append(df) + + data = { + "training_setA": set_collector[0], + "training_setB": set_collector[1], + "static_features": ["Age", "Gender", "Unit1", "Unit2", "HospAdmTime"], + } return data