diff --git a/.github/workflows/testing_ci.yml b/.github/workflows/testing_ci.yml
index bee3c39..eef4a0a 100644
--- a/.github/workflows/testing_ci.yml
+++ b/.github/workflows/testing_ci.yml
@@ -17,7 +17,7 @@ jobs:
strategy:
fail-fast: false
matrix:
- os: [ubuntu-latest, windows-latest, macOS-latest]
+ os: [ubuntu-latest, windows-latest, macOS-13]
python-version: ['3.7', '3.11']
steps:
diff --git a/README.md b/README.md
index da70e57..4e4394f 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
-
Welcome to TSDB
+Welcome to TSDB
*a Python toolbox to ease loading public time-series datasets
*
@@ -24,7 +24,7 @@
-
+
@@ -46,7 +46,7 @@
-> 📣 TSDB now supports a total of 1️⃣6️⃣9️⃣ time-series datasets ‼️
+> 📣 TSDB now supports a total of 1️⃣7️⃣0️⃣ time-series datasets ‼️
TSDB is a part of
@@ -71,7 +71,7 @@ TSDB now is available on
+
+
+
+
+
``` bibtex
-@article{du2023PyPOTS,
+@article{du2023pypots,
title={{PyPOTS: a Python toolbox for data mining on Partially-Observed Time Series}},
author={Wenjie Du},
year={2023},
@@ -137,21 +144,6 @@ doi={10.48550/arXiv.2305.18811},
> PyPOTS: a Python toolbox for data mining on Partially-Observed Time Series.
> arXiv, abs/2305.18811.https://arxiv.org/abs/2305.18811
-or
-
-``` bibtex
-@inproceedings{du2023PyPOTS,
-title={{PyPOTS: a Python toolbox for data mining on Partially-Observed Time Series}},
-booktitle={9th SIGKDD workshop on Mining and Learning from Time Series (MiLeTS'23)},
-author={Wenjie Du},
-year={2023},
-url={https://arxiv.org/abs/2305.18811},
-}
-```
-
-> Wenjie Du. (2023).
-> PyPOTS: a Python toolbox for data mining on Partially-Observed Time Series.
-> In *9th SIGKDD workshop on Mining and Learning from Time Series (MiLeTS'23)*. https://arxiv.org/abs/2305.18811
diff --git a/dataset_profiles/italy_air_quality/README.md b/dataset_profiles/italy_air_quality/README.md
new file mode 100644
index 0000000..94a20cd
--- /dev/null
+++ b/dataset_profiles/italy_air_quality/README.md
@@ -0,0 +1,17 @@
+# Italy Air Quality
+
+## Citing this dataset 🤗
+
+`Vito,Saverio. (2016). Air Quality. UCI Machine Learning Repository. https://doi.org/10.24432/C59K5F`
+
+or
+
+```bibtex
+@misc{vito2016air,
+author = {Vito,Saverio},
+title = {{Air Quality}},
+year = {2016},
+howpublished = {UCI Machine Learning Repository},
+note = {{DOI}: https://doi.org/10.24432/C59K5F}
+}
+```
diff --git a/docs/index.rst b/docs/index.rst
index cee7040..bc66a41 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,51 +1,58 @@
-.. TSDB documentation master file, created by
- sphinx-quickstart on Wed Mar 15 15:23:52 2023.
- You can adapt this file completely to your liking, but it should at least
- contain the root `toctree` directive.
+.. PyPOTS documentation index page
+ Created by Wenjie Du
+ License: BSD-3-Clause
Welcome to TSDB documentation!
================================
.. image:: https://pypots.com/figs/pypots_logos/TSDB/logo_FFBG.svg
- :height: 160
+ :height: 180
:align: right
:target: https://github.com/WenjieDu/TSDB
:alt: TSDB logo
-**A Python Toolbox Helping Load Time-Series Datasets Easily**
+**A Python Toolbox to Ease Loading Public Time-Series Datasets**
.. image:: https://img.shields.io/badge/python-v3-E97040?logo=python&logoColor=white
:alt: Python version
+
.. image:: https://img.shields.io/github/v/release/wenjiedu/tsdb?color=EE781F&include_prereleases&label=Release&logo=github&logoColor=white
:alt: the latest release version
:target: https://img.shields.io/github/v/release/wenjiedu/tsdb?color=EE781F&include_prereleases&label=Release&logo=github&logoColor=white
+
.. image:: https://img.shields.io/badge/License-BSD--3-E9BB41?logo=opensourceinitiative&logoColor=white
:alt: License
:target: https://github.com/WenjieDu/TSDB/blob/main/LICENSE
+
.. image:: https://img.shields.io/github/actions/workflow/status/wenjiedu/tsdb/testing_ci.yml?logo=github&color=C8D8E1&label=CI
:alt: GitHub Testing
:target: https://github.com/WenjieDu/TSDB/actions/workflows/testing_ci.yml
+
.. image:: https://img.shields.io/codeclimate/maintainability-percentage/WenjieDu/TSDB?color=3C7699&label=Maintainability&logo=codeclimate
:alt: Code Climate maintainability
:target: https://codeclimate.com/github/WenjieDu/TSDB
+
.. image:: https://img.shields.io/coverallsCoverage/github/WenjieDu/TSDB?branch=main&logo=coveralls&color=75C1C4&label=Coverage
:alt: Coveralls report
:target: https://coveralls.io/github/WenjieDu/TSDB
-.. image:: https://img.shields.io/conda/dn/conda-forge/tsdb?label=Conda%20Downloads&color=AED0ED&logo=anaconda&logoColor=white
+
+.. image:: https://img.shields.io/endpoint?url=https://pypots.com/figs/downloads_badges/conda_tsdb_downloads.json
:alt: Conda downloads
:target: https://anaconda.org/conda-forge/pypots
-.. image:: https://img.shields.io/endpoint?url=https%3A%2F%2Fraw.githubusercontent.com%2FWenjieDu%2FWenjieDu%2Fmain%2Ffigs%2Fprojects%2Ftsdb_downloads.json
+
+.. image:: https://img.shields.io/endpoint?url=https://pypots.com/figs/downloads_badges/pypi_tsdb_downloads.json
:alt: PyPI downloads
:target: https://pepy.tech/project/tsdb
+
.. image:: https://img.shields.io/badge/Contributor%20Covenant-v2.1-4baaaa.svg
:alt: CODE of CONDUCT
+
.. image:: https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fgithub.com%2FWenjieDu%2FTime_Series_Database&count_bg=%2379C83D&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=Visits+since+April+2022&edge_flat=false
:alt: Visit num
+📣 TSDB now supports a total of 1️⃣6️⃣9️⃣ time-series datasets ‼️
-📣 TSDB now supports a total of 1️⃣6️⃣8️⃣ time-series datasets ‼️
-
-.. image:: https://raw.githubusercontent.com/PyPOTS/pypots.github.io/main/static/figs/pypots_logos/PyPOTS/logo_FFBG.svg
- :height: 160
+.. image:: https://pypots.com/figs/pypots_logos/PyPOTS/logo_FFBG.svg
+ :width: 120
:align: left
:target: https://github.com/WenjieDu/PyPOTS
:alt: PyPOTS logo
@@ -72,7 +79,7 @@ TSDB is available on both `PyPI `_ and `Anaco
Install it with `conda install tsdb`, you may need to specify the channel with option `-c conda-forge`
-or install from PyPI:
+or install via PyPI:
pip install tsdb
@@ -85,12 +92,22 @@ or install from source code:
import tsdb
- tsdb.list_available_datasets() # list all available datasets in TSDB
- data = tsdb.load_dataset('physionet_2012') # select the dataset you need and load it, TSDB will download, extract, and process it automatically
- tsdb.download_and_extract('physionet_2012', './save_it_here') # if you need the raw data, use download_and_extract()
- tsdb.list_cached_data() # datasets you once loaded are cached, and you can check them with list_cached_data()
- tsdb.delete_cached_data(dataset_name='physionet_2012') # you can delete only one specific dataset and preserve others
- tsdb.delete_cached_data() # or you can delete all cache with delete_cached_data() to free disk space
+ # list all available datasets in TSDB
+ tsdb.list()
+ # select the dataset you need and load it, TSDB will download, extract, and process it automatically
+ data = tsdb.load('physionet_2012')
+ # if you need the raw data, use download_and_extract()
+ tsdb.download_and_extract('physionet_2012', './save_it_here')
+ # datasets you once loaded are cached, and you can check them with list_cached_data()
+ tsdb.list_cache()
+ # you can delete only one specific dataset and preserve others
+ tsdb.delete_cache(dataset_name='physionet_2012')
+ # or you can delete all cache with delete_cached_data() to free disk space
+ tsdb.delete_cache()
+
+ # to avoid taking up too much space if downloading many datasets,
+ # TSDB cache directory can be migrated to an external disk
+ tsdb.migrate_cache("/mnt/external_disk/TSDB_cache")
That's all. Simple and efficient. Enjoy it! 😃
@@ -117,6 +134,14 @@ and we are pursuing to publish it in prestigious academic venues, e.g. JMLR (tra
`Machine Learning Open Source Software `_). If you use PyPOTS in your work,
please cite it as below and 🌟star `PyPOTS repository `_ to make others notice this library. 🤗
+
+.. image:: https://pypots.com/figs/pypots_logos/Ecosystem/PyPOTS_Ecosystem_Pipeline.png
+ :height: 300
+ :align: center
+ :target: https://pypots.com/ecosystem/
+ :alt: PyPOTS Ecosystem Pipeline
+
+
.. code-block:: bibtex
:linenos:
diff --git a/tsdb/__init__.py b/tsdb/__init__.py
index d5ecb1c..0e5c951 100644
--- a/tsdb/__init__.py
+++ b/tsdb/__init__.py
@@ -21,7 +21,7 @@
#
# Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
# 'X.Y.dev0' is the canonical version of 'X.Y.dev'
-__version__ = "0.3.1"
+__version__ = "0.4"
from .data_processing import (
CACHED_DATASET_DIR,
diff --git a/tsdb/data_processing.py b/tsdb/data_processing.py
index 09c93c8..14105d4 100644
--- a/tsdb/data_processing.py
+++ b/tsdb/data_processing.py
@@ -18,6 +18,7 @@
load_beijing_air_quality,
load_ucr_uea_dataset,
load_ais,
+ load_italy_air_quality,
)
from .utils.downloading import download_and_extract
from .utils.file import purge_path, pickle_load, pickle_dump, determine_data_home
@@ -100,6 +101,8 @@ def load(dataset_name: str, use_cache: bool = True) -> dict:
result = load_ett(dataset_saving_path)
elif dataset_name == "beijing_multisite_air_quality":
result = load_beijing_air_quality(dataset_saving_path)
+ elif dataset_name == "italy_air_quality":
+ result = load_italy_air_quality(dataset_saving_path)
elif dataset_name == "vessel_ais":
result = load_ais(dataset_saving_path)
elif "ucr_uea_" in dataset_name:
diff --git a/tsdb/database.py b/tsdb/database.py
index 566b14f..060c550 100644
--- a/tsdb/database.py
+++ b/tsdb/database.py
@@ -30,6 +30,8 @@
# https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/beijing_multisite_air_quality
"beijing_multisite_air_quality": "https://archive.ics.uci.edu/ml/machine-learning-databases/00501/"
"PRSA2017_Data_20130301-20170228.zip",
+ # https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/italy_air_quality
+ "italy_air_quality": "https://archive.ics.uci.edu/static/public/360/air+quality.zip",
#
# https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/vessel_ais
"vessel_ais": "https://zenodo.org/record/8064564/files/parquets.zip",
diff --git a/tsdb/loading_funcs/__init__.py b/tsdb/loading_funcs/__init__.py
index fae0583..cb4770f 100644
--- a/tsdb/loading_funcs/__init__.py
+++ b/tsdb/loading_funcs/__init__.py
@@ -7,11 +7,12 @@
from .beijing_multisite_air_quality import load_beijing_air_quality
from .electricity_load_diagrams import load_electricity
+from .electricity_transformer_temperature import load_ett
+from .italy_air_quality import load_italy_air_quality
from .physionet_2012 import load_physionet2012
from .physionet_2019 import load_physionet2019
from .ucr_uea_datasets import load_ucr_uea_dataset
from .vessel_ais import load_ais
-from .electricity_transformer_temperature import load_ett
__all__ = [
"load_beijing_air_quality",
@@ -21,4 +22,5 @@
"load_ucr_uea_dataset",
"load_ais",
"load_ett",
+ "load_italy_air_quality",
]
diff --git a/tsdb/loading_funcs/italy_air_quality.py b/tsdb/loading_funcs/italy_air_quality.py
new file mode 100644
index 0000000..b7fdf75
--- /dev/null
+++ b/tsdb/loading_funcs/italy_air_quality.py
@@ -0,0 +1,41 @@
+"""
+Scripts related to dataset Italy Air Quality.
+
+For more information please refer to:
+https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/italy_air_quality
+"""
+
+# Created by Wenjie Du
+# License: BSD-3-Clause
+
+import os
+
+import pandas as pd
+
+
+def load_italy_air_quality(local_path):
+ """Load dataset Italy Air Quality.
+
+ Parameters
+ ----------
+ local_path : str,
+ The local path of dir saving the raw data of Beijing Multi-site Air Quality.
+
+ Returns
+ -------
+ data : dict
+ A dictionary contains X:
+ X : pandas.DataFrame
+ The time-series data of Beijing Multi-site Air Quality.
+ """
+ file_path = os.path.join(local_path, "AirQualityUCI.csv")
+ df = pd.read_csv(file_path, sep=";", decimal=",")
+ # remove empty columns
+ df.drop(columns=["Unnamed: 15", "Unnamed: 16"], inplace=True)
+ # remove rows with all NaN, i.e. Date is NaN
+ df = df[~df["Date"].isna()]
+
+ data = {
+ "X": df,
+ }
+ return data
diff --git a/tsdb/loading_funcs/physionet_2012.py b/tsdb/loading_funcs/physionet_2012.py
index 6eab9db..e8a75e7 100644
--- a/tsdb/loading_funcs/physionet_2012.py
+++ b/tsdb/loading_funcs/physionet_2012.py
@@ -56,12 +56,11 @@ def load_physionet2012(local_path):
) # ensure RecordID's type is int
outcome = outcome.set_index("RecordID")
outcome_collector.append(outcome)
- y = pd.concat(outcome_collector)
-
- df_collector = []
# iterate over all samples
+ set_collector = []
for m_ in time_series_measurements_dir:
+ df_collector = []
raw_data_dir = os.path.join(local_path, m_)
for filename in os.listdir(raw_data_dir):
recordID = int(filename.split(".txt")[0])
@@ -80,11 +79,16 @@ def load_physionet2012(local_path):
df_temp["Age"] = df_temp.loc[0, "Age"]
df_temp["Height"] = df_temp.loc[0, "Height"]
df_collector.append(df_temp)
-
- df = pd.concat(df_collector, sort=True)
- X = df.reset_index(drop=True)
- unique_ids = df["RecordID"].unique()
- y = y.loc[unique_ids]
-
- data = {"X": X, "y": y, "static_features": ["Age", "Gender", "ICUType", "Height"]}
+ df = pd.concat(df_collector, sort=True)
+ set_collector.append(df)
+
+ data = {
+ "set-a": set_collector[0],
+ "set-b": set_collector[1],
+ "set-c": set_collector[2],
+ "outcomes-a": outcome_collector[0],
+ "outcomes-b": outcome_collector[1],
+ "outcomes-c": outcome_collector[2],
+ "static_features": ["Age", "Gender", "ICUType", "Height"],
+ }
return data
diff --git a/tsdb/loading_funcs/physionet_2019.py b/tsdb/loading_funcs/physionet_2019.py
index 8e75273..df2fd9e 100644
--- a/tsdb/loading_funcs/physionet_2019.py
+++ b/tsdb/loading_funcs/physionet_2019.py
@@ -16,11 +16,12 @@
def load_physionet2019(local_path):
time_series_measurements_dir = ["training", "training_setB"]
- label_feature = "SepsisLabel" # feature SepsisLabel contains labels indicating whether patients get sepsis
- time_feature = "ICULOS" # ICU length-of-stay (hours since ICU admit)
+ # label_feature = "SepsisLabel" # feature SepsisLabel contains labels indicating whether patients get sepsis
+ # time_feature = "ICULOS" # ICU length-of-stay (hours since ICU admit)
- df_collector = []
+ set_collector = []
for m_ in time_series_measurements_dir:
+ df_collector = []
raw_data_dir = os.path.join(local_path, m_)
for filename in os.listdir(raw_data_dir):
recordID = filename.split(".psv")[0]
@@ -28,11 +29,12 @@ def load_physionet2019(local_path):
df_temp = pd.read_csv(f, sep="|", header=0)
df_temp["RecordID"] = recordID
df_collector.append(df_temp)
-
- df = pd.concat(df_collector, sort=True)
- df = df.reset_index(drop=True)
- y = df[["RecordID", time_feature, label_feature]]
- X = df.drop(label_feature, axis=1)
-
- data = {"X": X, "y": y, "static_features": ["Age", "Gender", "HospAdmTime"]}
+ df = pd.concat(df_collector, sort=True)
+ set_collector.append(df)
+
+ data = {
+ "training_setA": set_collector[0],
+ "training_setB": set_collector[1],
+ "static_features": ["Age", "Gender", "Unit1", "Unit2", "HospAdmTime"],
+ }
return data