From b532ed96e644c96b4a7cb5f81a7805510db04258 Mon Sep 17 00:00:00 2001
From: Hao Lyu <20434183+IncubatorShokuhou@users.noreply.github.com>
Date: Wed, 28 Feb 2024 09:31:17 +0800
Subject: [PATCH 1/7] Update README.md (#57)
* Update README.md
AttributeError: module 'tsdb' has no attribute 'list_available_datasets' . It seems that `list_available_datasets` has been replaced to `list`
---------
Co-authored-by: Wenjie Du
---
README.md | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/README.md b/README.md
index da70e57..4673ee4 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@
-
+
@@ -81,7 +81,7 @@ or install from source code:
import tsdb
# list all available datasets in TSDB
-tsdb.list_available_datasets()
+tsdb.list()
# select the dataset you need and load it, TSDB will download, extract, and process it automatically
data = tsdb.load('physionet_2012')
# if you need the raw data, use download_and_extract()
From 64e8aee60918273c114c575b3b34ab93c702aebb Mon Sep 17 00:00:00 2001
From: Wenjie Du
Date: Wed, 13 Mar 2024 10:07:19 +0800
Subject: [PATCH 2/7] docs: update the docs;
---
README.md | 27 ++++++++--------------
docs/index.rst | 63 +++++++++++++++++++++++++++++++++++---------------
2 files changed, 53 insertions(+), 37 deletions(-)
diff --git a/README.md b/README.md
index 4673ee4..f0933ee 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
-Welcome to TSDB
+Welcome to TSDB
*a Python toolbox to ease loading public time-series datasets
*
@@ -71,7 +71,7 @@ TSDB now is available on Wenjie Du. (2023).
-> PyPOTS: a Python toolbox for data mining on Partially-Observed Time Series.
-> In *9th SIGKDD workshop on Mining and Learning from Time Series (MiLeTS'23)*. https://arxiv.org/abs/2305.18811
diff --git a/docs/index.rst b/docs/index.rst
index cee7040..bc66a41 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,51 +1,58 @@
-.. TSDB documentation master file, created by
- sphinx-quickstart on Wed Mar 15 15:23:52 2023.
- You can adapt this file completely to your liking, but it should at least
- contain the root `toctree` directive.
+.. PyPOTS documentation index page
+ Created by Wenjie Du
+ License: BSD-3-Clause
Welcome to TSDB documentation!
================================
.. image:: https://pypots.com/figs/pypots_logos/TSDB/logo_FFBG.svg
- :height: 160
+ :height: 180
:align: right
:target: https://github.com/WenjieDu/TSDB
:alt: TSDB logo
-**A Python Toolbox Helping Load Time-Series Datasets Easily**
+**A Python Toolbox to Ease Loading Public Time-Series Datasets**
.. image:: https://img.shields.io/badge/python-v3-E97040?logo=python&logoColor=white
:alt: Python version
+
.. image:: https://img.shields.io/github/v/release/wenjiedu/tsdb?color=EE781F&include_prereleases&label=Release&logo=github&logoColor=white
:alt: the latest release version
:target: https://img.shields.io/github/v/release/wenjiedu/tsdb?color=EE781F&include_prereleases&label=Release&logo=github&logoColor=white
+
.. image:: https://img.shields.io/badge/License-BSD--3-E9BB41?logo=opensourceinitiative&logoColor=white
:alt: License
:target: https://github.com/WenjieDu/TSDB/blob/main/LICENSE
+
.. image:: https://img.shields.io/github/actions/workflow/status/wenjiedu/tsdb/testing_ci.yml?logo=github&color=C8D8E1&label=CI
:alt: GitHub Testing
:target: https://github.com/WenjieDu/TSDB/actions/workflows/testing_ci.yml
+
.. image:: https://img.shields.io/codeclimate/maintainability-percentage/WenjieDu/TSDB?color=3C7699&label=Maintainability&logo=codeclimate
:alt: Code Climate maintainability
:target: https://codeclimate.com/github/WenjieDu/TSDB
+
.. image:: https://img.shields.io/coverallsCoverage/github/WenjieDu/TSDB?branch=main&logo=coveralls&color=75C1C4&label=Coverage
:alt: Coveralls report
:target: https://coveralls.io/github/WenjieDu/TSDB
-.. image:: https://img.shields.io/conda/dn/conda-forge/tsdb?label=Conda%20Downloads&color=AED0ED&logo=anaconda&logoColor=white
+
+.. image:: https://img.shields.io/endpoint?url=https://pypots.com/figs/downloads_badges/conda_tsdb_downloads.json
:alt: Conda downloads
:target: https://anaconda.org/conda-forge/pypots
-.. image:: https://img.shields.io/endpoint?url=https%3A%2F%2Fraw.githubusercontent.com%2FWenjieDu%2FWenjieDu%2Fmain%2Ffigs%2Fprojects%2Ftsdb_downloads.json
+
+.. image:: https://img.shields.io/endpoint?url=https://pypots.com/figs/downloads_badges/pypi_tsdb_downloads.json
:alt: PyPI downloads
:target: https://pepy.tech/project/tsdb
+
.. image:: https://img.shields.io/badge/Contributor%20Covenant-v2.1-4baaaa.svg
:alt: CODE of CONDUCT
+
.. image:: https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fgithub.com%2FWenjieDu%2FTime_Series_Database&count_bg=%2379C83D&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=Visits+since+April+2022&edge_flat=false
:alt: Visit num
+📣 TSDB now supports a total of 1️⃣6️⃣9️⃣ time-series datasets ‼️
-📣 TSDB now supports a total of 1️⃣6️⃣8️⃣ time-series datasets ‼️
-
-.. image:: https://raw.githubusercontent.com/PyPOTS/pypots.github.io/main/static/figs/pypots_logos/PyPOTS/logo_FFBG.svg
- :height: 160
+.. image:: https://pypots.com/figs/pypots_logos/PyPOTS/logo_FFBG.svg
+ :width: 120
:align: left
:target: https://github.com/WenjieDu/PyPOTS
:alt: PyPOTS logo
@@ -72,7 +79,7 @@ TSDB is available on both `PyPI `_ and `Anaco
Install it with `conda install tsdb`, you may need to specify the channel with option `-c conda-forge`
-or install from PyPI:
+or install via PyPI:
pip install tsdb
@@ -85,12 +92,22 @@ or install from source code:
import tsdb
- tsdb.list_available_datasets() # list all available datasets in TSDB
- data = tsdb.load_dataset('physionet_2012') # select the dataset you need and load it, TSDB will download, extract, and process it automatically
- tsdb.download_and_extract('physionet_2012', './save_it_here') # if you need the raw data, use download_and_extract()
- tsdb.list_cached_data() # datasets you once loaded are cached, and you can check them with list_cached_data()
- tsdb.delete_cached_data(dataset_name='physionet_2012') # you can delete only one specific dataset and preserve others
- tsdb.delete_cached_data() # or you can delete all cache with delete_cached_data() to free disk space
+ # list all available datasets in TSDB
+ tsdb.list()
+ # select the dataset you need and load it, TSDB will download, extract, and process it automatically
+ data = tsdb.load('physionet_2012')
+ # if you need the raw data, use download_and_extract()
+ tsdb.download_and_extract('physionet_2012', './save_it_here')
+ # datasets you once loaded are cached, and you can check them with list_cached_data()
+ tsdb.list_cache()
+ # you can delete only one specific dataset and preserve others
+ tsdb.delete_cache(dataset_name='physionet_2012')
+ # or you can delete all cache with delete_cached_data() to free disk space
+ tsdb.delete_cache()
+
+ # to avoid taking up too much space if downloading many datasets,
+ # TSDB cache directory can be migrated to an external disk
+ tsdb.migrate_cache("/mnt/external_disk/TSDB_cache")
That's all. Simple and efficient. Enjoy it! 😃
@@ -117,6 +134,14 @@ and we are pursuing to publish it in prestigious academic venues, e.g. JMLR (tra
`Machine Learning Open Source Software `_). If you use PyPOTS in your work,
please cite it as below and 🌟star `PyPOTS repository `_ to make others notice this library. 🤗
+
+.. image:: https://pypots.com/figs/pypots_logos/Ecosystem/PyPOTS_Ecosystem_Pipeline.png
+ :height: 300
+ :align: center
+ :target: https://pypots.com/ecosystem/
+ :alt: PyPOTS Ecosystem Pipeline
+
+
.. code-block:: bibtex
:linenos:
From 7537f28d2aaac2b93fd837dcabaefbaad23d55e5 Mon Sep 17 00:00:00 2001
From: Wenjie Du
Date: Fri, 24 May 2024 13:09:37 +0800
Subject: [PATCH 3/7] feat: make physionet_2012 dataset returned as three
separated sets;
---
tsdb/loading_funcs/physionet_2012.py | 24 ++++++++++++++----------
1 file changed, 14 insertions(+), 10 deletions(-)
diff --git a/tsdb/loading_funcs/physionet_2012.py b/tsdb/loading_funcs/physionet_2012.py
index 6eab9db..e8a75e7 100644
--- a/tsdb/loading_funcs/physionet_2012.py
+++ b/tsdb/loading_funcs/physionet_2012.py
@@ -56,12 +56,11 @@ def load_physionet2012(local_path):
) # ensure RecordID's type is int
outcome = outcome.set_index("RecordID")
outcome_collector.append(outcome)
- y = pd.concat(outcome_collector)
-
- df_collector = []
# iterate over all samples
+ set_collector = []
for m_ in time_series_measurements_dir:
+ df_collector = []
raw_data_dir = os.path.join(local_path, m_)
for filename in os.listdir(raw_data_dir):
recordID = int(filename.split(".txt")[0])
@@ -80,11 +79,16 @@ def load_physionet2012(local_path):
df_temp["Age"] = df_temp.loc[0, "Age"]
df_temp["Height"] = df_temp.loc[0, "Height"]
df_collector.append(df_temp)
-
- df = pd.concat(df_collector, sort=True)
- X = df.reset_index(drop=True)
- unique_ids = df["RecordID"].unique()
- y = y.loc[unique_ids]
-
- data = {"X": X, "y": y, "static_features": ["Age", "Gender", "ICUType", "Height"]}
+ df = pd.concat(df_collector, sort=True)
+ set_collector.append(df)
+
+ data = {
+ "set-a": set_collector[0],
+ "set-b": set_collector[1],
+ "set-c": set_collector[2],
+ "outcomes-a": outcome_collector[0],
+ "outcomes-b": outcome_collector[1],
+ "outcomes-c": outcome_collector[2],
+ "static_features": ["Age", "Gender", "ICUType", "Height"],
+ }
return data
From 289c841c4d9d385b5580769559ebb769cd5ad0bd Mon Sep 17 00:00:00 2001
From: Wenjie Du
Date: Sat, 25 May 2024 22:16:57 +0800
Subject: [PATCH 4/7] feat: add Italy Air Quality and release v0.4;
---
README.md | 5 ++-
dataset_profiles/italy_air_quality/README.md | 17 +++++++++
tsdb/__init__.py | 2 +-
tsdb/data_processing.py | 3 ++
tsdb/database.py | 2 +
tsdb/loading_funcs/__init__.py | 4 +-
tsdb/loading_funcs/italy_air_quality.py | 39 ++++++++++++++++++++
7 files changed, 68 insertions(+), 4 deletions(-)
create mode 100644 dataset_profiles/italy_air_quality/README.md
create mode 100644 tsdb/loading_funcs/italy_air_quality.py
diff --git a/README.md b/README.md
index f0933ee..4e4394f 100644
--- a/README.md
+++ b/README.md
@@ -46,7 +46,7 @@
-> 📣 TSDB now supports a total of 1️⃣6️⃣9️⃣ time-series datasets ‼️
+> 📣 TSDB now supports a total of 1️⃣7️⃣0️⃣ time-series datasets ‼️
TSDB is a part of
@@ -108,6 +108,7 @@ That's all. Simple and efficient. Enjoy it! 😃
| [PhysioNet Challenge 2012](dataset_profiles/physionet_2012) | Forecasting, Imputation, Classification |
| [PhysioNet Challenge 2019](dataset_profiles/physionet_2019) | Forecasting, Imputation, Classification |
| [Beijing Multi-Site Air-Quality](dataset_profiles/beijing_multisite_air_quality) | Forecasting, Imputation |
+| [Italy Air Quality](dataset_profiles/italy_air_quality) | Forecasting, Imputation |
| [Electricity Load Diagrams](dataset_profiles/electricity_load_diagrams) | Forecasting, Imputation |
| [Electricity Transformer Temperature (ETT)](dataset_profiles/electricity_transformer_temperature) | Forecasting, Imputation |
| [Vessel AIS](dataset_profiles/vessel_ais) | Forecasting, Imputation, Classification |
@@ -121,7 +122,7 @@ and we are pursuing to publish it in prestigious academic venues, e.g. JMLR (tra
please cite PyPOTS project as below and 🌟star this repository to make others notice this library. 🤗 Thank you!
-
+
diff --git a/dataset_profiles/italy_air_quality/README.md b/dataset_profiles/italy_air_quality/README.md
new file mode 100644
index 0000000..94a20cd
--- /dev/null
+++ b/dataset_profiles/italy_air_quality/README.md
@@ -0,0 +1,17 @@
+# Italy Air Quality
+
+## Citing this dataset 🤗
+
+`Vito,Saverio. (2016). Air Quality. UCI Machine Learning Repository. https://doi.org/10.24432/C59K5F`
+
+or
+
+```bibtex
+@misc{vito2016air,
+author = {Vito,Saverio},
+title = {{Air Quality}},
+year = {2016},
+howpublished = {UCI Machine Learning Repository},
+note = {{DOI}: https://doi.org/10.24432/C59K5F}
+}
+```
diff --git a/tsdb/__init__.py b/tsdb/__init__.py
index d5ecb1c..0e5c951 100644
--- a/tsdb/__init__.py
+++ b/tsdb/__init__.py
@@ -21,7 +21,7 @@
#
# Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
# 'X.Y.dev0' is the canonical version of 'X.Y.dev'
-__version__ = "0.3.1"
+__version__ = "0.4"
from .data_processing import (
CACHED_DATASET_DIR,
diff --git a/tsdb/data_processing.py b/tsdb/data_processing.py
index 09c93c8..14105d4 100644
--- a/tsdb/data_processing.py
+++ b/tsdb/data_processing.py
@@ -18,6 +18,7 @@
load_beijing_air_quality,
load_ucr_uea_dataset,
load_ais,
+ load_italy_air_quality,
)
from .utils.downloading import download_and_extract
from .utils.file import purge_path, pickle_load, pickle_dump, determine_data_home
@@ -100,6 +101,8 @@ def load(dataset_name: str, use_cache: bool = True) -> dict:
result = load_ett(dataset_saving_path)
elif dataset_name == "beijing_multisite_air_quality":
result = load_beijing_air_quality(dataset_saving_path)
+ elif dataset_name == "italy_air_quality":
+ result = load_italy_air_quality(dataset_saving_path)
elif dataset_name == "vessel_ais":
result = load_ais(dataset_saving_path)
elif "ucr_uea_" in dataset_name:
diff --git a/tsdb/database.py b/tsdb/database.py
index 566b14f..060c550 100644
--- a/tsdb/database.py
+++ b/tsdb/database.py
@@ -30,6 +30,8 @@
# https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/beijing_multisite_air_quality
"beijing_multisite_air_quality": "https://archive.ics.uci.edu/ml/machine-learning-databases/00501/"
"PRSA2017_Data_20130301-20170228.zip",
+ # https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/italy_air_quality
+ "italy_air_quality": "https://archive.ics.uci.edu/static/public/360/air+quality.zip",
#
# https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/vessel_ais
"vessel_ais": "https://zenodo.org/record/8064564/files/parquets.zip",
diff --git a/tsdb/loading_funcs/__init__.py b/tsdb/loading_funcs/__init__.py
index fae0583..cb4770f 100644
--- a/tsdb/loading_funcs/__init__.py
+++ b/tsdb/loading_funcs/__init__.py
@@ -7,11 +7,12 @@
from .beijing_multisite_air_quality import load_beijing_air_quality
from .electricity_load_diagrams import load_electricity
+from .electricity_transformer_temperature import load_ett
+from .italy_air_quality import load_italy_air_quality
from .physionet_2012 import load_physionet2012
from .physionet_2019 import load_physionet2019
from .ucr_uea_datasets import load_ucr_uea_dataset
from .vessel_ais import load_ais
-from .electricity_transformer_temperature import load_ett
__all__ = [
"load_beijing_air_quality",
@@ -21,4 +22,5 @@
"load_ucr_uea_dataset",
"load_ais",
"load_ett",
+ "load_italy_air_quality",
]
diff --git a/tsdb/loading_funcs/italy_air_quality.py b/tsdb/loading_funcs/italy_air_quality.py
new file mode 100644
index 0000000..dadd3e9
--- /dev/null
+++ b/tsdb/loading_funcs/italy_air_quality.py
@@ -0,0 +1,39 @@
+"""
+Scripts related to dataset Italy Air Quality.
+
+For more information please refer to:
+https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/italy_air_quality
+"""
+
+# Created by Wenjie Du
+# License: BSD-3-Clause
+
+import os
+
+import pandas as pd
+
+
+def load_italy_air_quality(local_path):
+ """Load dataset Italy Air Quality.
+
+ Parameters
+ ----------
+ local_path : str,
+ The local path of dir saving the raw data of Beijing Multi-site Air Quality.
+
+ Returns
+ -------
+ data : dict
+ A dictionary contains X:
+ X : pandas.DataFrame
+ The time-series data of Beijing Multi-site Air Quality.
+ """
+ file_path = os.path.join(local_path, "AirQualityUCI.csv")
+ df = pd.read_csv(file_path, sep=";", decimal=",")
+ # remove empty columns
+ df.drop(columns=["Unnamed: 15", "Unnamed: 16"], inplace=True)
+
+ data = {
+ "X": df,
+ }
+ return data
From fbad4e19c2d768c65469db27334827ee6bfb10e7 Mon Sep 17 00:00:00 2001
From: Wenjie Du
Date: Sat, 25 May 2024 22:34:19 +0800
Subject: [PATCH 5/7] test: use macOS-13 to fix failed python 3.7 not available
on latest macOS;
---
.github/workflows/testing_ci.yml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.github/workflows/testing_ci.yml b/.github/workflows/testing_ci.yml
index bee3c39..eef4a0a 100644
--- a/.github/workflows/testing_ci.yml
+++ b/.github/workflows/testing_ci.yml
@@ -17,7 +17,7 @@ jobs:
strategy:
fail-fast: false
matrix:
- os: [ubuntu-latest, windows-latest, macOS-latest]
+ os: [ubuntu-latest, windows-latest, macOS-13]
python-version: ['3.7', '3.11']
steps:
From 3d939d1997138f94434412cf8f0e3de6d9edc89c Mon Sep 17 00:00:00 2001
From: Wenjie Du
Date: Sat, 25 May 2024 22:49:49 +0800
Subject: [PATCH 6/7] feat: remove empty rows;
---
tsdb/loading_funcs/italy_air_quality.py | 2 ++
1 file changed, 2 insertions(+)
diff --git a/tsdb/loading_funcs/italy_air_quality.py b/tsdb/loading_funcs/italy_air_quality.py
index dadd3e9..b7fdf75 100644
--- a/tsdb/loading_funcs/italy_air_quality.py
+++ b/tsdb/loading_funcs/italy_air_quality.py
@@ -32,6 +32,8 @@ def load_italy_air_quality(local_path):
df = pd.read_csv(file_path, sep=";", decimal=",")
# remove empty columns
df.drop(columns=["Unnamed: 15", "Unnamed: 16"], inplace=True)
+ # remove rows with all NaN, i.e. Date is NaN
+ df = df[~df["Date"].isna()]
data = {
"X": df,
From c5ba01cdc83ed95d074aad5d476b92ff5affd956 Mon Sep 17 00:00:00 2001
From: Wenjie Du
Date: Sun, 26 May 2024 10:29:08 +0800
Subject: [PATCH 7/7] feat: return physionet_2019 as two subsets;
---
tsdb/loading_funcs/physionet_2019.py | 22 ++++++++++++----------
1 file changed, 12 insertions(+), 10 deletions(-)
diff --git a/tsdb/loading_funcs/physionet_2019.py b/tsdb/loading_funcs/physionet_2019.py
index 8e75273..df2fd9e 100644
--- a/tsdb/loading_funcs/physionet_2019.py
+++ b/tsdb/loading_funcs/physionet_2019.py
@@ -16,11 +16,12 @@
def load_physionet2019(local_path):
time_series_measurements_dir = ["training", "training_setB"]
- label_feature = "SepsisLabel" # feature SepsisLabel contains labels indicating whether patients get sepsis
- time_feature = "ICULOS" # ICU length-of-stay (hours since ICU admit)
+ # label_feature = "SepsisLabel" # feature SepsisLabel contains labels indicating whether patients get sepsis
+ # time_feature = "ICULOS" # ICU length-of-stay (hours since ICU admit)
- df_collector = []
+ set_collector = []
for m_ in time_series_measurements_dir:
+ df_collector = []
raw_data_dir = os.path.join(local_path, m_)
for filename in os.listdir(raw_data_dir):
recordID = filename.split(".psv")[0]
@@ -28,11 +29,12 @@ def load_physionet2019(local_path):
df_temp = pd.read_csv(f, sep="|", header=0)
df_temp["RecordID"] = recordID
df_collector.append(df_temp)
-
- df = pd.concat(df_collector, sort=True)
- df = df.reset_index(drop=True)
- y = df[["RecordID", time_feature, label_feature]]
- X = df.drop(label_feature, axis=1)
-
- data = {"X": X, "y": y, "static_features": ["Age", "Gender", "HospAdmTime"]}
+ df = pd.concat(df_collector, sort=True)
+ set_collector.append(df)
+
+ data = {
+ "training_setA": set_collector[0],
+ "training_setB": set_collector[1],
+ "static_features": ["Age", "Gender", "Unit1", "Unit2", "HospAdmTime"],
+ }
return data