Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions aeon/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@
"load_daphnet_s06r02e0",
"load_ecg_diff_count_3",
"get_dataset_meta_data",
# Monster datasets
"load_monster_dataset_names",
"load_monster_dataset",
]

from aeon.datasets._data_loaders import (
Expand Down Expand Up @@ -101,6 +104,10 @@
load_human_activity_segmentation_datasets,
load_time_series_segmentation_benchmark,
)
from aeon.datasets.monster_loader import (
load_monster_dataset,
load_monster_dataset_names,
)
from aeon.datasets.rehabpile_loader import (
load_rehab_pile_classification_datasets,
load_rehab_pile_dataset,
Expand Down
152 changes: 152 additions & 0 deletions aeon/datasets/monster_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
"""Dataset loading functions for Monster datasets."""

__maintainer__ = []
__all__ = [
"load_monster_dataset_names",
"load_monster_dataset",
]

import numpy as np

from aeon.utils.numba.general import z_normalise_series_3d
from aeon.utils.validation._dependencies import _check_soft_dependencies

ORG_ID = "monster-monash"
_monster_dataset_names = None


def _fetch_monster_dataset_names() -> list[str]:
"""Fetch the list of Monster dataset names from Hugging Face Hub."""
_check_soft_dependencies("huggingface-hub", severity="none")
from huggingface_hub import list_datasets

datasets = list_datasets(author=ORG_ID)
dataset_names = []
for dataset_info in datasets:
if dataset_info.id.startswith(f"{ORG_ID}/"):
name = dataset_info.id.split("/")[-1]
dataset_names.append(name)

return sorted(dataset_names)


def _lazy_load_monster_names():
"""Fetch and cache names, but only on the first call."""
global _monster_dataset_names
if _monster_dataset_names is None:
_monster_dataset_names = _fetch_monster_dataset_names()


def load_monster_dataset_names() -> list[str]:
"""Load the list of available Monster dataset names from Hugging Face Hub.

Returns
-------
list of str
A list of available Monster dataset names.
"""
_lazy_load_monster_names()
return _monster_dataset_names


def load_monster_dataset(
dataset_name: str,
fold: int = 0,
normalize: bool = True,
) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
"""Load a Monster dataset from Hugging Face Hub.

MONSTER— the MONash Scalable Time Series Evaluation Repository,
introduced in [1]_, is a collection of large datasets for time
series classification.The collection is hosted on Hugging Face Hub.

Parameters
----------
dataset_name : str
The name of the dataset to load (e.g., "CornellWhaleChallenge", "AudioMNIST").
fold : int, default=0
The specific cross-validation fold index to load. This determines which
samples are used for the test set. Defaults to fold 0.
normalize : bool, default=True
If True, the time series data (X) is Z-normalized (mean=0, std=1) across
the series length using `z_normalise_series_3d`.

Returns
-------
X_train : np.ndarray
The training data, shape (n_train_cases, n_channels, n_timepoints).
(n_channels=1 for these univariate datasets).
y_train : np.ndarray
The training class labels, shape (n_train_cases,).
X_test : np.ndarray
The testing data, shape (n_test_cases, n_channels, n_timepoints).
y_test : np.ndarray
The testing class labels, shape (n_test_cases,).

Raises
------
ModuleNotFoundError
If required optional dependency 'huggingface-hub' not installed.
ValueError
If the `dataset_name` is not recognized
or the `fold` number is invalid.
OSError
If the download fails due to network issues

Notes
-----
The data files are cached locally by the `huggingface-hub`
library, avoiding repeated downloads. This function
requires the optional dependency `huggingface-hub`.

References
----------
.. [1] Dempster, A., Mohammadi Foumani, N., Tan, C. W., Miller,
L., Mishra, A., Salehi, M., Pelletier, C., Schmidt, D. F.,
& Webb, G. I. (2025). MONSTER: Monash Scalable
Time Series Evaluation Repository. arXiv preprint arXiv:2502.15122.

"""
_check_soft_dependencies("huggingface-hub", severity="none")
from huggingface_hub import hf_hub_download
from huggingface_hub.errors import HfHubHTTPError

repo_id = f"{ORG_ID}/{dataset_name}"

if dataset_name not in load_monster_dataset_names():
raise ValueError(f"Dataset {dataset_name} not found in the Monster collection.")

data_path = hf_hub_download(
repo_id=repo_id, filename=f"{dataset_name}_X.npy", repo_type="dataset"
)
X = np.load(data_path, mmap_mode="r")
if normalize:
X = z_normalise_series_3d(X)

label_filename = f"{dataset_name}_y.npy"
try:
label_path = hf_hub_download(
repo_id=repo_id, filename=label_filename, repo_type="dataset"
)
y = np.load(label_path)
except HfHubHTTPError as e:
raise OSError("Failed to load labels for the dataset ") from e

try:
test_index_path = hf_hub_download(
repo_id=repo_id,
filename=f"test_indices_fold_{fold}.txt",
repo_type="dataset",
)
test_index = np.loadtxt(test_index_path, dtype=int)
except HfHubHTTPError as e:
raise OSError(f"Failed to load test indices for fold {fold}: {e}. ") from e

test_bool_index = np.zeros(len(y), dtype=bool)
test_bool_index[test_index] = True
return (
X[~test_bool_index],
y[~test_bool_index],
X[test_bool_index],
y[test_bool_index],
)
59 changes: 59 additions & 0 deletions aeon/datasets/tests/test_monster_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""Test functions for monster dataset loader."""

import numpy as np
import pytest

from aeon.datasets._data_loaders import CONNECTION_ERRORS
from aeon.datasets.monster_loader import (
load_monster_dataset,
load_monster_dataset_names,
)
from aeon.utils.validation._dependencies import _check_soft_dependencies


@pytest.mark.skipif(
not _check_soft_dependencies("huggingface-hub", severity="none"),
reason="required soft dependency huggingface-hub not available",
)
@pytest.mark.xfail(raises=CONNECTION_ERRORS)
def test_monster_dataset_names():
"""Test loading the list of Monster dataset names."""
dataset_names = load_monster_dataset_names()
assert isinstance(dataset_names, list)
assert len(dataset_names) > 0
assert all(isinstance(name, str) for name in dataset_names)


@pytest.mark.skipif(
not _check_soft_dependencies("huggingface-hub", severity="none"),
reason="required soft dependency huggingface-hub not available",
)
@pytest.mark.xfail(raises=CONNECTION_ERRORS)
def test_load_monster_dataset():
"""Test loading a Monster dataset and verify its structure."""
dataset_name = "FOO"
with pytest.raises(
ValueError, match=f"Dataset {dataset_name} not found in the Monster collection."
):
load_monster_dataset(dataset_name)
dataset_name = "Pedestrian"

X_train, y_train, X_test, y_test = load_monster_dataset(
dataset_name=dataset_name, fold=0, normalize=True
)

assert isinstance(X_train, np.ndarray)
assert isinstance(y_train, np.ndarray)
assert isinstance(X_test, np.ndarray)
assert isinstance(y_test, np.ndarray)

assert X_train.ndim == 3
assert X_test.ndim == 3
assert len(X_train) == len(y_train)
assert len(X_test) == len(y_test)

# Check normalization
mean = np.mean(X_train, axis=(0, 2))
std = np.std(X_train, axis=(0, 2))
np.testing.assert_array_almost_equal(mean, np.zeros_like(mean), decimal=3)
np.testing.assert_array_almost_equal(std, np.ones_like(std), decimal=3)
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ dependencies = [
[project.optional-dependencies]
all_extras = [
"esig>=1.0.0; platform_system != 'Darwin'",
"huggingface-hub>=0.20.0",
"imbalanced-learn",
"matplotlib>=3.3.2",
"pycatch22>=0.4.5",
Expand Down