aeon-toolkit · rwtarpit · Nov 26, 2025 · Nov 27, 2025 · Nov 27, 2025 · Nov 27, 2025
diff --git a/aeon/datasets/__init__.py b/aeon/datasets/__init__.py
@@ -49,6 +49,9 @@
     "load_daphnet_s06r02e0",
     "load_ecg_diff_count_3",
     "get_dataset_meta_data",
+    # Monster datasets
+    "load_monster_dataset_names",
+    "load_monster_dataset",
 ]
 
 from aeon.datasets._data_loaders import (
@@ -101,6 +104,10 @@
     load_human_activity_segmentation_datasets,
     load_time_series_segmentation_benchmark,
 )
+from aeon.datasets.monster_loader import (
+    load_monster_dataset,
+    load_monster_dataset_names,
+)
 from aeon.datasets.rehabpile_loader import (
     load_rehab_pile_classification_datasets,
     load_rehab_pile_dataset,

diff --git a/aeon/datasets/monster_loader.py b/aeon/datasets/monster_loader.py
@@ -0,0 +1,152 @@
+"""Dataset loading functions for Monster datasets."""
+
+__maintainer__ = []
+__all__ = [
+    "load_monster_dataset_names",
+    "load_monster_dataset",
+]
+
+import numpy as np
+
+from aeon.utils.numba.general import z_normalise_series_3d
+from aeon.utils.validation._dependencies import _check_soft_dependencies
+
+ORG_ID = "monster-monash"
+_monster_dataset_names = None
+
+
+def _fetch_monster_dataset_names() -> list[str]:
+    """Fetch the list of Monster dataset names from Hugging Face Hub."""
+    _check_soft_dependencies("huggingface-hub", severity="none")
+    from huggingface_hub import list_datasets
+
+    datasets = list_datasets(author=ORG_ID)
+    dataset_names = []
+    for dataset_info in datasets:
+        if dataset_info.id.startswith(f"{ORG_ID}/"):
+            name = dataset_info.id.split("/")[-1]
+            dataset_names.append(name)
+
+    return sorted(dataset_names)
+
+
+def _lazy_load_monster_names():
+    """Fetch and cache names, but only on the first call."""
+    global _monster_dataset_names
+    if _monster_dataset_names is None:
+        _monster_dataset_names = _fetch_monster_dataset_names()
+
+
+def load_monster_dataset_names() -> list[str]:
+    """Load the list of available Monster dataset names from Hugging Face Hub.
+
+    Returns
+    -------
+    list of str
+        A list of available Monster dataset names.
+    """
+    _lazy_load_monster_names()
+    return _monster_dataset_names
+
+
+def load_monster_dataset(
+    dataset_name: str,
+    fold: int = 0,
+    normalize: bool = True,
+) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Load a Monster dataset from Hugging Face Hub.
+
+     MONSTER— the MONash Scalable Time Series Evaluation Repository,
+     introduced in [1]_, is a collection of large datasets for time
+     series classification.The collection is hosted on Hugging Face Hub.
+
+    Parameters
+    ----------
+    dataset_name : str
+        The name of the dataset to load (e.g., "CornellWhaleChallenge", "AudioMNIST").
+    fold : int, default=0
+        The specific cross-validation fold index to load. This determines which
+        samples are used for the test set. Defaults to fold 0.
+    normalize : bool, default=True
+        If True, the time series data (X) is Z-normalized (mean=0, std=1) across
+        the series length using `z_normalise_series_3d`.
+
+    Returns
+    -------
+    X_train : np.ndarray
+        The training data, shape (n_train_cases, n_channels, n_timepoints).
+        (n_channels=1 for these univariate datasets).
+    y_train : np.ndarray
+        The training class labels, shape (n_train_cases,).
+    X_test : np.ndarray
+        The testing data, shape (n_test_cases, n_channels, n_timepoints).
+    y_test : np.ndarray
+        The testing class labels, shape (n_test_cases,).
+
+    Raises
+    ------
+    ModuleNotFoundError
+        If required optional dependency 'huggingface-hub' not installed.
+    ValueError
+        If the `dataset_name` is not recognized
+        or the `fold` number is invalid.
+    OSError
+        If the download fails due to network issues
+
+    Notes
+    -----
+    The data files are cached locally by the `huggingface-hub`
+    library, avoiding repeated downloads. This function
+    requires the optional dependency `huggingface-hub`.
+
+    References
+    ----------
+    .. [1] Dempster, A., Mohammadi Foumani, N., Tan, C. W., Miller,
+        L., Mishra, A., Salehi, M., Pelletier, C., Schmidt, D. F.,
+        & Webb, G. I. (2025). MONSTER: Monash Scalable
+        Time Series Evaluation Repository. arXiv preprint arXiv:2502.15122.
+
+    """
+    _check_soft_dependencies("huggingface-hub", severity="none")
+    from huggingface_hub import hf_hub_download
+    from huggingface_hub.errors import HfHubHTTPError
+
+    repo_id = f"{ORG_ID}/{dataset_name}"
+
+    if dataset_name not in load_monster_dataset_names():
+        raise ValueError(f"Dataset {dataset_name} not found in the Monster collection.")
+
+    data_path = hf_hub_download(
+        repo_id=repo_id, filename=f"{dataset_name}_X.npy", repo_type="dataset"
+    )
+    X = np.load(data_path, mmap_mode="r")
+    if normalize:
+        X = z_normalise_series_3d(X)
+
+    label_filename = f"{dataset_name}_y.npy"
+    try:
+        label_path = hf_hub_download(
+            repo_id=repo_id, filename=label_filename, repo_type="dataset"
+        )
+        y = np.load(label_path)
+    except HfHubHTTPError as e:
+        raise OSError("Failed to load labels for the dataset ") from e
+
+    try:
+        test_index_path = hf_hub_download(
+            repo_id=repo_id,
+            filename=f"test_indices_fold_{fold}.txt",
+            repo_type="dataset",
+        )
+        test_index = np.loadtxt(test_index_path, dtype=int)
+    except HfHubHTTPError as e:
+        raise OSError(f"Failed to load test indices for fold {fold}: {e}. ") from e
+
+    test_bool_index = np.zeros(len(y), dtype=bool)
+    test_bool_index[test_index] = True
+    return (
+        X[~test_bool_index],
+        y[~test_bool_index],
+        X[test_bool_index],
+        y[test_bool_index],
+    )
diff --git a/aeon/datasets/tests/test_monster_loader.py b/aeon/datasets/tests/test_monster_loader.py
@@ -0,0 +1,59 @@
+"""Test functions for monster dataset loader."""
+
+import numpy as np
+import pytest
+
+from aeon.datasets._data_loaders import CONNECTION_ERRORS
+from aeon.datasets.monster_loader import (
+    load_monster_dataset,
+    load_monster_dataset_names,
+)
+from aeon.utils.validation._dependencies import _check_soft_dependencies
+
+
+@pytest.mark.skipif(
+    not _check_soft_dependencies("huggingface-hub", severity="none"),
+    reason="required soft dependency huggingface-hub not available",
+)
+@pytest.mark.xfail(raises=CONNECTION_ERRORS)
+def test_monster_dataset_names():
+    """Test loading the list of Monster dataset names."""
+    dataset_names = load_monster_dataset_names()
+    assert isinstance(dataset_names, list)
+    assert len(dataset_names) > 0
+    assert all(isinstance(name, str) for name in dataset_names)
+
+
+@pytest.mark.skipif(
+    not _check_soft_dependencies("huggingface-hub", severity="none"),
+    reason="required soft dependency huggingface-hub not available",
+)
+@pytest.mark.xfail(raises=CONNECTION_ERRORS)
+def test_load_monster_dataset():
+    """Test loading a Monster dataset and verify its structure."""
+    dataset_name = "FOO"
+    with pytest.raises(
+        ValueError, match=f"Dataset {dataset_name} not found in the Monster collection."
+    ):
+        load_monster_dataset(dataset_name)
+    dataset_name = "Pedestrian"
+
+    X_train, y_train, X_test, y_test = load_monster_dataset(
+        dataset_name=dataset_name, fold=0, normalize=True
+    )
+
+    assert isinstance(X_train, np.ndarray)
+    assert isinstance(y_train, np.ndarray)
+    assert isinstance(X_test, np.ndarray)
+    assert isinstance(y_test, np.ndarray)
+
+    assert X_train.ndim == 3
+    assert X_test.ndim == 3
+    assert len(X_train) == len(y_train)
+    assert len(X_test) == len(y_test)
+
+    # Check normalization
+    mean = np.mean(X_train, axis=(0, 2))
+    std = np.std(X_train, axis=(0, 2))
+    np.testing.assert_array_almost_equal(mean, np.zeros_like(mean), decimal=3)
+    np.testing.assert_array_almost_equal(std, np.ones_like(std), decimal=3)
diff --git a/pyproject.toml b/pyproject.toml
@@ -58,6 +58,7 @@ dependencies = [
 [project.optional-dependencies]
 all_extras = [
     "esig>=1.0.0; platform_system != 'Darwin'",
+    "huggingface-hub>=0.20.0",
     "imbalanced-learn",
     "matplotlib>=3.3.2",
     "pycatch22>=0.4.5",