Merge branch 'develop' of github.com:sccn/EEG-Dash-Data into develop

arnodelorme · arnodelorme · commit b39bd1015846 · 2025-02-03T11:24:23.000-08:00
diff --git a/eegdash/data_utils.py b/eegdash/data_utils.py
@@ -6,8 +6,94 @@
 from pathlib import Path
 import re
 import json
+from mne.io import BaseRaw
+from mne._fiff.utils import _find_channels, _read_segments_file
+import s3fs
+import tempfile
+from mne._fiff.utils import _read_segments_file
 
-verbose = False
+class RawEEGDash(BaseRaw):
+    r"""Raw object from EEG-Dash connection with Openneuro S3 file.
+
+    Parameters
+    ----------
+    input_fname : path-like
+        Path to the S3 file
+    eog : list | tuple | 'auto'
+        Names or indices of channels that should be designated EOG channels.
+        If 'auto', the channel names containing ``EOG`` or ``EYE`` are used.
+        Defaults to empty tuple.
+    %(preload)s
+        Note that preload=False will be effective only if the data is stored
+        in a separate binary file.
+    %(uint16_codec)s
+    %(montage_units)s
+    %(verbose)s
+
+    See Also
+    --------
+    mne.io.Raw : Documentation of attributes and methods.
+
+    Notes
+    -----
+    .. versionadded:: 0.11.0
+    """
+
+    def __init__(
+        self,
+        input_fname,
+        metadata,
+        eog=(),
+        preload=False,
+        *,
+        cache_dir='.',
+        uint16_codec=None,
+        montage_units="auto",
+        verbose=None,
+    ):
+        '''
+        Get to work with S3 endpoint first, no caching
+        '''
+        # Create a simple RawArray
+        sfreq = metadata['sfreq']  # Sampling frequency
+        n_chans = metadata['nchans']
+        n_times = metadata['n_times']
+        print('n_times', n_times)
+        ch_names = [f'EEG{d}' for d in range(1,n_chans+1)]
+        ch_types = ["eeg"] * n_chans
+        info = mne.create_info(ch_names=ch_names, sfreq=sfreq, ch_types=ch_types)
+        self.s3file = input_fname
+        self.filecache = os.path.join(cache_dir, os.path.basename(self.s3file))
+
+        if preload and not os.path.exists(self.filecache):
+            self._download_s3()
+            preload = self.filecache
+
+        super().__init__(
+            info,
+            preload,
+            last_samps=[n_times-1],
+            orig_format="double",
+            verbose=verbose,
+        )
+
+    def _download_s3(self):
+        filesystem = s3fs.S3FileSystem(anon=True, client_kwargs={'region_name': 'us-east-2'})
+        filesystem.download(self.s3file, self.filecache)
+        self.filenames = [self.filecache]
+
+    def _read_segment(
+        self, start=0, stop=None, sel=None, data_buffer=None, *, verbose=None
+    ):
+        if not os.path.exists(self.filecache): # not preload
+            self._download_s3()
+        else: # not preload and file is not cached
+            self.filenames = [self.filecache]
+        return super()._read_segment(start, stop, sel, data_buffer, verbose=verbose)
+    
+    def _read_segment_file(self, data, idx, fi, start, stop, cals, mult):
+        """Read a chunk of data from the file."""
+        _read_segments_file(self, data, idx, fi, start, stop, cals, mult, dtype="<f4")
 
 
 class BIDSDataset():
diff --git a/requirements.txt b/requirements.txt
@@ -7,4 +7,5 @@ pynwb
 h5py
 pymongo
 joblib
--e git+https://github.com/dungscout96/SignalStore.git@cedf682bf589e57c8ba8253a8ff2d7c33eeae97f#egg=signalstore
+-e git+https://github.com/dungscout96/SignalStore.git@cedf682bf589e57c8ba8253a8ff2d7c33eeae97f#egg=signalstore
+pynwb
diff --git a/tests/test_s3_mne.ipynb b/tests/test_s3_mne.ipynb
@@ -0,0 +1,197 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "create empty mne-python raw object"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Creating RawArray with float64 data, n_channels=74, n_times=747750\n",
+      "    Range : 0 ... 747749 =      0.000 ...  2990.996 secs\n",
+      "Ready.\n",
+      "False\n",
+      "<class 'mne.io.array.array.RawArray'>\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "import mne\n",
+    "\n",
+    "# Create a simple RawArray\n",
+    "sfreq = 250  # Sampling frequency\n",
+    "ch_names = [f'EEG{d}' for d in range(1,75)]\n",
+    "ch_types = [\"eeg\"] * 74\n",
+    "info = mne.create_info(ch_names=ch_names, sfreq=sfreq, ch_types=ch_types)\n",
+    "\n",
+    "data = np.random.randn(74, 747750)  # 2 channels, 1000 samples\n",
+    "raw = mne.io.RawArray(data, info)\n",
+    "\n",
+    "print(isinstance(raw, mne.io.Raw))  # True\n",
+    "print(type(raw))  # <class 'mne.io.array.array.RawArray'>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "braindecode call __getitem__ of mne.base.Raw, which then calls _getitem which calls _read_segment of BaseRaw. mne uses _read_segment to read a specific range of the file. We want to test whether S3 file via fsspec can be integrated\n",
+    "It calls _read_segments_file of the BaseRaw class. Any subclass must implement this method. EEGLAB calls fiff reader function: mne/_fiff/utils.py#L200"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# To be able to make edits to repo without having to restart notebook\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "import sys \n",
+    "sys.path.append('..')\n",
+    "from eegdash.data_utils import RawEEGDash"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "n_times 747750\n",
+      "Reading 0 ... 747749  =      0.000 ...  2990.996 secs...\n"
+     ]
+    }
+   ],
+   "source": [
+    "eegdash = RawEEGDash('s3://testspeedeegdash/sub-002_task-FaceRecognition_eeg.set', {'sfreq': 250, 'nchans': 74, 'n_times': 747750}, preload=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[5.56413960e+07, 4.40817108e-39, 5.60519386e-45, ...,\n",
+       "        7.10578003e+01, 7.32979889e+01, 6.95856934e+01],\n",
+       "       [5.97010569e-07, 8.72444220e-39, 0.00000000e+00, ...,\n",
+       "        4.59728317e+01, 4.81444893e+01, 4.25833282e+01],\n",
+       "       [1.67969504e+08, 8.90820568e-39, 7.00649232e-45, ...,\n",
+       "        4.93649330e+01, 4.92341499e+01, 4.53561974e+01],\n",
+       "       ...,\n",
+       "       [1.00893489e-43, 7.84727140e-44, 8.90820568e-39, ...,\n",
+       "        4.53600616e+01, 4.10236855e+01, 4.29333000e+01],\n",
+       "       [1.07449142e-38, 8.40779079e-45, 1.02856414e-38, ...,\n",
+       "        5.49960251e+01, 4.60316620e+01, 4.72489014e+01],\n",
+       "       [4.13273465e-39, 1.12103877e-44, 1.01938998e-38, ...,\n",
+       "        4.54888268e+01, 3.74752045e+01, 2.96322441e+01]],\n",
+       "      shape=(74, 747750))"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "eegdash.get_data()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "integrate with braindecode"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from braindecode.datasets import BaseDataset, BaseConcatDataset\n",
+    "eegdash_braindecode = BaseConcatDataset([BaseDataset(eegdash)])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from braindecode.preprocessing import (\n",
+    "    preprocess, Preprocessor, create_fixed_length_windows)\n",
+    "windows_ds = create_fixed_length_windows(eegdash_braindecode, start_offset_samples=0, stop_offset_samples=None,\n",
+    "        window_size_samples=1000,\n",
+    "        window_stride_samples=1000, drop_last_window=True,\n",
+    "        preload=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(74, 1000)"
+      ]
+     },
+     "execution_count": 36,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "windows_ds[0][0].shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tests/testspeed.ipynb b/tests/testspeed.ipynb