integrate pybids. refactor mongodb fields into config file. test braindecode BIDSDataset vs pybids

dungscout96 · dungscout96 · commit a524b75b65fd · 2025-04-01T11:16:39.000-07:00
diff --git a/notebooks/test_pybids_braindecode_BIDSDataset.ipynb b/notebooks/test_pybids_braindecode_BIDSDataset.ipynb
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "eegdash"
-version = "0.0.7"
+version = "0.0.8"
 authors = [
   { name="Young Truong", email="dt.young112@gmail.com" },
   { name="Arnaud Delorme", email="adelorme@gmail.com" },
@@ -27,6 +27,8 @@ dependencies = [
   "h5py",
   "pymongo",
   "joblib",
+  "braindecode",
+  "mne-bids",
 ]
 [project.urls]
 Homepage = "https://github.com/sccn/EEG-Dash-Data"
diff --git a/scripts/data_ingest.py b/scripts/data_ingest.py
@@ -21,9 +21,20 @@ def main():
     failed_ds = ['ds005873', "ds004148"]
     hed_datasets = ['ds004853','ds004852','ds004851','ds004850','ds004849','ds004844','ds004843','ds004842','ds004841','ds004661','ds004660','ds004657','ds004362','ds004123','ds004122','ds004121','ds004120','ds004119','ds004118','ds004117','ds004106','ds004105','ds003645','ds003061','ds002893','ds002691','ds002680','ds002578']
     eeglab_datasets = ["ds004362", "ds005514", "ds002181","ds004554", "ds005697", "ds004151", "ds003800", "ds004350","ds004105", "ds004785", "ds004504", "ds004122", "ds004118","ds004121", "ds004635", "ds005787", "ds005512", "ds005079","ds004120", "ds004119", "ds005178", "ds004019", "ds005342","ds004745", "ds004502", "ds005505", "ds005034", "ds004563","ds002680", "ds003774", "ds004123", "ds003805", "ds005506","ds003838", "ds005507", "ds004040", "ds005511", "ds002718","ds002691", "ds003690", "ds003061", "ds005672", "ds003775","ds004106", "ds005410", "ds005508", "ds005510", "ds005509","ds002578", "ds003620"]
-    datasets = ["ds002578", "ds003620"]
+    # datasets = ["ds002578", "ds003620"]
+    failed_ds = set(['ds004770', 'ds005261', 'ds000247', 'ds003420', 'ds005557'])
+    datasets = ["ds004841","ds004770","ds004561","ds005261","ds000247","ds005131","ds003753","ds003420","ds005028","ds005557","ds005170","ds004840","ds004855","ds004718","ds002725","ds005565","ds004408","ds004796","ds002550","ds004511","ds002893","ds003682","ds004817","ds000248","ds003190","ds004819","ds005089","ds003822","ds003670","ds005048","ds004917","ds004574","ds004852","ds004357","ds003082","ds005574","ds005397","ds004519","ds004602","ds004784","ds005491","ds003846","ds002799","ds004024","ds005815","ds003694","ds005429","ds004771","ds003518","ds004977","ds003702","ds004577","ds005207","ds005866","ds004127","ds003574","ds004703","ds005779","ds004398","ds003523","ds005558","ds004212","ds004347","ds005185","ds005489","ds005398","ds004588","ds001787","ds003505","ds005670","ds003568","ds003703","ds005811","ds004370","ds005340","ds003987","ds004865","ds005363","ds005121","ds004078","ds003392","ds004317","ds004851","ds004033","ds004011","ds003876","ds004166","ds005691","ds005087","ds004330","ds004256","ds004315","ds005279","ds005420","ds003474","ds002034","ds003509","ds004186","ds003825","ds005868","ds003516","ds004587","ds005415","ds004942","ds004348","ds003633","ds004598","ds005383","ds003195","ds004473","ds005403","ds002908","ds004621","ds005863","ds003848","ds004625","ds005594","ds002336","ds004043","ds003517","ds005083","ds004368","ds004584","ds004012","ds003374","ds005624","ds005810","ds003506","ds005106","ds004284","ds005620","ds004738","ds004849","ds005234","ds003570","ds003490","ds002720","ds005307","ds002094","ds002833","ds002218","ds000117","ds004117","ds005021","ds004194","ds005356","ds004264","ds004446","ds004980","ds002722","ds004457","ds004505","ds004853","ds002885","ds004580","ds003944","ds005545","ds004279","ds005876","ds004532","ds004346","ds003816","ds005385","ds004572","ds005095","ds004696","ds004460","ds004902","ds005189","ds005274","ds004075","ds004447","ds004295","ds003519","ds004107","ds004952","ds003458","ds002724","ds003004","ds005571","ds003104","ds004200","ds002791","ds004015","ds005592","ds004262","ds004850","ds005273","ds002712","ds004520","ds004444","ds004582","ds002723","ds004017","ds004595","ds004626","ds003751","ds004475","ds000246","ds004515","ds003421","ds002158","ds004951","ds005522","ds004883","ds004483","ds005065","ds004624","ds004802","ds004993","ds004278","ds004816","ds003739","ds005873","ds004389","ds003194","ds004356","ds004367","ds004369","ds004381","ds004196","ds005692","ds002338","ds004022","ds004579","ds004859","ds005416","ds004603","ds004752","ds003768","ds003947","ds004229","ds005530","ds004844","ds005555","ds004998","ds004843","ds004477","ds001785","ds005688","ds003766","ds004276","ds005540","ds004152","ds004944","ds001971","ds003352","ds003626","ds002814","ds003645","ds005007","ds004551","ds005586","ds001784","ds004809","ds003922","ds004388","ds003810","ds004306","ds004642","ds003478","ds004100","ds003969","ds004000","ds005411","ds004842","ds005305","ds005494","ds004995","ds005114","ds004854","ds003638","ds004521","ds002761","ds001849","ds003844","ds003039","ds004706","ds004252","ds004448","ds005795","ds003602","ds005169","ds003380","ds004018","ds004080","ds004324","ds003887","ds004789","ds004860","ds004837","ds005241","ds003688","ds005107","ds002721","ds003655","ds004395","ds004147","ds003483","ds003555","ds005486","ds005520","ds005262","ds002778","ds004661","ds003885","ds004657","ds005523","ds003498","ds003522","ds005406","ds003710","ds003343","ds003708","ds002001","ds005345","ds004067","ds003078","ds003801","ds005059","ds003029","ds001810","ds005296","ds004660"]
     for ds in datasets:
-        obj.add_bids_dataset(dataset=ds, data_dir=f'/mnt/nemar/openneuro/{ds}', raw_format='eeglab', overwrite=True)
+        if ds in failed_ds:
+            continue
+        try:
+            print(f'Processing {ds}')
+            obj.add_bids_dataset(dataset=ds, data_dir=f'/mnt/nemar/openneuro/{ds}', overwrite=True)
+        except Exception as e:
+            print(e)
+            failed_ds.add(ds)
+            pass
+    print(f'Failed datasets: {list(failed_ds)}')
 
 if __name__ == "__main__":
     main()
diff --git a/src/eegdash/config.json b/src/eegdash/config.json
@@ -0,0 +1,28 @@
+{
+  "required_fields": ["data_name"],
+  "attributes": {
+    "data_name": "str",
+    "dataset": "str",
+    "bidspath": "str",
+    "subject": "str",
+    "task": "str",
+    "session": "str",
+    "run": "str",
+    "sampling_frequency": "float",
+    "modality": "str",
+    "nchans": "int",
+    "ntimes": "int"
+  },
+  "description_fields": ["subject", "session", "run", "task", "age", "gender", "sex"],
+  "bids_dependencies_files": [
+    "dataset_description.json", 
+    "participants.tsv", 
+    "events.tsv", 
+    "events.json", 
+    "eeg.json", 
+    "electrodes.tsv", 
+    "channels.tsv", 
+    "coordsystem.json"
+  ],
+  "accepted_query_fields": ["data_name", "dataset"]
+}
diff --git a/src/eegdash/data_utils.py b/src/eegdash/data_utils.py
@@ -17,6 +17,7 @@
 from mne_bids import (
     BIDSPath,
 )
+from bids import BIDSLayout
 
 class EEGDashBaseDataset(BaseDataset):
     """Returns samples from an mne.io.Raw object along with a target.
@@ -96,7 +97,7 @@ def __getitem__(self, index):
     
     def __len__(self):
         if self._raw is None:
-            return self.record['rawdatainfo']['ntimes']
+            return self.record['ntimes']
         else:
             return len(self._raw)
 
@@ -216,39 +217,49 @@ def _read_segment_file(self, data, idx, fi, start, stop, cals, mult):
         _read_segments_file(self, data, idx, fi, start, stop, cals, mult, dtype="<f4")
 
 
-class BIDSDataset():
+class EEGBIDSDataset():
     ALLOWED_FILE_FORMAT = ['eeglab', 'brainvision', 'biosemi', 'european']
-    RAW_EXTENSION = {
-        'eeglab': '.set',
-        'brainvision': '.vhdr',
-        'biosemi': '.bdf',
-        'european': '.edf'
-    }
+    RAW_EXTENSIONS = {
+            '.set': ['.set', '.fdt'], # eeglab
+            '.edf': ['.edf'], # european
+            '.vhdr': ['.eeg', '.vhdr', '.vmrk', '.dat', '.raw'], # brainvision
+            '.bdf': ['.bdf'], # biosemi
+        }
     METADATA_FILE_EXTENSIONS = ['eeg.json', 'channels.tsv', 'electrodes.tsv', 'events.tsv', 'events.json']
     def __init__(self,
             data_dir=None,                            # location of bids dataset 
             dataset='',                               # dataset name
-            raw_format='eeglab',                      # format of raw data
         ):                            
         if data_dir is None or not os.path.exists(data_dir):
             raise ValueError('data_dir must be specified and must exist')
         self.bidsdir = Path(data_dir)
         self.dataset = dataset
         assert str(self.bidsdir).endswith(self.dataset)
-
-        if raw_format.lower() not in self.ALLOWED_FILE_FORMAT:
-            raise ValueError('raw_format must be one of {}'.format(self.ALLOWED_FILE_FORMAT))
-        self.raw_format = raw_format.lower()
-
-        # get all .set files in the bids directory
-        temp_dir = (Path().resolve() / 'data')
-        if not os.path.exists(temp_dir):
-            os.mkdir(temp_dir)
-        if not os.path.exists(temp_dir / f'{dataset}_files.npy'):
-            self.files = self.get_files_with_extension_parallel(self.bidsdir, extension=self.RAW_EXTENSION[self.raw_format])
-            np.save(temp_dir / f'{dataset}_files.npy', self.files)
-        else:
-            self.files = np.load(temp_dir / f'{dataset}_files.npy', allow_pickle=True)
+        self.layout = BIDSLayout(data_dir)
+
+        # get all recording files in the bids directory
+        self.files = self.get_recordings(self.layout)
+        assert len(self.files) > 0, ValueError('Unable to construct EEG dataset. No EEG recordings found.')
+        assert self.check_eeg_dataset(), ValueError('Dataset is not an EEG dataset.')
+        # temp_dir = (Path().resolve() / 'data')
+        # if not os.path.exists(temp_dir):
+        #     os.mkdir(temp_dir)
+        # if not os.path.exists(temp_dir / f'{dataset}_files.npy'):
+        #     self.files = self.get_files_with_extension_parallel(self.bidsdir, extension=self.RAW_EXTENSION[self.raw_format])
+        #     np.save(temp_dir / f'{dataset}_files.npy', self.files)
+        # else:
+        #     self.files = np.load(temp_dir / f'{dataset}_files.npy', allow_pickle=True)
+
+    def check_eeg_dataset(self):
+        return self.get_bids_file_attribute('modality', self.files[0]).lower() == 'eeg'
+
+    def get_recordings(self, layout:BIDSLayout):
+        files = []
+        for ext, exts in self.RAW_EXTENSIONS.items():
+            files = layout.get(extension=ext, return_type='filename')
+            if files:
+                break 
+        return files
 
     def get_relative_bidspath(self, filename):
         bids_parent_dir = self.bidsdir.parent
@@ -301,11 +312,6 @@ def get_bids_file_inheritance(self, path, basename, extension):
                     filepath = path / file
                     bids_files.append(filepath)
 
-                # cur_file_basename = file[:file.rfind('_')] # TODO: change to just search for any file with extension
-                # if file.endswith(extension) and cur_file_basename in basename:
-                #     filepath = path / file
-                #     bids_files.append(filepath)
-
         # check if file is in top level directory
         if any(file in os.listdir(path) for file in top_level_files):
             return bids_files
@@ -338,7 +344,7 @@ def get_bids_metadata_files(self, filepath, metadata_file_extension):
         
     def scan_directory(self, directory, extension):
         result_files = []
-        directory_to_ignore = ['.git']
+        directory_to_ignore = ['.git', '.datalad', 'derivatives', 'code']
         with os.scandir(directory) as entries:
             for entry in entries:
                 if entry.is_file() and entry.name.endswith(extension):
@@ -419,32 +425,22 @@ def resolve_bids_json(self, json_files: list):
                 json_dict.update(json.load(f))
         return json_dict
 
-    def sfreq(self, data_filepath):
-        json_files = self.get_bids_metadata_files(data_filepath, 'eeg.json')
-        if len(json_files) == 0:
-            raise ValueError('No eeg.json found')
-
-        metadata = self.resolve_bids_json(json_files)
-        if 'SamplingFrequency' not in metadata:
-            raise ValueError('SamplingFrequency not found in metadata')
-        else:
-            return metadata['SamplingFrequency']
-    
-    def task(self, data_filepath):
-        return self.get_property_from_filename('task', data_filepath)
-        
-    def session(self, data_filepath):
-        return self.get_property_from_filename('session', data_filepath)
-
-    def run(self, data_filepath):
-        return self.get_property_from_filename('run', data_filepath)
-
-    def subject(self, data_filepath):
-        return self.get_property_from_filename('sub', data_filepath)
-
-    def num_channels(self, data_filepath):
-        channels_tsv = pd.read_csv(self.get_bids_metadata_files(data_filepath, 'channels.tsv')[0], sep='\t')
-        return len(channels_tsv)
+    def get_bids_file_attribute(self, attribute, data_filepath):
+        entities = self.layout.parse_file_entities(data_filepath)
+        bidsfile = self.layout.get(**entities)[0]
+        attributes = bidsfile.get_entities(metadata='all')
+        attribute_mapping = {
+            'sfreq': 'SamplingFrequency',
+            'modality': 'datatype',
+            'task': 'task',
+            'session': 'session',
+            'run': 'run',
+            'subject': 'subject',
+            'ntimes': 'RecordingDuration',
+            'nchans': 'EEGChannelCount'
+        }
+        attribute_value = attributes.get(attribute_mapping.get(attribute), None)
+        return attribute_value
 
     def channel_labels(self, data_filepath):
         channels_tsv = pd.read_csv(self.get_bids_metadata_files(data_filepath, 'channels.tsv')[0], sep='\t')
@@ -462,9 +458,12 @@ def num_times(self, data_filepath):
     def subject_participant_tsv(self, data_filepath):
         '''Get participants_tsv info of a subject based on filepath'''
         participants_tsv = pd.read_csv(self.get_bids_metadata_files(data_filepath, 'participants.tsv')[0], sep='\t')
+        # if participants_tsv is not empty
+        if participants_tsv.empty:
+            return {}
         # set 'participant_id' as index
         participants_tsv.set_index('participant_id', inplace=True)
-        subject = f'sub-{self.subject(data_filepath)}'
+        subject = f"sub-{self.get_bids_file_attribute('subject', data_filepath)}"
         return participants_tsv.loc[subject].to_dict()
     
     def eeg_json(self, data_filepath):
diff --git a/src/eegdash/main.py b/src/eegdash/main.py