Skip to content

Commit a524b75

Browse files
committed
integrate pybids. refactor mongodb fields into config file. test braindecode BIDSDataset vs pybids
1 parent 4cc0a7f commit a524b75

File tree

6 files changed

+830
-106
lines changed

6 files changed

+830
-106
lines changed

notebooks/test_pybids_braindecode_BIDSDataset.ipynb

+641
Large diffs are not rendered by default.

pyproject.toml

+3-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "eegdash"
7-
version = "0.0.7"
7+
version = "0.0.8"
88
authors = [
99
{ name="Young Truong", email="[email protected]" },
1010
{ name="Arnaud Delorme", email="[email protected]" },
@@ -27,6 +27,8 @@ dependencies = [
2727
"h5py",
2828
"pymongo",
2929
"joblib",
30+
"braindecode",
31+
"mne-bids",
3032
]
3133
[project.urls]
3234
Homepage = "https://github.com/sccn/EEG-Dash-Data"

scripts/data_ingest.py

+13-2
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,20 @@ def main():
2121
failed_ds = ['ds005873', "ds004148"]
2222
hed_datasets = ['ds004853','ds004852','ds004851','ds004850','ds004849','ds004844','ds004843','ds004842','ds004841','ds004661','ds004660','ds004657','ds004362','ds004123','ds004122','ds004121','ds004120','ds004119','ds004118','ds004117','ds004106','ds004105','ds003645','ds003061','ds002893','ds002691','ds002680','ds002578']
2323
eeglab_datasets = ["ds004362", "ds005514", "ds002181","ds004554", "ds005697", "ds004151", "ds003800", "ds004350","ds004105", "ds004785", "ds004504", "ds004122", "ds004118","ds004121", "ds004635", "ds005787", "ds005512", "ds005079","ds004120", "ds004119", "ds005178", "ds004019", "ds005342","ds004745", "ds004502", "ds005505", "ds005034", "ds004563","ds002680", "ds003774", "ds004123", "ds003805", "ds005506","ds003838", "ds005507", "ds004040", "ds005511", "ds002718","ds002691", "ds003690", "ds003061", "ds005672", "ds003775","ds004106", "ds005410", "ds005508", "ds005510", "ds005509","ds002578", "ds003620"]
24-
datasets = ["ds002578", "ds003620"]
24+
# datasets = ["ds002578", "ds003620"]
25+
failed_ds = set(['ds004770', 'ds005261', 'ds000247', 'ds003420', 'ds005557'])
26+
datasets = ["ds004841","ds004770","ds004561","ds005261","ds000247","ds005131","ds003753","ds003420","ds005028","ds005557","ds005170","ds004840","ds004855","ds004718","ds002725","ds005565","ds004408","ds004796","ds002550","ds004511","ds002893","ds003682","ds004817","ds000248","ds003190","ds004819","ds005089","ds003822","ds003670","ds005048","ds004917","ds004574","ds004852","ds004357","ds003082","ds005574","ds005397","ds004519","ds004602","ds004784","ds005491","ds003846","ds002799","ds004024","ds005815","ds003694","ds005429","ds004771","ds003518","ds004977","ds003702","ds004577","ds005207","ds005866","ds004127","ds003574","ds004703","ds005779","ds004398","ds003523","ds005558","ds004212","ds004347","ds005185","ds005489","ds005398","ds004588","ds001787","ds003505","ds005670","ds003568","ds003703","ds005811","ds004370","ds005340","ds003987","ds004865","ds005363","ds005121","ds004078","ds003392","ds004317","ds004851","ds004033","ds004011","ds003876","ds004166","ds005691","ds005087","ds004330","ds004256","ds004315","ds005279","ds005420","ds003474","ds002034","ds003509","ds004186","ds003825","ds005868","ds003516","ds004587","ds005415","ds004942","ds004348","ds003633","ds004598","ds005383","ds003195","ds004473","ds005403","ds002908","ds004621","ds005863","ds003848","ds004625","ds005594","ds002336","ds004043","ds003517","ds005083","ds004368","ds004584","ds004012","ds003374","ds005624","ds005810","ds003506","ds005106","ds004284","ds005620","ds004738","ds004849","ds005234","ds003570","ds003490","ds002720","ds005307","ds002094","ds002833","ds002218","ds000117","ds004117","ds005021","ds004194","ds005356","ds004264","ds004446","ds004980","ds002722","ds004457","ds004505","ds004853","ds002885","ds004580","ds003944","ds005545","ds004279","ds005876","ds004532","ds004346","ds003816","ds005385","ds004572","ds005095","ds004696","ds004460","ds004902","ds005189","ds005274","ds004075","ds004447","ds004295","ds003519","ds004107","ds004952","ds003458","ds002724","ds003004","ds005571","ds003104","ds004200","ds002791","ds004015","ds005592","ds004262","ds004850","ds005273","ds002712","ds004520","ds004444","ds004582","ds002723","ds004017","ds004595","ds004626","ds003751","ds004475","ds000246","ds004515","ds003421","ds002158","ds004951","ds005522","ds004883","ds004483","ds005065","ds004624","ds004802","ds004993","ds004278","ds004816","ds003739","ds005873","ds004389","ds003194","ds004356","ds004367","ds004369","ds004381","ds004196","ds005692","ds002338","ds004022","ds004579","ds004859","ds005416","ds004603","ds004752","ds003768","ds003947","ds004229","ds005530","ds004844","ds005555","ds004998","ds004843","ds004477","ds001785","ds005688","ds003766","ds004276","ds005540","ds004152","ds004944","ds001971","ds003352","ds003626","ds002814","ds003645","ds005007","ds004551","ds005586","ds001784","ds004809","ds003922","ds004388","ds003810","ds004306","ds004642","ds003478","ds004100","ds003969","ds004000","ds005411","ds004842","ds005305","ds005494","ds004995","ds005114","ds004854","ds003638","ds004521","ds002761","ds001849","ds003844","ds003039","ds004706","ds004252","ds004448","ds005795","ds003602","ds005169","ds003380","ds004018","ds004080","ds004324","ds003887","ds004789","ds004860","ds004837","ds005241","ds003688","ds005107","ds002721","ds003655","ds004395","ds004147","ds003483","ds003555","ds005486","ds005520","ds005262","ds002778","ds004661","ds003885","ds004657","ds005523","ds003498","ds003522","ds005406","ds003710","ds003343","ds003708","ds002001","ds005345","ds004067","ds003078","ds003801","ds005059","ds003029","ds001810","ds005296","ds004660"]
2527
for ds in datasets:
26-
obj.add_bids_dataset(dataset=ds, data_dir=f'/mnt/nemar/openneuro/{ds}', raw_format='eeglab', overwrite=True)
28+
if ds in failed_ds:
29+
continue
30+
try:
31+
print(f'Processing {ds}')
32+
obj.add_bids_dataset(dataset=ds, data_dir=f'/mnt/nemar/openneuro/{ds}', overwrite=True)
33+
except Exception as e:
34+
print(e)
35+
failed_ds.add(ds)
36+
pass
37+
print(f'Failed datasets: {list(failed_ds)}')
2738

2839
if __name__ == "__main__":
2940
main()

src/eegdash/config.json

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
{
2+
"required_fields": ["data_name"],
3+
"attributes": {
4+
"data_name": "str",
5+
"dataset": "str",
6+
"bidspath": "str",
7+
"subject": "str",
8+
"task": "str",
9+
"session": "str",
10+
"run": "str",
11+
"sampling_frequency": "float",
12+
"modality": "str",
13+
"nchans": "int",
14+
"ntimes": "int"
15+
},
16+
"description_fields": ["subject", "session", "run", "task", "age", "gender", "sex"],
17+
"bids_dependencies_files": [
18+
"dataset_description.json",
19+
"participants.tsv",
20+
"events.tsv",
21+
"events.json",
22+
"eeg.json",
23+
"electrodes.tsv",
24+
"channels.tsv",
25+
"coordsystem.json"
26+
],
27+
"accepted_query_fields": ["data_name", "dataset"]
28+
}

src/eegdash/data_utils.py

+55-56
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from mne_bids import (
1818
BIDSPath,
1919
)
20+
from bids import BIDSLayout
2021

2122
class EEGDashBaseDataset(BaseDataset):
2223
"""Returns samples from an mne.io.Raw object along with a target.
@@ -96,7 +97,7 @@ def __getitem__(self, index):
9697

9798
def __len__(self):
9899
if self._raw is None:
99-
return self.record['rawdatainfo']['ntimes']
100+
return self.record['ntimes']
100101
else:
101102
return len(self._raw)
102103

@@ -216,39 +217,49 @@ def _read_segment_file(self, data, idx, fi, start, stop, cals, mult):
216217
_read_segments_file(self, data, idx, fi, start, stop, cals, mult, dtype="<f4")
217218

218219

219-
class BIDSDataset():
220+
class EEGBIDSDataset():
220221
ALLOWED_FILE_FORMAT = ['eeglab', 'brainvision', 'biosemi', 'european']
221-
RAW_EXTENSION = {
222-
'eeglab': '.set',
223-
'brainvision': '.vhdr',
224-
'biosemi': '.bdf',
225-
'european': '.edf'
226-
}
222+
RAW_EXTENSIONS = {
223+
'.set': ['.set', '.fdt'], # eeglab
224+
'.edf': ['.edf'], # european
225+
'.vhdr': ['.eeg', '.vhdr', '.vmrk', '.dat', '.raw'], # brainvision
226+
'.bdf': ['.bdf'], # biosemi
227+
}
227228
METADATA_FILE_EXTENSIONS = ['eeg.json', 'channels.tsv', 'electrodes.tsv', 'events.tsv', 'events.json']
228229
def __init__(self,
229230
data_dir=None, # location of bids dataset
230231
dataset='', # dataset name
231-
raw_format='eeglab', # format of raw data
232232
):
233233
if data_dir is None or not os.path.exists(data_dir):
234234
raise ValueError('data_dir must be specified and must exist')
235235
self.bidsdir = Path(data_dir)
236236
self.dataset = dataset
237237
assert str(self.bidsdir).endswith(self.dataset)
238-
239-
if raw_format.lower() not in self.ALLOWED_FILE_FORMAT:
240-
raise ValueError('raw_format must be one of {}'.format(self.ALLOWED_FILE_FORMAT))
241-
self.raw_format = raw_format.lower()
242-
243-
# get all .set files in the bids directory
244-
temp_dir = (Path().resolve() / 'data')
245-
if not os.path.exists(temp_dir):
246-
os.mkdir(temp_dir)
247-
if not os.path.exists(temp_dir / f'{dataset}_files.npy'):
248-
self.files = self.get_files_with_extension_parallel(self.bidsdir, extension=self.RAW_EXTENSION[self.raw_format])
249-
np.save(temp_dir / f'{dataset}_files.npy', self.files)
250-
else:
251-
self.files = np.load(temp_dir / f'{dataset}_files.npy', allow_pickle=True)
238+
self.layout = BIDSLayout(data_dir)
239+
240+
# get all recording files in the bids directory
241+
self.files = self.get_recordings(self.layout)
242+
assert len(self.files) > 0, ValueError('Unable to construct EEG dataset. No EEG recordings found.')
243+
assert self.check_eeg_dataset(), ValueError('Dataset is not an EEG dataset.')
244+
# temp_dir = (Path().resolve() / 'data')
245+
# if not os.path.exists(temp_dir):
246+
# os.mkdir(temp_dir)
247+
# if not os.path.exists(temp_dir / f'{dataset}_files.npy'):
248+
# self.files = self.get_files_with_extension_parallel(self.bidsdir, extension=self.RAW_EXTENSION[self.raw_format])
249+
# np.save(temp_dir / f'{dataset}_files.npy', self.files)
250+
# else:
251+
# self.files = np.load(temp_dir / f'{dataset}_files.npy', allow_pickle=True)
252+
253+
def check_eeg_dataset(self):
254+
return self.get_bids_file_attribute('modality', self.files[0]).lower() == 'eeg'
255+
256+
def get_recordings(self, layout:BIDSLayout):
257+
files = []
258+
for ext, exts in self.RAW_EXTENSIONS.items():
259+
files = layout.get(extension=ext, return_type='filename')
260+
if files:
261+
break
262+
return files
252263

253264
def get_relative_bidspath(self, filename):
254265
bids_parent_dir = self.bidsdir.parent
@@ -301,11 +312,6 @@ def get_bids_file_inheritance(self, path, basename, extension):
301312
filepath = path / file
302313
bids_files.append(filepath)
303314

304-
# cur_file_basename = file[:file.rfind('_')] # TODO: change to just search for any file with extension
305-
# if file.endswith(extension) and cur_file_basename in basename:
306-
# filepath = path / file
307-
# bids_files.append(filepath)
308-
309315
# check if file is in top level directory
310316
if any(file in os.listdir(path) for file in top_level_files):
311317
return bids_files
@@ -338,7 +344,7 @@ def get_bids_metadata_files(self, filepath, metadata_file_extension):
338344

339345
def scan_directory(self, directory, extension):
340346
result_files = []
341-
directory_to_ignore = ['.git']
347+
directory_to_ignore = ['.git', '.datalad', 'derivatives', 'code']
342348
with os.scandir(directory) as entries:
343349
for entry in entries:
344350
if entry.is_file() and entry.name.endswith(extension):
@@ -419,32 +425,22 @@ def resolve_bids_json(self, json_files: list):
419425
json_dict.update(json.load(f))
420426
return json_dict
421427

422-
def sfreq(self, data_filepath):
423-
json_files = self.get_bids_metadata_files(data_filepath, 'eeg.json')
424-
if len(json_files) == 0:
425-
raise ValueError('No eeg.json found')
426-
427-
metadata = self.resolve_bids_json(json_files)
428-
if 'SamplingFrequency' not in metadata:
429-
raise ValueError('SamplingFrequency not found in metadata')
430-
else:
431-
return metadata['SamplingFrequency']
432-
433-
def task(self, data_filepath):
434-
return self.get_property_from_filename('task', data_filepath)
435-
436-
def session(self, data_filepath):
437-
return self.get_property_from_filename('session', data_filepath)
438-
439-
def run(self, data_filepath):
440-
return self.get_property_from_filename('run', data_filepath)
441-
442-
def subject(self, data_filepath):
443-
return self.get_property_from_filename('sub', data_filepath)
444-
445-
def num_channels(self, data_filepath):
446-
channels_tsv = pd.read_csv(self.get_bids_metadata_files(data_filepath, 'channels.tsv')[0], sep='\t')
447-
return len(channels_tsv)
428+
def get_bids_file_attribute(self, attribute, data_filepath):
429+
entities = self.layout.parse_file_entities(data_filepath)
430+
bidsfile = self.layout.get(**entities)[0]
431+
attributes = bidsfile.get_entities(metadata='all')
432+
attribute_mapping = {
433+
'sfreq': 'SamplingFrequency',
434+
'modality': 'datatype',
435+
'task': 'task',
436+
'session': 'session',
437+
'run': 'run',
438+
'subject': 'subject',
439+
'ntimes': 'RecordingDuration',
440+
'nchans': 'EEGChannelCount'
441+
}
442+
attribute_value = attributes.get(attribute_mapping.get(attribute), None)
443+
return attribute_value
448444

449445
def channel_labels(self, data_filepath):
450446
channels_tsv = pd.read_csv(self.get_bids_metadata_files(data_filepath, 'channels.tsv')[0], sep='\t')
@@ -462,9 +458,12 @@ def num_times(self, data_filepath):
462458
def subject_participant_tsv(self, data_filepath):
463459
'''Get participants_tsv info of a subject based on filepath'''
464460
participants_tsv = pd.read_csv(self.get_bids_metadata_files(data_filepath, 'participants.tsv')[0], sep='\t')
461+
# if participants_tsv is not empty
462+
if participants_tsv.empty:
463+
return {}
465464
# set 'participant_id' as index
466465
participants_tsv.set_index('participant_id', inplace=True)
467-
subject = f'sub-{self.subject(data_filepath)}'
466+
subject = f"sub-{self.get_bids_file_attribute('subject', data_filepath)}"
468467
return participants_tsv.loc[subject].to_dict()
469468

470469
def eeg_json(self, data_filepath):

0 commit comments

Comments
 (0)