czlwang · epatel16 · Oct 16, 2025 · glchau · Oct 20, 2025 · glchau
diff --git a/data/make_subject_specific_manifest.py b/data/make_subject_specific_manifest.py
@@ -9,6 +9,7 @@
 import random
 from sklearn.model_selection import train_test_split
 import shutil
+import numpy as np
 
 log = logging.getLogger(__name__)
 
@@ -19,16 +20,18 @@ def main(cfg: DictConfig) -> None:
     log.info(f'Working directory {os.getcwd()}')
 
     data_path = cfg.data_prep.data_path
+
 
-    manifest_path = os.path.join(data_path, "manifest.tsv")
+    manifest_path = os.path.join(data_path, f"subject_manifests/{cfg.data_prep.subj}/manifest.tsv")
     assert os.path.exists(manifest_path)
     manifest = []
     with open(manifest_path) as fd:
         rd = csv.reader(fd, delimiter="\t", quotechar='"')
         for row in rd:
-            manifest.append(row)
+            if row[1] == cfg.data_prep.subj:
+                manifest.append(row)
 
-    label_path = os.path.join(data_path, "labels.tsv")
+    label_path = os.path.join(data_path, f"subject_labels/{cfg.data_prep.subj}/labels.tsv")
     assert os.path.exists(label_path)
     labels = []
     with open(label_path) as fd:
@@ -38,22 +41,23 @@ def main(cfg: DictConfig) -> None:
 
     out_path = cfg.data_prep.out_path
     Path(out_path).mkdir(exist_ok=True, parents=True)
-    src = os.path.join(data_path, "localization")
+    src = os.path.join(data_path, f"subject_metadata/{cfg.data_prep.subj}/localization")
     dest = os.path.join(out_path, "localization")
     if not os.path.exists(dest):
         shutil.copytree(src, dest)
 
-    src = os.path.join(data_path, "all_ordered_electrodes.json")
+    src = os.path.join(data_path, f"subject_metadata/{cfg.data_prep.subj}/all_ordered_electrodes.json")
     dest = os.path.join(out_path, "all_ordered_electrodes.json")
     if not os.path.exists(dest):
         shutil.copy(src, dest)
 
+
     new_manifest, new_labels = [], []
     for manifest_record, labels_record in zip(manifest, labels):
         if manifest_record[1] == cfg.data_prep.subj:
             new_manifest.append(manifest_record)
             new_labels.append(labels_record)
-
+            
     manifest_path = os.path.join(out_path, "manifest.tsv")
     with open(manifest_path, 'w', newline='') as tsvfile:
         writer = csv.writer(tsvfile, delimiter='\t', lineterminator='\n')
@@ -67,6 +71,4 @@ def main(cfg: DictConfig) -> None:
             writer.writerow(record)
 
 if __name__ == "__main__":
-    main()
-
-
+    main()
diff --git a/data/multi_electrode_subj_data.py b/data/multi_electrode_subj_data.py
@@ -55,6 +55,21 @@ def get_subj_data(self):
         electrodes = reader.selected_electrodes
         localization_df = trial_data.get_brain_region_localization_df()
 
-        seeg_data = np.concatenate(seeg_data, axis=1)
-        return seeg_data, trials, electrodes, localization_df
+        # Determine total size
+        total_words = sum(arr.shape[1] for arr in seeg_data)
+        n_electrodes, _, n_samples = seeg_data[0].shape
+
+        neural_data_path = "neural_data_memmap.dat"
+        neural_data = np.memmap(neural_data_path, dtype=seeg_data[0].dtype,
+                                mode='w+', shape=(n_electrodes, total_words, n_samples))
+
+        # Fill from seeg_data
+        current_idx = 0
+        for arr in seeg_data:
+            word_count = arr.shape[1]
+            neural_data[:, current_idx:current_idx + word_count, :] = arr
+            current_idx += word_count
+            del arr
+
+        return neural_data, trials, electrodes, localization_df
 
diff --git a/data/speech_nonspeech_subject_data.py b/data/speech_nonspeech_subject_data.py
@@ -45,7 +45,24 @@ def get_subj_data(self, subject):
 
         electrodes = reader.selected_electrodes
         localization_df = trial_data.get_brain_region_localization_df()
-        neural_data = np.concatenate(seeg_data, axis=1)
+
+        # neural_data = np.concatenate(seeg_data, axis=1)
+
+        total_words = sum(arr.shape[1] for arr in seeg_data)
+        n_electrodes, _, n_samples = seeg_data[0].shape
+
+        neural_data_path = "neural_data_memmap.dat"
+        neural_data = np.memmap(neural_data_path, dtype=seeg_data[0].dtype,
+                                mode='w+', shape=(n_electrodes, total_words, n_samples))
+
+        # Fill from seeg_data
+        current_idx = 0
+        for arr in seeg_data:
+            word_count = arr.shape[1]
+            neural_data[:, current_idx:current_idx + word_count, :] = arr
+            current_idx += word_count
+            del arr
+
         labels_df = pd.concat(words) #NOTE the index will not be unique, but the location will
         #TODO: pretty sure we are missing the get_subj_data method here
         return labels_df, neural_data, trials, electrodes, localization_df
@@ -79,7 +96,24 @@ def get_subj_data(self, subject):
 
         electrodes = reader.selected_electrodes
         localization_df = trial_data.get_brain_region_localization_df()
-        neural_data = np.concatenate(seeg_data, axis=1)
+
+        # neural_data = np.concatenate(seeg_data, axis=1)
+
+        total_words = sum(arr.shape[1] for arr in seeg_data)
+        n_electrodes, _, n_samples = seeg_data[0].shape
+
+        neural_data_path = "neural_data_memmap.dat"
+        neural_data = np.memmap(neural_data_path, dtype=seeg_data[0].dtype,
+                                mode='w+', shape=(n_electrodes, total_words, n_samples))
+
+        # Fill from seeg_data
+        current_idx = 0
+        for arr in seeg_data:
+            word_count = arr.shape[1]
+            neural_data[:, current_idx:current_idx + word_count, :] = arr
+            current_idx += word_count
+            del arr
+
         labels_df = pd.concat(words) #NOTE the index will not be unique, but the location will
         #TODO: pretty sure we are missing the get_subj_data method here
         return labels_df, neural_data, trials, electrodes, localization_df
diff --git a/data/subject_data.py b/data/subject_data.py
@@ -60,7 +60,28 @@ def get_subj_data(self, subject, index_subsample=None):
         #get electrode labels
         electrodes = reader.selected_electrodes
         localization_df = trial_data.get_brain_region_localization_df()
-        neural_data = np.concatenate(seeg_data, axis=1)
-        #neural_data is [n_electrodes, n_words, n_samples]
+
+        # neural_data = np.concatenate(seeg_data, axis=1)  
+        # neural_data is [n_electrodes, n_words, n_samples]
+
+        # Determine total size
+        total_words = sum(arr.shape[1] for arr in seeg_data)
+        n_electrodes, _, n_samples = seeg_data[0].shape
+
+        neural_data_path = "neural_data_memmap.dat"
+        neural_data = np.memmap(neural_data_path, dtype=seeg_data[0].dtype,
+                                mode='w+', shape=(n_electrodes, total_words, n_samples))
+
+        # Fill from seeg_data
+        current_idx = 0
+        for arr in seeg_data:
+            word_count = arr.shape[1]
+            neural_data[:, current_idx:current_idx + word_count, :] = arr
+            current_idx += word_count
+            del arr
+
+        # You can read from this later like a regular array:
+        # neural_data = np.memmap(neural_data_path, dtype=np.float32, mode='r', shape=(...))
+
         words_df = pd.concat(words) #NOTE the index will not be unique, but the location will
         return words_df, neural_data, trials, electrodes, localization_df