update 1 file and create 2 files: add data_postprocessing script

chadHGY · chadHGY · commit 1b0c5be9f5e2 · 2024-06-13T16:43:43.000+02:00
diff --git a/README.md b/README.md
@@ -33,19 +33,24 @@ conda activate cam
 pip install -r requirements.txt
 ```
 
-3. Data Preprocessing:
-Please make sure you have gone through [FreeSurfer's](https://surfer.nmr.mgh.harvard.edu/fswiki/recon-all) `recon-all pipeline` to extract the cortical surface features. The surface features should be found under each subject's `surf` directory.
+# Data
+To easily demonstrate the usage of CAM, we provide a toy dataset in the `data` directory. The toy dataset contains 10 subjects from [IXI dataset](https://brain-development.org/ixi-dataset/). For each subject we will extract 4 cortical surface features using FreeSurfer (Curvature, Sulci, Thickness, Volume).
 
+1. Data Preprocessing:
+Please make sure you have gone through [FreeSurfer's](https://surfer.nmr.mgh.harvard.edu/fswiki/recon-all) `recon-all pipeline` to extract the cortical surface features. The surface features should be found under each subject's `surf` directory. You can find the already processed data in the `data/freesurfer` directory.
 
-4. Data Postprocessing:
+2. Data Postprocessing:
 Here we provide a simple script to convert the surface features to a numpy array. 
 ```bash
-python data_postprocessing.py --data_dir /path/to/your/freesurfer/output --output_dir /path/to/your/postprocessed/data
-```
+# training set
+python src/data_postprocessing.py --freesurfer_dir data/freesurfer/ --subject_list data/train_subjects.txt  --output_dir data/sphere/train/ --in_ch thickness volume curv sulc --annot_file aparc --hemi lh
 
+# validation set
+python src/data_postprocessing.py --freesurfer_dir data/freesurfer/ --subject_list data/val_subjects.txt  --output_dir data/sphere/val/ --in_ch thickness volume curv sulc --annot_file aparc --hemi lh
 
-# Data
-To easily demonstrate the usage of CAM, we provide a toy dataset in the `data` directory. The toy dataset contains 10 subjects, each with 3 cortical surface features extracted by FreeSurfer (Thickness, Sulc, Curvature).
+# testing set
+python src/data_postprocessing.py --freesurfer_dir data/freesurfer/ --subject_list data/test_subjects.txt  --output_dir data/sphere/test/ --in_ch thickness volume curv sulc --annot_file aparc --hemi lh
+```
 
 
 # Training
@@ -73,4 +78,7 @@ If you find this repository useful for your research, please use the following.
 
 # Acknowledgments/References
 1. IXI data: https://brain-development.org/ixi-dataset/
+2. Sphere postprocessing code borrowed from:
+    - [surface-vision-transformers](https://github.com/metrics-lab/surface-vision-transformers)
+    - [SPHARM-Net](https://github.com/Shape-Lab/SPHARM-Net)
 3. We would like to thank all participants in this study, making the work possible. This work was supported the German Research Foundation (DFG) Emmy Noether with reference 513851350 (TW), the Cluster of Excellence with reference 390727645 (TW) and the BMBF-funded de.NBI Cloud within the German Network for Bioinformatics Infrastructure (de.NBI) (031A532B, 031A533A, 031A533B, 031A534A, 031A535A, 031A537A, 031A537B, 031A537C, 031A537D, 031A538A).
diff --git a/src/data_postprocessing.py b/src/data_postprocessing.py
@@ -0,0 +1,156 @@
+"""
+https://github.com/metrics-lab/surface-vision-transformers/blob/main/tools/preprocessing.py
+
+triangle_indices_ico_6_sub_ico_1 -> ico6_80_561
+    num_patches: 80 
+    num_vertices: 561 
+
+triangle_indices_ico_6_sub_ico_2 -> ico6_320_153 
+    num_patches: 320
+    num_vertices: 153 
+"""
+
+# %% import
+import argparse
+
+import joblib
+import pandas as pd
+import pyrootutils
+from joblib import Parallel, delayed
+from tqdm import tqdm
+
+pyrootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
+from src.utils.feature_extract import RunningStats, get_patch_data
+from src.utils.spharmnet.lib.io import read_mesh
+
+# %% args
+# ------------------------------------------------------------------------------
+parser = argparse.ArgumentParser(description="Extract sphere from freesurfer")
+
+# paths
+parser.add_argument(
+    "--freesurfer_dir",
+    type=str,
+    default="data/freesurfer/",
+    help="Path to FreeSurfer output directory",
+)
+parser.add_argument(
+    "--subject_list",
+    type=str,
+    default="data/train_subjects.txt",
+    help="List of subjects to process",
+)
+parser.add_argument(
+    "--ico6_sphere_path",
+    type=str,
+    default="src/utils/ico6.vtk",
+    help="Path to ico6 sphere",
+)
+
+parser.add_argument(
+    "--output_dir",
+    type=str,
+    default="data/sphere/train/",
+    help="FreeSurfer sphere output directory",
+)
+
+# features
+parser.add_argument(
+    "--in_ch",
+    type=str,
+    default=["thickness", "volume", "curv", "sulc"],
+    nargs="+",
+    help="List of geometry to process",
+)
+parser.add_argument(
+    "--annot_file",
+    type=str,
+    default="aparc",
+    choices=["aparc", "aparc.a2009s"],
+    help="Manual labels (e.g. aparc for ?h.aparc.annot)",
+)
+parser.add_argument(
+    "--hemi",
+    type=str,
+    default="lh",
+    choices=["lh", "rh"],
+    help="Hemisphere for data generation",
+)
+parser.add_argument(
+    "--n_jobs",
+    type=int,
+    default=-1,
+    help="# of CPU n_jobs for parallel data generation",
+)
+args, unknown = parser.parse_known_args()
+
+
+# %% main
+# ------------------------------------------------------------------------------
+def main(args):
+    # init
+    proj_root_dir = pyrootutils.find_root()
+    out_dir = proj_root_dir / args.output_dir
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    # load subject list
+    with open(proj_root_dir / args.subject_list, "r") as f:
+        subjects = f.read().splitlines()
+    subjects = [proj_root_dir / sub for sub in subjects]
+
+    # load ico mesh & triangle indices
+    ico_v, _ = read_mesh(
+        str(proj_root_dir / args.ico6_sphere_path)
+    )  # ico_v: ico vertices (40962, 3)
+    patch_ids_path = proj_root_dir / "src/utils/ico6_320_153.csv"
+    triangle_mesh_indices = pd.read_csv(patch_ids_path)
+
+    # extract feature
+    # ------------------------------------------------------------------------------
+    print(f"Extractiing {args.subject_list}: {args.in_ch}")
+    sphere_data = Parallel(n_jobs=args.n_jobs)(
+        delayed(get_patch_data)(
+            ico_v=ico_v,
+            triangle_mesh_indices=triangle_mesh_indices,
+            in_ch=args.in_ch,
+            annot_file=args.annot_file,
+            sub=sub,
+            hemi=args.hemi,
+        )
+        for sub in tqdm(subjects, desc=f"{args.hemi}")
+    )
+
+    # store sphere data & phenotypic data in pkl file
+    # ------------------------------------------------------------------------------
+    running_stats = {channel: RunningStats() for channel in args.in_ch}
+    for sub_folder, feat_patches, roi_anno, structure_map in sphere_data:
+        # save to pkl file
+        sub = sub_folder.name
+        pkl_file = f"{out_dir}/{sub}.pkl"
+        joblib.dump(
+            {
+                "feat_patches": feat_patches,
+                "roi_anno": roi_anno,
+                "structure_map": structure_map,
+            },
+            pkl_file,
+        )
+
+        # udpate running stats (mean, std) for each channel
+        for channel in args.in_ch:
+            running_stats[channel].update(feat_patches[channel])
+
+    running_stats = {
+        channel: {
+            "mean": running_stats[channel].get_mean(),
+            "std": running_stats[channel].get_std(),
+        }
+        for channel in args.in_ch
+    }
+    print(f"Running stats: {running_stats}")
+    joblib.dump(running_stats, f"{out_dir}/stats.pkl")
+
+
+# %% main
+if __name__ == "__main__":
+    main(args)
diff --git a/src/utils/feature_extract.py b/src/utils/feature_extract.py
@@ -0,0 +1,153 @@
+#%% import
+import pyrootutils
+import numpy as np
+import traceback
+import pandas as pd
+import os
+
+pyrootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
+from src.utils.spharmnet.lib.io import read_annot, read_feat, read_mesh
+from src.utils.spharmnet.lib.sphere import TriangleSearch
+
+
+# %% extract patch data from surface
+def get_patch_data(
+    ico_v: np.array,
+    triangle_mesh_indices: pd.DataFrame,
+    in_ch: list[str] = ["area", "sphere", "thickness", "volume", "curv", "sulc", "inflated.H"],
+    annot_file: str = "aparc",
+    sub: str = "data/freesurfer/sub-IXI031",
+    hemi: str = "lh",
+) -> tuple[str, dict[np.array], list]:
+
+    # paths
+    surf_dir = sub / "surf"
+    label_dir = sub / "label"
+
+    # load native sphere
+    # ------------------------------------------------------------------------------
+    try:
+        sphere_path = os.path.join(surf_dir, hemi + "." + "sphere")
+        native_v, native_f = read_mesh(sphere_path)
+    except FileNotFoundError as e:
+        print(f"\tsub: {sub} | Error: File {sphere_path} not found.")
+        # raise e  # Re-raise the exception to see the full traceback
+        return None, None, None, None
+    except Exception as e:
+        print(f"\tsub: {sub} | An error occurred while reading the mesh:\n{e}")
+        traceback.print_exc()
+        return None, None, None, None
+    try:
+        tree = TriangleSearch(native_v, native_f)
+        triangle_idx, bary_coeff = tree.query(ico_v)
+    except Exception as e:
+        print(f"\tsub: {sub} | An error occurred during triangle search and query:\n{e}")
+        traceback.print_exc()
+        return None, None, None, None
+
+    # extract sphere features
+    # ------------------------------------------------------------------------------
+    try:
+        feat_patches = {feat_name: None for feat_name in in_ch}
+        for feat_name in in_ch:
+            # load surface feature
+            feat_path = os.path.join(surf_dir, hemi + "." + feat_name)
+            try:
+                feat = read_feat(feat_path)  # feat: features (115231, 1)
+            except Exception as feat_read_error:
+                print(f"\tsub: {sub} | Error reading feature '{feat_name}' from '{feat_path}': {feat_read_error}")
+                traceback.print_exc()
+                return None, None, None, None
+
+            # remesh surf feature: 115231 -> 40962
+            try:
+                feat_remesh = np.multiply(feat[native_f[triangle_idx]], bary_coeff).sum(
+                    -1
+                )  # feat_remesh: features (40962, 1)
+                assert feat_remesh.shape[0] == ico_v.shape[0], f"feat_remesh.shape[0] != ico_v.shape[0]"
+            except Exception as feat_processing_error:
+                print(f"\tsub: {sub} | Error processing feature '{feat_name}': {feat_processing_error}")
+                traceback.print_exc()
+                return None, None, None, None
+
+            # extract triangle patches
+            try:
+                data = feat_remesh[triangle_mesh_indices.values].T  # num_patches x num_vertices
+                feat_patches[feat_name] = data
+            except Exception as feat_extract_error:
+                print(f"\tsub: {sub} | Error extracting feature '{feat_name}': {feat_extract_error}")
+                traceback.print_exc()
+                return None, None, None, None
+
+    except Exception as e:
+        print(f"\tsub: {sub} | An error occurred during feature extraction:\n{e}")
+        traceback.print_exc()
+        return None, None, None, None
+
+    # extract labels
+    # ------------------------------------------------------------------------------
+    try:
+        # laod annotation
+        num_vert = native_v.shape[0]
+        label_arr = np.zeros(num_vert, dtype=np.int16)
+        annot = os.path.join(label_dir, hemi + "." + annot_file + ".annot")
+        try:
+            vertices, label, sturcture_ls, structureID_ls = read_annot(
+                annot
+            )  # vertices: vertex indices (115231,), label: labels (115231,), sturcture_ls: structure names (36,), structureID_ls: structure IDs (36,)
+        except Exception as annot_read_error:
+            print(f"\tsub: {sub} | Error reading annotation from '{annot}': {annot_read_error}")
+            traceback.print_exc()
+            return None, None, None, None
+
+        # remesh roi label: 115231 -> 40962
+        try:
+            label = [structureID_ls.index(l) if l in structureID_ls else 0 for l in label]
+            label_arr[vertices] = label
+            label_remesh = label_arr[
+                native_f[triangle_idx, np.argmax(bary_coeff, axis=1)]
+            ]  # label_remesh: labels (40962,)
+            assert label_remesh.shape[0] == ico_v.shape[0], "label_remesh.shape[0] != ico_v.shape[0]"
+        except Exception as label_processing_error:
+            print(f"\tsub: {sub} | Error processing label: {label_processing_error}")
+            traceback.print_exc()
+            return None, None, None, None
+
+        # extract triangle patches
+        try:
+            label_remesh = label_remesh[triangle_mesh_indices.values].T  # num_patches x num_vertices
+        except Exception as label_extract_error:
+            print(f"\tsub: {sub} | Error extracting label: {label_extract_error}")
+            traceback.print_exc()
+            return None, None, None, None
+
+    except Exception as e:
+        print(f"\tsub: {sub} | An error occurred during label extraction:\n{e}")
+        traceback.print_exc()
+        return None, None, None, None
+
+    # extract structure map
+    structure_map = list(enumerate(sturcture_ls))
+    return sub, feat_patches, label_remesh, structure_map
+
+
+#%% calcualte running stats
+class RunningStats:
+    def __init__(self):
+        self.N = 0
+        self.mean = 0.0
+        self.M2 = 0.0
+
+    def update(self, data):
+        self.N += 1
+        self.mean += np.mean(data)
+        self.M2 += np.mean(data**2)
+
+    def get_mean(self):
+        return self.mean / self.N
+
+    def get_std(self):
+        mean = self.mean / self.N
+        m2 = self.M2 / self.N
+        return np.sqrt(m2 - mean**2)
+# %%