PavlidisLab · rachadele · Feb 15, 2025 · Feb 15, 2025 · Feb 15, 2025 · Feb 15, 2025
diff --git a/bin/adata_functions.py b/bin/adata_functions.py
@@ -51,40 +51,46 @@ def setup(organism="homo_sapiens", version="2024-07-01"):
     return(outdir)
 
 #clean up cellxgene ontologies
-def rename_cells(obs):
-
-#if organism == "Homo sapiens":
-    mapping = dict(obs[["cell_type","cell_type_ontology_term_id"]].drop_duplicates().values)
-    # add new CL terms to mapping dict
-    mapping["L2/3 intratelencephalic projecting glutamatergic neuron"] = "CL:4030059"
-    mapping["L4/5 intratelencephalic projecting glutamatergic neuron"] = "CL:4030062"
-    mapping["L5/6 near-projecting glutamatergic neuron"] = "CL:4030067"
-    mapping["L6 intratelencephalic projecting glutamatergic neuron"] = "CL:4030065"
-
-    #change to cat.rename_categories
-    obs["cell_type"] = obs["cell_type"].replace({
-            "astrocyte of the cerebral cortex": "astrocyte",
-            "cerebral cortex endothelial cell": "endothelial cell",
-            "L2/3 intratelencephalic projecting glutamatergic neuron of the primary motor cortex": "L2/3 intratelencephalic projecting glutamatergic neuron",
-            "L4/5 intratelencephalic projecting glutamatergic neuron of the primary motor cortex": "L4/5 intratelencephalic projecting glutamatergic neuron",
-            "L5/6 near-projecting glutamatergic neuron of the primary motor cortex": "L5/6 near-projecting glutamatergic neuron",
-            "L6 intratelencephalic projecting glutamatergic neuron of the primary motor cortex": "L6 intratelencephalic projecting glutamatergic neuron"
-        })
-
-    obs["cell_type_ontology_term_id"] = obs["cell_type"].map(mapping)
+
+def rename_cells(obs, rename_file="/space/grp/rschwartz/rschwartz/cell_annotation_cortex.nf/meta/rename_cells.tsv"):
+    # Read renaming terms from file
+    rename_df = pd.read_csv(rename_file, sep="\t")
+
+    # Ensure expected columns exist
+    if not {'cell_type', 'new_cell_type', 'cell_type_ontology_term_id'}.issubset(rename_df.columns):
+        raise ValueError("Rename file must contain 'cell_type', 'new_cell_type', and 'cell_type_ontology_term_id' columns.")
+
+    # Create mapping dictionaries
+    rename_mapping = dict(zip(rename_df['cell_type'], rename_df['new_cell_type']))
+    ontology_mapping = dict(zip(rename_df['new_cell_type'], rename_df['cell_type_ontology_term_id']))
+
+    # Apply renaming
+    obs['cell_type'] = obs['cell_type'].replace(rename_mapping)
+
+    if pd.api.types.is_categorical_dtype(obs['cell_type_ontology_term_id']):
+        # Add any new categories to 'cell_type_ontology_term_id'
+        new_categories = list(set(ontology_mapping.values()) - set(obs['cell_type_ontology_term_id'].cat.categories))
+        if new_categories:
+            obs['cell_type_ontology_term_id'] = obs['cell_type_ontology_term_id'].cat.add_categories(new_categories)
+
+    # Assign new ontology IDs only for replaced cells
+    replaced_cells = obs['cell_type'].isin(rename_mapping.values())
+    obs.loc[replaced_cells, 'cell_type_ontology_term_id'] = obs.loc[replaced_cells, 'cell_type'].map(ontology_mapping)
+
     return obs
 
+
 # Subsample x cells from each cell type if there are n>x cells present
 # ensures equal representation of cell types in reference
-def subsample_cells(data, filtered_ids, subsample=500, seed=42, organism="Homo sapiens"):
+def subsample_cells(data, filtered_ids, subsample=500, seed=42, organism="Homo sapiens", rename_file="/space/grp/rschwartz/rschwartz/cell_annotation_cortex.nf/meta/rename_cells.tsv"):
     random.seed(seed)         # For `random`
     np.random.seed(seed)      # For `numpy`
     scvi.settings.seed = seed # For `scvi`
 
     # Filter data based on filtered_ids
     obs = data[data['soma_joinid'].isin(filtered_ids)]
-    
-    obs = rename_cells(obs)
+
+    obs = rename_cells(obs, rename_file=rename_file)
 
     celltypes = obs["cell_type"].unique()
     final_idx = []
@@ -166,7 +172,7 @@ def split_and_extract_data(data, split_column, subsample=500, organism=None, cen
 
     return refs
 
-def get_brain_obs(census, organism, organ="brain", primary_data=True, disease="normal"):
+def get_cellxgene_obs(census, organism, organ="brain", primary_data=True, disease="normal"):
     value_filter = (
         f"tissue_general == '{organ}' and "
         f"is_primary_data == {str(primary_data)} and "
@@ -176,34 +182,32 @@ def get_brain_obs(census, organism, organ="brain", primary_data=True, disease="n
 
 
 def get_census(census_version="2024-07-01", organism="homo_sapiens", subsample=5, assay=None, tissue=None, organ="brain",
-               ref_collections=["Transcriptomic cytoarchitecture reveals principles of human neocortex organization"," SEA-AD: Seattle Alzheimer’s Disease Brain Cell Atlas"], seed=42):
+               ref_collections=["Transcriptomic cytoarchitecture reveals principles of human neocortex organization"," SEA-AD: Seattle Alzheimer’s Disease Brain Cell Atlas"], 
+               rename_file="/space/grp/rschwartz/rschwartz/cell_annotation_cortex.nf/meta/rename_cells.tsv",seed=42):
 
     census = cellxgene_census.open_soma(census_version=census_version)
     dataset_info = census.get("census_info").get("datasets").read().concat().to_pandas()
 
-    brain_obs = brain_obs = get_brain_obs(census, organism, organ=organ, primary_data=True, disease="normal")
+    cellxgene_obs = cellxgene_obs = get_cellxgene_obs(census, organism, organ=organ, primary_data=True, disease="normal")
 
-    brain_obs = brain_obs.merge(dataset_info, on="dataset_id", suffixes=(None,"_y"))
-    brain_obs.drop(columns=['soma_joinid_y'], inplace=True)
-    brain_obs_filtered = brain_obs[brain_obs['collection_name'].isin(ref_collections)]
+    cellxgene_obs = cellxgene_obs.merge(dataset_info, on="dataset_id", suffixes=(None,"_y"))
+    cellxgene_obs.drop(columns=['soma_joinid_y'], inplace=True)
+    cellxgene_obs_filtered = cellxgene_obs[cellxgene_obs['collection_name'].isin(ref_collections)]
+    # eventually change this to filter out "restricted cell types" from passed file
+    restricted_celltypes_hs=["unknown", "glutamatergic neuron"]
+    restricted_celltypes_mmus=["unknown"]
     if organism == "homo_sapiens":
-        brain_obs_filtered = brain_obs_filtered[~brain_obs_filtered['cell_type'].isin(["unknown", "glutamatergic neuron"])] # remove non specific cells
+        cellxgene_obs_filtered = cellxgene_obs_filtered[~cellxgene_obs_filtered['cell_type'].isin(restricted_celltypes_hs)] # remove non specific cells
 
     elif organism == "mus_musculus":
-         brain_obs_filtered = brain_obs_filtered[~brain_obs_filtered['cell_type'].isin([# remove non specific cells
-                                                                                        "unknown",
-                                                                                        "hippocampal neuron", 
-                                                                                        "cortical interneuron", 
-                                                                                        "meis2 expressing cortical GABAergic cell", 
-                                                                                        "glutamatergic neuron"])]
-        # pd.DataFrame(brain_obs_filtered[["cell_type","collection_name","dataset_title"]].value_counts().reset_index()).to_csv("/space/grp/rschwartz/rschwartz/cell_annotation_cortex.nf/meta/mm_census_brain_info.tsv",sep='\t',index=False)
+         cellxgene_obs_filtered = cellxgene_obs_filtered[~cellxgene_obs_filtered['cell_type'].isin(restricted_celltypes_mmus)]
     else:
        raise ValueError("Unsupported organism")
 
     if assay:
-        brain_obs_filtered = brain_obs_filtered[brain_obs_filtered["assay"].isin(assay)]
+        cellxgene_obs_filtered = cellxgene_obs_filtered[cellxgene_obs_filtered["assay"].isin(assay)]
     if tissue:
-        brain_obs_filtered = brain_obs_filtered[brain_obs_filtered["tissue"].isin(tissue)]
+        cellxgene_obs_filtered = cellxgene_obs_filtered[cellxgene_obs_filtered["tissue"].isin(tissue)]
 
     # Adjust organism naming for compatibility
     organism_name_mapping = {
@@ -216,20 +220,20 @@ def get_census(census_version="2024-07-01", organism="homo_sapiens", subsample=5
         "assay", "cell_type", "cell_type_ontology_term_id", "tissue",
         "tissue_general", "suspension_type",
         "disease", "dataset_id", "development_stage",
-        "soma_joinid"
+        "soma_joinid", "obervation_joinid"
     ]
 
     refs = {}
 
     # Get embeddings for all data together
-    filtered_ids = brain_obs_filtered['soma_joinid'].values
+    filtered_ids = cellxgene_obs_filtered['soma_joinid'].values
     adata = extract_data(
-        brain_obs_filtered, filtered_ids,
+        cellxgene_obs_filtered, filtered_ids,
         subsample=subsample, organism=organism,
         census=census, obs_filter=None,
         cell_columns=cell_columns, dataset_info=dataset_info, seed = seed
     )
-    adata.obs=rename_cells(adata.obs) 
+    adata.obs=rename_cells(adata.obs, rename_file=rename_file) 
 
 
     # Creating the key dynamically based on tissue and assay

diff --git a/bin/get_census_adata.py b/bin/get_census_adata.py
@@ -37,9 +37,11 @@ def parse_arguments():
     parser.add_argument('--census_version', type=str, default='2024-07-01', help='Census version (e.g., 2024-07-01)')
     parser.add_argument('--ref_collections', type=str, nargs = '+', default = ["A taxonomy of transcriptomic cell types across the isocortex and hippocampal formation"]) 
     parser.add_argument('--seed', type=int, default=42)
+    parser.add_argument('--organ', type=str, default="brain")
     parser.add_argument('--assay', type=str, nargs = "+", help="Assays to use from reference", default=None)
     parser.add_argument('--tissue', type=str, nargs="+", default = None, help = "Cortex region to pull from (default: all)")
     parser.add_argument('--subsample', type=str, help="Number of cells per cell type to subsample from reference", default=500)
+    parser.add_argument('--rename_file', type=str, default="/space/grp/rschwartz/rschwartz/cell_annotation_cortex.nf/meta/rename_cells.tsv")
     if __name__ == "__main__":
         known_args, _ = parser.parse_known_args()
         return known_args
@@ -57,10 +59,11 @@ def main():
    assay = args.assay
    subsample = args.subsample
    tissue = args.tissue
-
+   organ=args.organ
+   rename_file=args.rename_file
    refs=get_census(organism=organism, 
-                     subsample=subsample, census_version=census_version, 
-                        ref_collections=ref_collections, assay=assay, tissue=tissue, seed=SEED)
+                     subsample=subsample, census_version=census_version, organ=organ,
+                        ref_collections=ref_collections, assay=assay, tissue=tissue, rename_file=rename_file, seed=SEED)
 
    print("finished fetching anndata")
    outdir="refs"

diff --git a/main.nf b/main.nf
@@ -22,62 +22,7 @@ process save_params_to_file {
     echo "ref collections: ${params.ref_collections}" >> params.txt
     """
 }
-// process parseJsonMeta {
-//     input:
-//         path study_json_file
-//     output:
-//         path study_meta_file
-//     script:
-//         """
-//         python $projectDir/bin/parse_json.py --json_file ${study_json_file}
-//         """
-// }
-//
-
-
-// process parseStudies {
-    // input:
-        // path study_meta_file
-    // output:
-        // tuple val(study_name), val(mapped_organism)
-
-    // script:
-
-
-    // """
-    // study_name = study_meta_file.split("_")[0]
-    // readLines("${study_meta_file}").first() { line ->
-    //
-        // def organism = line.split(" ")[1]
-        //
-        //
-    //
-    //  
-        // # Output the tuple
-        // echo "$study_name $organism
-    // done
-    // """
-// }
-
 
-// process getStudies {
-
-    // input:
-        // val study_name, val organism
-
-    // output:
-        // 
-        // path(${params.studies_dir}/${experiment}")
-        //
-
-    // script:
-
-    // """
-    // gemma-cli-sc getSingleCellDataMatrix -e ${study_name} \\
-    // --format mex --scale-type count --use-ensembl-ids \\
-    // -o /space/scratch/gemma-single-cell-data-ensembl-id/${organism}/${study_name}
-    // """
-// }
 
 process runSetup {
     //conda '/home/rschwartz/anaconda3/envs/scanpyenv'
@@ -127,6 +72,7 @@ process getCensusAdata {
     val census_version
     val subsample_ref
     val ref_collections
+    val organ
 
     output:
     path "refs/*.h5ad", emit: ref_paths_adata
@@ -137,9 +83,11 @@ process getCensusAdata {
     # Run the python script to generate the files
     python $projectDir/bin/get_census_adata.py \\
         --organism ${organism} \\
+        --organ ${organ} \\
         --census_version ${census_version} \\
         --subsample_ref ${subsample_ref} \\
         --ref_collections ${ref_collections} \\
+        --rename_file ${params.rename_file} \\
         --seed ${params.seed}
 
     # After running the python script, all .h5ad files will be saved in the refs/ directory inside a work directory
@@ -217,7 +165,7 @@ workflow {
     ref_collections = params.ref_collections.collect { "\"${it}\"" }.join(' ')
 
     // Get reference data and save to files
-    getCensusAdata(params.organism, params.census_version, params.subsample_ref, ref_collections)
+    getCensusAdata(params.organism, params.census_version, params.subsample_ref, ref_collections, params.organ)
     getCensusAdata.out.ref_paths_adata.flatten()
     .set { ref_paths_adata }
 

diff --git a/meta/rename_cells.tsv b/meta/rename_cells.tsv
@@ -0,0 +1,10 @@
+cell_type	new_cell_type	cell_type_ontology_term_id
+astrocyte of the cerebral cortex	astrocyte	CL:0000127
+cerebral cortex endothelial cell	endothelial cell	CL:0000115
+L2/3 intratelencephalic projecting glutamatergic neuron of the primary motor cortex	L2/3 intratelencephalic projecting glutamatergic neuron	CL:4030059
+L4/5 intratelencephalic projecting glutamatergic neuron of the primary motor cortex	L4/5 intratelencephalic projecting glutamatergic neuron	CL:4030062
+L5/6 near-projecting glutamatergic neuron of the primary motor cortex	L5/6 near-projecting glutamatergic neuron	CL:4030067
+L6 intratelencephalic projecting glutamatergic neuron of the primary motor cortex	L6 intratelencephalic projecting glutamatergic neuron	CL:4030065
+central nervous system macrophage	macrophage	CL:0000235
+vascular leptomeningeal cell (Mmus)	vascular leptomeningeal cell	CL:4023051
+cortical interneuron	corticothalamic-projecting glutamatergic cortical neuron	CL:4023013
diff --git a/nextflow.config b/nextflow.config
@@ -9,7 +9,8 @@ params.subsample_ref = 50
 params.studies_dir = "/space/scratch/gemma-single-cell-data-ensembl-id/mus_musculus"
 params.ref_collections = ["A taxonomy of transcriptomic cell types across the isocortex and hippocampal formation"]
 params.outdir = "$projectDir/results/${params.organism}_subsample_ref_${params.subsample_ref}_${java.time.LocalDateTime.now().format(java.time.format.DateTimeFormatter.ofPattern('yyyy-MM-dd_HH-mm-ss'))}"
-
+params.organ="brain"
+params.rename_file="$projectDir/meta/rename_cells.tsv"
 process {
   cache = 'standard'  // Options: 'standard' (default), 'deep', 'lenient', or 'false'
   executor = 'local'

diff --git a/params.hs.json b/params.hs.json
@@ -8,5 +8,6 @@
     "ref_collections": [
         "Transcriptomic cytoarchitecture reveals principles of human neocortex organization", 
         "SEA-AD: Seattle Alzheimer’s Disease Brain Cell Atlas"
-    ]
+    ],
+    "organ": "brain"
 }
diff --git a/params.mm.json b/params.mm.json
@@ -9,5 +9,6 @@
         "A taxonomy of transcriptomic cell types across the isocortex and hippocampal formation",
         "An integrated transcriptomic and epigenomic atlas of mouse primary motor cortex cell types",
         "Adult mouse cortical cell taxonomy revealed by single cell transcriptomics"
-    ]
+    ],
+    "organ": "brain"
 }