Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Development #12

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 48 additions & 44 deletions bin/adata_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,40 +51,46 @@ def setup(organism="homo_sapiens", version="2024-07-01"):
return(outdir)

#clean up cellxgene ontologies
def rename_cells(obs):

#if organism == "Homo sapiens":
mapping = dict(obs[["cell_type","cell_type_ontology_term_id"]].drop_duplicates().values)
# add new CL terms to mapping dict
mapping["L2/3 intratelencephalic projecting glutamatergic neuron"] = "CL:4030059"
mapping["L4/5 intratelencephalic projecting glutamatergic neuron"] = "CL:4030062"
mapping["L5/6 near-projecting glutamatergic neuron"] = "CL:4030067"
mapping["L6 intratelencephalic projecting glutamatergic neuron"] = "CL:4030065"

#change to cat.rename_categories
obs["cell_type"] = obs["cell_type"].replace({
"astrocyte of the cerebral cortex": "astrocyte",
"cerebral cortex endothelial cell": "endothelial cell",
"L2/3 intratelencephalic projecting glutamatergic neuron of the primary motor cortex": "L2/3 intratelencephalic projecting glutamatergic neuron",
"L4/5 intratelencephalic projecting glutamatergic neuron of the primary motor cortex": "L4/5 intratelencephalic projecting glutamatergic neuron",
"L5/6 near-projecting glutamatergic neuron of the primary motor cortex": "L5/6 near-projecting glutamatergic neuron",
"L6 intratelencephalic projecting glutamatergic neuron of the primary motor cortex": "L6 intratelencephalic projecting glutamatergic neuron"
})

obs["cell_type_ontology_term_id"] = obs["cell_type"].map(mapping)

def rename_cells(obs, rename_file="/space/grp/rschwartz/rschwartz/cell_annotation_cortex.nf/meta/rename_cells.tsv"):
# Read renaming terms from file
rename_df = pd.read_csv(rename_file, sep="\t")

# Ensure expected columns exist
if not {'cell_type', 'new_cell_type', 'cell_type_ontology_term_id'}.issubset(rename_df.columns):
raise ValueError("Rename file must contain 'cell_type', 'new_cell_type', and 'cell_type_ontology_term_id' columns.")

# Create mapping dictionaries
rename_mapping = dict(zip(rename_df['cell_type'], rename_df['new_cell_type']))
ontology_mapping = dict(zip(rename_df['new_cell_type'], rename_df['cell_type_ontology_term_id']))

# Apply renaming
obs['cell_type'] = obs['cell_type'].replace(rename_mapping)

if pd.api.types.is_categorical_dtype(obs['cell_type_ontology_term_id']):
# Add any new categories to 'cell_type_ontology_term_id'
new_categories = list(set(ontology_mapping.values()) - set(obs['cell_type_ontology_term_id'].cat.categories))
if new_categories:
obs['cell_type_ontology_term_id'] = obs['cell_type_ontology_term_id'].cat.add_categories(new_categories)

# Assign new ontology IDs only for replaced cells
replaced_cells = obs['cell_type'].isin(rename_mapping.values())
obs.loc[replaced_cells, 'cell_type_ontology_term_id'] = obs.loc[replaced_cells, 'cell_type'].map(ontology_mapping)

return obs


# Subsample x cells from each cell type if there are n>x cells present
# ensures equal representation of cell types in reference
def subsample_cells(data, filtered_ids, subsample=500, seed=42, organism="Homo sapiens"):
def subsample_cells(data, filtered_ids, subsample=500, seed=42, organism="Homo sapiens", rename_file="/space/grp/rschwartz/rschwartz/cell_annotation_cortex.nf/meta/rename_cells.tsv"):
random.seed(seed) # For `random`
np.random.seed(seed) # For `numpy`
scvi.settings.seed = seed # For `scvi`

# Filter data based on filtered_ids
obs = data[data['soma_joinid'].isin(filtered_ids)]
obs = rename_cells(obs)

obs = rename_cells(obs, rename_file=rename_file)

celltypes = obs["cell_type"].unique()
final_idx = []
Expand Down Expand Up @@ -166,7 +172,7 @@ def split_and_extract_data(data, split_column, subsample=500, organism=None, cen

return refs

def get_brain_obs(census, organism, organ="brain", primary_data=True, disease="normal"):
def get_cellxgene_obs(census, organism, organ="brain", primary_data=True, disease="normal"):
value_filter = (
f"tissue_general == '{organ}' and "
f"is_primary_data == {str(primary_data)} and "
Expand All @@ -176,34 +182,32 @@ def get_brain_obs(census, organism, organ="brain", primary_data=True, disease="n


def get_census(census_version="2024-07-01", organism="homo_sapiens", subsample=5, assay=None, tissue=None, organ="brain",
ref_collections=["Transcriptomic cytoarchitecture reveals principles of human neocortex organization"," SEA-AD: Seattle Alzheimer’s Disease Brain Cell Atlas"], seed=42):
ref_collections=["Transcriptomic cytoarchitecture reveals principles of human neocortex organization"," SEA-AD: Seattle Alzheimer’s Disease Brain Cell Atlas"],
rename_file="/space/grp/rschwartz/rschwartz/cell_annotation_cortex.nf/meta/rename_cells.tsv",seed=42):

census = cellxgene_census.open_soma(census_version=census_version)
dataset_info = census.get("census_info").get("datasets").read().concat().to_pandas()

brain_obs = brain_obs = get_brain_obs(census, organism, organ=organ, primary_data=True, disease="normal")
cellxgene_obs = cellxgene_obs = get_cellxgene_obs(census, organism, organ=organ, primary_data=True, disease="normal")

brain_obs = brain_obs.merge(dataset_info, on="dataset_id", suffixes=(None,"_y"))
brain_obs.drop(columns=['soma_joinid_y'], inplace=True)
brain_obs_filtered = brain_obs[brain_obs['collection_name'].isin(ref_collections)]
cellxgene_obs = cellxgene_obs.merge(dataset_info, on="dataset_id", suffixes=(None,"_y"))
cellxgene_obs.drop(columns=['soma_joinid_y'], inplace=True)
cellxgene_obs_filtered = cellxgene_obs[cellxgene_obs['collection_name'].isin(ref_collections)]
# eventually change this to filter out "restricted cell types" from passed file
restricted_celltypes_hs=["unknown", "glutamatergic neuron"]
restricted_celltypes_mmus=["unknown"]
if organism == "homo_sapiens":
brain_obs_filtered = brain_obs_filtered[~brain_obs_filtered['cell_type'].isin(["unknown", "glutamatergic neuron"])] # remove non specific cells
cellxgene_obs_filtered = cellxgene_obs_filtered[~cellxgene_obs_filtered['cell_type'].isin(restricted_celltypes_hs)] # remove non specific cells

elif organism == "mus_musculus":
brain_obs_filtered = brain_obs_filtered[~brain_obs_filtered['cell_type'].isin([# remove non specific cells
"unknown",
"hippocampal neuron",
"cortical interneuron",
"meis2 expressing cortical GABAergic cell",
"glutamatergic neuron"])]
# pd.DataFrame(brain_obs_filtered[["cell_type","collection_name","dataset_title"]].value_counts().reset_index()).to_csv("/space/grp/rschwartz/rschwartz/cell_annotation_cortex.nf/meta/mm_census_brain_info.tsv",sep='\t',index=False)
cellxgene_obs_filtered = cellxgene_obs_filtered[~cellxgene_obs_filtered['cell_type'].isin(restricted_celltypes_mmus)]
else:
raise ValueError("Unsupported organism")

if assay:
brain_obs_filtered = brain_obs_filtered[brain_obs_filtered["assay"].isin(assay)]
cellxgene_obs_filtered = cellxgene_obs_filtered[cellxgene_obs_filtered["assay"].isin(assay)]
if tissue:
brain_obs_filtered = brain_obs_filtered[brain_obs_filtered["tissue"].isin(tissue)]
cellxgene_obs_filtered = cellxgene_obs_filtered[cellxgene_obs_filtered["tissue"].isin(tissue)]

# Adjust organism naming for compatibility
organism_name_mapping = {
Expand All @@ -216,20 +220,20 @@ def get_census(census_version="2024-07-01", organism="homo_sapiens", subsample=5
"assay", "cell_type", "cell_type_ontology_term_id", "tissue",
"tissue_general", "suspension_type",
"disease", "dataset_id", "development_stage",
"soma_joinid"
"soma_joinid", "obervation_joinid"
]

refs = {}

# Get embeddings for all data together
filtered_ids = brain_obs_filtered['soma_joinid'].values
filtered_ids = cellxgene_obs_filtered['soma_joinid'].values
adata = extract_data(
brain_obs_filtered, filtered_ids,
cellxgene_obs_filtered, filtered_ids,
subsample=subsample, organism=organism,
census=census, obs_filter=None,
cell_columns=cell_columns, dataset_info=dataset_info, seed = seed
)
adata.obs=rename_cells(adata.obs)
adata.obs=rename_cells(adata.obs, rename_file=rename_file)


# Creating the key dynamically based on tissue and assay
Expand Down
9 changes: 6 additions & 3 deletions bin/get_census_adata.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,11 @@ def parse_arguments():
parser.add_argument('--census_version', type=str, default='2024-07-01', help='Census version (e.g., 2024-07-01)')
parser.add_argument('--ref_collections', type=str, nargs = '+', default = ["A taxonomy of transcriptomic cell types across the isocortex and hippocampal formation"])
parser.add_argument('--seed', type=int, default=42)
parser.add_argument('--organ', type=str, default="brain")
parser.add_argument('--assay', type=str, nargs = "+", help="Assays to use from reference", default=None)
parser.add_argument('--tissue', type=str, nargs="+", default = None, help = "Cortex region to pull from (default: all)")
parser.add_argument('--subsample', type=str, help="Number of cells per cell type to subsample from reference", default=500)
parser.add_argument('--rename_file', type=str, default="/space/grp/rschwartz/rschwartz/cell_annotation_cortex.nf/meta/rename_cells.tsv")
if __name__ == "__main__":
known_args, _ = parser.parse_known_args()
return known_args
Expand All @@ -57,10 +59,11 @@ def main():
assay = args.assay
subsample = args.subsample
tissue = args.tissue

organ=args.organ
rename_file=args.rename_file
refs=get_census(organism=organism,
subsample=subsample, census_version=census_version,
ref_collections=ref_collections, assay=assay, tissue=tissue, seed=SEED)
subsample=subsample, census_version=census_version, organ=organ,
ref_collections=ref_collections, assay=assay, tissue=tissue, rename_file=rename_file, seed=SEED)

print("finished fetching anndata")
outdir="refs"
Expand Down
60 changes: 4 additions & 56 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -22,62 +22,7 @@ process save_params_to_file {
echo "ref collections: ${params.ref_collections}" >> params.txt
"""
}
// process parseJsonMeta {
// input:
// path study_json_file
// output:
// path study_meta_file
// script:
// """
// python $projectDir/bin/parse_json.py --json_file ${study_json_file}
// """
// }
//


// process parseStudies {
// input:
// path study_meta_file
// output:
// tuple val(study_name), val(mapped_organism)

// script:


// """
// study_name = study_meta_file.split("_")[0]
// readLines("${study_meta_file}").first() { line ->
//
// def organism = line.split(" ")[1]
//
//
//
//
// # Output the tuple
// echo "$study_name $organism
// done
// """
// }


// process getStudies {

// input:
// val study_name, val organism

// output:
//
// path(${params.studies_dir}/${experiment}")
//

// script:

// """
// gemma-cli-sc getSingleCellDataMatrix -e ${study_name} \\
// --format mex --scale-type count --use-ensembl-ids \\
// -o /space/scratch/gemma-single-cell-data-ensembl-id/${organism}/${study_name}
// """
// }

process runSetup {
//conda '/home/rschwartz/anaconda3/envs/scanpyenv'
Expand Down Expand Up @@ -127,6 +72,7 @@ process getCensusAdata {
val census_version
val subsample_ref
val ref_collections
val organ

output:
path "refs/*.h5ad", emit: ref_paths_adata
Expand All @@ -137,9 +83,11 @@ process getCensusAdata {
# Run the python script to generate the files
python $projectDir/bin/get_census_adata.py \\
--organism ${organism} \\
--organ ${organ} \\
--census_version ${census_version} \\
--subsample_ref ${subsample_ref} \\
--ref_collections ${ref_collections} \\
--rename_file ${params.rename_file} \\
--seed ${params.seed}

# After running the python script, all .h5ad files will be saved in the refs/ directory inside a work directory
Expand Down Expand Up @@ -217,7 +165,7 @@ workflow {
ref_collections = params.ref_collections.collect { "\"${it}\"" }.join(' ')

// Get reference data and save to files
getCensusAdata(params.organism, params.census_version, params.subsample_ref, ref_collections)
getCensusAdata(params.organism, params.census_version, params.subsample_ref, ref_collections, params.organ)
getCensusAdata.out.ref_paths_adata.flatten()
.set { ref_paths_adata }

Expand Down
10 changes: 10 additions & 0 deletions meta/rename_cells.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
cell_type new_cell_type cell_type_ontology_term_id
astrocyte of the cerebral cortex astrocyte CL:0000127
cerebral cortex endothelial cell endothelial cell CL:0000115
L2/3 intratelencephalic projecting glutamatergic neuron of the primary motor cortex L2/3 intratelencephalic projecting glutamatergic neuron CL:4030059
L4/5 intratelencephalic projecting glutamatergic neuron of the primary motor cortex L4/5 intratelencephalic projecting glutamatergic neuron CL:4030062
L5/6 near-projecting glutamatergic neuron of the primary motor cortex L5/6 near-projecting glutamatergic neuron CL:4030067
L6 intratelencephalic projecting glutamatergic neuron of the primary motor cortex L6 intratelencephalic projecting glutamatergic neuron CL:4030065
central nervous system macrophage macrophage CL:0000235
vascular leptomeningeal cell (Mmus) vascular leptomeningeal cell CL:4023051
cortical interneuron corticothalamic-projecting glutamatergic cortical neuron CL:4023013
3 changes: 2 additions & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ params.subsample_ref = 50
params.studies_dir = "/space/scratch/gemma-single-cell-data-ensembl-id/mus_musculus"
params.ref_collections = ["A taxonomy of transcriptomic cell types across the isocortex and hippocampal formation"]
params.outdir = "$projectDir/results/${params.organism}_subsample_ref_${params.subsample_ref}_${java.time.LocalDateTime.now().format(java.time.format.DateTimeFormatter.ofPattern('yyyy-MM-dd_HH-mm-ss'))}"

params.organ="brain"
params.rename_file="$projectDir/meta/rename_cells.tsv"
process {
cache = 'standard' // Options: 'standard' (default), 'deep', 'lenient', or 'false'
executor = 'local'
Expand Down
3 changes: 2 additions & 1 deletion params.hs.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,6 @@
"ref_collections": [
"Transcriptomic cytoarchitecture reveals principles of human neocortex organization",
"SEA-AD: Seattle Alzheimer’s Disease Brain Cell Atlas"
]
],
"organ": "brain"
}
3 changes: 2 additions & 1 deletion params.mm.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,6 @@
"A taxonomy of transcriptomic cell types across the isocortex and hippocampal formation",
"An integrated transcriptomic and epigenomic atlas of mouse primary motor cortex cell types",
"Adult mouse cortical cell taxonomy revealed by single cell transcriptomics"
]
],
"organ": "brain"
}
Loading