From c31cf6ece2d9e316471a42688fd945445a5bc19f Mon Sep 17 00:00:00 2001 From: Hunter Craft <118154470+hunterckx@users.noreply.github.com> Date: Thu, 6 Feb 2025 21:55:51 -0800 Subject: [PATCH] feat: create catalog build python package (#251) (#258) --- .gitignore | 1 + .../brc-analytics-catalog/common/entities.ts | 6 +- catalog-build/build-catalog.ts | 6 +- catalog-build/build-files-from-ncbi.py | 123 +------------- .../package/catalog_build/__init__.py | 3 + catalog-build/package/catalog_build/build.py | 160 ++++++++++++++++++ catalog-build/package/setup.py | 8 + 7 files changed, 180 insertions(+), 127 deletions(-) create mode 100644 catalog-build/package/catalog_build/__init__.py create mode 100644 catalog-build/package/catalog_build/build.py create mode 100644 catalog-build/package/setup.py diff --git a/.gitignore b/.gitignore index 007591f..fa70ebd 100644 --- a/.gitignore +++ b/.gitignore @@ -34,3 +34,4 @@ npm-debug.log* # python venv +__pycache__ diff --git a/app/apis/catalog/brc-analytics-catalog/common/entities.ts b/app/apis/catalog/brc-analytics-catalog/common/entities.ts index 81a6aa4..94917b4 100644 --- a/app/apis/catalog/brc-analytics-catalog/common/entities.ts +++ b/app/apis/catalog/brc-analytics-catalog/common/entities.ts @@ -20,9 +20,9 @@ export interface BRCDataCatalogGenome { length: number; level: string; ncbiTaxonomyId: string; - scaffoldCount: number; - scaffoldL50: number; - scaffoldN50: number; + scaffoldCount: number | null; + scaffoldL50: number | null; + scaffoldN50: number | null; species: string; speciesTaxonomyId: string; strain: string | null; diff --git a/catalog-build/build-catalog.ts b/catalog-build/build-catalog.ts index fc6d84d..e433d96 100644 --- a/catalog-build/build-catalog.ts +++ b/catalog-build/build-catalog.ts @@ -37,9 +37,9 @@ async function buildGenomes(): Promise { length: parseNumber(row.length), level: row.level, ncbiTaxonomyId: row.taxonomyId, - scaffoldCount: parseNumber(row.scaffoldCount), - scaffoldL50: parseNumber(row.scaffoldL50), - scaffoldN50: parseNumber(row.scaffoldN50), + scaffoldCount: parseNumberOrNull(row.scaffoldCount), + scaffoldL50: parseNumberOrNull(row.scaffoldL50), + scaffoldN50: parseNumberOrNull(row.scaffoldN50), species: row.species, speciesTaxonomyId: row.speciesTaxonomyId, strain: parseStringOrNull(row.strain), diff --git a/catalog-build/build-files-from-ncbi.py b/catalog-build/build-files-from-ncbi.py index 43cf3ea..2d68b9a 100644 --- a/catalog-build/build-files-from-ncbi.py +++ b/catalog-build/build-files-from-ncbi.py @@ -1,8 +1,4 @@ -import pandas as pd -import requests -import urllib.parse -import re -import yaml +from package.catalog_build import build_files ASSEMBLIES_PATH = "catalog-build/source/assemblies.yml" @@ -19,120 +15,5 @@ 5653: "Kinetoplastea", } -def read_assemblies(): - with open(ASSEMBLIES_PATH) as stream: - return pd.DataFrame(yaml.safe_load(stream)["assemblies"]) - -def get_paginated_ncbi_results(base_url, query_description): - page = 1 - next_page_token = None - results = [] - while next_page_token or page == 1: - print(f"Requesting page {page} of {query_description}") - request_url = f"{base_url}?page_size=1000{"&page_token=" + next_page_token if next_page_token else ""}" - page_data = requests.get(request_url).json() - if len(page_data["reports"][0].get("errors", [])) > 0: - raise Exception(page_data["reports"][0]) - results += page_data["reports"] - next_page_token = page_data.get("next_page_token") - page += 1 - return results - -def get_taxonomic_groups(lineage): - return [TAXONOMIC_GROUPS_BY_TAXONOMY_ID[tax_id] for tax_id in lineage if tax_id in TAXONOMIC_GROUPS_BY_TAXONOMY_ID] - -def get_species_row(taxon_info): - species_info = taxon_info["taxonomy"]["classification"]["species"] - return { - "taxonomyId": taxon_info["taxonomy"]["tax_id"], - "species": species_info["name"], - "speciesTaxonomyId": species_info["id"], - "taxonomicGroup": ",".join(get_taxonomic_groups(taxon_info["taxonomy"]["parents"])) - } - -def get_species_df(taxonomy_ids): - species_info = get_paginated_ncbi_results(f"https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy/taxon/{",".join([str(id) for id in taxonomy_ids])}/dataset_report", "taxa") - return pd.DataFrame([get_species_row(info) for info in species_info]) - -def get_genome_row(genome_info): - refseq_category = genome_info["assembly_info"].get("refseq_category") - return { - "strain": genome_info["organism"].get("infraspecific_names", {}).get("strain", ""), - "taxonomyId": genome_info["organism"]["tax_id"], - "accession": genome_info["accession"], - "isRef": refseq_category == "reference genome", - "level": genome_info["assembly_info"]["assembly_level"], - "chromosomeCount": genome_info["assembly_stats"].get("total_number_of_chromosomes"), - "length": genome_info["assembly_stats"]["total_sequence_length"], - "scaffoldCount": genome_info["assembly_stats"]["number_of_scaffolds"], - "scaffoldN50": genome_info["assembly_stats"]["scaffold_n50"], - "scaffoldL50": genome_info["assembly_stats"]["scaffold_l50"], - "coverage": genome_info["assembly_stats"].get("genome_coverage"), - "gcPercent": genome_info["assembly_stats"]["gc_percent"], - "annotationStatus": genome_info["annotation_info"].get("status"), - "pairedAccession": genome_info.get("paired_accession"), - } - -def get_genomes_df(accessions): - genomes_info = get_paginated_ncbi_results(f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{",".join(accessions)}/dataset_report", "genomes") - return pd.DataFrame(data=[get_genome_row(info) for info in genomes_info]) - -def _id_to_gene_model_url(asm_id): - hubs_url = "https://hgdownload.soe.ucsc.edu/hubs/" - components = [asm_id[0:3], asm_id[4:7], asm_id[7:10], asm_id[10:13], asm_id, "genes"] - url = urllib.parse.urljoin(hubs_url, "/".join(components)) - # url looks something like https://hgdownload.soe.ucsc.edu/hubs/GCF/030/504/385/GCF_030504385.1/genes/ - # and contains html content with links to gene models. - # we need to scrape this to get the gtf - print(f"fetching url {url}") - response = requests.get(url) - try: - response.raise_for_status() - except Exception: - # FIXME?: Some accessions don't have a gene folder - return None - # find link to gtf, should ideally be ncbiRefSeq, but augustus will do - html_content = response.text - pattern = rf"{asm_id.replace('.', r'\.')}.*?\.gtf\.gz" - augustus_file = None - for match in re.findall(pattern, html_content): - if "ncbiRefSeq" in match: - return urllib.parse.urljoin(f"{url}/", match) - elif "augustus" in match: - augustus_file = match - if augustus_file: - return urllib.parse.urljoin(f"{url}/", augustus_file) - # No match, I guess that's OK ? - return None - -def add_gene_model_url(genomes_df: pd.DataFrame): - return pd.concat([genomes_df, genomes_df["accession"].apply(_id_to_gene_model_url).rename("geneModelUrl")], axis="columns") - -def build_files(): - print("Building files") - - source_list_df = read_assemblies() - - base_genomes_df = get_genomes_df(source_list_df["accession"]) - - species_df = get_species_df(base_genomes_df["taxonomyId"]) - - genomes_with_species_df = base_genomes_df.merge(species_df, how="left", on="taxonomyId") - - assemblies_df = pd.DataFrame(requests.get(UCSC_ASSEMBLIES_URL).json()["data"])[["ucscBrowser", "genBank", "refSeq"]] - - gen_bank_merge_df = genomes_with_species_df.merge(assemblies_df, how="left", left_on="accession", right_on="genBank") - ref_seq_merge_df = genomes_with_species_df.merge(assemblies_df, how="left", left_on="accession", right_on="refSeq") - - unmatched_accessions = genomes_with_species_df["accession"][~(genomes_with_species_df["accession"].isin(assemblies_df["genBank"]) | genomes_with_species_df["accession"].isin(assemblies_df["refSeq"]))] - if len(unmatched_accessions) > 0: - print(f"{len(unmatched_accessions)} accessions had no match in assembly list: {", ".join(unmatched_accessions)}") - - genomes_df = add_gene_model_url(gen_bank_merge_df.combine_first(ref_seq_merge_df)) - - genomes_df.to_csv(GENOMES_OUTPUT_PATH, index=False, sep="\t") - - print(f"Wrote to {GENOMES_OUTPUT_PATH}") - if __name__ == "__main__": - build_files() + build_files(ASSEMBLIES_PATH, GENOMES_OUTPUT_PATH, UCSC_ASSEMBLIES_URL, {"taxonomicGroup": TAXONOMIC_GROUPS_BY_TAXONOMY_ID}) diff --git a/catalog-build/package/catalog_build/__init__.py b/catalog-build/package/catalog_build/__init__.py new file mode 100644 index 0000000..6eba703 --- /dev/null +++ b/catalog-build/package/catalog_build/__init__.py @@ -0,0 +1,3 @@ +from . import build + +build_files = build.build_files diff --git a/catalog-build/package/catalog_build/build.py b/catalog-build/package/catalog_build/build.py new file mode 100644 index 0000000..e17f340 --- /dev/null +++ b/catalog-build/package/catalog_build/build.py @@ -0,0 +1,160 @@ +import pandas as pd +import yaml +import requests +import urllib +import re + +def read_assemblies(assemblies_path): + with open(assemblies_path) as stream: + return pd.DataFrame(yaml.safe_load(stream)["assemblies"]) + +def get_paginated_ncbi_results(base_url, query_description): + page = 1 + next_page_token = None + results = [] + while next_page_token or page == 1: + print(f"Requesting page {page} of {query_description}") + request_url = f"{base_url}?page_size=1000{"&page_token=" + next_page_token if next_page_token else ""}" + page_data = requests.get(request_url).json() + if len(page_data["reports"][0].get("errors", [])) > 0: + raise Exception(page_data["reports"][0]) + results += page_data["reports"] + next_page_token = page_data.get("next_page_token") + page += 1 + return results + +def match_taxonomic_group(tax_id, lineage, taxonomic_groups): + if not tax_id in taxonomic_groups: + return None + taxon_info = taxonomic_groups[tax_id] + name, exclude = (taxon_info["value"], taxon_info.get("exclude")) if isinstance(taxon_info, dict) else (taxon_info, None) + if exclude is None: + return name + if isinstance(exclude, int): + exclude = [exclude] + if all(tid not in lineage for tid in exclude): + return name + return None + +def get_taxonomic_groups(lineage, taxonomic_groups): + return [group for group in (match_taxonomic_group(tax_id, lineage, taxonomic_groups) for tax_id in lineage) if group is not None] + +def get_taxonomic_group_sets(lineage, taxonomic_group_sets): + return {field: ",".join(get_taxonomic_groups(lineage, taxonomic_groups)) for field, taxonomic_groups in taxonomic_group_sets.items()} + +def get_species_row(taxon_info, taxonomic_group_sets): + species_info = taxon_info["taxonomy"]["classification"]["species"] + return { + "taxonomyId": taxon_info["taxonomy"]["tax_id"], + "species": species_info["name"], + "speciesTaxonomyId": species_info["id"], + **get_taxonomic_group_sets(taxon_info["taxonomy"]["parents"], taxonomic_group_sets) + } + +def get_species_df(taxonomy_ids, taxonomic_group_sets): + species_info = get_paginated_ncbi_results(f"https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy/taxon/{",".join([str(id) for id in taxonomy_ids])}/dataset_report", "taxa") + return pd.DataFrame([get_species_row(info, taxonomic_group_sets) for info in species_info]) + +def get_genome_row(genome_info): + refseq_category = genome_info["assembly_info"].get("refseq_category") + return { + "strain": genome_info["organism"].get("infraspecific_names", {}).get("strain", ""), + "taxonomyId": genome_info["organism"]["tax_id"], + "accession": genome_info["accession"], + "isRef": refseq_category == "reference genome", + "level": genome_info["assembly_info"]["assembly_level"], + "chromosomeCount": genome_info["assembly_stats"].get("total_number_of_chromosomes"), + "length": genome_info["assembly_stats"]["total_sequence_length"], + "scaffoldCount": genome_info["assembly_stats"].get("number_of_scaffolds"), + "scaffoldN50": genome_info["assembly_stats"].get("scaffold_n50"), + "scaffoldL50": genome_info["assembly_stats"].get("scaffold_l50"), + "coverage": genome_info["assembly_stats"].get("genome_coverage"), + "gcPercent": genome_info["assembly_stats"]["gc_percent"], + "annotationStatus": genome_info.get("annotation_info", {}).get("status"), + "pairedAccession": genome_info.get("paired_accession"), + } + +def get_genomes_df(accessions): + genomes_info = get_paginated_ncbi_results(f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{",".join(accessions)}/dataset_report", "genomes") + return pd.DataFrame(data=[get_genome_row(info) for info in genomes_info]) + +def _id_to_gene_model_url(asm_id): + hubs_url = "https://hgdownload.soe.ucsc.edu/hubs/" + components = [asm_id[0:3], asm_id[4:7], asm_id[7:10], asm_id[10:13], asm_id, "genes"] + url = urllib.parse.urljoin(hubs_url, "/".join(components)) + # url looks something like https://hgdownload.soe.ucsc.edu/hubs/GCF/030/504/385/GCF_030504385.1/genes/ + # and contains html content with links to gene models. + # we need to scrape this to get the gtf + print(f"fetching url {url}") + response = requests.get(url) + try: + response.raise_for_status() + except Exception: + # FIXME?: Some accessions don't have a gene folder + return None + # find link to gtf, should ideally be ncbiRefSeq, but augustus will do + html_content = response.text + pattern = rf"{asm_id.replace('.', r'\.')}.*?\.gtf\.gz" + augustus_file = None + for match in re.findall(pattern, html_content): + if "ncbiRefSeq" in match: + return urllib.parse.urljoin(f"{url}/", match) + elif "augustus" in match: + augustus_file = match + if augustus_file: + return urllib.parse.urljoin(f"{url}/", augustus_file) + # No match, I guess that's OK ? + return None + +def add_gene_model_url(genomes_df: pd.DataFrame): + return pd.concat([genomes_df, genomes_df["accession"].apply(_id_to_gene_model_url).rename("geneModelUrl")], axis="columns") + +def report_missing_values_from(values_name, message_predicate, all_values_series, *partial_values_series): + present_values_mask = all_values_series.astype(bool) + present_values_mask[:] = False + for series in partial_values_series: + present_values_mask |= all_values_series.isin(series) + report_missing_values(values_name, message_predicate, all_values_series, present_values_mask) + +def report_missing_values(values_name, message_predicate, values_series, present_values_mask): + missing_values = values_series[~present_values_mask] + if len(missing_values) > 0: + if len(missing_values) > len(values_series)/2: + present_values = values_series[present_values_mask] + print(f"Only {len(present_values)} of {len(values_series)} {values_name} {message_predicate}: {", ".join(present_values)}") + else: + print(f"{len(missing_values)} {values_name} not {message_predicate}: {", ".join(missing_values)}") + +def build_files(assemblies_path, genomes_output_path, ucsc_assemblies_url, taxonomic_group_sets={}, do_gene_model_urls=True): + print("Building files") + + source_list_df = read_assemblies(assemblies_path) + + base_genomes_df = get_genomes_df(source_list_df["accession"]) + + report_missing_values_from("accessions", "found on NCBI", source_list_df["accession"], base_genomes_df["accession"]) + + species_df = get_species_df(base_genomes_df["taxonomyId"], taxonomic_group_sets) + + report_missing_values_from("species", "found on NCBI", base_genomes_df["taxonomyId"], species_df["taxonomyId"]) + + genomes_with_species_df = base_genomes_df.merge(species_df, how="left", on="taxonomyId") + + assemblies_df = pd.DataFrame(requests.get(ucsc_assemblies_url).json()["data"])[["ucscBrowser", "genBank", "refSeq"]] + + gen_bank_merge_df = genomes_with_species_df.merge(assemblies_df, how="left", left_on="accession", right_on="genBank") + ref_seq_merge_df = genomes_with_species_df.merge(assemblies_df, how="left", left_on="accession", right_on="refSeq") + + report_missing_values_from("accessions", "matched in assembly list", genomes_with_species_df["accession"], assemblies_df["genBank"], assemblies_df["refSeq"]) + + genomes_df = gen_bank_merge_df.combine_first(ref_seq_merge_df) + + if do_gene_model_urls: + genomes_df = add_gene_model_url(genomes_df) + report_missing_values("accessions", "matched with gene model URLs", genomes_df["accession"], genomes_df["geneModelUrl"].astype(bool)) + else: + genomes_df["geneModelUrl"] = "" + + genomes_df.to_csv(genomes_output_path, index=False, sep="\t") + + print(f"Wrote to {genomes_output_path}") diff --git a/catalog-build/package/setup.py b/catalog-build/package/setup.py new file mode 100644 index 0000000..7356474 --- /dev/null +++ b/catalog-build/package/setup.py @@ -0,0 +1,8 @@ +from setuptools import setup + +setup( + name="catalog_build", + version="1.2.1", + packages=["catalog_build"], + install_requires=["pandas", "requests", "PyYAML"], +)