Skip to content

New source: SILVA taxonomy #348

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 18 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/pyobo/sources/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
from .selventa import SCHEMGetter, SCOMPGetter, SDISGetter, SFAMGetter
from .sgd import SGDGetter
from .signor import SignorGetter
from .silva import SILVAGetter
from .slm import SLMGetter
from .umls import UMLSGetter, UMLSSTyGetter
from .unimod import UnimodGetter
Expand Down Expand Up @@ -139,6 +140,7 @@
"SDISGetter",
"SFAMGetter",
"SGDGetter",
"SILVAGetter",
"SLMGetter",
"SignorGetter",
"UMLSGetter",
Expand Down
160 changes: 160 additions & 0 deletions src/pyobo/sources/silva.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
"""Convert SILVA small subunit (ssu) taxonomy to OBO format."""

import logging
from collections.abc import Iterable

import pandas as pd
from tqdm.auto import tqdm

from pyobo.struct import Obo, Reference, Term, TypeDef, default_reference
from pyobo.struct.typedef import has_taxonomy_rank
from pyobo.utils.path import ensure_path

__all__ = [
"SILVAGetter",
]

PREFIX = "silva.taxon"

#: A mapping from SILVA rank names to TAXRANK references
SILVA_RANK_TO_TAXRANK = {
"domain": Reference(prefix="TAXRANK", identifier="0000037", name="domain"),
"major_clade": Reference(prefix="TAXRANK", identifier="0001004", name="major_clade"),
"superkingdom": Reference(prefix="TAXRANK", identifier="0000022", name="superkingdom"),
"kingdom": Reference(prefix="TAXRANK", identifier="0000017", name="kingdom"),
"subkingdom": Reference(prefix="TAXRANK", identifier="0000029", name="subkingdom"),
"superphylum": Reference(prefix="TAXRANK", identifier="0000027", name="superphylum"),
"phylum": Reference(prefix="TAXRANK", identifier="0000001", name="phylum"),
"subphylum": Reference(prefix="TAXRANK", identifier="0000008", name="subphylum"),
"infraphylum": Reference(prefix="TAXRANK", identifier="0000040", name="infraphylum"),
"superclass": Reference(prefix="TAXRANK", identifier="0000015", name="superclass"),
"class": Reference(prefix="TAXRANK", identifier="0000002", name="class"),
"subclass": Reference(prefix="TAXRANK", identifier="0000007", name="subclass"),
"infraclass": Reference(prefix="TAXRANK", identifier="0000019", name="infraclass"),
"superorder": Reference(prefix="TAXRANK", identifier="0000020", name="superorder"),
"order": Reference(prefix="TAXRANK", identifier="0000003", name="order"),
"suborder": Reference(prefix="TAXRANK", identifier="0000014", name="suborder"),
"superfamily": Reference(prefix="TAXRANK", identifier="0000018", name="superfamily"),
"family": Reference(prefix="TAXRANK", identifier="0000004", name="family"),
"subfamily": Reference(prefix="TAXRANK", identifier="0000024", name="subfamily"),
"genus": Reference(prefix="TAXRANK", identifier="0000005", name="genus"),
}

#: URLs for the SILVA files.
SILVA_TAXONOMY_URL = "https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_ssu_138.2.txt.gz"
SILVA_TAXMAP_URL = "https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/taxmap_slv_ssu_ref_nr_138.2.txt.gz"

logger = logging.getLogger(__name__)
logger.setLevel(logging.WARNING)

RELATION_NEEDS_NEW_NAME = TypeDef(
reference=default_reference(PREFIX, "has_related_sequence", name="has related sequence"),
# FIXME!
definition="This relation represents a connection between a species and ENA records that are "
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we still need to understand the context in what it means for a SILVA taxon to be mapped to an ENA record for a sequence. I am starting to think that this is just as simple as "the sequence was derived from a sample taken from an individual of this species" but please do a deep dive to clarify further

Copy link
Contributor Author

@jplfaria jplfaria Mar 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The fixes look good to me.

"annotated by SILVA's 'taxmap' that these are related in some way. It's crucial to make more explicit "
"what this relation is. Old text: Indicates that the genome sequence represented by an ENA accession is "
"classified under this taxon by SILVA.",
is_metadata_tag=True,
)


class SILVAGetter(Obo):
"""An ontology representation of the SILVA taxonomy."""

ontology = bioversions_key = PREFIX
typedefs = [has_taxonomy_rank, RELATION_NEEDS_NEW_NAME]
root_terms = [
Reference(prefix=PREFIX, identifier="2", name="Archaea"),
Reference(prefix=PREFIX, identifier="3", name="Bacteria"),
Reference(prefix=PREFIX, identifier="4", name="Eukaryota"),
]

def iter_terms(self, force: bool = False) -> Iterable[Term]:
"""Iterate over terms in the SILVA ontology."""
return iter_terms_silva(version=self._version_or_raise, force=force)

Check warning on line 74 in src/pyobo/sources/silva.py

View check run for this annotation

Codecov / codecov/patch

src/pyobo/sources/silva.py#L74

Added line #L74 was not covered by tests


def iter_terms_silva(version: str, force: bool = False) -> Iterable[Term]:
"""Iterate over SILVA terms from the main taxonomy and taxmap files."""
# --- Process the main taxonomy file ---
taxonomy_path = ensure_path(PREFIX, url=SILVA_TAXONOMY_URL, version=version, force=force)
tax_df = pd.read_csv(

Check warning on line 81 in src/pyobo/sources/silva.py

View check run for this annotation

Codecov / codecov/patch

src/pyobo/sources/silva.py#L80-L81

Added lines #L80 - L81 were not covered by tests
taxonomy_path,
sep="\t",
header=None,
names=["taxonomy", "taxon_id", "rank", "ignore", "introduced"],
dtype=str,
)
tax_df.fillna("", inplace=True)

Check warning on line 88 in src/pyobo/sources/silva.py

View check run for this annotation

Codecov / codecov/patch

src/pyobo/sources/silva.py#L88

Added line #L88 was not covered by tests

#: a dictionary that maps the joined taxonomy path (with trailing ";") to taxon_id
tax_path_to_id: dict[str, str] = {}

Check warning on line 91 in src/pyobo/sources/silva.py

View check run for this annotation

Codecov / codecov/patch

src/pyobo/sources/silva.py#L91

Added line #L91 was not covered by tests

#: maps taxon_id to the Term object
silva_taxon_id_to_term: dict[str, Term] = {}

Check warning on line 94 in src/pyobo/sources/silva.py

View check run for this annotation

Codecov / codecov/patch

src/pyobo/sources/silva.py#L94

Added line #L94 was not covered by tests

for idx, row in tqdm(
tax_df.iterrows(),
total=len(tax_df),
desc=f"[{PREFIX}] processing main taxonomy",
unit_scale=True,
):
tax_str = row["taxonomy"].strip()
silva_taxon_id = row["taxon_id"].strip()
rank_raw = row["rank"].strip()
rank = rank_raw.lower()

Check warning on line 105 in src/pyobo/sources/silva.py

View check run for this annotation

Codecov / codecov/patch

src/pyobo/sources/silva.py#L102-L105

Added lines #L102 - L105 were not covered by tests
# Split taxonomy string by ";" and discard empty parts.
parts = [p.strip() for p in tax_str.split(";") if p.strip()]

Check warning on line 107 in src/pyobo/sources/silva.py

View check run for this annotation

Codecov / codecov/patch

src/pyobo/sources/silva.py#L107

Added line #L107 was not covered by tests
if not parts:
logger.warning(f"Row {idx}: empty taxonomy string: {tax_str}")
continue

Check warning on line 110 in src/pyobo/sources/silva.py

View check run for this annotation

Codecov / codecov/patch

src/pyobo/sources/silva.py#L109-L110

Added lines #L109 - L110 were not covered by tests

# The term's name is the last element (e.g. for "Bacteria;Actinomycetota;", name is "Actinomycetota").
name = parts[-1]
term = Term(reference=Reference(prefix=PREFIX, identifier=silva_taxon_id, name=name))

Check warning on line 114 in src/pyobo/sources/silva.py

View check run for this annotation

Codecov / codecov/patch

src/pyobo/sources/silva.py#L113-L114

Added lines #L113 - L114 were not covered by tests
if rank in SILVA_RANK_TO_TAXRANK:
term.annotate_object(has_taxonomy_rank, SILVA_RANK_TO_TAXRANK[rank])

Check warning on line 116 in src/pyobo/sources/silva.py

View check run for this annotation

Codecov / codecov/patch

src/pyobo/sources/silva.py#L116

Added line #L116 was not covered by tests
else:
logger.warning(

Check warning on line 118 in src/pyobo/sources/silva.py

View check run for this annotation

Codecov / codecov/patch

src/pyobo/sources/silva.py#L118

Added line #L118 was not covered by tests
f"Row {idx}: unknown rank '{rank_raw}' for taxonomy: {tax_str} (taxon id: {silva_taxon_id})"
)

# Determine the parent by joining all but the last element.
if len(parts) > 1:
parent_key = ";".join(parts[:-1]) + ";" # e.g. "Bacteria;"
parent_id = tax_path_to_id.get(parent_key)

Check warning on line 125 in src/pyobo/sources/silva.py

View check run for this annotation

Codecov / codecov/patch

src/pyobo/sources/silva.py#L124-L125

Added lines #L124 - L125 were not covered by tests
if parent_id:
# TODO get parent's name in there if possible, makes the OBO file much more readable
term.append_parent(Reference(prefix=PREFIX, identifier=parent_id))
full_key = ";".join(parts) + ";"
tax_path_to_id[full_key] = silva_taxon_id
silva_taxon_id_to_term[silva_taxon_id] = term

Check warning on line 131 in src/pyobo/sources/silva.py

View check run for this annotation

Codecov / codecov/patch

src/pyobo/sources/silva.py#L128-L131

Added lines #L128 - L131 were not covered by tests

# --- Process the taxmap file ---
# This file has a header with columns: primaryAccession, start, stop, path, organism_name, taxid
taxmap_path = ensure_path(PREFIX, url=SILVA_TAXMAP_URL, version=version, force=force)
taxmap_df = pd.read_csv(taxmap_path, sep="\t", dtype=str, usecols=[0, 5])

Check warning on line 136 in src/pyobo/sources/silva.py

View check run for this annotation

Codecov / codecov/patch

src/pyobo/sources/silva.py#L135-L136

Added lines #L135 - L136 were not covered by tests

for ena_embl_id, species_silva_taxon_id in tqdm(
taxmap_df.values, desc=f"[{PREFIX}] processing taxmap", unit_scale=True
):
if pd.isna(ena_embl_id) or pd.isna(species_silva_taxon_id):
continue

Check warning on line 142 in src/pyobo/sources/silva.py

View check run for this annotation

Codecov / codecov/patch

src/pyobo/sources/silva.py#L142

Added line #L142 was not covered by tests
# FIXME please add a comment on why we're only doing this based on the species ID.
# does SILVA not make mappings for other ranks?
term = silva_taxon_id_to_term.get(species_silva_taxon_id)

Check warning on line 145 in src/pyobo/sources/silva.py

View check run for this annotation

Codecov / codecov/patch

src/pyobo/sources/silva.py#L145

Added line #L145 was not covered by tests
if term is None:
logger.warning(

Check warning on line 147 in src/pyobo/sources/silva.py

View check run for this annotation

Codecov / codecov/patch

src/pyobo/sources/silva.py#L147

Added line #L147 was not covered by tests
f"Row {idx} in taxmap: species_taxon_id {species_silva_taxon_id} not found in main taxonomy"
)
else:
ref = Reference(prefix="ena.embl", identifier=ena_embl_id)

Check warning on line 151 in src/pyobo/sources/silva.py

View check run for this annotation

Codecov / codecov/patch

src/pyobo/sources/silva.py#L151

Added line #L151 was not covered by tests
if ref not in term.properties[RELATION_NEEDS_NEW_NAME.reference]:
term.annotate_object(RELATION_NEEDS_NEW_NAME, ref)

Check warning on line 153 in src/pyobo/sources/silva.py

View check run for this annotation

Codecov / codecov/patch

src/pyobo/sources/silva.py#L153

Added line #L153 was not covered by tests

# Yield all terms from the main taxonomy.
yield from silva_taxon_id_to_term.values()

Check warning on line 156 in src/pyobo/sources/silva.py

View check run for this annotation

Codecov / codecov/patch

src/pyobo/sources/silva.py#L156

Added line #L156 was not covered by tests


if __name__ == "__main__":
SILVAGetter().cli()
Loading