Skip to content

Commit

Permalink
fix(ids): Fix the case where canonical IDs are not supplied.
Browse files Browse the repository at this point in the history
  • Loading branch information
aaronmussig committed Jan 17, 2025
1 parent d1530c6 commit 95730be
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 5 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -154,3 +154,4 @@ node_modules/
docs/*.rst
!docs/index.rst

/test
10 changes: 8 additions & 2 deletions gtdb_itol_decorate/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,27 @@
from gtdb_itol_decorate.itol import get_phylum_colours, write_color_datastrip, \
get_internal_nodes_with_labels, write_internal_node_labels, write_tree_colours, write_collapse_file, \
write_popup_file
from gtdb_itol_decorate.newick import load_newick_to_tree, validate_dendropy_namespace, \
from gtdb_itol_decorate.newick import load_newick_to_tree, assert_no_duplicate_taxa, \
get_canonical_mapping, validate_sets, strip_tree_labels, set_node_desc_taxa, set_taxon_label_for_internal_nodes
from gtdb_itol_decorate.util import log


def main(tree_path: Path, tax_path: Path, out_dir: Path):

# Create the output directory
log(f'Creating output directory: {out_dir}')
out_dir.mkdir(exist_ok=True)

# Read and validate the tree
log(f'Reading tree from: {tree_path}')
tree = load_newick_to_tree(str(tree_path))
log(f'Found {len(tree.leaf_nodes()):,} leaf nodes in the tree.')
validate_dendropy_namespace((x.label for x in tree.taxon_namespace))
assert_no_duplicate_taxa((x.label for x in tree.taxon_namespace))

# Create a mapping from the canonical genome ID to the tree
d_canonical_to_gid = get_canonical_mapping((x.label for x in tree.taxon_namespace))

# Read and validate the taxonomy file
log(f'Reading taxonomy from: {tax_path}')
d_tax = load_taxonomy_file(str(tax_path), set(d_canonical_to_gid.keys()))
log(f'Read the taxonomy for {len(d_tax):,} genomes.')
Expand Down
2 changes: 1 addition & 1 deletion gtdb_itol_decorate/gtdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def load_taxonomy_file(path: str, limit_to_gids: set):
with open(path) as f:
for line in f.readlines():
gid, tax = line.strip().split('\t')
gid = canonical_gid(gid)
# gid = canonical_gid(gid)
if gid not in limit_to_gids:
continue
if gid in out:
Expand Down
7 changes: 5 additions & 2 deletions gtdb_itol_decorate/newick.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import sys

from collections import Counter, deque, defaultdict

import dendropy
Expand Down Expand Up @@ -27,7 +29,7 @@ def load_newick_to_tree(path: str) -> dendropy.Tree:
preserve_underscores=True)


def validate_dendropy_namespace(taxa):
def assert_no_duplicate_taxa(taxa):
taxa_count = Counter(taxa)
duplicates = {k: v for k, v in taxa_count.items() if v > 1}
if len(duplicates) > 0:
Expand All @@ -48,7 +50,8 @@ def get_lca_str(node: dendropy.Node):
def get_canonical_mapping(gids):
out = dict()
for gid in gids:
out[canonical_gid(gid)] = gid
out[gid] = gid
# out[canonical_gid(gid)] = gid
return out


Expand Down

0 comments on commit 95730be

Please sign in to comment.