From f2e15e4fa094e3acaf82be734269abb85799987a Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Thu, 23 May 2024 15:30:16 +0200 Subject: [PATCH 01/29] Add row_diff_traverse, row_diff_successor --- .github/workflows/main.yml | 2 - .../representation/base/sequence_graph.cpp | 44 +++++++++++++++++++ .../representation/base/sequence_graph.hpp | 19 ++++++++ 3 files changed, 63 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 5c83b75f3f..f8022c5fdb 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -7,8 +7,6 @@ on: tags: - 'v*' pull_request: - branches: - - master env: REGISTRY: ghcr.io diff --git a/metagraph/src/graph/representation/base/sequence_graph.cpp b/metagraph/src/graph/representation/base/sequence_graph.cpp index bd6d55c485..185817b046 100644 --- a/metagraph/src/graph/representation/base/sequence_graph.cpp +++ b/metagraph/src/graph/representation/base/sequence_graph.cpp @@ -420,6 +420,50 @@ void DeBruijnGraph::call_unitigs(const CallPath &callback, ::mtg::graph::call_sequences(*this, callback, num_threads, true, min_tip_size, kmers_in_single_form); } +void DeBruijnGraph::row_diff_traverse(size_t num_threads, + size_t max_length, + const bit_vector &rd_succ, + sdsl::bit_vector *terminal) const { + sdsl::bit_vector visited(max_index() + 1, false); + auto finalised = visited; + std::vector distance(max_index() + 1); + assert(terminal->size() == visited.size()); + assert(rd_succ.size() == visited.size()); + auto set_terminal = [&](int v) { + distance[v] = 0; + (*terminal)[v] = true; + }; + call_nodes([&](node_index v) { + if (visited[v]) { + return; + } + static std::stack path; + while (!visited[v]) { + path.push(v); + visited[v] = true; + if (!has_no_outgoing(v)) { + v = row_diff_successor(v, rd_succ); + } + } + if (!finalised[v]) { + set_terminal(v); + finalised[v] = true; + } + node_index u = v; + while (!path.empty()) { + std::tie(u, v) = std::tie(path.top(), u); + if (!finalised[u]) { + distance[u] = distance[v] + 1; + if (distance[u] == max_length) { + set_terminal(u); + } + finalised[u] = true; + } + path.pop(); + } + }); +} + /** * Traverse graph and iterate over all nodes */ diff --git a/metagraph/src/graph/representation/base/sequence_graph.hpp b/metagraph/src/graph/representation/base/sequence_graph.hpp index a1b8625b62..44106ad4e9 100644 --- a/metagraph/src/graph/representation/base/sequence_graph.hpp +++ b/metagraph/src/graph/representation/base/sequence_graph.hpp @@ -1,6 +1,8 @@ #ifndef __SEQUENCE_GRAPH_HPP__ #define __SEQUENCE_GRAPH_HPP__ +#include "common/vectors/bit_vector.hpp" + #include #include #include @@ -203,6 +205,7 @@ class DeBruijnGraph : public SequenceGraph { const std::function &stop_early = [](){ return false; }) const; virtual size_t outdegree(node_index) const = 0; + virtual bool has_no_outgoing(node_index node) const { return outdegree(node) == 0; } virtual bool has_single_outgoing(node_index node) const { return outdegree(node) == 1; } virtual bool has_multiple_outgoing(node_index node) const { return outdegree(node) > 1; } @@ -243,6 +246,22 @@ class DeBruijnGraph : public SequenceGraph { // Call all nodes that have no incoming edges virtual void call_source_nodes(const std::function &callback) const; + + virtual void row_diff_traverse(size_t num_threads, + size_t max_length, + const bit_vector &rd_succ, + sdsl::bit_vector *terminal) const; + + virtual node_index row_diff_successor(node_index node, const bit_vector &rd_succ) const { + node_index succ = npos; + adjacent_outgoing_nodes(node, [&](node_index adjacent_node) { + if(rd_succ[adjacent_node]) { + succ = adjacent_node; + } + }); + assert(succ != npos && "a row diff successor must exist"); + return succ; + } }; From f3fb0b065de71e8c78dd6bed80e3f638d4af7be6 Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Tue, 2 Jul 2024 22:38:57 +0200 Subject: [PATCH 02/29] Update, propagate to DBGSuccinct --- metagraph/src/annotation/row_diff_builder.cpp | 17 +++++-- .../representation/base/sequence_graph.cpp | 46 ++++++++++++++----- .../representation/base/sequence_graph.hpp | 13 ++---- .../representation/succinct/dbg_succinct.cpp | 21 +++++++++ .../representation/succinct/dbg_succinct.hpp | 9 ++++ 5 files changed, 80 insertions(+), 26 deletions(-) diff --git a/metagraph/src/annotation/row_diff_builder.cpp b/metagraph/src/annotation/row_diff_builder.cpp index db7d36befa..c9972b2a93 100644 --- a/metagraph/src/annotation/row_diff_builder.cpp +++ b/metagraph/src/annotation/row_diff_builder.cpp @@ -264,7 +264,16 @@ void sum_and_call_counts(const fs::path &dir, } } -rd_succ_bv_type route_at_forks(const graph::DBGSuccinct &graph, +uint64_t from_graph_index(const graph::DeBruijnGraph &graph, + graph::DeBruijnGraph::node_index idx) { + if (auto* g = dynamic_cast(&graph)) { + return g->kmer_to_boss_index(idx); + } else { + return idx; + } +} + +rd_succ_bv_type route_at_forks(const graph::DeBruijnGraph &graph, const std::string &rd_succ_filename, const std::string &count_vectors_dir, const std::string &row_count_extension) { @@ -282,7 +291,7 @@ rd_succ_bv_type route_at_forks(const graph::DBGSuccinct &graph, logger->trace("RowDiff successors will be set to the adjacent nodes with" " the largest number of labels"); - const bit_vector &last = graph.get_boss().get_last(); + const bit_vector &last = *graph.get_last(); graph::DeBruijnGraph::node_index graph_idx = to_node(0); std::vector outgoing_counts; @@ -293,12 +302,12 @@ rd_succ_bv_type route_at_forks(const graph::DBGSuccinct &graph, [&](int32_t count) { // TODO: skip single outgoing outgoing_counts.push_back(count); - if (last[graph.kmer_to_boss_index(graph_idx)]) { + if (last[from_graph_index(graph, graph_idx)]) { // pick the node with the largest count size_t max_pos = std::max_element(outgoing_counts.rbegin(), outgoing_counts.rend()) - outgoing_counts.rbegin(); - rd_succ_bv[graph.kmer_to_boss_index(graph_idx - max_pos)] = true; + rd_succ_bv[from_graph_index(graph, graph_idx - max_pos)] = true; outgoing_counts.resize(0); } graph_idx++; diff --git a/metagraph/src/graph/representation/base/sequence_graph.cpp b/metagraph/src/graph/representation/base/sequence_graph.cpp index 185817b046..7d7d3eabee 100644 --- a/metagraph/src/graph/representation/base/sequence_graph.cpp +++ b/metagraph/src/graph/representation/base/sequence_graph.cpp @@ -5,6 +5,7 @@ #include #include "common/logger.hpp" +#include "common/vectors/bit_vector_dyn.hpp" #include "common/seq_tools/reverse_complement.hpp" #include "common/threads/threading.hpp" #include "common/vectors/vector_algorithm.hpp" @@ -420,11 +421,23 @@ void DeBruijnGraph::call_unitigs(const CallPath &callback, ::mtg::graph::call_sequences(*this, callback, num_threads, true, min_tip_size, kmers_in_single_form); } +std::shared_ptr DeBruijnGraph::get_last() const { + bit_vector_dyn last_bv(max_index() + 1); + call_nodes([&](node_index v) { + std::pair last; + call_outgoing_kmers(v, [&](node_index u, char c) { + last = std::max(last, std::pair{c, u}); + }); + last_bv.set(last.second, true); + }); + return std::make_shared(std::move(last_bv)); +} + void DeBruijnGraph::row_diff_traverse(size_t num_threads, size_t max_length, const bit_vector &rd_succ, sdsl::bit_vector *terminal) const { - sdsl::bit_vector visited(max_index() + 1, false); + sdsl::bit_vector visited(max_index() + 1); auto finalised = visited; std::vector distance(max_index() + 1); assert(terminal->size() == visited.size()); @@ -434,9 +447,6 @@ void DeBruijnGraph::row_diff_traverse(size_t num_threads, (*terminal)[v] = true; }; call_nodes([&](node_index v) { - if (visited[v]) { - return; - } static std::stack path; while (!visited[v]) { path.push(v); @@ -445,25 +455,37 @@ void DeBruijnGraph::row_diff_traverse(size_t num_threads, v = row_diff_successor(v, rd_succ); } } + // Either a sink, or a cyclic dependency if (!finalised[v]) { set_terminal(v); finalised[v] = true; } - node_index u = v; - while (!path.empty()) { - std::tie(u, v) = std::tie(path.top(), u); - if (!finalised[u]) { - distance[u] = distance[v] + 1; - if (distance[u] == max_length) { - set_terminal(u); + node_index succ; + while (!empty(path)) { + succ = std::exchange(v, path.top()); + if (!finalised[v]) { + distance[v] = distance[succ] + 1; + if (distance[v] == max_length) { + set_terminal(v); } - finalised[u] = true; + finalised[v] = true; } path.pop(); } }); } +node_index DeBruijnGraph::row_diff_successor(node_index node, const bit_vector &rd_succ) const { + node_index succ = npos; + adjacent_outgoing_nodes(node, [&](node_index adjacent_node) { + if(rd_succ[adjacent_node]) { + succ = adjacent_node; + } + }); + assert(succ != npos && "a row diff successor must exist"); + return succ; +} + /** * Traverse graph and iterate over all nodes */ diff --git a/metagraph/src/graph/representation/base/sequence_graph.hpp b/metagraph/src/graph/representation/base/sequence_graph.hpp index 44106ad4e9..9f8a2ff374 100644 --- a/metagraph/src/graph/representation/base/sequence_graph.hpp +++ b/metagraph/src/graph/representation/base/sequence_graph.hpp @@ -247,21 +247,14 @@ class DeBruijnGraph : public SequenceGraph { // Call all nodes that have no incoming edges virtual void call_source_nodes(const std::function &callback) const; + virtual std::shared_ptr get_last() const; + virtual void row_diff_traverse(size_t num_threads, size_t max_length, const bit_vector &rd_succ, sdsl::bit_vector *terminal) const; - virtual node_index row_diff_successor(node_index node, const bit_vector &rd_succ) const { - node_index succ = npos; - adjacent_outgoing_nodes(node, [&](node_index adjacent_node) { - if(rd_succ[adjacent_node]) { - succ = adjacent_node; - } - }); - assert(succ != npos && "a row diff successor must exist"); - return succ; - } + virtual node_index row_diff_successor(node_index node, const bit_vector &rd_succ) const; }; diff --git a/metagraph/src/graph/representation/succinct/dbg_succinct.cpp b/metagraph/src/graph/representation/succinct/dbg_succinct.cpp index 915c76af49..a7906e009c 100644 --- a/metagraph/src/graph/representation/succinct/dbg_succinct.cpp +++ b/metagraph/src/graph/representation/succinct/dbg_succinct.cpp @@ -516,6 +516,27 @@ ::call_source_nodes(const std::function &callback) const { }); } +std::shared_ptr DBGSuccinct +::get_last() const { + return std::shared_ptr(&get_boss().get_last(), [](const bit_vector*) { + // Do not destruct BOSS's last with shared_ptr + }); +} + +void DBGSuccinct +::row_diff_traverse(size_t num_threads, + size_t max_length, + const bit_vector &rd_succ, + sdsl::bit_vector *terminal) const { + return get_boss().row_diff_traverse(num_threads, max_length, rd_succ, terminal); +} + +node_index DBGSuccinct +::row_diff_successor(node_index node, const bit_vector &rd_succ) const { + return get_boss().row_diff_successor(node, rd_succ); +} + + size_t DBGSuccinct::outdegree(node_index node) const { assert(node > 0 && node <= num_nodes()); diff --git a/metagraph/src/graph/representation/succinct/dbg_succinct.hpp b/metagraph/src/graph/representation/succinct/dbg_succinct.hpp index bdbabe3104..7011d183fd 100644 --- a/metagraph/src/graph/representation/succinct/dbg_succinct.hpp +++ b/metagraph/src/graph/representation/succinct/dbg_succinct.hpp @@ -174,6 +174,15 @@ class DBGSuccinct : public DeBruijnGraph { virtual void call_source_nodes(const std::function &callback) const override final; + virtual std::shared_ptr get_last() const override final; + + virtual void row_diff_traverse(size_t num_threads, + size_t max_length, + const bit_vector &rd_succ, + sdsl::bit_vector *terminal) const override final; + + virtual node_index row_diff_successor(node_index node, const bit_vector &rd_succ) const override final; + uint64_t kmer_to_boss_index(node_index kmer_index) const; node_index boss_to_kmer_index(uint64_t boss_index) const; From d7878bf6447b3edea3f56ef66e5eece2db06ae09 Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Tue, 8 Oct 2024 00:02:12 +0200 Subject: [PATCH 03/29] Use graph::DeBruijnGraph in build_pred_succ and assign_anchors --- .../src/annotation/annotation_converters.cpp | 6 +- metagraph/src/annotation/row_diff_builder.cpp | 108 ++++++++++-------- metagraph/src/annotation/row_diff_builder.hpp | 4 +- 3 files changed, 66 insertions(+), 52 deletions(-) diff --git a/metagraph/src/annotation/annotation_converters.cpp b/metagraph/src/annotation/annotation_converters.cpp index 061b16ee21..99f310fc23 100644 --- a/metagraph/src/annotation/annotation_converters.cpp +++ b/metagraph/src/annotation/annotation_converters.cpp @@ -10,6 +10,7 @@ #include #include "row_diff_builder.hpp" +#include "cli/load/load_graph.hpp" #include "common/logger.hpp" #include "common/algorithms.hpp" #include "common/hashers/hash.hpp" @@ -1539,12 +1540,13 @@ void convert_to_row_diff(const std::vector &files, if (out_dir.empty()) out_dir = "./"; + auto graph = cli::load_critical_dbg(graph_fname); if (construction_stage != RowDiffStage::COUNT_LABELS) - build_pred_succ(graph_fname, graph_fname, out_dir, + build_pred_succ(*graph, graph_fname, out_dir, ".row_count", get_num_threads()); if (construction_stage == RowDiffStage::CONVERT) { - assign_anchors(graph_fname, graph_fname, out_dir, max_path_length, + assign_anchors(*graph, graph_fname, out_dir, max_path_length, ".row_reduction", get_num_threads()); const std::string anchors_fname = graph_fname + kRowDiffAnchorExt; diff --git a/metagraph/src/annotation/row_diff_builder.cpp b/metagraph/src/annotation/row_diff_builder.cpp index c9972b2a93..e14fe63e17 100644 --- a/metagraph/src/annotation/row_diff_builder.cpp +++ b/metagraph/src/annotation/row_diff_builder.cpp @@ -273,6 +273,15 @@ uint64_t from_graph_index(const graph::DeBruijnGraph &graph, } } +graph::DeBruijnGraph::node_index to_graph_index(const graph::DeBruijnGraph &graph, + uint64_t idx) { + if (auto* g = dynamic_cast(&graph)) { + return g->boss_to_kmer_index(idx); + } else { + return idx; + } +} + rd_succ_bv_type route_at_forks(const graph::DeBruijnGraph &graph, const std::string &rd_succ_filename, const std::string &count_vectors_dir, @@ -335,7 +344,7 @@ rd_succ_bv_type route_at_forks(const graph::DeBruijnGraph &graph, return rd_succ; } -void build_pred_succ(const std::string &graph_fname, +void build_pred_succ(const graph::DeBruijnGraph &graph, const std::string &outfbase, const std::string &count_vectors_dir, const std::string &row_count_extension, @@ -351,21 +360,16 @@ void build_pred_succ(const std::string &graph_fname, logger->trace("Building and writing successor and predecessor files to {}.*", outfbase); - graph::DBGSuccinct graph(2); - logger->trace("Loading graph..."); - if (!graph.load(graph_fname)) { - logger->error("Cannot load graph from {}", graph_fname); - std::exit(1); + std::optional dummy; + auto* succinct = dynamic_cast(&graph); + if (succinct) { + dummy = succinct->get_boss().mark_all_dummy_edges(num_threads); } // assign row-diff successors at forks rd_succ_bv_type rd_succ = route_at_forks(graph, outfbase + kRowDiffForkSuccExt, count_vectors_dir, row_count_extension); - const BOSS &boss = graph.get_boss(); - - sdsl::bit_vector dummy = boss.mark_all_dummy_edges(num_threads); - // create the succ/pred files, indexed using annotation indices uint32_t width = sdsl::bits::hi(graph.num_nodes()) + 1; sdsl::int_vector_buffer<> succ(outfbase + ".succ", std::ios::out, BUFFER_SIZE, width); @@ -377,7 +381,7 @@ void build_pred_succ(const std::string &graph_fname, !common::get_verbose()); const uint64_t BS = 1'000'000; - // traverse BOSS table in parallel processing blocks of size |BS| + // traverse graph in parallel processing blocks of size |BS| // use static scheduling to make threads process ordered contiguous blocks #pragma omp parallel for ordered num_threads(num_threads) schedule(dynamic) for (uint64_t start = 1; start <= graph.num_nodes(); start += BS) { @@ -387,34 +391,49 @@ void build_pred_succ(const std::string &graph_fname, std::vector pred_boundary_buf; for (uint64_t i = start; i < std::min(start + BS, graph.num_nodes() + 1); ++i) { - BOSS::edge_index boss_idx = graph.kmer_to_boss_index(i); - if (!dummy[boss_idx]) { - BOSS::edge_index next = boss.fwd(boss_idx); - assert(next); - if (!dummy[next]) { - while (rd_succ.size() && !rd_succ[next]) { - next--; - assert(!boss.get_last(next)); + if (succinct) { // Legacy code for DBGSuccinct + const BOSS &boss = succinct->get_boss(); + BOSS::edge_index boss_idx = succinct->kmer_to_boss_index(i); + if (!(*dummy)[boss_idx]) { + BOSS::edge_index next = boss.fwd(boss_idx); + assert(next); + if (!(*dummy)[next]) { + while (rd_succ.size() && !rd_succ[next]) { + next--; + assert(!boss.get_last(next)); + } + succ_buf.push_back(to_row(succinct->boss_to_kmer_index(next))); + succ_boundary_buf.push_back(0); } - succ_buf.push_back(to_row(graph.boss_to_kmer_index(next))); - succ_boundary_buf.push_back(0); - } - // compute predecessors only for row-diff successors - if (rd_succ.size() ? rd_succ[boss_idx] : boss.get_last(boss_idx)) { - BOSS::TAlphabet d = boss.get_node_last_value(boss_idx); - BOSS::edge_index back_idx = boss.bwd(boss_idx); - boss.call_incoming_to_target(back_idx, d, - [&](BOSS::edge_index pred) { - // dummy predecessors are ignored - if (!dummy[pred]) { - uint64_t node_index = graph.boss_to_kmer_index(pred); - pred_buf.push_back(to_row(node_index)); - pred_boundary_buf.push_back(0); + // compute predecessors only for row-diff successors + if (rd_succ.size() ? rd_succ[boss_idx] : boss.get_last(boss_idx)) { + BOSS::TAlphabet d = boss.get_node_last_value(boss_idx); + BOSS::edge_index back_idx = boss.bwd(boss_idx); + boss.call_incoming_to_target(back_idx, d, + [&](BOSS::edge_index pred) { + // dummy predecessors are ignored + if (!(*dummy)[pred]) { + uint64_t node_index = succinct->boss_to_kmer_index(pred); + pred_buf.push_back(to_row(node_index)); + pred_boundary_buf.push_back(0); + } } - } - ); + ); + } + } + } else { + auto j = graph.row_diff_successor(i, rd_succ); + succ_buf.push_back(to_row(j)); + succ_boundary_buf.push_back(0); + + if(rd_succ[i]) { + graph.adjacent_incoming_nodes(i, [&](auto pred) { + pred_buf.push_back(to_row(pred)); + pred_boundary_buf.push_back(0); + }); } } + succ_boundary_buf.push_back(1); pred_boundary_buf.push_back(1); ++progress_bar; @@ -433,7 +452,7 @@ void build_pred_succ(const std::string &graph_fname, logger->trace("Pred/succ nodes written to {}.pred/succ", outfbase); } -void assign_anchors(const std::string &graph_fname, +void assign_anchors(const graph::DeBruijnGraph &graph, const std::string &outfbase, const std::filesystem::path &count_vectors_dir, uint32_t max_length, @@ -445,13 +464,6 @@ void assign_anchors(const std::string &graph_fname, return; } - graph::DBGSuccinct graph(2); - logger->trace("Loading graph..."); - if (!graph.load(graph_fname)) { - logger->error("Cannot load graph from {}", graph_fname); - std::exit(1); - } - const BOSS &boss = graph.get_boss(); const uint64_t num_rows = graph.num_nodes(); bool optimize_anchors = false; @@ -460,7 +472,7 @@ void assign_anchors(const std::string &graph_fname, optimize_anchors = true; } - sdsl::bit_vector anchors_bv(boss.get_last().size(), false); + sdsl::bit_vector anchors_bv(graph.get_last()->size(), false); if (optimize_anchors) { logger->trace("Making every row with negative reduction an anchor..."); @@ -470,7 +482,7 @@ void assign_anchors(const std::string &graph_fname, [&](int32_t count) { // check if the reduction is negative if (count < 0) - anchors_bv[graph.kmer_to_boss_index(to_node(i))] = true; + anchors_bv[from_graph_index(graph, to_node(i))] = true; i++; } ); @@ -501,11 +513,11 @@ void assign_anchors(const std::string &graph_fname, if (rd_succ.size()) { logger->trace("Assigning anchors for RowDiff successors {}...", rd_succ_fname); - boss.row_diff_traverse(num_threads, max_length, rd_succ, &anchors_bv); + graph.row_diff_traverse(num_threads, max_length, rd_succ, &anchors_bv); } else { logger->warn("Assigning anchors without chosen RowDiff successors." " The last outgoing edges will be used for routing."); - boss.row_diff_traverse(num_threads, max_length, boss.get_last(), &anchors_bv); + graph.row_diff_traverse(num_threads, max_length, *graph.get_last(), &anchors_bv); } } @@ -514,7 +526,7 @@ void assign_anchors(const std::string &graph_fname, sdsl::bit_vector anchors(num_rows, false); for (BOSS::edge_index i = 1; i < anchors_bv.size(); ++i) { if (anchors_bv[i]) { - uint64_t graph_idx = graph.boss_to_kmer_index(i); + uint64_t graph_idx = to_graph_index(graph, i); assert(to_row(graph_idx) < num_rows); anchors[to_row(graph_idx)] = 1; } diff --git a/metagraph/src/annotation/row_diff_builder.hpp b/metagraph/src/annotation/row_diff_builder.hpp index f57fe4c38c..47e3ffa116 100644 --- a/metagraph/src/annotation/row_diff_builder.hpp +++ b/metagraph/src/annotation/row_diff_builder.hpp @@ -16,13 +16,13 @@ void count_labels_per_row(const std::vector &source_files, const std::string &row_count_fname, bool with_coordinates = false); -void build_pred_succ(const std::string &graph_filename, +void build_pred_succ(const graph::DeBruijnGraph &graph, const std::string &outfbase, const std::string &count_vectors_dir, const std::string &row_count_extension, uint32_t num_threads); -void assign_anchors(const std::string &graph_filename, +void assign_anchors(const graph::DeBruijnGraph &graph, const std::string &outfbase, const std::filesystem::path &dest_dir, uint32_t max_length, From 3da66a968c9ce757a3855ff67de8ea0abaec32a0 Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Tue, 8 Oct 2024 02:00:05 +0200 Subject: [PATCH 04/29] Update download-artifact --- .github/workflows/main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index f8022c5fdb..823f192880 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -211,7 +211,7 @@ jobs: python-version: 3.8 - name: fetch static binary - uses: actions/download-artifact@v2 + uses: actions/download-artifact@v4 with: path: artifacts @@ -283,7 +283,7 @@ jobs: run: git submodule update --init --recursive - name: fetch static binary - uses: actions/download-artifact@v2 + uses: actions/download-artifact@v4 with: path: artifacts From 3a1633228a5be50b96a5d9c2e59a73d880530330 Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Tue, 8 Oct 2024 14:00:02 +0200 Subject: [PATCH 05/29] Apply suggestions from code review Co-authored-by: Harun Mustafa --- metagraph/src/annotation/row_diff_builder.cpp | 2 +- metagraph/src/graph/representation/succinct/dbg_succinct.cpp | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/metagraph/src/annotation/row_diff_builder.cpp b/metagraph/src/annotation/row_diff_builder.cpp index e14fe63e17..f975175f02 100644 --- a/metagraph/src/annotation/row_diff_builder.cpp +++ b/metagraph/src/annotation/row_diff_builder.cpp @@ -472,7 +472,7 @@ void assign_anchors(const graph::DeBruijnGraph &graph, optimize_anchors = true; } - sdsl::bit_vector anchors_bv(graph.get_last()->size(), false); + sdsl::bit_vector anchors_bv(graph.max_index() + 1, false); if (optimize_anchors) { logger->trace("Making every row with negative reduction an anchor..."); diff --git a/metagraph/src/graph/representation/succinct/dbg_succinct.cpp b/metagraph/src/graph/representation/succinct/dbg_succinct.cpp index a7906e009c..26745d13f2 100644 --- a/metagraph/src/graph/representation/succinct/dbg_succinct.cpp +++ b/metagraph/src/graph/representation/succinct/dbg_succinct.cpp @@ -518,9 +518,7 @@ ::call_source_nodes(const std::function &callback) const { std::shared_ptr DBGSuccinct ::get_last() const { - return std::shared_ptr(&get_boss().get_last(), [](const bit_vector*) { - // Do not destruct BOSS's last with shared_ptr - }); + return std::shared_ptr(std::shared_ptr{}, &get_boss().get_last()); } void DBGSuccinct From 76b29b7f15cfea90f9a02d800a20837104a5fe01 Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Tue, 8 Oct 2024 23:12:26 +0200 Subject: [PATCH 06/29] Merge master to rowdiff (#504) * Update download-artifact * Touch commit for workflows * Update upload-artifact to @v4 * Own artifact name for _noAVX --- .github/workflows/main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 823f192880..0b320cb88b 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -125,9 +125,9 @@ jobs: run: mv metagraph/build/metagraph_${{ matrix.alphabet }} metagraph/build/metagraph_${{ matrix.alphabet }}_noAVX - name: upload static binary if: ${{ matrix.build_static == 'ON' && matrix.compiler == 'g++-11' }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: - name: metagraph_${{ matrix.alphabet }}_linux_x86 + name: metagraph_${{ matrix.alphabet }}${{ matrix.with_avx == 'OFF' && '_noAVX' || '' }}_linux_x86 path: metagraph/build/metagraph_${{ matrix.alphabet }}${{ matrix.with_avx == 'OFF' && '_noAVX' || '' }} - name: run unit tests From 3a9f1c64220fa04a6fde32f0cdfbc0fa9affa9b8 Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Wed, 9 Oct 2024 00:58:26 +0200 Subject: [PATCH 07/29] Try simplifying build_pred_succ, temporarily rollback to graph.get_last()->size() --- metagraph/src/annotation/row_diff_builder.cpp | 49 ++++++------------- .../representation/succinct/dbg_succinct.cpp | 2 +- 2 files changed, 15 insertions(+), 36 deletions(-) diff --git a/metagraph/src/annotation/row_diff_builder.cpp b/metagraph/src/annotation/row_diff_builder.cpp index f975175f02..753fc24c49 100644 --- a/metagraph/src/annotation/row_diff_builder.cpp +++ b/metagraph/src/annotation/row_diff_builder.cpp @@ -391,47 +391,26 @@ void build_pred_succ(const graph::DeBruijnGraph &graph, std::vector pred_boundary_buf; for (uint64_t i = start; i < std::min(start + BS, graph.num_nodes() + 1); ++i) { + bool find_succ = true; if (succinct) { // Legacy code for DBGSuccinct const BOSS &boss = succinct->get_boss(); BOSS::edge_index boss_idx = succinct->kmer_to_boss_index(i); - if (!(*dummy)[boss_idx]) { - BOSS::edge_index next = boss.fwd(boss_idx); - assert(next); - if (!(*dummy)[next]) { - while (rd_succ.size() && !rd_succ[next]) { - next--; - assert(!boss.get_last(next)); - } - succ_buf.push_back(to_row(succinct->boss_to_kmer_index(next))); - succ_boundary_buf.push_back(0); - } - // compute predecessors only for row-diff successors - if (rd_succ.size() ? rd_succ[boss_idx] : boss.get_last(boss_idx)) { - BOSS::TAlphabet d = boss.get_node_last_value(boss_idx); - BOSS::edge_index back_idx = boss.bwd(boss_idx); - boss.call_incoming_to_target(back_idx, d, - [&](BOSS::edge_index pred) { - // dummy predecessors are ignored - if (!(*dummy)[pred]) { - uint64_t node_index = succinct->boss_to_kmer_index(pred); - pred_buf.push_back(to_row(node_index)); - pred_boundary_buf.push_back(0); - } - } - ); - } + BOSS::edge_index next = boss.fwd(boss_idx); + assert(next); + if ((*dummy)[next]) { + find_succ = false; } - } else { + } + if(find_succ) { auto j = graph.row_diff_successor(i, rd_succ); succ_buf.push_back(to_row(j)); succ_boundary_buf.push_back(0); - - if(rd_succ[i]) { - graph.adjacent_incoming_nodes(i, [&](auto pred) { - pred_buf.push_back(to_row(pred)); - pred_boundary_buf.push_back(0); - }); - } + } + if(rd_succ[from_graph_index(graph, i)]) { + graph.adjacent_incoming_nodes(i, [&](auto pred) { + pred_buf.push_back(to_row(pred)); + pred_boundary_buf.push_back(0); + }); } succ_boundary_buf.push_back(1); @@ -472,7 +451,7 @@ void assign_anchors(const graph::DeBruijnGraph &graph, optimize_anchors = true; } - sdsl::bit_vector anchors_bv(graph.max_index() + 1, false); + sdsl::bit_vector anchors_bv(graph.get_last()->size(), false); if (optimize_anchors) { logger->trace("Making every row with negative reduction an anchor..."); diff --git a/metagraph/src/graph/representation/succinct/dbg_succinct.cpp b/metagraph/src/graph/representation/succinct/dbg_succinct.cpp index 26745d13f2..6d325c7f49 100644 --- a/metagraph/src/graph/representation/succinct/dbg_succinct.cpp +++ b/metagraph/src/graph/representation/succinct/dbg_succinct.cpp @@ -531,7 +531,7 @@ ::row_diff_traverse(size_t num_threads, node_index DBGSuccinct ::row_diff_successor(node_index node, const bit_vector &rd_succ) const { - return get_boss().row_diff_successor(node, rd_succ); + return boss_to_kmer_index(get_boss().row_diff_successor(kmer_to_boss_index(node), rd_succ)); } From b18d4635d2b781e2e28a856e3618085d6fa28742 Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Wed, 9 Oct 2024 01:02:58 +0200 Subject: [PATCH 08/29] Special handling of last.size() --- metagraph/src/annotation/row_diff_builder.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/metagraph/src/annotation/row_diff_builder.cpp b/metagraph/src/annotation/row_diff_builder.cpp index 753fc24c49..87665f716d 100644 --- a/metagraph/src/annotation/row_diff_builder.cpp +++ b/metagraph/src/annotation/row_diff_builder.cpp @@ -272,7 +272,6 @@ uint64_t from_graph_index(const graph::DeBruijnGraph &graph, return idx; } } - graph::DeBruijnGraph::node_index to_graph_index(const graph::DeBruijnGraph &graph, uint64_t idx) { if (auto* g = dynamic_cast(&graph)) { @@ -281,6 +280,13 @@ graph::DeBruijnGraph::node_index to_graph_index(const graph::DeBruijnGraph &grap return idx; } } +size_t get_last_size(const graph::DeBruijnGraph &graph) { + if (dynamic_cast(&graph)) { + return graph.get_last()->size(); + } else { + return graph.max_index() + 1; + } +} rd_succ_bv_type route_at_forks(const graph::DeBruijnGraph &graph, const std::string &rd_succ_filename, @@ -451,7 +457,7 @@ void assign_anchors(const graph::DeBruijnGraph &graph, optimize_anchors = true; } - sdsl::bit_vector anchors_bv(graph.get_last()->size(), false); + sdsl::bit_vector anchors_bv(get_last_size(graph), false); if (optimize_anchors) { logger->trace("Making every row with negative reduction an anchor..."); From 629cb09153328888970192e8310338d8eeaf4cbb Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Wed, 9 Oct 2024 02:23:50 +0200 Subject: [PATCH 09/29] Use last instead of rd_succ if it's empty --- metagraph/src/annotation/row_diff_builder.cpp | 37 +++++++++++-------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/metagraph/src/annotation/row_diff_builder.cpp b/metagraph/src/annotation/row_diff_builder.cpp index 87665f716d..b0009aeb66 100644 --- a/metagraph/src/annotation/row_diff_builder.cpp +++ b/metagraph/src/annotation/row_diff_builder.cpp @@ -373,8 +373,8 @@ void build_pred_succ(const graph::DeBruijnGraph &graph, } // assign row-diff successors at forks - rd_succ_bv_type rd_succ = route_at_forks(graph, outfbase + kRowDiffForkSuccExt, - count_vectors_dir, row_count_extension); + auto rd_succ = route_at_forks(graph, outfbase + kRowDiffForkSuccExt, + count_vectors_dir, row_count_extension); // create the succ/pred files, indexed using annotation indices uint32_t width = sdsl::bits::hi(graph.num_nodes()) + 1; @@ -401,22 +401,29 @@ void build_pred_succ(const graph::DeBruijnGraph &graph, if (succinct) { // Legacy code for DBGSuccinct const BOSS &boss = succinct->get_boss(); BOSS::edge_index boss_idx = succinct->kmer_to_boss_index(i); - BOSS::edge_index next = boss.fwd(boss_idx); - assert(next); + BOSS::edge_index next = boss.fwd(boss_idx); + assert(next); if ((*dummy)[next]) { find_succ = false; - } - } - if(find_succ) { - auto j = graph.row_diff_successor(i, rd_succ); - succ_buf.push_back(to_row(j)); - succ_boundary_buf.push_back(0); + } } - if(rd_succ[from_graph_index(graph, i)]) { - graph.adjacent_incoming_nodes(i, [&](auto pred) { - pred_buf.push_back(to_row(pred)); - pred_boundary_buf.push_back(0); - }); + auto with_rd_succ = [&](bit_vector const& rd_succ) { + if(find_succ) { + auto j = graph.row_diff_successor(i, rd_succ); + succ_buf.push_back(to_row(j)); + succ_boundary_buf.push_back(0); + } + if(rd_succ[from_graph_index(graph, i)]) { + graph.adjacent_incoming_nodes(i, [&](auto pred) { + pred_buf.push_back(to_row(pred)); + pred_boundary_buf.push_back(0); + }); + } + }; + if (rd_succ.size()) { + with_rd_succ(rd_succ); + } else { + with_rd_succ(*graph.get_last()); } succ_boundary_buf.push_back(1); From 71c19a1dc3bcb27fdb397a3a0cc114d75817e24c Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Wed, 9 Oct 2024 12:22:03 +0200 Subject: [PATCH 10/29] Add checks to fix integration tests --- metagraph/src/annotation/row_diff_builder.cpp | 30 +++++++++++-------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/metagraph/src/annotation/row_diff_builder.cpp b/metagraph/src/annotation/row_diff_builder.cpp index b0009aeb66..0e15dfa5ea 100644 --- a/metagraph/src/annotation/row_diff_builder.cpp +++ b/metagraph/src/annotation/row_diff_builder.cpp @@ -397,33 +397,37 @@ void build_pred_succ(const graph::DeBruijnGraph &graph, std::vector pred_boundary_buf; for (uint64_t i = start; i < std::min(start + BS, graph.num_nodes() + 1); ++i) { - bool find_succ = true; + bool skip_succ = false, skip_all = false; if (succinct) { // Legacy code for DBGSuccinct - const BOSS &boss = succinct->get_boss(); - BOSS::edge_index boss_idx = succinct->kmer_to_boss_index(i); - BOSS::edge_index next = boss.fwd(boss_idx); - assert(next); - if ((*dummy)[next]) { - find_succ = false; - } + BOSS::edge_index boss_idx = from_graph_index(graph, i); + if((*dummy)[boss_idx]) { + skip_all = true; + } else { + skip_succ = (*dummy)[succinct->get_boss().fwd(boss_idx)]; + } } auto with_rd_succ = [&](bit_vector const& rd_succ) { - if(find_succ) { + if(!skip_succ) { auto j = graph.row_diff_successor(i, rd_succ); succ_buf.push_back(to_row(j)); succ_boundary_buf.push_back(0); } if(rd_succ[from_graph_index(graph, i)]) { graph.adjacent_incoming_nodes(i, [&](auto pred) { + if (dummy && (*dummy)[from_graph_index(graph, pred)]) { + return; + } pred_buf.push_back(to_row(pred)); pred_boundary_buf.push_back(0); }); } }; - if (rd_succ.size()) { - with_rd_succ(rd_succ); - } else { - with_rd_succ(*graph.get_last()); + if (!skip_all) { + if (rd_succ.size()) { + with_rd_succ(rd_succ); + } else { + with_rd_succ(*graph.get_last()); + } } succ_boundary_buf.push_back(1); From 90abc0810814ef86017ee343cc0e23e8ae295e1d Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Wed, 9 Oct 2024 16:57:29 +0200 Subject: [PATCH 11/29] Use BOSS index space in DBGSuccinct --- .../binary_matrix/row_diff/row_diff.cpp | 7 +- .../binary_matrix/row_diff/row_diff.hpp | 4 +- .../int_matrix/row_diff/int_row_diff.hpp | 4 +- .../int_matrix/row_diff/tuple_row_diff.hpp | 4 +- metagraph/src/annotation/row_diff_builder.cpp | 59 +++----- .../alignment/aligner_seeder_methods.cpp | 2 +- metagraph/src/graph/alignment/alignment.cpp | 4 +- .../src/graph/alignment/annotation_buffer.cpp | 2 +- .../graph_extensions/node_first_cache.cpp | 4 +- .../representation/base/sequence_graph.cpp | 4 +- .../representation/base/sequence_graph.hpp | 2 +- .../graph/representation/canonical_dbg.cpp | 18 +-- .../src/graph/representation/masked_graph.cpp | 14 +- .../representation/succinct/dbg_succinct.cpp | 138 ++++++++---------- .../representation/succinct/dbg_succinct.hpp | 9 +- .../tests/annotation/test_converters.cpp | 49 ++++--- 16 files changed, 134 insertions(+), 190 deletions(-) diff --git a/metagraph/src/annotation/binary_matrix/row_diff/row_diff.cpp b/metagraph/src/annotation/binary_matrix/row_diff/row_diff.cpp index 81f23f581b..a061d87355 100644 --- a/metagraph/src/annotation/binary_matrix/row_diff/row_diff.cpp +++ b/metagraph/src/annotation/binary_matrix/row_diff/row_diff.cpp @@ -60,12 +60,11 @@ IRowDiff::get_rd_ids(const std::vector &row_ids) const { for (size_t i = 0; i < row_ids.size(); ++i) { Row row = row_ids[i]; - graph::boss::BOSS::edge_index boss_edge = graph_->kmer_to_boss_index( - graph::AnnotatedSequenceGraph::anno_to_graph_index(row)); + graph::boss::BOSS::edge_index boss_edge = + graph::AnnotatedSequenceGraph::anno_to_graph_index(row); while (true) { - row = graph::AnnotatedSequenceGraph::graph_to_anno_index( - graph_->boss_to_kmer_index(boss_edge)); + row = graph::AnnotatedSequenceGraph::graph_to_anno_index(boss_edge); auto [it, is_new] = node_to_rd.try_emplace(row, node_to_rd.size()); rd_paths_trunc[i].push_back(it.value()); diff --git a/metagraph/src/annotation/binary_matrix/row_diff/row_diff.hpp b/metagraph/src/annotation/binary_matrix/row_diff/row_diff.hpp index f2842d58f4..d9d3ebb5b5 100644 --- a/metagraph/src/annotation/binary_matrix/row_diff/row_diff.hpp +++ b/metagraph/src/annotation/binary_matrix/row_diff/row_diff.hpp @@ -123,9 +123,7 @@ std::vector RowDiff::get_column(Column column) co std::vector result; // TODO: implement a more efficient algorithm for (Row row = 0; row < num_rows(); ++row) { - auto edge = graph_->kmer_to_boss_index( - graph::AnnotatedSequenceGraph::anno_to_graph_index(row) - ); + auto edge = graph::AnnotatedSequenceGraph::anno_to_graph_index(row); if (!boss.get_W(edge)) continue; diff --git a/metagraph/src/annotation/int_matrix/row_diff/int_row_diff.hpp b/metagraph/src/annotation/int_matrix/row_diff/int_row_diff.hpp index 36a04eace6..535adc0084 100644 --- a/metagraph/src/annotation/int_matrix/row_diff/int_row_diff.hpp +++ b/metagraph/src/annotation/int_matrix/row_diff/int_row_diff.hpp @@ -86,9 +86,7 @@ std::vector IntRowDiff::get_column(Column j) cons // TODO: implement a more efficient algorithm std::vector result; for (Row i = 0; i < num_rows(); ++i) { - auto edge = graph_->kmer_to_boss_index( - graph::AnnotatedSequenceGraph::anno_to_graph_index(i) - ); + auto edge = graph::AnnotatedSequenceGraph::anno_to_graph_index(i); if (!boss.get_W(edge)) continue; diff --git a/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp b/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp index 8c9df1cfa5..0b05d4c9fa 100644 --- a/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp +++ b/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp @@ -69,9 +69,7 @@ std::vector TupleRowDiff::get_column(Column j) co // TODO: implement a more efficient algorithm std::vector result; for (Row i = 0; i < num_rows(); ++i) { - auto edge = graph_->kmer_to_boss_index( - graph::AnnotatedSequenceGraph::anno_to_graph_index(i) - ); + auto edge = graph::AnnotatedSequenceGraph::anno_to_graph_index(i); if (!boss.get_W(edge)) continue; diff --git a/metagraph/src/annotation/row_diff_builder.cpp b/metagraph/src/annotation/row_diff_builder.cpp index 0e15dfa5ea..3b2e98ee4d 100644 --- a/metagraph/src/annotation/row_diff_builder.cpp +++ b/metagraph/src/annotation/row_diff_builder.cpp @@ -264,30 +264,6 @@ void sum_and_call_counts(const fs::path &dir, } } -uint64_t from_graph_index(const graph::DeBruijnGraph &graph, - graph::DeBruijnGraph::node_index idx) { - if (auto* g = dynamic_cast(&graph)) { - return g->kmer_to_boss_index(idx); - } else { - return idx; - } -} -graph::DeBruijnGraph::node_index to_graph_index(const graph::DeBruijnGraph &graph, - uint64_t idx) { - if (auto* g = dynamic_cast(&graph)) { - return g->boss_to_kmer_index(idx); - } else { - return idx; - } -} -size_t get_last_size(const graph::DeBruijnGraph &graph) { - if (dynamic_cast(&graph)) { - return graph.get_last()->size(); - } else { - return graph.max_index() + 1; - } -} - rd_succ_bv_type route_at_forks(const graph::DeBruijnGraph &graph, const std::string &rd_succ_filename, const std::string &count_vectors_dir, @@ -317,21 +293,21 @@ rd_succ_bv_type route_at_forks(const graph::DeBruijnGraph &graph, [&](int32_t count) { // TODO: skip single outgoing outgoing_counts.push_back(count); - if (last[from_graph_index(graph, graph_idx)]) { + if (last[graph_idx]) { // pick the node with the largest count size_t max_pos = std::max_element(outgoing_counts.rbegin(), outgoing_counts.rend()) - outgoing_counts.rbegin(); - rd_succ_bv[from_graph_index(graph, graph_idx - max_pos)] = true; + rd_succ_bv[graph_idx - max_pos] = true; outgoing_counts.resize(0); } graph_idx++; } ); - if (graph_idx != graph.num_nodes() + 1) { - logger->error("Size the count vectors is incompatible with the" - " graph: {} != {}", graph_idx - 1, graph.num_nodes()); + if (graph_idx != graph.max_index() + 1) { + logger->error("Size of the count vectors is incompatible with the" + " graph: {} != {}", graph_idx - 1, graph.max_index()); exit(1); } @@ -377,29 +353,29 @@ void build_pred_succ(const graph::DeBruijnGraph &graph, count_vectors_dir, row_count_extension); // create the succ/pred files, indexed using annotation indices - uint32_t width = sdsl::bits::hi(graph.num_nodes()) + 1; + uint32_t width = sdsl::bits::hi(graph.max_index()) + 1; sdsl::int_vector_buffer<> succ(outfbase + ".succ", std::ios::out, BUFFER_SIZE, width); sdsl::int_vector_buffer<1> succ_boundary(outfbase + ".succ_boundary", std::ios::out, BUFFER_SIZE); sdsl::int_vector_buffer<> pred(outfbase + ".pred", std::ios::out, BUFFER_SIZE, width); sdsl::int_vector_buffer<1> pred_boundary(outfbase + ".pred_boundary", std::ios::out, BUFFER_SIZE); - ProgressBar progress_bar(graph.num_nodes(), "Compute succ/pred", std::cerr, + ProgressBar progress_bar(graph.max_index(), "Compute succ/pred", std::cerr, !common::get_verbose()); const uint64_t BS = 1'000'000; // traverse graph in parallel processing blocks of size |BS| // use static scheduling to make threads process ordered contiguous blocks #pragma omp parallel for ordered num_threads(num_threads) schedule(dynamic) - for (uint64_t start = 1; start <= graph.num_nodes(); start += BS) { + for (uint64_t start = 1; start <= graph.max_index(); start += BS) { std::vector succ_buf; std::vector succ_boundary_buf; std::vector pred_buf; std::vector pred_boundary_buf; - for (uint64_t i = start; i < std::min(start + BS, graph.num_nodes() + 1); ++i) { + for (uint64_t i = start; i < std::min(start + BS, graph.max_index() + 1); ++i) { bool skip_succ = false, skip_all = false; if (succinct) { // Legacy code for DBGSuccinct - BOSS::edge_index boss_idx = from_graph_index(graph, i); + BOSS::edge_index boss_idx = i; if((*dummy)[boss_idx]) { skip_all = true; } else { @@ -412,9 +388,9 @@ void build_pred_succ(const graph::DeBruijnGraph &graph, succ_buf.push_back(to_row(j)); succ_boundary_buf.push_back(0); } - if(rd_succ[from_graph_index(graph, i)]) { + if(rd_succ[i]) { graph.adjacent_incoming_nodes(i, [&](auto pred) { - if (dummy && (*dummy)[from_graph_index(graph, pred)]) { + if (dummy && (*dummy)[pred]) { return; } pred_buf.push_back(to_row(pred)); @@ -429,7 +405,6 @@ void build_pred_succ(const graph::DeBruijnGraph &graph, with_rd_succ(*graph.get_last()); } } - succ_boundary_buf.push_back(1); pred_boundary_buf.push_back(1); ++progress_bar; @@ -460,7 +435,7 @@ void assign_anchors(const graph::DeBruijnGraph &graph, return; } - const uint64_t num_rows = graph.num_nodes(); + const uint64_t num_rows = graph.max_index(); bool optimize_anchors = false; for (const auto &p : fs::directory_iterator(count_vectors_dir)) { @@ -468,7 +443,7 @@ void assign_anchors(const graph::DeBruijnGraph &graph, optimize_anchors = true; } - sdsl::bit_vector anchors_bv(get_last_size(graph), false); + sdsl::bit_vector anchors_bv(graph.max_index() + 1, false); if (optimize_anchors) { logger->trace("Making every row with negative reduction an anchor..."); @@ -478,7 +453,7 @@ void assign_anchors(const graph::DeBruijnGraph &graph, [&](int32_t count) { // check if the reduction is negative if (count < 0) - anchors_bv[from_graph_index(graph, to_node(i))] = true; + anchors_bv[to_node(i)] = true; i++; } ); @@ -522,7 +497,7 @@ void assign_anchors(const graph::DeBruijnGraph &graph, sdsl::bit_vector anchors(num_rows, false); for (BOSS::edge_index i = 1; i < anchors_bv.size(); ++i) { if (anchors_bv[i]) { - uint64_t graph_idx = to_graph_index(graph, i); + uint64_t graph_idx = i; assert(to_row(graph_idx) < num_rows); anchors[to_row(graph_idx)] = 1; } @@ -946,7 +921,7 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, // reduction (zero diff) __atomic_add_fetch(&row_nbits_block[chunk_idx], 1, __ATOMIC_RELAXED); } - } else { + } else if (succ) { bool is_anchor = anchor[row_idx]; // add current bit if this node is an anchor // or if the successor has zero diff diff --git a/metagraph/src/graph/alignment/aligner_seeder_methods.cpp b/metagraph/src/graph/alignment/aligner_seeder_methods.cpp index 306c7f6a0d..84755b6fa6 100644 --- a/metagraph/src/graph/alignment/aligner_seeder_methods.cpp +++ b/metagraph/src/graph/alignment/aligner_seeder_methods.cpp @@ -104,7 +104,7 @@ void suffix_to_prefix(const DBGSuccinct &dbg_succ, const auto &[first, last, seed_length] = final_range; assert(seed_length == boss.get_k()); for (boss::BOSS::edge_index i = first; i <= last; ++i) { - DBGSuccinct::node_index node = dbg_succ.boss_to_kmer_index(i); + DBGSuccinct::node_index node = i; if (node) callback(node); } diff --git a/metagraph/src/graph/alignment/alignment.cpp b/metagraph/src/graph/alignment/alignment.cpp index b1bdd0d8a7..fe1127fd41 100644 --- a/metagraph/src/graph/alignment/alignment.cpp +++ b/metagraph/src/graph/alignment/alignment.cpp @@ -550,7 +550,7 @@ void Alignment::reverse_complement(const DeBruijnGraph &graph, // the node is present in the underlying graph, so use // lower-level methods const auto &boss = dbg_succ.get_boss(); - boss::BOSS::edge_index edge = dbg_succ.kmer_to_boss_index(nodes_[0]); + boss::BOSS::edge_index edge = nodes_[0]; boss::BOSS::TAlphabet edge_label = boss.get_W(edge) % boss.alph_size; // TODO: This picks the node which is found by always traversing @@ -565,7 +565,7 @@ void Alignment::reverse_complement(const DeBruijnGraph &graph, return; } - nodes_[0] = dbg_succ.boss_to_kmer_index(edge); + nodes_[0] = edge; assert(nodes_[0]); sequence_.push_back(boss.decode(edge_label)); assert(graph.get_node_sequence(nodes_[0]) diff --git a/metagraph/src/graph/alignment/annotation_buffer.cpp b/metagraph/src/graph/alignment/annotation_buffer.cpp index 4020f312a7..a644bf2933 100644 --- a/metagraph/src/graph/alignment/annotation_buffer.cpp +++ b/metagraph/src/graph/alignment/annotation_buffer.cpp @@ -78,7 +78,7 @@ void AnnotationBuffer::fetch_queued_annotations() { continue; } - if (boss && !boss->get_W(dbg_succ->kmer_to_boss_index(base_path[i]))) { + if (boss && !boss->get_W(base_path[i])) { // skip dummy nodes if (node_to_cols_.try_emplace(base_path[i], 0).second && has_coordinates()) label_coords_.emplace_back(); diff --git a/metagraph/src/graph/graph_extensions/node_first_cache.cpp b/metagraph/src/graph/graph_extensions/node_first_cache.cpp index a945acf12f..975c830f63 100644 --- a/metagraph/src/graph/graph_extensions/node_first_cache.cpp +++ b/metagraph/src/graph/graph_extensions/node_first_cache.cpp @@ -38,11 +38,11 @@ void NodeFirstCache::call_incoming_kmers(node_index node, const IncomingEdgeCallback &callback) const { assert(node > 0 && node <= dbg_succ_.num_nodes()); - edge_index edge = dbg_succ_.kmer_to_boss_index(node); + edge_index edge = node; call_incoming_edges(edge, [&](edge_index prev_edge) { - node_index prev = dbg_succ_.boss_to_kmer_index(prev_edge); + node_index prev = prev_edge; if (prev != DeBruijnGraph::npos) callback(prev, get_first_char(prev_edge, edge)); } diff --git a/metagraph/src/graph/representation/base/sequence_graph.cpp b/metagraph/src/graph/representation/base/sequence_graph.cpp index 7d7d3eabee..1b933c5b71 100644 --- a/metagraph/src/graph/representation/base/sequence_graph.cpp +++ b/metagraph/src/graph/representation/base/sequence_graph.cpp @@ -24,11 +24,11 @@ static_assert(!(kBlockSize & 0xFF)); /*************** SequenceGraph ***************/ void SequenceGraph::call_nodes(const std::function &callback, - const std::function &stop_early) const { + const std::function &terminate) const { assert(num_nodes() == max_index()); const auto nnodes = num_nodes(); - for (node_index i = 1; i <= nnodes && !stop_early(); ++i) { + for (node_index i = 1; i <= nnodes && !terminate(); ++i) { callback(i); } } diff --git a/metagraph/src/graph/representation/base/sequence_graph.hpp b/metagraph/src/graph/representation/base/sequence_graph.hpp index 9f8a2ff374..e79e2f3612 100644 --- a/metagraph/src/graph/representation/base/sequence_graph.hpp +++ b/metagraph/src/graph/representation/base/sequence_graph.hpp @@ -62,7 +62,7 @@ class SequenceGraph { const std::function &callback) const = 0; virtual void call_nodes(const std::function &callback, - const std::function &stop_early = [](){ return false; }) const; + const std::function &terminate = [](){ return false; }) const; virtual uint64_t num_nodes() const = 0; virtual uint64_t max_index() const { return num_nodes(); }; diff --git a/metagraph/src/graph/representation/canonical_dbg.cpp b/metagraph/src/graph/representation/canonical_dbg.cpp index fdd4bd683e..d19b1c2385 100644 --- a/metagraph/src/graph/representation/canonical_dbg.cpp +++ b/metagraph/src/graph/representation/canonical_dbg.cpp @@ -115,7 +115,7 @@ ::map_to_nodes_sequentially(std::string_view sequence, sequence.substr(1)); boss.map_to_edges(sequence.substr(1), [&](boss::BOSS::edge_index edge) { - path.push_back(dbg_succ->boss_to_kmer_index(edge)); + path.push_back(edge); ++it; }, []() { return false; }, @@ -601,17 +601,14 @@ ::adjacent_incoming_rc_strand(node_index node, //-> TCAAGCAGAAGACGGCATACGAGATCCTCT const boss::BOSS &boss = dbg_succ_->get_boss(); - boss::BOSS::edge_index rc_edge = get_cache().get_prefix_rc( - dbg_succ_->kmer_to_boss_index(node), - spelling_hint - ); + boss::BOSS::edge_index rc_edge = get_cache().get_prefix_rc(node, spelling_hint); if (!rc_edge) return; boss.call_outgoing(rc_edge, [&](boss::BOSS::edge_index adjacent_edge) { assert(dbg_succ_); - node_index prev = dbg_succ_->boss_to_kmer_index(adjacent_edge); + node_index prev = adjacent_edge; if (prev == DeBruijnGraph::npos) return; @@ -658,24 +655,21 @@ ::adjacent_outgoing_rc_strand(node_index node, const std::function &callback) const { // rshift rc // ATGGCT -> TGGCT* -> *AGCCA - if (const auto *dbg_succ_ = get_dbg_succ(*graph_)) { + if (get_dbg_succ(*graph_)) { // AGAGGATCTCGTATGCCGTCTTCTGCTTGAG //-> GAGGATCTCGTATGCCGTCTTCTGCTTGAG //-> CTCAAGCAGAAGACGGCATACGAGATCCTC auto &cache = get_cache(); - boss::BOSS::edge_index rc_edge = cache.get_suffix_rc( - dbg_succ_->kmer_to_boss_index(node), - spelling_hint - ); + boss::BOSS::edge_index rc_edge = cache.get_suffix_rc(node, spelling_hint); if (!rc_edge) return; cache.call_incoming_edges(rc_edge, [&](edge_index prev_edge) { - node_index prev = dbg_succ_->boss_to_kmer_index(prev_edge); + node_index prev = prev_edge; if (!prev) return; diff --git a/metagraph/src/graph/representation/masked_graph.cpp b/metagraph/src/graph/representation/masked_graph.cpp index 319a936237..7cfe13be27 100644 --- a/metagraph/src/graph/representation/masked_graph.cpp +++ b/metagraph/src/graph/representation/masked_graph.cpp @@ -91,14 +91,14 @@ bit_vector_stat get_boss_mask(const DBGSuccinct &dbg_succ, sdsl::bit_vector mask_bv(dbg_succ.get_boss().num_edges() + 1, false); if (only_valid_nodes_in_mask) { kmers_in_graph.call_ones([&](auto i) { - assert(dbg_succ.kmer_to_boss_index(i)); - mask_bv[dbg_succ.kmer_to_boss_index(i)] = true; + assert(i); + mask_bv[i] = true; }); } else { dbg_succ.call_nodes([&](auto i) { - assert(dbg_succ.kmer_to_boss_index(i)); + assert(i); if (kmers_in_graph[i]) - mask_bv[dbg_succ.kmer_to_boss_index(i)] = true; + mask_bv[i] = true; }); } return bit_vector_stat(std::move(mask_bv)); @@ -112,9 +112,6 @@ void MaskedDeBruijnGraph::call_sequences(const CallPath &callback, only_valid_nodes_in_mask_); dbg_succ->get_boss().call_sequences([&](std::string&& sequence, auto&& path) { - for (auto &node : path) { - node = dbg_succ->boss_to_kmer_index(node); - } callback(sequence, path); }, num_threads, kmers_in_single_form, &mask); @@ -133,9 +130,6 @@ void MaskedDeBruijnGraph::call_unitigs(const CallPath &callback, only_valid_nodes_in_mask_); dbg_succ->get_boss().call_unitigs([&](std::string&& sequence, auto&& path) { - for (auto &node : path) { - node = dbg_succ->boss_to_kmer_index(node); - } callback(sequence, path); }, num_threads, min_tip_size, kmers_in_single_form, &mask); diff --git a/metagraph/src/graph/representation/succinct/dbg_succinct.cpp b/metagraph/src/graph/representation/succinct/dbg_succinct.cpp index 6d325c7f49..7a84cd31aa 100644 --- a/metagraph/src/graph/representation/succinct/dbg_succinct.cpp +++ b/metagraph/src/graph/representation/succinct/dbg_succinct.cpp @@ -82,29 +82,25 @@ bool DBGSuccinct::find(std::string_view sequence, // Traverse the outgoing edge node_index DBGSuccinct::traverse(node_index node, char next_char) const { - assert(node > 0 && node <= num_nodes()); + assert(node > 0 && node <= max_index()); // return npos if the character is invalid if (boss_graph_->encode(next_char) == boss_graph_->alph_size) return npos; // dbg node is a boss edge - BOSS::edge_index boss_edge = kmer_to_boss_index(node); + BOSS::edge_index boss_edge = node; boss_edge = boss_graph_->fwd(boss_edge); - return boss_to_kmer_index( - boss_graph_->pick_edge(boss_edge, boss_graph_->encode(next_char)) - ); + return boss_graph_->pick_edge(boss_edge, boss_graph_->encode(next_char)); } // Traverse the incoming edge node_index DBGSuccinct::traverse_back(node_index node, char prev_char) const { - assert(node > 0 && node <= num_nodes()); + assert(node > 0 && node <= max_index()); // dbg node is a boss edge - BOSS::edge_index edge = boss_graph_->bwd(kmer_to_boss_index(node)); - return boss_to_kmer_index( - boss_graph_->pick_incoming_edge(edge, boss_graph_->encode(prev_char)) - ); + BOSS::edge_index edge = boss_graph_->bwd(node); + return boss_graph_->pick_incoming_edge(edge, boss_graph_->encode(prev_char)); } template @@ -128,10 +124,10 @@ inline void call_outgoing(const BOSS &boss, void DBGSuccinct::call_outgoing_kmers(node_index node, const OutgoingEdgeCallback &callback) const { - assert(node > 0 && node <= num_nodes()); + assert(node > 0 && node <= max_index()); - call_outgoing(*boss_graph_, kmer_to_boss_index(node), [&](auto i) { - auto next = boss_to_kmer_index(i); + call_outgoing(*boss_graph_, node, [&](auto i) { + auto next = i; if (next != npos) callback(next, boss_graph_->decode(boss_graph_->get_W(i) % boss_graph_->alph_size)); @@ -140,9 +136,9 @@ void DBGSuccinct::call_outgoing_kmers(node_index node, void DBGSuccinct::call_incoming_kmers(node_index node, const IncomingEdgeCallback &callback) const { - assert(node > 0 && node <= num_nodes()); + assert(node > 0 && node <= max_index()); - auto edge = kmer_to_boss_index(node); + auto edge = node; boss_graph_->call_incoming_to_target(boss_graph_->bwd(edge), boss_graph_->get_node_last_value(edge), @@ -150,7 +146,7 @@ void DBGSuccinct::call_incoming_kmers(node_index node, assert(boss_graph_->get_W(incoming_boss_edge) % boss_graph_->alph_size == boss_graph_->get_node_last_value(edge)); - auto prev = boss_to_kmer_index(incoming_boss_edge); + auto prev = incoming_boss_edge; if (prev != npos) { callback(prev, boss_graph_->decode( @@ -164,10 +160,10 @@ void DBGSuccinct::call_incoming_kmers(node_index node, void DBGSuccinct::adjacent_outgoing_nodes(node_index node, const std::function &callback) const { - assert(node > 0 && node <= num_nodes()); + assert(node > 0 && node <= max_index()); - call_outgoing(*boss_graph_, kmer_to_boss_index(node), [&](auto i) { - auto next = boss_to_kmer_index(i); + call_outgoing(*boss_graph_, node, [&](auto i) { + auto next = i; if (next != npos) callback(next); }); @@ -175,9 +171,9 @@ void DBGSuccinct::adjacent_outgoing_nodes(node_index node, void DBGSuccinct::adjacent_incoming_nodes(node_index node, const std::function &callback) const { - assert(node > 0 && node <= num_nodes()); + assert(node > 0 && node <= max_index()); - auto edge = kmer_to_boss_index(node); + auto edge = node; boss_graph_->call_incoming_to_target(boss_graph_->bwd(edge), boss_graph_->get_node_last_value(edge), @@ -185,13 +181,22 @@ void DBGSuccinct::adjacent_incoming_nodes(node_index node, assert(boss_graph_->get_W(incoming_boss_edge) % boss_graph_->alph_size == boss_graph_->get_node_last_value(edge)); - auto prev = boss_to_kmer_index(incoming_boss_edge); + auto prev = incoming_boss_edge; if (prev != npos) callback(prev); } ); } +void DBGSuccinct::call_nodes(const std::function &callback, + const std::function &terminate) const { + for (node_index i = 1; i <= max_index() && !terminate(); ++i) { + if (!valid_edges_ || (*valid_edges_)[i]) { + callback(i); + } + } +} + void DBGSuccinct::add_sequence(std::string_view sequence, const std::function &on_insertion) { if (sequence.size() < get_k()) @@ -223,7 +228,7 @@ void DBGSuccinct::add_sequence(std::string_view sequence, // Call all new nodes inserted including the dummy ones, unless they // are masked out. - on_insertion(boss_to_kmer_index(new_boss_edge)); + on_insertion(new_boss_edge); } assert(!valid_edges_.get() || !(*valid_edges_)[0]); @@ -234,9 +239,9 @@ void DBGSuccinct::add_sequence(std::string_view sequence, } std::string DBGSuccinct::get_node_sequence(node_index node) const { - assert(node > 0 && node <= num_nodes()); + assert(node > 0 && node <= max_index()); - auto boss_edge = kmer_to_boss_index(node); + auto boss_edge = node; return boss_graph_->get_node_str(boss_edge) + boss_graph_->decode(boss_graph_->get_W(boss_edge) % boss_graph_->alph_size); @@ -256,7 +261,7 @@ void DBGSuccinct::map_to_nodes_sequentially(std::string_view sequence, boss_graph_->map_to_edges( sequence, - [&](BOSS::edge_index i) { callback(boss_to_kmer_index(i)); }, + [&](BOSS::edge_index i) { callback(i); }, terminate, [&]() { if (!is_missing()) @@ -297,7 +302,7 @@ ::call_nodes_with_suffix_matching_longest_prefix( assert(first == last); auto edge = boss_graph_->pick_edge(last, encoded.back()); if (edge) { - auto kmer_index = boss_to_kmer_index(edge); + auto kmer_index = edge; if (kmer_index != npos) { assert(str.size() == get_k()); assert(get_node_sequence(kmer_index) == str); @@ -322,7 +327,7 @@ ::call_nodes_with_suffix_matching_longest_prefix( boss_graph_->call_incoming_to_target(boss_graph_->bwd(e), boss_graph_->get_node_last_value(e), [&](BOSS::edge_index incoming_edge_idx) { - auto kmer_index = boss_to_kmer_index(incoming_edge_idx); + auto kmer_index = incoming_edge_idx; if (kmer_index != npos) { assert(get_node_sequence(kmer_index).substr(get_k() - match_size) == str.substr(0, match_size)); @@ -344,7 +349,7 @@ ::call_nodes_with_suffix_matching_longest_prefix( boss_graph_->call_incoming_to_target(boss_graph_->bwd(e), boss_graph_->get_node_last_value(e), [&](BOSS::edge_index incoming_edge_idx) { - auto kmer_index = boss_to_kmer_index(incoming_edge_idx); + auto kmer_index = incoming_edge_idx; if (kmer_index != npos) { assert(get_node_sequence(kmer_index).substr(get_k() - match_size) == str.substr(0, match_size)); @@ -361,13 +366,13 @@ void DBGSuccinct::traverse(node_index start, const char *end, const std::function &callback, const std::function &terminate) const { - assert(start > 0 && start <= num_nodes()); + assert(start > 0 && start <= max_index()); assert(end >= begin); if (terminate()) return; - auto edge = kmer_to_boss_index(start); + auto edge = start; assert(edge); BOSS::TAlphabet w; @@ -379,7 +384,7 @@ void DBGSuccinct::traverse(node_index start, edge = boss_graph_->fwd(edge, w % boss_graph_->alph_size); edge = boss_graph_->pick_edge(edge, boss_graph_->encode(*begin)); - start = boss_to_kmer_index(edge); + start = edge; if (start == npos) return; @@ -442,13 +447,13 @@ void DBGSuccinct::map_to_nodes(std::string_view sequence, for (size_t i = 0; i < boss_edges.size() && !terminate(); ++i) { // the definition of a canonical k-mer is redefined: // use k-mer with smaller index in the BOSS table. - callback(boss_to_kmer_index(boss_edges[i])); + callback(boss_edges[i]); } } else { boss_graph_->map_to_edges( sequence, - [&](BOSS::edge_index i) { callback(boss_to_kmer_index(i)); }, + [&](BOSS::edge_index i) { callback(i); }, terminate, [&]() { if (!is_missing()) @@ -467,9 +472,6 @@ void DBGSuccinct::call_sequences(const CallPath &callback, assert(boss_graph_.get()); boss_graph_->call_sequences( [&](std::string&& seq, auto&& path) { - for (auto &node : path) { - node = boss_to_kmer_index(node); - } callback(std::move(seq), std::move(path)); }, num_threads, @@ -484,9 +486,6 @@ void DBGSuccinct::call_unitigs(const CallPath &callback, assert(boss_graph_.get()); boss_graph_->call_unitigs( [&](std::string&& seq, auto&& path) { - for (auto &node : path) { - node = boss_to_kmer_index(node); - } callback(std::move(seq), std::move(path)); }, num_threads, @@ -500,7 +499,7 @@ ::call_kmers(const std::function &callback const std::function &stop_early) const { assert(boss_graph_.get()); boss_graph_->call_kmers([&](auto index, const std::string &seq) { - auto node = boss_to_kmer_index(index); + auto node = index; assert(node != npos); callback(node, seq); }, stop_early); @@ -509,7 +508,7 @@ ::call_kmers(const std::function &callback void DBGSuccinct ::call_source_nodes(const std::function &callback) const { boss_graph_->call_start_edges([&](auto boss_edge) { - auto node = boss_to_kmer_index(boss_edge); + auto node = boss_edge; assert(node != npos); assert(!indegree(node)); callback(node); @@ -529,16 +528,12 @@ ::row_diff_traverse(size_t num_threads, return get_boss().row_diff_traverse(num_threads, max_length, rd_succ, terminal); } -node_index DBGSuccinct -::row_diff_successor(node_index node, const bit_vector &rd_succ) const { - return boss_to_kmer_index(get_boss().row_diff_successor(kmer_to_boss_index(node), rd_succ)); -} size_t DBGSuccinct::outdegree(node_index node) const { - assert(node > 0 && node <= num_nodes()); + assert(node > 0 && node <= max_index()); - auto boss_edge = kmer_to_boss_index(node); + auto boss_edge = node; if (boss_edge == 1) return boss_graph_->succ_last(1) - 1; @@ -562,9 +557,9 @@ size_t DBGSuccinct::outdegree(node_index node) const { } bool DBGSuccinct::has_single_outgoing(node_index node) const { - assert(node > 0 && node <= num_nodes()); + assert(node > 0 && node <= max_index()); - auto boss_edge = kmer_to_boss_index(node); + auto boss_edge = node; if (boss_edge == 1) return boss_graph_->succ_last(1) == 2; @@ -588,9 +583,9 @@ bool DBGSuccinct::has_single_outgoing(node_index node) const { } bool DBGSuccinct::has_multiple_outgoing(node_index node) const { - assert(node > 0 && node <= num_nodes()); + assert(node > 0 && node <= max_index()); - auto boss_edge = kmer_to_boss_index(node); + auto boss_edge = node; if (boss_edge == 1) return boss_graph_->succ_last(1) > 2; @@ -605,9 +600,9 @@ bool DBGSuccinct::has_multiple_outgoing(node_index node) const { } size_t DBGSuccinct::indegree(node_index node) const { - assert(node > 0 && node <= num_nodes()); + assert(node > 0 && node <= max_index()); - auto boss_edge = kmer_to_boss_index(node); + auto boss_edge = node; if (boss_edge == 1) return 1; @@ -621,9 +616,9 @@ size_t DBGSuccinct::indegree(node_index node) const { } bool DBGSuccinct::has_no_incoming(node_index node) const { - assert(node > 0 && node <= num_nodes()); + assert(node > 0 && node <= max_index()); - auto boss_edge = kmer_to_boss_index(node); + auto boss_edge = node; if (boss_edge == 1) return false; @@ -637,9 +632,9 @@ bool DBGSuccinct::has_no_incoming(node_index node) const { } bool DBGSuccinct::has_single_incoming(node_index node) const { - assert(node > 0 && node <= num_nodes()); + assert(node > 0 && node <= max_index()); - auto boss_edge = kmer_to_boss_index(node); + auto boss_edge = node; if (boss_edge == 1) return false; @@ -664,6 +659,10 @@ uint64_t DBGSuccinct::num_nodes() const { : boss_graph_->num_edges(); } +uint64_t DBGSuccinct::max_index() const { + return boss_graph_->num_edges(); +} + bool DBGSuccinct::load_without_mask(const std::string &filename) { // release the old mask valid_edges_.reset(); @@ -907,29 +906,6 @@ void DBGSuccinct::mask_dummy_kmers(size_t num_threads, bool with_pruning) { assert(!(*valid_edges_)[0]); } -uint64_t DBGSuccinct::kmer_to_boss_index(node_index node) const { - assert(node > 0); - assert(node <= num_nodes()); - - if (!valid_edges_.get()) - return node; - - return valid_edges_->select1(node); -} - -DBGSuccinct::node_index DBGSuccinct::boss_to_kmer_index(uint64_t boss_index) const { - assert(boss_index <= boss_graph_->num_edges()); - assert(!valid_edges_.get() || boss_index < valid_edges_->size()); - - if (!valid_edges_.get() || !boss_index) - return boss_index; - - if (!(*valid_edges_)[boss_index]) - return npos; - - return valid_edges_->rank1(boss_index); -} - void DBGSuccinct ::initialize_bloom_filter_from_fpr(double false_positive_rate, uint32_t max_num_hash_functions) { diff --git a/metagraph/src/graph/representation/succinct/dbg_succinct.hpp b/metagraph/src/graph/representation/succinct/dbg_succinct.hpp index 7011d183fd..27eba6cd4f 100644 --- a/metagraph/src/graph/representation/succinct/dbg_succinct.hpp +++ b/metagraph/src/graph/representation/succinct/dbg_succinct.hpp @@ -37,6 +37,9 @@ class DBGSuccinct : public DeBruijnGraph { virtual void adjacent_incoming_nodes(node_index node, const std::function &callback) const override final; + virtual void call_nodes(const std::function &callback, + const std::function &terminate = [](){ return false; }) const; + // Insert sequence to graph and invoke callback |on_insertion| for each new // node index augmenting the range [1,...,max_index], including those not // pointing to any real node in graph. That is, the callback is invoked for @@ -110,6 +113,7 @@ class DBGSuccinct : public DeBruijnGraph { * edges in the BOSS graph (because an edge in the BOSS graph represents a k-mer). */ virtual uint64_t num_nodes() const override final; + virtual uint64_t max_index() const override final; virtual void mask_dummy_kmers(size_t num_threads, bool with_pruning) final; @@ -181,11 +185,6 @@ class DBGSuccinct : public DeBruijnGraph { const bit_vector &rd_succ, sdsl::bit_vector *terminal) const override final; - virtual node_index row_diff_successor(node_index node, const bit_vector &rd_succ) const override final; - - uint64_t kmer_to_boss_index(node_index kmer_index) const; - node_index boss_to_kmer_index(uint64_t boss_index) const; - void initialize_bloom_filter_from_fpr(double false_positive_rate, uint32_t max_num_hash_functions = -1); diff --git a/metagraph/tests/annotation/test_converters.cpp b/metagraph/tests/annotation/test_converters.cpp index cd39b07b47..f0d2c79611 100644 --- a/metagraph/tests/annotation/test_converters.cpp +++ b/metagraph/tests/annotation/test_converters.cpp @@ -279,8 +279,10 @@ TEST(RowDiff, ConvertFromColumnCompressedSameLabels) { std::unique_ptr graph = create_graph(3, { "ACGTCAC" }); graph->serialize(graph_fname); - ColumnCompressed source_annot(5); - source_annot.add_labels({ 0, 1, 2, 3, 4 }, labels); + ColumnCompressed source_annot(graph->max_index()); + std::vector edges(graph->max_index()); + std::iota(begin(edges), end(edges), 0); + source_annot.add_labels(edges, labels); source_annot.serialize(annot_fname); convert_to_row_diff({ annot_fname }, graph_fname, 1e9, max_depth, dst_dir, dst_dir, RowDiffStage::COMPUTE_REDUCTION); @@ -293,7 +295,7 @@ TEST(RowDiff, ConvertFromColumnCompressedSameLabels) { .load_anchor(graph_fname + matrix::kRowDiffAnchorExt); ASSERT_EQ(labels.size(), annotator.num_labels()); - ASSERT_EQ(5u, annotator.num_objects()); + ASSERT_EQ(graph->max_index(), annotator.num_objects()); EXPECT_EQ(labels.size() * expected_relations[max_depth - 1], annotator.num_relations()); @@ -326,8 +328,10 @@ TEST(RowDiff, ConvertFromColumnCompressedSameLabelsMultipleColumns) { std::vector sources; for (const std::string &label : labels) { - ColumnCompressed source_annot(5); - source_annot.add_labels({ 0, 1, 2, 3, 4 }, { label }); + ColumnCompressed source_annot(graph->max_index()); + std::vector edges(graph->max_index()); + std::iota(begin(edges), end(edges), 0); + source_annot.add_labels(edges, { label }); const std::string annot_fname = dst_dir/(label + ColumnCompressed<>::kExtension); source_annot.serialize(annot_fname); @@ -346,7 +350,7 @@ TEST(RowDiff, ConvertFromColumnCompressedSameLabelsMultipleColumns) { .load_anchor(graph_fname + matrix::kRowDiffAnchorExt); ASSERT_EQ(1, annotator.num_labels()); - ASSERT_EQ(5u, annotator.num_objects()); + ASSERT_EQ(graph->max_index(), annotator.num_objects()); EXPECT_EQ(expected_relations[max_depth - 1], annotator.num_relations()); for (uint32 idx = 0; idx < annotator.num_objects(); ++idx) { @@ -382,13 +386,15 @@ void test_row_diff(uint32_t k, graph->mask_dummy_kmers(1, false); graph->serialize(graph_fname); - ColumnCompressed initial_annotation(graph->num_nodes()); + ColumnCompressed initial_annotation(graph->max_index()); std::unordered_set all_labels; - for (uint32_t anno_idx = 0; anno_idx < graph->num_nodes(); ++anno_idx) { + uint32_t anno_idx = 0; + graph->call_nodes([&](uint32_t node_idx) { const std::vector &labels = annotations[anno_idx]; - initial_annotation.add_labels({anno_idx}, labels); + initial_annotation.add_labels({node_idx - 1}, labels); std::for_each(labels.begin(), labels.end(), [&](auto l) { all_labels.insert(l); }); - } + ++anno_idx; + }); initial_annotation.serialize(annot_fname); @@ -402,11 +408,16 @@ void test_row_diff(uint32_t k, .load_anchor(graph_fname + matrix::kRowDiffAnchorExt); ASSERT_EQ(all_labels.size(), annotator.num_labels()); - ASSERT_EQ(graph->num_nodes(), annotator.num_objects()); + ASSERT_EQ(graph->max_index(), annotator.num_objects()); - for (uint32_t anno_idx = 0; anno_idx < graph->num_nodes(); ++anno_idx) { - ASSERT_THAT(annotator.get_labels(anno_idx), + anno_idx = 0; + graph->call_nodes([&](uint32_t node_idx) { + ASSERT_THAT(annotator.get_labels(node_idx - 1), UnorderedElementsAreArray(annotations[anno_idx])); + ++anno_idx; + }); + + for (uint32_t anno_idx = 0; anno_idx < graph->max_index(); ++anno_idx) { } std::filesystem::remove_all(dst_dir); @@ -433,14 +444,16 @@ void test_row_diff_separate_columns(uint32_t k, graph->serialize(graph_fname); std::map> col_annotations; - for (uint32_t anno_idx = 0; anno_idx < graph->num_nodes(); ++anno_idx) { + uint32_t anno_idx = 0; + graph->call_nodes([&](auto node_idx) { for (const auto &label : annotations[anno_idx]) { - col_annotations[label].push_back(anno_idx); + col_annotations[label].push_back(node_idx - 1); } - } + ++anno_idx; + }); for (const auto& [label, indices] : col_annotations) { - ColumnCompressed initial_annotation(graph->num_nodes()); + ColumnCompressed initial_annotation(graph->max_index()); initial_annotation.add_labels(indices, {label}); std::string annot_fname = dst_dir/("anno_" + label + ColumnCompressed<>::kExtension); @@ -460,7 +473,7 @@ void test_row_diff_separate_columns(uint32_t k, const_cast &>(annotator.get_matrix()) .load_anchor(graph_fname + matrix::kRowDiffAnchorExt); - ASSERT_EQ(graph->num_nodes(), annotator.num_objects()); + ASSERT_EQ(graph->max_index(), annotator.num_objects()); std::vector actual_indices; annotator.call_objects(label, From f019a056da09ff2d4f462f0dfe75ef0ace9bb49f Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Wed, 9 Oct 2024 18:21:32 +0200 Subject: [PATCH 12/29] override final for call_nodes + add select/rank node + some fixes --- .../representation/succinct/dbg_succinct.cpp | 22 +++++ .../representation/succinct/dbg_succinct.hpp | 5 +- .../annotation/row_diff/test_row_diff.cpp | 88 +++++++++---------- .../tests/annotation/test_converters.cpp | 10 +-- 4 files changed, 75 insertions(+), 50 deletions(-) diff --git a/metagraph/src/graph/representation/succinct/dbg_succinct.cpp b/metagraph/src/graph/representation/succinct/dbg_succinct.cpp index 7a84cd31aa..77c548b0fe 100644 --- a/metagraph/src/graph/representation/succinct/dbg_succinct.cpp +++ b/metagraph/src/graph/representation/succinct/dbg_succinct.cpp @@ -906,6 +906,28 @@ void DBGSuccinct::mask_dummy_kmers(size_t num_threads, bool with_pruning) { assert(!(*valid_edges_)[0]); } +node_index DBGSuccinct::select_node(uint64_t rank) const { + assert(rank <= num_nodes()); + + if (!valid_edges_.get() || !rank) + return rank; + + return valid_edges_->select1(rank); +} + +uint64_t DBGSuccinct::rank_node(node_index node) const { + assert(node <= boss_graph_->num_edges()); + assert(!valid_edges_.get() || node < valid_edges_->size()); + + if (!valid_edges_.get() || !node) + return node; + + if (!(*valid_edges_)[node]) + return npos; + + return valid_edges_->rank1(node); +} + void DBGSuccinct ::initialize_bloom_filter_from_fpr(double false_positive_rate, uint32_t max_num_hash_functions) { diff --git a/metagraph/src/graph/representation/succinct/dbg_succinct.hpp b/metagraph/src/graph/representation/succinct/dbg_succinct.hpp index 27eba6cd4f..978947fd04 100644 --- a/metagraph/src/graph/representation/succinct/dbg_succinct.hpp +++ b/metagraph/src/graph/representation/succinct/dbg_succinct.hpp @@ -38,7 +38,7 @@ class DBGSuccinct : public DeBruijnGraph { const std::function &callback) const override final; virtual void call_nodes(const std::function &callback, - const std::function &terminate = [](){ return false; }) const; + const std::function &terminate = [](){ return false; }) const override final; // Insert sequence to graph and invoke callback |on_insertion| for each new // node index augmenting the range [1,...,max_index], including those not @@ -178,6 +178,9 @@ class DBGSuccinct : public DeBruijnGraph { virtual void call_source_nodes(const std::function &callback) const override final; + node_index select_node(uint64_t boss_index) const; + uint64_t rank_node(node_index kmer_index) const; + virtual std::shared_ptr get_last() const override final; virtual void row_diff_traverse(size_t num_threads, diff --git a/metagraph/tests/annotation/row_diff/test_row_diff.cpp b/metagraph/tests/annotation/row_diff/test_row_diff.cpp index 158171a44e..59f2f03b5b 100644 --- a/metagraph/tests/annotation/row_diff/test_row_diff.cpp +++ b/metagraph/tests/annotation/row_diff/test_row_diff.cpp @@ -95,28 +95,28 @@ TEST(RowDiff, GetRows) { annot.load_anchor(fterm_temp.name()); auto rows = annot.get_rows({ 3, 3, 3, 3, 5, 5, 6, 7, 8, 9, 10, 11 }); - EXPECT_EQ("CTAG", graph.get_node_sequence(4)); + EXPECT_EQ("CTAG", graph.get_node_sequence(graph.select_node(4))); ASSERT_THAT(rows[3], ElementsAre(0, 1)); - EXPECT_EQ("AGCT", graph.get_node_sequence(6)); + EXPECT_EQ("AGCT", graph.get_node_sequence(graph.select_node(6))); ASSERT_THAT(rows[5], ElementsAre(1)); - EXPECT_EQ("CTCT", graph.get_node_sequence(7)); + EXPECT_EQ("CTCT", graph.get_node_sequence(graph.select_node(7))); ASSERT_THAT(rows[6], ElementsAre(0)); - EXPECT_EQ("TAGC", graph.get_node_sequence(8)); + EXPECT_EQ("TAGC", graph.get_node_sequence(graph.select_node(8))); ASSERT_THAT(rows[7], ElementsAre(1)); - EXPECT_EQ("ACTA", graph.get_node_sequence(9)); + EXPECT_EQ("ACTA", graph.get_node_sequence(graph.select_node(9))); ASSERT_THAT(rows[8], ElementsAre(1)); - EXPECT_EQ("ACTC", graph.get_node_sequence(10)); + EXPECT_EQ("ACTC", graph.get_node_sequence(graph.select_node(10))); ASSERT_THAT(rows[9], ElementsAre(0)); - EXPECT_EQ("GCTA", graph.get_node_sequence(11)); + EXPECT_EQ("GCTA", graph.get_node_sequence(graph.select_node(11))); ASSERT_THAT(rows[10], ElementsAre(1)); - EXPECT_EQ("TCTA", graph.get_node_sequence(12)); + EXPECT_EQ("TCTA", graph.get_node_sequence(graph.select_node(12))); ASSERT_THAT(rows[11], ElementsAre(0)); } @@ -149,28 +149,28 @@ TEST(RowDiff, GetAnnotation) { RowDiff annot(&graph, std::move(mat)); annot.load_anchor(fterm_temp.name()); - EXPECT_EQ("CTAG", graph.get_node_sequence(4)); + EXPECT_EQ("CTAG", graph.get_node_sequence(graph.select_node(4))); ASSERT_THAT(annot.get_rows({3})[0], ElementsAre(0, 1)); - EXPECT_EQ("AGCT", graph.get_node_sequence(6)); + EXPECT_EQ("AGCT", graph.get_node_sequence(graph.select_node(6))); ASSERT_THAT(annot.get_rows({5})[0], ElementsAre(1)); - EXPECT_EQ("CTCT", graph.get_node_sequence(7)); + EXPECT_EQ("CTCT", graph.get_node_sequence(graph.select_node(7))); ASSERT_THAT(annot.get_rows({6})[0], ElementsAre(0)); - EXPECT_EQ("TAGC", graph.get_node_sequence(8)); + EXPECT_EQ("TAGC", graph.get_node_sequence(graph.select_node(8))); ASSERT_THAT(annot.get_rows({7})[0], ElementsAre(1)); - EXPECT_EQ("ACTA", graph.get_node_sequence(9)); + EXPECT_EQ("ACTA", graph.get_node_sequence(graph.select_node(9))); ASSERT_THAT(annot.get_rows({8})[0], ElementsAre(1)); - EXPECT_EQ("ACTC", graph.get_node_sequence(10)); + EXPECT_EQ("ACTC", graph.get_node_sequence(graph.select_node(10))); ASSERT_THAT(annot.get_rows({9})[0], ElementsAre(0)); - EXPECT_EQ("GCTA", graph.get_node_sequence(11)); + EXPECT_EQ("GCTA", graph.get_node_sequence(graph.select_node(11))); ASSERT_THAT(annot.get_rows({10})[0], ElementsAre(1)); - EXPECT_EQ("TCTA", graph.get_node_sequence(12)); + EXPECT_EQ("TCTA", graph.get_node_sequence(graph.select_node(12))); ASSERT_THAT(annot.get_rows({11})[0], ElementsAre(0)); } @@ -205,28 +205,28 @@ TEST(RowDiff, GetAnnotationMasked) { RowDiff annot(&graph, std::move(mat)); annot.load_anchor(fterm_temp.name()); - EXPECT_EQ("CTAG", graph.get_node_sequence(1)); + EXPECT_EQ("CTAG", graph.get_node_sequence(graph.select_node(1))); ASSERT_THAT(annot.get_rows({0})[0], ElementsAre(0, 1)); - EXPECT_EQ("AGCT", graph.get_node_sequence(2)); + EXPECT_EQ("AGCT", graph.get_node_sequence(graph.select_node(2))); ASSERT_THAT(annot.get_rows({1})[0], ElementsAre(1)); - EXPECT_EQ("CTCT", graph.get_node_sequence(3)); + EXPECT_EQ("CTCT", graph.get_node_sequence(graph.select_node(3))); ASSERT_THAT(annot.get_rows({2})[0], ElementsAre(0)); - EXPECT_EQ("TAGC", graph.get_node_sequence(4)); + EXPECT_EQ("TAGC", graph.get_node_sequence(graph.select_node(4))); ASSERT_THAT(annot.get_rows({3})[0], ElementsAre(1)); - EXPECT_EQ("ACTA", graph.get_node_sequence(5)); + EXPECT_EQ("ACTA", graph.get_node_sequence(graph.select_node(5))); ASSERT_THAT(annot.get_rows({4})[0], ElementsAre(1)); - EXPECT_EQ("ACTC", graph.get_node_sequence(6)); + EXPECT_EQ("ACTC", graph.get_node_sequence(graph.select_node(6))); ASSERT_THAT(annot.get_rows({5})[0], ElementsAre(0)); - EXPECT_EQ("GCTA", graph.get_node_sequence(7)); + EXPECT_EQ("GCTA", graph.get_node_sequence(graph.select_node(7))); ASSERT_THAT(annot.get_rows({6})[0], ElementsAre(1)); - EXPECT_EQ("TCTA", graph.get_node_sequence(8)); + EXPECT_EQ("TCTA", graph.get_node_sequence(graph.select_node(8))); ASSERT_THAT(annot.get_rows({7})[0], ElementsAre(0)); } @@ -260,34 +260,34 @@ TEST(RowDiff, GetAnnotationBifurcation) { RowDiff annot(&graph, std::move(mat)); annot.load_anchor(fterm_temp.name()); - EXPECT_EQ("CTAG", graph.get_node_sequence(4)); + EXPECT_EQ("CTAG", graph.get_node_sequence(graph.select_node(4))); ASSERT_THAT(annot.get_rows({3})[0], ElementsAre(0, 1)); - EXPECT_EQ("CTAT", graph.get_node_sequence(5)); + EXPECT_EQ("CTAT", graph.get_node_sequence(graph.select_node(5))); ASSERT_THAT(annot.get_rows({4})[0], ElementsAre(1)); - EXPECT_EQ("TACT", graph.get_node_sequence(6)); + EXPECT_EQ("TACT", graph.get_node_sequence(graph.select_node(6))); ASSERT_THAT(annot.get_rows({5})[0], ElementsAre(0)); - EXPECT_EQ("AGCT", graph.get_node_sequence(7)); + EXPECT_EQ("AGCT", graph.get_node_sequence(graph.select_node(7))); ASSERT_THAT(annot.get_rows({6})[0], ElementsAre(0, 1)); - EXPECT_EQ("CTCT", graph.get_node_sequence(8)); + EXPECT_EQ("CTCT", graph.get_node_sequence(graph.select_node(8))); ASSERT_THAT(annot.get_rows({7})[0], ElementsAre(1)); - EXPECT_EQ("TAGC", graph.get_node_sequence(9)); + EXPECT_EQ("TAGC", graph.get_node_sequence(graph.select_node(9))); ASSERT_THAT(annot.get_rows({8})[0], ElementsAre(0, 1)); - EXPECT_EQ("ACTA", graph.get_node_sequence(12)); + EXPECT_EQ("ACTA", graph.get_node_sequence(graph.select_node(12))); ASSERT_THAT(annot.get_rows({11})[0], ElementsAre(0)); - EXPECT_EQ("ACTC", graph.get_node_sequence(13)); + EXPECT_EQ("ACTC", graph.get_node_sequence(graph.select_node(13))); ASSERT_THAT(annot.get_rows({12})[0], ElementsAre(1)); - EXPECT_EQ("GCTA", graph.get_node_sequence(14)); + EXPECT_EQ("GCTA", graph.get_node_sequence(graph.select_node(14))); ASSERT_THAT(annot.get_rows({13})[0], ElementsAre(0, 1)); - EXPECT_EQ("TCTA", graph.get_node_sequence(15)); + EXPECT_EQ("TCTA", graph.get_node_sequence(graph.select_node(15))); ASSERT_THAT(annot.get_rows({14})[0], ElementsAre(1)); } @@ -321,34 +321,34 @@ TEST(RowDiff, GetAnnotationBifurcationMasked) { RowDiff annot(&graph, std::move(mat)); annot.load_anchor(fterm_temp.name()); - EXPECT_EQ("CTAG", graph.get_node_sequence(1)); + EXPECT_EQ("CTAG", graph.get_node_sequence(graph.select_node(1))); ASSERT_THAT(annot.get_rows({0})[0], ElementsAre(0, 1)); - EXPECT_EQ("CTAT", graph.get_node_sequence(2)); + EXPECT_EQ("CTAT", graph.get_node_sequence(graph.select_node(2))); ASSERT_THAT(annot.get_rows({1})[0], ElementsAre(1)); - EXPECT_EQ("TACT", graph.get_node_sequence(3)); + EXPECT_EQ("TACT", graph.get_node_sequence(graph.select_node(3))); ASSERT_THAT(annot.get_rows({2})[0], ElementsAre(0)); - EXPECT_EQ("AGCT", graph.get_node_sequence(4)); + EXPECT_EQ("AGCT", graph.get_node_sequence(graph.select_node(4))); ASSERT_THAT(annot.get_rows({3})[0], ElementsAre(0, 1)); - EXPECT_EQ("CTCT", graph.get_node_sequence(5)); + EXPECT_EQ("CTCT", graph.get_node_sequence(graph.select_node(5))); ASSERT_THAT(annot.get_rows({4})[0], ElementsAre(1)); - EXPECT_EQ("TAGC", graph.get_node_sequence(6)); + EXPECT_EQ("TAGC", graph.get_node_sequence(graph.select_node(6))); ASSERT_THAT(annot.get_rows({5})[0], ElementsAre(0, 1)); - EXPECT_EQ("ACTA", graph.get_node_sequence(7)); + EXPECT_EQ("ACTA", graph.get_node_sequence(graph.select_node(7))); ASSERT_THAT(annot.get_rows({6})[0], ElementsAre(0)); - EXPECT_EQ("ACTC", graph.get_node_sequence(8)); + EXPECT_EQ("ACTC", graph.get_node_sequence(graph.select_node(8))); ASSERT_THAT(annot.get_rows({7})[0], ElementsAre(1)); - EXPECT_EQ("GCTA", graph.get_node_sequence(9)); + EXPECT_EQ("GCTA", graph.get_node_sequence(graph.select_node(9))); ASSERT_THAT(annot.get_rows({8})[0], ElementsAre(0, 1)); - EXPECT_EQ("TCTA", graph.get_node_sequence(10)); + EXPECT_EQ("TCTA", graph.get_node_sequence(graph.select_node(10))); ASSERT_THAT(annot.get_rows({9})[0], ElementsAre(1)); } diff --git a/metagraph/tests/annotation/test_converters.cpp b/metagraph/tests/annotation/test_converters.cpp index f0d2c79611..85b2a2d5dc 100644 --- a/metagraph/tests/annotation/test_converters.cpp +++ b/metagraph/tests/annotation/test_converters.cpp @@ -189,9 +189,9 @@ TEST(RowDiff, succ) { */ const std::vector expected_succ = { 3, 0, 4, 2 }; - const std::vector expected_succ_boundary = { 1, 0, 1, 0, 1, 0, 1, 0, 1 }; + const std::vector expected_succ_boundary = { 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1 }; const std::vector expected_pred = { 2, 4, 1, 3 }; - const std::vector expected_pred_boundary = { 0, 1, 1, 0, 1, 0, 1, 0, 1 }; + const std::vector expected_pred_boundary = { 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1 }; for (uint32_t max_depth : { 1, 3, 5 }) { std::filesystem::remove_all(dst_dir); @@ -210,8 +210,8 @@ TEST(RowDiff, succ) { sdsl::int_vector_buffer succ(succ_file, std::ios::in); ASSERT_EQ(expected_succ.size(), succ.size()); - for (uint32_t i = 0; i < succ.size(); ++i) { - EXPECT_EQ(expected_succ[i], succ[i]) << max_depth << " " << i; + for (uint32_t i = 0; i < expected_succ.size(); ++i) { + EXPECT_EQ(expected_succ[i] + 1, graph->rank_node(succ[i] + 1)) << max_depth << " " << i; } sdsl::int_vector_buffer<1> succ_boundary(succ_boundary_file, std::ios::in); @@ -223,7 +223,7 @@ TEST(RowDiff, succ) { sdsl::int_vector_buffer pred(pred_file, std::ios::in); EXPECT_EQ(expected_pred.size(), pred.size()); for (uint32_t i = 0; i < pred.size(); ++i) { - EXPECT_EQ(expected_pred[i], pred[i]) << max_depth << " " << i; + EXPECT_EQ(expected_pred[i] + 1, graph->rank_node(pred[i] + 1)) << max_depth << " " << i; } sdsl::int_vector_buffer<1> pred_boundary(pred_boundary_file, std::ios::in); From 358e44585dfe55b04f004f15ef3cfee6fcc16591 Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Wed, 9 Oct 2024 22:34:39 +0200 Subject: [PATCH 13/29] num_nodes() -> max_index() for dbg_succ_ --- metagraph/src/graph/graph_extensions/node_first_cache.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metagraph/src/graph/graph_extensions/node_first_cache.cpp b/metagraph/src/graph/graph_extensions/node_first_cache.cpp index 975c830f63..a56f7b73b7 100644 --- a/metagraph/src/graph/graph_extensions/node_first_cache.cpp +++ b/metagraph/src/graph/graph_extensions/node_first_cache.cpp @@ -36,7 +36,7 @@ void NodeFirstCache::call_incoming_edges(edge_index edge, void NodeFirstCache::call_incoming_kmers(node_index node, const IncomingEdgeCallback &callback) const { - assert(node > 0 && node <= dbg_succ_.num_nodes()); + assert(node > 0 && node <= dbg_succ_.max_index()); edge_index edge = node; From 1068116c43ffce21c7ed03a56fa62705f823fca1 Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Wed, 9 Oct 2024 23:36:16 +0200 Subject: [PATCH 14/29] Add is_valid checks to nodes in DBGSuccinct --- .../graph_extensions/node_first_cache.cpp | 4 +- .../graph/representation/canonical_dbg.cpp | 7 ++- .../representation/succinct/dbg_succinct.cpp | 54 ++++++++++--------- .../representation/succinct/dbg_succinct.hpp | 1 + 4 files changed, 34 insertions(+), 32 deletions(-) diff --git a/metagraph/src/graph/graph_extensions/node_first_cache.cpp b/metagraph/src/graph/graph_extensions/node_first_cache.cpp index a56f7b73b7..2534dbbeda 100644 --- a/metagraph/src/graph/graph_extensions/node_first_cache.cpp +++ b/metagraph/src/graph/graph_extensions/node_first_cache.cpp @@ -36,14 +36,14 @@ void NodeFirstCache::call_incoming_edges(edge_index edge, void NodeFirstCache::call_incoming_kmers(node_index node, const IncomingEdgeCallback &callback) const { - assert(node > 0 && node <= dbg_succ_.max_index()); + assert(dbg_succ_.is_valid(node)); edge_index edge = node; call_incoming_edges(edge, [&](edge_index prev_edge) { node_index prev = prev_edge; - if (prev != DeBruijnGraph::npos) + if (dbg_succ_.is_valid(prev)) callback(prev, get_first_char(prev_edge, edge)); } ); diff --git a/metagraph/src/graph/representation/canonical_dbg.cpp b/metagraph/src/graph/representation/canonical_dbg.cpp index d19b1c2385..097011e4bc 100644 --- a/metagraph/src/graph/representation/canonical_dbg.cpp +++ b/metagraph/src/graph/representation/canonical_dbg.cpp @@ -285,7 +285,6 @@ void CanonicalDBG::call_incoming_kmers(node_index node, SmallVector parents(alphabet.size(), npos); // "- has_sentinel_" because there can't be a dummy sink with another non-dummy edge size_t max_num_edges_left = parents.size() - has_sentinel_; - auto incoming_kmer_callback = [&](node_index prev, char c) { assert(has_sentinel_ || c != boss::BOSS::kSentinel); assert(c == boss::BOSS::kSentinel || traverse_back(node, c) == prev); @@ -609,7 +608,7 @@ ::adjacent_incoming_rc_strand(node_index node, boss.call_outgoing(rc_edge, [&](boss::BOSS::edge_index adjacent_edge) { assert(dbg_succ_); node_index prev = adjacent_edge; - if (prev == DeBruijnGraph::npos) + if (!dbg_succ_->is_valid(prev)) return; char c = boss.decode(boss.get_W(adjacent_edge) % boss.alph_size); @@ -655,7 +654,7 @@ ::adjacent_outgoing_rc_strand(node_index node, const std::function &callback) const { // rshift rc // ATGGCT -> TGGCT* -> *AGCCA - if (get_dbg_succ(*graph_)) { + if (const auto *dbg_succ_ = get_dbg_succ(*graph_)) { // AGAGGATCTCGTATGCCGTCTTCTGCTTGAG //-> GAGGATCTCGTATGCCGTCTTCTGCTTGAG //-> CTCAAGCAGAAGACGGCATACGAGATCCTC @@ -670,7 +669,7 @@ ::adjacent_outgoing_rc_strand(node_index node, cache.call_incoming_edges(rc_edge, [&](edge_index prev_edge) { node_index prev = prev_edge; - if (!prev) + if (!dbg_succ_->is_valid(prev)) return; char c = cache.get_first_char(prev_edge, rc_edge); diff --git a/metagraph/src/graph/representation/succinct/dbg_succinct.cpp b/metagraph/src/graph/representation/succinct/dbg_succinct.cpp index 77c548b0fe..6c602cdbde 100644 --- a/metagraph/src/graph/representation/succinct/dbg_succinct.cpp +++ b/metagraph/src/graph/representation/succinct/dbg_succinct.cpp @@ -82,7 +82,7 @@ bool DBGSuccinct::find(std::string_view sequence, // Traverse the outgoing edge node_index DBGSuccinct::traverse(node_index node, char next_char) const { - assert(node > 0 && node <= max_index()); + assert(is_valid(node)); // return npos if the character is invalid if (boss_graph_->encode(next_char) == boss_graph_->alph_size) @@ -96,7 +96,7 @@ node_index DBGSuccinct::traverse(node_index node, char next_char) const { // Traverse the incoming edge node_index DBGSuccinct::traverse_back(node_index node, char prev_char) const { - assert(node > 0 && node <= max_index()); + assert(is_valid(node)); // dbg node is a boss edge BOSS::edge_index edge = boss_graph_->bwd(node); @@ -124,11 +124,11 @@ inline void call_outgoing(const BOSS &boss, void DBGSuccinct::call_outgoing_kmers(node_index node, const OutgoingEdgeCallback &callback) const { - assert(node > 0 && node <= max_index()); + assert(is_valid(node)); call_outgoing(*boss_graph_, node, [&](auto i) { auto next = i; - if (next != npos) + if (is_valid(next)) callback(next, boss_graph_->decode(boss_graph_->get_W(i) % boss_graph_->alph_size)); }); @@ -136,7 +136,7 @@ void DBGSuccinct::call_outgoing_kmers(node_index node, void DBGSuccinct::call_incoming_kmers(node_index node, const IncomingEdgeCallback &callback) const { - assert(node > 0 && node <= max_index()); + assert(is_valid(node)); auto edge = node; @@ -147,7 +147,7 @@ void DBGSuccinct::call_incoming_kmers(node_index node, == boss_graph_->get_node_last_value(edge)); auto prev = incoming_boss_edge; - if (prev != npos) { + if (is_valid(prev)) { callback(prev, boss_graph_->decode( boss_graph_->get_minus_k_value(incoming_boss_edge, get_k() - 2).first @@ -160,18 +160,18 @@ void DBGSuccinct::call_incoming_kmers(node_index node, void DBGSuccinct::adjacent_outgoing_nodes(node_index node, const std::function &callback) const { - assert(node > 0 && node <= max_index()); + assert(is_valid(node)); call_outgoing(*boss_graph_, node, [&](auto i) { auto next = i; - if (next != npos) + if (is_valid(next)) callback(next); }); } void DBGSuccinct::adjacent_incoming_nodes(node_index node, const std::function &callback) const { - assert(node > 0 && node <= max_index()); + assert(is_valid(node)); auto edge = node; @@ -182,7 +182,7 @@ void DBGSuccinct::adjacent_incoming_nodes(node_index node, == boss_graph_->get_node_last_value(edge)); auto prev = incoming_boss_edge; - if (prev != npos) + if (is_valid(prev)) callback(prev); } ); @@ -191,7 +191,7 @@ void DBGSuccinct::adjacent_incoming_nodes(node_index node, void DBGSuccinct::call_nodes(const std::function &callback, const std::function &terminate) const { for (node_index i = 1; i <= max_index() && !terminate(); ++i) { - if (!valid_edges_ || (*valid_edges_)[i]) { + if (is_valid(i)) { callback(i); } } @@ -239,7 +239,7 @@ void DBGSuccinct::add_sequence(std::string_view sequence, } std::string DBGSuccinct::get_node_sequence(node_index node) const { - assert(node > 0 && node <= max_index()); + assert(is_valid(node)); auto boss_edge = node; @@ -303,7 +303,7 @@ ::call_nodes_with_suffix_matching_longest_prefix( auto edge = boss_graph_->pick_edge(last, encoded.back()); if (edge) { auto kmer_index = edge; - if (kmer_index != npos) { + if (is_valid(kmer_index)) { assert(str.size() == get_k()); assert(get_node_sequence(kmer_index) == str); callback(kmer_index, get_k()); @@ -328,7 +328,7 @@ ::call_nodes_with_suffix_matching_longest_prefix( boss_graph_->get_node_last_value(e), [&](BOSS::edge_index incoming_edge_idx) { auto kmer_index = incoming_edge_idx; - if (kmer_index != npos) { + if (is_valid(kmer_index)) { assert(get_node_sequence(kmer_index).substr(get_k() - match_size) == str.substr(0, match_size)); nodes.emplace_back(kmer_index); @@ -350,7 +350,7 @@ ::call_nodes_with_suffix_matching_longest_prefix( boss_graph_->get_node_last_value(e), [&](BOSS::edge_index incoming_edge_idx) { auto kmer_index = incoming_edge_idx; - if (kmer_index != npos) { + if (is_valid(kmer_index)) { assert(get_node_sequence(kmer_index).substr(get_k() - match_size) == str.substr(0, match_size)); callback(kmer_index, match_size); @@ -366,7 +366,7 @@ void DBGSuccinct::traverse(node_index start, const char *end, const std::function &callback, const std::function &terminate) const { - assert(start > 0 && start <= max_index()); + assert(is_valid(start)); assert(end >= begin); if (terminate()) @@ -500,7 +500,7 @@ ::call_kmers(const std::function &callback assert(boss_graph_.get()); boss_graph_->call_kmers([&](auto index, const std::string &seq) { auto node = index; - assert(node != npos); + assert(is_valid(node)); callback(node, seq); }, stop_early); } @@ -509,7 +509,7 @@ void DBGSuccinct ::call_source_nodes(const std::function &callback) const { boss_graph_->call_start_edges([&](auto boss_edge) { auto node = boss_edge; - assert(node != npos); + assert(is_valid(node)); assert(!indegree(node)); callback(node); }); @@ -531,7 +531,7 @@ ::row_diff_traverse(size_t num_threads, size_t DBGSuccinct::outdegree(node_index node) const { - assert(node > 0 && node <= max_index()); + assert(is_valid(node)); auto boss_edge = node; @@ -557,7 +557,7 @@ size_t DBGSuccinct::outdegree(node_index node) const { } bool DBGSuccinct::has_single_outgoing(node_index node) const { - assert(node > 0 && node <= max_index()); + assert(is_valid(node)); auto boss_edge = node; @@ -583,7 +583,7 @@ bool DBGSuccinct::has_single_outgoing(node_index node) const { } bool DBGSuccinct::has_multiple_outgoing(node_index node) const { - assert(node > 0 && node <= max_index()); + assert(is_valid(node)); auto boss_edge = node; @@ -600,7 +600,7 @@ bool DBGSuccinct::has_multiple_outgoing(node_index node) const { } size_t DBGSuccinct::indegree(node_index node) const { - assert(node > 0 && node <= max_index()); + assert(is_valid(node)); auto boss_edge = node; @@ -616,7 +616,7 @@ size_t DBGSuccinct::indegree(node_index node) const { } bool DBGSuccinct::has_no_incoming(node_index node) const { - assert(node > 0 && node <= max_index()); + assert(is_valid(node)); auto boss_edge = node; @@ -632,7 +632,7 @@ bool DBGSuccinct::has_no_incoming(node_index node) const { } bool DBGSuccinct::has_single_incoming(node_index node) const { - assert(node > 0 && node <= max_index()); + assert(is_valid(node)); auto boss_edge = node; @@ -906,6 +906,9 @@ void DBGSuccinct::mask_dummy_kmers(size_t num_threads, bool with_pruning) { assert(!(*valid_edges_)[0]); } +bool DBGSuccinct::is_valid(node_index node) const { + return 0 < node && node <= max_index() && (!valid_edges_ || (*valid_edges_)[node]); +} node_index DBGSuccinct::select_node(uint64_t rank) const { assert(rank <= num_nodes()); @@ -916,8 +919,7 @@ node_index DBGSuccinct::select_node(uint64_t rank) const { } uint64_t DBGSuccinct::rank_node(node_index node) const { - assert(node <= boss_graph_->num_edges()); - assert(!valid_edges_.get() || node < valid_edges_->size()); + assert(node <= max_index()); if (!valid_edges_.get() || !node) return node; diff --git a/metagraph/src/graph/representation/succinct/dbg_succinct.hpp b/metagraph/src/graph/representation/succinct/dbg_succinct.hpp index 978947fd04..6f61d74679 100644 --- a/metagraph/src/graph/representation/succinct/dbg_succinct.hpp +++ b/metagraph/src/graph/representation/succinct/dbg_succinct.hpp @@ -178,6 +178,7 @@ class DBGSuccinct : public DeBruijnGraph { virtual void call_source_nodes(const std::function &callback) const override final; + bool is_valid(node_index node) const; node_index select_node(uint64_t boss_index) const; uint64_t rank_node(node_index kmer_index) const; From 42707fff46b921488477aead760e24693401d8a6 Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Thu, 10 Oct 2024 00:42:35 +0200 Subject: [PATCH 15/29] Fix DBGSuccinct tests --- .../graph/succinct/test_dbg_succinct.cpp | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/metagraph/tests/graph/succinct/test_dbg_succinct.cpp b/metagraph/tests/graph/succinct/test_dbg_succinct.cpp index 88a25e2716..1bc51ffd59 100644 --- a/metagraph/tests/graph/succinct/test_dbg_succinct.cpp +++ b/metagraph/tests/graph/succinct/test_dbg_succinct.cpp @@ -20,9 +20,9 @@ TEST(DBGSuccinct, get_degree_with_source_dummy) { + std::string(k, 'T')); // dummy source k-mer: '$$$$$' - EXPECT_EQ(std::string(k, '$'), graph->get_node_sequence(1)); - EXPECT_EQ(1ull, graph->outdegree(1)); - EXPECT_EQ(1ull, graph->indegree(1)); + EXPECT_EQ(std::string(k, '$'), graph->get_node_sequence(graph->select_node(1))); + EXPECT_EQ(1ull, graph->outdegree(graph->select_node(1))); + EXPECT_EQ(1ull, graph->indegree(graph->select_node(1))); // 'AAAAA' auto node_A = graph->kmer_to_node(std::string(k, 'A')); @@ -40,7 +40,7 @@ TEST(DBGSuccinct, get_degree_with_source_dummy) { graph->mask_dummy_kmers(1, false); // dummy source k-mer: '$$$$$' - EXPECT_NE(std::string(k, '$'), graph->get_node_sequence(1)); + EXPECT_NE(std::string(k, '$'), graph->get_node_sequence(graph->select_node(1))); // 'AAAAA' node_A = graph->kmer_to_node(std::string(k, 'A')); @@ -65,9 +65,9 @@ TEST(DBGSuccinct, get_degree_with_source_and_sink_dummy) { + std::string(k - 1, 'T')); // dummy source k-mer: '$$$$$' - EXPECT_EQ(std::string(k, '$'), graph->get_node_sequence(1)); - EXPECT_EQ(1ull, graph->outdegree(1)); - EXPECT_EQ(1ull, graph->indegree(1)); + EXPECT_EQ(std::string(k, '$'), graph->get_node_sequence(graph->select_node(1))); + EXPECT_EQ(1ull, graph->outdegree(graph->select_node(1))); + EXPECT_EQ(1ull, graph->indegree(graph->select_node(1))); // 'AAAAA' auto node_A = graph->kmer_to_node(std::string(k, 'A')); @@ -85,7 +85,7 @@ TEST(DBGSuccinct, get_degree_with_source_and_sink_dummy) { graph->mask_dummy_kmers(1, false); // dummy source k-mer: '$$$$$' - EXPECT_NE(std::string(k, '$'), graph->get_node_sequence(1)); + EXPECT_NE(std::string(k, '$'), graph->get_node_sequence(graph->select_node(1))); // 'AAAAA' node_A = graph->kmer_to_node(std::string(k, 'A')); @@ -109,7 +109,7 @@ TEST(DBGSuccinct, is_single_outgoing_simple) { uint64_t single_outgoing_counter = 0; for (DBGSuccinct::node_index i = 1; i <= graph->num_nodes(); ++i) { - if (graph->outdegree(i) == 1) + if (graph->outdegree(graph->select_node(i)) == 1) single_outgoing_counter++; } @@ -126,7 +126,7 @@ TEST(DBGSuccinct, is_single_outgoing_for_multiple_valid_edges) { uint64_t single_outgoing_counter = 0; for (DBGSuccinct::node_index i = 1; i <= graph->num_nodes(); ++i) { - if (graph->outdegree(i) == 1) + if (graph->outdegree(graph->select_node(i)) == 1) single_outgoing_counter++; } From a49f07f9c1cb99657007bfe684e2e82e8c898d33 Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Thu, 10 Oct 2024 01:13:23 +0200 Subject: [PATCH 16/29] Fix AnnotatedDBG test group --- .../tests/annotation/test_annotated_dbg.cpp | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/metagraph/tests/annotation/test_annotated_dbg.cpp b/metagraph/tests/annotation/test_annotated_dbg.cpp index 278f01f1f0..b3f7cc931b 100644 --- a/metagraph/tests/annotation/test_annotated_dbg.cpp +++ b/metagraph/tests/annotation/test_annotated_dbg.cpp @@ -512,8 +512,8 @@ TEST(AnnotatedDBG, ExtendGraphAddTwoPathsWithoutDummy) { ); EXPECT_EQ(num_nodes, anno_graph.get_graph().num_nodes()); - EXPECT_TRUE(anno_graph.get_annotator().num_objects() + k - < dynamic_cast(anno_graph.get_graph()).get_boss().num_edges()) + EXPECT_EQ(anno_graph.get_annotator().num_objects(), + dynamic_cast(anno_graph.get_graph()).get_boss().num_edges()) << dynamic_cast(anno_graph.get_graph()).get_boss(); EXPECT_FALSE(anno_graph.label_exists("First")); @@ -540,7 +540,7 @@ TEST(AnnotatedDBG, ExtendGraphAddTwoPathsWithoutDummy) { ); anno_graph.annotator_->insert_rows(edge_to_row_idx(inserted_nodes)); - EXPECT_EQ(anno_graph.get_graph().num_nodes() + 1, inserted_nodes.size()); + EXPECT_EQ(anno_graph.get_graph().max_index() + 1, inserted_nodes.size()); ASSERT_EQ(std::vector { "First" }, anno_graph.get_labels(seq_first, 1)); @@ -559,8 +559,8 @@ TEST(AnnotatedDBG, ExtendGraphAddTwoPathsWithoutDummy) { EXPECT_TRUE(anno_graph.label_exists("Third")); EXPECT_FALSE(anno_graph.label_exists("Fourth")); - EXPECT_TRUE(anno_graph.get_annotator().num_objects() + k - < dynamic_cast(anno_graph.get_graph()).get_boss().num_edges()) + EXPECT_EQ(anno_graph.get_annotator().num_objects(), + dynamic_cast(anno_graph.get_graph()).get_boss().num_edges()) << dynamic_cast(anno_graph.get_graph()).get_boss(); EXPECT_EQ(std::vector { "First" }, @@ -630,8 +630,8 @@ TEST(AnnotatedDBG, ExtendGraphAddTwoPathsWithoutDummyParallel) { std::make_unique>(graph->max_index()) ); - EXPECT_TRUE(anno_graph.get_annotator().num_objects() + k - < dynamic_cast(anno_graph.get_graph()).get_boss().num_edges()) + EXPECT_EQ(anno_graph.get_annotator().num_objects(), + dynamic_cast(anno_graph.get_graph()).get_boss().num_edges()) << dynamic_cast(anno_graph.get_graph()).get_boss(); EXPECT_FALSE(anno_graph.label_exists("First")); @@ -664,7 +664,7 @@ TEST(AnnotatedDBG, ExtendGraphAddTwoPathsWithoutDummyParallel) { ); anno_graph.annotator_->insert_rows(edge_to_row_idx(inserted_nodes)); - EXPECT_EQ(anno_graph.get_graph().num_nodes() + 1, inserted_nodes.size()); + EXPECT_EQ(anno_graph.get_graph().max_index() + 1, inserted_nodes.size()); ASSERT_EQ(std::vector { "First" }, anno_graph.get_labels(seq_first, 1)); @@ -688,8 +688,8 @@ TEST(AnnotatedDBG, ExtendGraphAddTwoPathsWithoutDummyParallel) { EXPECT_TRUE(anno_graph.label_exists("Third")); EXPECT_FALSE(anno_graph.label_exists("Fourth")); - EXPECT_TRUE(anno_graph.get_annotator().num_objects() + k - < dynamic_cast(anno_graph.get_graph()).get_boss().num_edges()) + EXPECT_EQ(anno_graph.get_annotator().num_objects(), + dynamic_cast(anno_graph.get_graph()).get_boss().num_edges()) << dynamic_cast(anno_graph.get_graph()).get_boss(); EXPECT_EQ(std::vector { "First" }, @@ -770,8 +770,8 @@ TEST(AnnotatedDBG, ExtendGraphAddTwoPathsPruneDummy) { EXPECT_FALSE(anno_graph.label_exists("Third")); EXPECT_FALSE(anno_graph.label_exists("Fourth")); - EXPECT_TRUE(anno_graph.get_annotator().num_objects() + 1 - < dynamic_cast(anno_graph.get_graph()).get_boss().num_edges()) + EXPECT_EQ(anno_graph.get_annotator().num_objects(), + dynamic_cast(anno_graph.get_graph()).get_boss().num_edges()) << dynamic_cast(anno_graph.get_graph()).get_boss(); ASSERT_EQ(std::vector { "First" }, @@ -786,7 +786,7 @@ TEST(AnnotatedDBG, ExtendGraphAddTwoPathsPruneDummy) { ); anno_graph.annotator_->insert_rows(edge_to_row_idx(inserted_nodes)); - EXPECT_EQ(anno_graph.get_graph().num_nodes() + 1, inserted_nodes.size()); + EXPECT_EQ(anno_graph.get_graph().max_index() + 1, inserted_nodes.size()); ASSERT_EQ(std::vector { "First" }, anno_graph.get_labels(seq_first, 1)); @@ -805,8 +805,8 @@ TEST(AnnotatedDBG, ExtendGraphAddTwoPathsPruneDummy) { EXPECT_TRUE(anno_graph.label_exists("Third")); EXPECT_FALSE(anno_graph.label_exists("Fourth")); - EXPECT_TRUE(anno_graph.get_annotator().num_objects() + 1 - < dynamic_cast(anno_graph.get_graph()).get_boss().num_edges()) + EXPECT_EQ(anno_graph.get_annotator().num_objects(), + dynamic_cast(anno_graph.get_graph()).get_boss().num_edges()) << dynamic_cast(anno_graph.get_graph()).get_boss(); EXPECT_EQ(std::vector { "First" }, @@ -893,8 +893,8 @@ TEST(AnnotatedDBG, ExtendGraphAddTwoPathsPruneDummyParallel) { EXPECT_FALSE(anno_graph.label_exists("Third")); EXPECT_FALSE(anno_graph.label_exists("Fourth")); - EXPECT_TRUE(anno_graph.get_annotator().num_objects() + 1 - < dynamic_cast(anno_graph.get_graph()).get_boss().num_edges()) + EXPECT_EQ(anno_graph.get_annotator().num_objects(), + dynamic_cast(anno_graph.get_graph()).get_boss().num_edges()) << dynamic_cast(anno_graph.get_graph()).get_boss(); ASSERT_EQ(std::vector { "First" }, @@ -909,7 +909,7 @@ TEST(AnnotatedDBG, ExtendGraphAddTwoPathsPruneDummyParallel) { ); anno_graph.annotator_->insert_rows(edge_to_row_idx(inserted_nodes)); - EXPECT_EQ(anno_graph.get_graph().num_nodes() + 1, inserted_nodes.size()); + EXPECT_EQ(anno_graph.get_graph().max_index() + 1, inserted_nodes.size()); ASSERT_EQ(std::vector { "First" }, anno_graph.get_labels(seq_first, 1)); @@ -933,8 +933,8 @@ TEST(AnnotatedDBG, ExtendGraphAddTwoPathsPruneDummyParallel) { EXPECT_TRUE(anno_graph.label_exists("Third")); EXPECT_FALSE(anno_graph.label_exists("Fourth")); - EXPECT_TRUE(anno_graph.get_annotator().num_objects() + 1 - < dynamic_cast(anno_graph.get_graph()).get_boss().num_edges()) + EXPECT_EQ(anno_graph.get_annotator().num_objects(), + dynamic_cast(anno_graph.get_graph()).get_boss().num_edges()) << dynamic_cast(anno_graph.get_graph()).get_boss(); EXPECT_EQ(std::vector { "First" }, From 68dd5f5382e6fe203561225d753e9cf1c9e96caa Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Thu, 10 Oct 2024 02:13:20 +0200 Subject: [PATCH 17/29] Fix RowDiff tests --- .../annotation/row_diff/test_row_diff.cpp | 87 ++++++++++++------- 1 file changed, 55 insertions(+), 32 deletions(-) diff --git a/metagraph/tests/annotation/row_diff/test_row_diff.cpp b/metagraph/tests/annotation/row_diff/test_row_diff.cpp index 59f2f03b5b..4eb7b2e0fb 100644 --- a/metagraph/tests/annotation/row_diff/test_row_diff.cpp +++ b/metagraph/tests/annotation/row_diff/test_row_diff.cpp @@ -187,18 +187,30 @@ TEST(RowDiff, GetAnnotationMasked) { graph.mask_dummy_kmers(1, false); // build annotation - sdsl::bit_vector bterminal = { 0, 0, 0, 0, 1, 0, 1, 0 }; + sdsl::bit_vector bterminal_masked = { 0, 0, 0, 0, 1, 0, 1, 0 }; + sdsl::bit_vector bterminal(graph.max_index() + 1); + sdsl::bit_vector cols_masked[2] = { + { 1, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 1, 0, 1, 1 } + }; + sdsl::bit_vector cols_concrete[2]; + cols_concrete[0].resize(graph.max_index() + 1); + cols_concrete[1].resize(graph.max_index() + 1); + graph.call_nodes([&](auto i) { + auto rank = graph.rank_node(i) - 1; + bterminal[i - 1] = bterminal_masked[rank]; + cols_concrete[0][i - 1] = cols_masked[0][rank]; + cols_concrete[1][i - 1] = cols_masked[1][rank]; + }); anchor_bv_type terminal(bterminal); utils::TempFile fterm_temp; std::ofstream fterm(fterm_temp.name(), ios::binary); terminal.serialize(fterm); fterm.flush(); - + std::vector> cols(2); - cols[0] = std::make_unique( - std::initializer_list({ 1, 0, 0, 0, 0, 0, 0, 0 })); - cols[1] = std::make_unique( - std::initializer_list({ 0, 0, 0, 0, 1, 0, 1, 1 })); + cols[0] = std::make_unique(std::move(cols_concrete[0])); + cols[1] = std::make_unique(std::move(cols_concrete[1])); ColumnMajor mat(std::move(cols)); @@ -206,28 +218,28 @@ TEST(RowDiff, GetAnnotationMasked) { annot.load_anchor(fterm_temp.name()); EXPECT_EQ("CTAG", graph.get_node_sequence(graph.select_node(1))); - ASSERT_THAT(annot.get_rows({0})[0], ElementsAre(0, 1)); + ASSERT_THAT(annot.get_rows({graph.select_node(1) - 1})[0], ElementsAre(0, 1)); EXPECT_EQ("AGCT", graph.get_node_sequence(graph.select_node(2))); - ASSERT_THAT(annot.get_rows({1})[0], ElementsAre(1)); + ASSERT_THAT(annot.get_rows({graph.select_node(2) - 1})[0], ElementsAre(1)); EXPECT_EQ("CTCT", graph.get_node_sequence(graph.select_node(3))); - ASSERT_THAT(annot.get_rows({2})[0], ElementsAre(0)); + ASSERT_THAT(annot.get_rows({graph.select_node(3) - 1})[0], ElementsAre(0)); EXPECT_EQ("TAGC", graph.get_node_sequence(graph.select_node(4))); - ASSERT_THAT(annot.get_rows({3})[0], ElementsAre(1)); + ASSERT_THAT(annot.get_rows({graph.select_node(4) - 1})[0], ElementsAre(1)); EXPECT_EQ("ACTA", graph.get_node_sequence(graph.select_node(5))); - ASSERT_THAT(annot.get_rows({4})[0], ElementsAre(1)); + ASSERT_THAT(annot.get_rows({graph.select_node(5) - 1})[0], ElementsAre(1)); EXPECT_EQ("ACTC", graph.get_node_sequence(graph.select_node(6))); - ASSERT_THAT(annot.get_rows({5})[0], ElementsAre(0)); + ASSERT_THAT(annot.get_rows({graph.select_node(6) - 1})[0], ElementsAre(0)); EXPECT_EQ("GCTA", graph.get_node_sequence(graph.select_node(7))); - ASSERT_THAT(annot.get_rows({6})[0], ElementsAre(1)); + ASSERT_THAT(annot.get_rows({graph.select_node(7) - 1})[0], ElementsAre(1)); EXPECT_EQ("TCTA", graph.get_node_sequence(graph.select_node(8))); - ASSERT_THAT(annot.get_rows({7})[0], ElementsAre(0)); + ASSERT_THAT(annot.get_rows({graph.select_node(8) - 1})[0], ElementsAre(0)); } /** @@ -299,57 +311,68 @@ TEST(RowDiff, GetAnnotationBifurcationMasked) { graph.mask_dummy_kmers(1, false); // build annotation - sdsl::bit_vector bterminal = { 0, 1, 0, 0, 0, 0, 1, 0, 1, 0 }; + sdsl::bit_vector bterminal_masked = { 0, 1, 0, 0, 0, 0, 1, 0, 1, 0 }; + sdsl::bit_vector bterminal(graph.max_index() + 1); + sdsl::bit_vector cols_masked[2] = { + {0, 0, 1, 0, 0, 0, 1, 0, 1, 0 }, + {0, 1, 1, 0, 0, 0, 0, 0, 1, 0 } + }; + sdsl::bit_vector cols_concrete[2]; + cols_concrete[0].resize(graph.max_index() + 1); + cols_concrete[1].resize(graph.max_index() + 1); + graph.call_nodes([&](auto i) { + auto rank = graph.rank_node(i) - 1; + bterminal[i - 1] = bterminal_masked[rank]; + cols_concrete[0][i - 1] = cols_masked[0][rank]; + cols_concrete[1][i - 1] = cols_masked[1][rank]; + }); anchor_bv_type terminal(bterminal); utils::TempFile fterm_temp; std::ofstream fterm(fterm_temp.name(), ios::binary); terminal.serialize(fterm); fterm.flush(); + + std::vector> cols(2); + cols[0] = std::make_unique(std::move(cols_concrete[0])); + cols[1] = std::make_unique(std::move(cols_concrete[1])); Vector diffs = { 1, 0, 1, 0, 0, 1 }; sdsl::bit_vector boundary = { 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1 }; - - std::vector> cols(2); - cols[0] = std::make_unique( - std::initializer_list({0, 0, 1, 0, 0, 0, 1, 0, 1, 0 })); - cols[1] = std::make_unique( - std::initializer_list({0, 1, 1, 0, 0, 0, 0, 0, 1, 0 })); - ColumnMajor mat(std::move(cols)); RowDiff annot(&graph, std::move(mat)); annot.load_anchor(fterm_temp.name()); EXPECT_EQ("CTAG", graph.get_node_sequence(graph.select_node(1))); - ASSERT_THAT(annot.get_rows({0})[0], ElementsAre(0, 1)); + ASSERT_THAT(annot.get_rows({graph.select_node(1) - 1})[0], ElementsAre(0, 1)); EXPECT_EQ("CTAT", graph.get_node_sequence(graph.select_node(2))); - ASSERT_THAT(annot.get_rows({1})[0], ElementsAre(1)); + ASSERT_THAT(annot.get_rows({graph.select_node(2) - 1})[0], ElementsAre(1)); EXPECT_EQ("TACT", graph.get_node_sequence(graph.select_node(3))); - ASSERT_THAT(annot.get_rows({2})[0], ElementsAre(0)); + ASSERT_THAT(annot.get_rows({graph.select_node(3) - 1})[0], ElementsAre(0)); EXPECT_EQ("AGCT", graph.get_node_sequence(graph.select_node(4))); - ASSERT_THAT(annot.get_rows({3})[0], ElementsAre(0, 1)); + ASSERT_THAT(annot.get_rows({graph.select_node(4) - 1})[0], ElementsAre(0, 1)); EXPECT_EQ("CTCT", graph.get_node_sequence(graph.select_node(5))); - ASSERT_THAT(annot.get_rows({4})[0], ElementsAre(1)); + ASSERT_THAT(annot.get_rows({graph.select_node(5) - 1})[0], ElementsAre(1)); EXPECT_EQ("TAGC", graph.get_node_sequence(graph.select_node(6))); - ASSERT_THAT(annot.get_rows({5})[0], ElementsAre(0, 1)); + ASSERT_THAT(annot.get_rows({graph.select_node(6) - 1})[0], ElementsAre(0, 1)); EXPECT_EQ("ACTA", graph.get_node_sequence(graph.select_node(7))); - ASSERT_THAT(annot.get_rows({6})[0], ElementsAre(0)); + ASSERT_THAT(annot.get_rows({graph.select_node(7) - 1})[0], ElementsAre(0)); EXPECT_EQ("ACTC", graph.get_node_sequence(graph.select_node(8))); - ASSERT_THAT(annot.get_rows({7})[0], ElementsAre(1)); + ASSERT_THAT(annot.get_rows({graph.select_node(8) - 1})[0], ElementsAre(1)); EXPECT_EQ("GCTA", graph.get_node_sequence(graph.select_node(9))); - ASSERT_THAT(annot.get_rows({8})[0], ElementsAre(0, 1)); + ASSERT_THAT(annot.get_rows({graph.select_node(9) - 1})[0], ElementsAre(0, 1)); EXPECT_EQ("TCTA", graph.get_node_sequence(graph.select_node(10))); - ASSERT_THAT(annot.get_rows({9})[0], ElementsAre(1)); + ASSERT_THAT(annot.get_rows({graph.select_node(10) - 1})[0], ElementsAre(1)); } } // namespace From ee0c4a86d2585910d037a2a8ef447bd1e9d9ca2c Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Thu, 10 Oct 2024 11:57:09 +0200 Subject: [PATCH 18/29] Return npos in certain callbacks for dummy nodes --- .../src/graph/representation/succinct/dbg_succinct.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/metagraph/src/graph/representation/succinct/dbg_succinct.cpp b/metagraph/src/graph/representation/succinct/dbg_succinct.cpp index 6c602cdbde..9ca7e09820 100644 --- a/metagraph/src/graph/representation/succinct/dbg_succinct.cpp +++ b/metagraph/src/graph/representation/succinct/dbg_succinct.cpp @@ -261,7 +261,7 @@ void DBGSuccinct::map_to_nodes_sequentially(std::string_view sequence, boss_graph_->map_to_edges( sequence, - [&](BOSS::edge_index i) { callback(i); }, + [&](BOSS::edge_index i) { callback(is_valid(i) ? i : npos); }, terminate, [&]() { if (!is_missing()) @@ -385,7 +385,7 @@ void DBGSuccinct::traverse(node_index start, edge = boss_graph_->pick_edge(edge, boss_graph_->encode(*begin)); start = edge; - if (start == npos) + if (!is_valid(start)) return; callback(start); @@ -447,13 +447,13 @@ void DBGSuccinct::map_to_nodes(std::string_view sequence, for (size_t i = 0; i < boss_edges.size() && !terminate(); ++i) { // the definition of a canonical k-mer is redefined: // use k-mer with smaller index in the BOSS table. - callback(boss_edges[i]); + callback(is_valid(boss_edges[i]) ? boss_edges[i] : npos); } } else { boss_graph_->map_to_edges( sequence, - [&](BOSS::edge_index i) { callback(i); }, + [&](BOSS::edge_index i) { callback(is_valid(i) ? i : npos); }, terminate, [&]() { if (!is_missing()) From a46f6a46dfa59e26bc09ac2ba5dd37e9235b359b Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Thu, 10 Oct 2024 13:38:05 +0200 Subject: [PATCH 19/29] Validate edges on BOSS edge -> DBG node transition --- .../alignment/aligner_seeder_methods.cpp | 2 +- metagraph/src/graph/alignment/alignment.cpp | 2 +- .../graph/representation/canonical_dbg.cpp | 4 +-- .../representation/succinct/dbg_succinct.cpp | 25 ++++++++++++++----- .../representation/succinct/dbg_succinct.hpp | 4 +++ 5 files changed, 27 insertions(+), 10 deletions(-) diff --git a/metagraph/src/graph/alignment/aligner_seeder_methods.cpp b/metagraph/src/graph/alignment/aligner_seeder_methods.cpp index 84755b6fa6..1a6d7e03d4 100644 --- a/metagraph/src/graph/alignment/aligner_seeder_methods.cpp +++ b/metagraph/src/graph/alignment/aligner_seeder_methods.cpp @@ -104,7 +104,7 @@ void suffix_to_prefix(const DBGSuccinct &dbg_succ, const auto &[first, last, seed_length] = final_range; assert(seed_length == boss.get_k()); for (boss::BOSS::edge_index i = first; i <= last; ++i) { - DBGSuccinct::node_index node = i; + DBGSuccinct::node_index node = dbg_succ.validate_edge(i); if (node) callback(node); } diff --git a/metagraph/src/graph/alignment/alignment.cpp b/metagraph/src/graph/alignment/alignment.cpp index fe1127fd41..ef1f4fb29b 100644 --- a/metagraph/src/graph/alignment/alignment.cpp +++ b/metagraph/src/graph/alignment/alignment.cpp @@ -565,7 +565,7 @@ void Alignment::reverse_complement(const DeBruijnGraph &graph, return; } - nodes_[0] = edge; + nodes_[0] = dbg_succ.validate_edge(edge); assert(nodes_[0]); sequence_.push_back(boss.decode(edge_label)); assert(graph.get_node_sequence(nodes_[0]) diff --git a/metagraph/src/graph/representation/canonical_dbg.cpp b/metagraph/src/graph/representation/canonical_dbg.cpp index 097011e4bc..5ba11f69fe 100644 --- a/metagraph/src/graph/representation/canonical_dbg.cpp +++ b/metagraph/src/graph/representation/canonical_dbg.cpp @@ -668,8 +668,8 @@ ::adjacent_outgoing_rc_strand(node_index node, cache.call_incoming_edges(rc_edge, [&](edge_index prev_edge) { - node_index prev = prev_edge; - if (!dbg_succ_->is_valid(prev)) + node_index prev = dbg_succ_->validate_edge(prev_edge); + if (!prev) return; char c = cache.get_first_char(prev_edge, rc_edge); diff --git a/metagraph/src/graph/representation/succinct/dbg_succinct.cpp b/metagraph/src/graph/representation/succinct/dbg_succinct.cpp index 9ca7e09820..95489f3e08 100644 --- a/metagraph/src/graph/representation/succinct/dbg_succinct.cpp +++ b/metagraph/src/graph/representation/succinct/dbg_succinct.cpp @@ -261,7 +261,7 @@ void DBGSuccinct::map_to_nodes_sequentially(std::string_view sequence, boss_graph_->map_to_edges( sequence, - [&](BOSS::edge_index i) { callback(is_valid(i) ? i : npos); }, + [&](BOSS::edge_index i) { callback(validate_edge(i)); }, terminate, [&]() { if (!is_missing()) @@ -447,13 +447,13 @@ void DBGSuccinct::map_to_nodes(std::string_view sequence, for (size_t i = 0; i < boss_edges.size() && !terminate(); ++i) { // the definition of a canonical k-mer is redefined: // use k-mer with smaller index in the BOSS table. - callback(is_valid(boss_edges[i]) ? boss_edges[i] : npos); + callback(validate_edge(boss_edges[i])); } } else { boss_graph_->map_to_edges( sequence, - [&](BOSS::edge_index i) { callback(is_valid(i) ? i : npos); }, + [&](BOSS::edge_index i) { callback(validate_edge(i)); }, terminate, [&]() { if (!is_missing()) @@ -472,6 +472,9 @@ void DBGSuccinct::call_sequences(const CallPath &callback, assert(boss_graph_.get()); boss_graph_->call_sequences( [&](std::string&& seq, auto&& path) { + for (auto &node : path) { + node = validate_edge(node); + } callback(std::move(seq), std::move(path)); }, num_threads, @@ -486,6 +489,9 @@ void DBGSuccinct::call_unitigs(const CallPath &callback, assert(boss_graph_.get()); boss_graph_->call_unitigs( [&](std::string&& seq, auto&& path) { + for (auto &node : path) { + node = validate_edge(node); + } callback(std::move(seq), std::move(path)); }, num_threads, @@ -522,12 +528,16 @@ ::get_last() const { void DBGSuccinct ::row_diff_traverse(size_t num_threads, - size_t max_length, - const bit_vector &rd_succ, - sdsl::bit_vector *terminal) const { + size_t max_length, + const bit_vector &rd_succ, + sdsl::bit_vector *terminal) const { return get_boss().row_diff_traverse(num_threads, max_length, rd_succ, terminal); } +node_index DBGSuccinct +::row_diff_successor(node_index node, const bit_vector &rd_succ) const { + return get_boss().row_diff_successor(node, rd_succ); +} size_t DBGSuccinct::outdegree(node_index node) const { @@ -909,6 +919,9 @@ void DBGSuccinct::mask_dummy_kmers(size_t num_threads, bool with_pruning) { bool DBGSuccinct::is_valid(node_index node) const { return 0 < node && node <= max_index() && (!valid_edges_ || (*valid_edges_)[node]); } +node_index DBGSuccinct::validate_edge(node_index node) const { + return is_valid(node) ? node : npos; +} node_index DBGSuccinct::select_node(uint64_t rank) const { assert(rank <= num_nodes()); diff --git a/metagraph/src/graph/representation/succinct/dbg_succinct.hpp b/metagraph/src/graph/representation/succinct/dbg_succinct.hpp index 6f61d74679..69ead431d8 100644 --- a/metagraph/src/graph/representation/succinct/dbg_succinct.hpp +++ b/metagraph/src/graph/representation/succinct/dbg_succinct.hpp @@ -179,6 +179,7 @@ class DBGSuccinct : public DeBruijnGraph { virtual void call_source_nodes(const std::function &callback) const override final; bool is_valid(node_index node) const; + node_index validate_edge(node_index node) const; node_index select_node(uint64_t boss_index) const; uint64_t rank_node(node_index kmer_index) const; @@ -188,6 +189,9 @@ class DBGSuccinct : public DeBruijnGraph { size_t max_length, const bit_vector &rd_succ, sdsl::bit_vector *terminal) const override final; + + virtual node_index row_diff_successor(node_index node, + const bit_vector &rd_succ) const override final; void initialize_bloom_filter_from_fpr(double false_positive_rate, uint32_t max_num_hash_functions = -1); From 60851dae0de716b15611e8a8fd1d87abc7ab3f4b Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Thu, 10 Oct 2024 13:56:22 +0200 Subject: [PATCH 20/29] More validate_edge checks --- .../annotation/binary_matrix/row_diff/row_diff.cpp | 4 +++- .../src/graph/representation/canonical_dbg.cpp | 2 +- .../src/graph/representation/masked_graph.cpp | 6 ++++++ .../graph/representation/succinct/dbg_succinct.cpp | 14 +++++++++----- 4 files changed, 19 insertions(+), 7 deletions(-) diff --git a/metagraph/src/annotation/binary_matrix/row_diff/row_diff.cpp b/metagraph/src/annotation/binary_matrix/row_diff/row_diff.cpp index a061d87355..a81bef121c 100644 --- a/metagraph/src/annotation/binary_matrix/row_diff/row_diff.cpp +++ b/metagraph/src/annotation/binary_matrix/row_diff/row_diff.cpp @@ -64,7 +64,9 @@ IRowDiff::get_rd_ids(const std::vector &row_ids) const { graph::AnnotatedSequenceGraph::anno_to_graph_index(row); while (true) { - row = graph::AnnotatedSequenceGraph::graph_to_anno_index(boss_edge); + row = graph::AnnotatedSequenceGraph::graph_to_anno_index( + graph_->validate_edge(boss_edge) + ); auto [it, is_new] = node_to_rd.try_emplace(row, node_to_rd.size()); rd_paths_trunc[i].push_back(it.value()); diff --git a/metagraph/src/graph/representation/canonical_dbg.cpp b/metagraph/src/graph/representation/canonical_dbg.cpp index 5ba11f69fe..2b11f695ea 100644 --- a/metagraph/src/graph/representation/canonical_dbg.cpp +++ b/metagraph/src/graph/representation/canonical_dbg.cpp @@ -115,7 +115,7 @@ ::map_to_nodes_sequentially(std::string_view sequence, sequence.substr(1)); boss.map_to_edges(sequence.substr(1), [&](boss::BOSS::edge_index edge) { - path.push_back(edge); + path.push_back(dbg_succ->validate_edge(edge)); ++it; }, []() { return false; }, diff --git a/metagraph/src/graph/representation/masked_graph.cpp b/metagraph/src/graph/representation/masked_graph.cpp index 7cfe13be27..68b721b099 100644 --- a/metagraph/src/graph/representation/masked_graph.cpp +++ b/metagraph/src/graph/representation/masked_graph.cpp @@ -112,6 +112,9 @@ void MaskedDeBruijnGraph::call_sequences(const CallPath &callback, only_valid_nodes_in_mask_); dbg_succ->get_boss().call_sequences([&](std::string&& sequence, auto&& path) { + for (auto &node : path) { + node = dbg_succ->validate_edge(node); + } callback(sequence, path); }, num_threads, kmers_in_single_form, &mask); @@ -130,6 +133,9 @@ void MaskedDeBruijnGraph::call_unitigs(const CallPath &callback, only_valid_nodes_in_mask_); dbg_succ->get_boss().call_unitigs([&](std::string&& sequence, auto&& path) { + for (auto &node : path) { + node = dbg_succ->validate_edge(node); + } callback(sequence, path); }, num_threads, min_tip_size, kmers_in_single_form, &mask); diff --git a/metagraph/src/graph/representation/succinct/dbg_succinct.cpp b/metagraph/src/graph/representation/succinct/dbg_succinct.cpp index 95489f3e08..69b51859fd 100644 --- a/metagraph/src/graph/representation/succinct/dbg_succinct.cpp +++ b/metagraph/src/graph/representation/succinct/dbg_succinct.cpp @@ -91,7 +91,9 @@ node_index DBGSuccinct::traverse(node_index node, char next_char) const { // dbg node is a boss edge BOSS::edge_index boss_edge = node; boss_edge = boss_graph_->fwd(boss_edge); - return boss_graph_->pick_edge(boss_edge, boss_graph_->encode(next_char)); + return validate_edge( + boss_graph_->pick_edge(boss_edge, boss_graph_->encode(next_char)) + ); } // Traverse the incoming edge @@ -100,7 +102,9 @@ node_index DBGSuccinct::traverse_back(node_index node, char prev_char) const { // dbg node is a boss edge BOSS::edge_index edge = boss_graph_->bwd(node); - return boss_graph_->pick_incoming_edge(edge, boss_graph_->encode(prev_char)); + return validate_edge( + boss_graph_->pick_incoming_edge(edge, boss_graph_->encode(prev_char)) + ); } template @@ -228,7 +232,7 @@ void DBGSuccinct::add_sequence(std::string_view sequence, // Call all new nodes inserted including the dummy ones, unless they // are masked out. - on_insertion(new_boss_edge); + on_insertion(validate_edge(new_boss_edge)); } assert(!valid_edges_.get() || !(*valid_edges_)[0]); @@ -472,7 +476,7 @@ void DBGSuccinct::call_sequences(const CallPath &callback, assert(boss_graph_.get()); boss_graph_->call_sequences( [&](std::string&& seq, auto&& path) { - for (auto &node : path) { + for (auto &node : path) { node = validate_edge(node); } callback(std::move(seq), std::move(path)); @@ -489,7 +493,7 @@ void DBGSuccinct::call_unitigs(const CallPath &callback, assert(boss_graph_.get()); boss_graph_->call_unitigs( [&](std::string&& seq, auto&& path) { - for (auto &node : path) { + for (auto &node : path) { node = validate_edge(node); } callback(std::move(seq), std::move(path)); From 91ea56f30e8899282d9b43c9f3fb5e38a7605e58 Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Thu, 10 Oct 2024 16:33:54 +0200 Subject: [PATCH 21/29] Use is_valid in adjacent_outgoing_rc_strand --- metagraph/src/graph/representation/canonical_dbg.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metagraph/src/graph/representation/canonical_dbg.cpp b/metagraph/src/graph/representation/canonical_dbg.cpp index 2b11f695ea..985a25ada6 100644 --- a/metagraph/src/graph/representation/canonical_dbg.cpp +++ b/metagraph/src/graph/representation/canonical_dbg.cpp @@ -668,8 +668,8 @@ ::adjacent_outgoing_rc_strand(node_index node, cache.call_incoming_edges(rc_edge, [&](edge_index prev_edge) { - node_index prev = dbg_succ_->validate_edge(prev_edge); - if (!prev) + node_index prev = prev_edge; + if (!dbg_succ_->is_valid(prev)) return; char c = cache.get_first_char(prev_edge, rc_edge); From 1976be1a785b44b316ec5a518c4cb2e93997c55e Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Thu, 10 Oct 2024 19:31:54 +0200 Subject: [PATCH 22/29] Fix identation + annotations without succ --- metagraph/src/annotation/row_diff_builder.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metagraph/src/annotation/row_diff_builder.cpp b/metagraph/src/annotation/row_diff_builder.cpp index 3b2e98ee4d..ff07c788af 100644 --- a/metagraph/src/annotation/row_diff_builder.cpp +++ b/metagraph/src/annotation/row_diff_builder.cpp @@ -388,7 +388,7 @@ void build_pred_succ(const graph::DeBruijnGraph &graph, succ_buf.push_back(to_row(j)); succ_boundary_buf.push_back(0); } - if(rd_succ[i]) { + if(rd_succ[i]) { graph.adjacent_incoming_nodes(i, [&](auto pred) { if (dummy && (*dummy)[pred]) { return; @@ -921,7 +921,7 @@ void convert_batch_to_row_diff(const std::string &pred_succ_fprefix, // reduction (zero diff) __atomic_add_fetch(&row_nbits_block[chunk_idx], 1, __ATOMIC_RELAXED); } - } else if (succ) { + } else if (succ || anchor[row_idx]) { bool is_anchor = anchor[row_idx]; // add current bit if this node is an anchor // or if the successor has zero diff From 4e971da44e1a96345c6e2a0f3840834c6dd3dfa5 Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Thu, 10 Oct 2024 19:44:41 +0200 Subject: [PATCH 23/29] Fix RowDiff test --- .../binary_matrix/row_diff/row_diff.cpp | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/metagraph/src/annotation/binary_matrix/row_diff/row_diff.cpp b/metagraph/src/annotation/binary_matrix/row_diff/row_diff.cpp index a81bef121c..712e098564 100644 --- a/metagraph/src/annotation/binary_matrix/row_diff/row_diff.cpp +++ b/metagraph/src/annotation/binary_matrix/row_diff/row_diff.cpp @@ -64,22 +64,22 @@ IRowDiff::get_rd_ids(const std::vector &row_ids) const { graph::AnnotatedSequenceGraph::anno_to_graph_index(row); while (true) { - row = graph::AnnotatedSequenceGraph::graph_to_anno_index( - graph_->validate_edge(boss_edge) - ); - - auto [it, is_new] = node_to_rd.try_emplace(row, node_to_rd.size()); - rd_paths_trunc[i].push_back(it.value()); - - // If a node had been reached before, we interrupt the diff path. - // The annotation for that node will have been reconstructed earlier - // than for other nodes in this path as well. Thus, we will start - // reconstruction from that node and don't need its successors. - if (!is_new) - break; - - if (anchor_[row]) - break; + if (graph_->is_valid(boss_edge)) { + row = graph::AnnotatedSequenceGraph::graph_to_anno_index(boss_edge); + + auto [it, is_new] = node_to_rd.try_emplace(row, node_to_rd.size()); + rd_paths_trunc[i].push_back(it.value()); + + // If a node had been reached before, we interrupt the diff path. + // The annotation for that node will have been reconstructed earlier + // than for other nodes in this path as well. Thus, we will start + // reconstruction from that node and don't need its successors. + if (!is_new) + break; + + if (anchor_[row]) + break; + } boss_edge = boss.row_diff_successor(boss_edge, rd_succ); } From 7737e26af9cb3daae75c1630e803b44c3a211601 Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Thu, 10 Oct 2024 20:08:26 +0200 Subject: [PATCH 24/29] Move get_last, row_diff_traverse, row_diff_successor into row_diff_builder --- metagraph/src/annotation/row_diff_builder.cpp | 102 ++++++++++++++++-- .../representation/base/sequence_graph.cpp | 66 ------------ .../representation/base/sequence_graph.hpp | 9 -- .../representation/succinct/dbg_succinct.cpp | 19 ---- .../representation/succinct/dbg_succinct.hpp | 10 -- 5 files changed, 93 insertions(+), 113 deletions(-) diff --git a/metagraph/src/annotation/row_diff_builder.cpp b/metagraph/src/annotation/row_diff_builder.cpp index ff07c788af..8fcb511a3d 100644 --- a/metagraph/src/annotation/row_diff_builder.cpp +++ b/metagraph/src/annotation/row_diff_builder.cpp @@ -11,6 +11,7 @@ #include "common/elias_fano/elias_fano_merger.hpp" #include "common/utils/file_utils.hpp" #include "common/vectors/bit_vector_sdsl.hpp" +#include "common/vectors/bit_vector_dyn.hpp" #include "graph/annotated_dbg.hpp" const uint64_t BLOCK_SIZE = 1 << 25; @@ -26,6 +27,7 @@ namespace annot { using namespace mtg::annot::matrix; using mtg::common::logger; using mtg::graph::boss::BOSS; +using node_index = graph::DeBruijnGraph::node_index; namespace fs = std::filesystem; using anchor_bv_type = RowDiff::anchor_bv_type; @@ -264,6 +266,23 @@ void sum_and_call_counts(const fs::path &dir, } } +std::shared_ptr get_last(const graph::DeBruijnGraph &graph) { + if (auto* dbg_succ = dynamic_cast(&graph)) { + return std::shared_ptr( + std::shared_ptr{}, &dbg_succ->get_boss().get_last()); + } else { + bit_vector_dyn last_bv(graph.max_index() + 1); + graph.call_nodes([&](node_index v) { + std::pair last; + graph.call_outgoing_kmers(v, [&](node_index u, char c) { + last = std::max(last, std::pair{c, u}); + }); + last_bv.set(last.second, true); + }); + return std::make_shared(std::move(last_bv)); + } +} + rd_succ_bv_type route_at_forks(const graph::DeBruijnGraph &graph, const std::string &rd_succ_filename, const std::string &count_vectors_dir, @@ -282,7 +301,7 @@ rd_succ_bv_type route_at_forks(const graph::DeBruijnGraph &graph, logger->trace("RowDiff successors will be set to the adjacent nodes with" " the largest number of labels"); - const bit_vector &last = *graph.get_last(); + const bit_vector &last = *get_last(graph); graph::DeBruijnGraph::node_index graph_idx = to_node(0); std::vector outgoing_counts; @@ -326,6 +345,71 @@ rd_succ_bv_type route_at_forks(const graph::DeBruijnGraph &graph, return rd_succ; } +node_index row_diff_successor(const graph::DeBruijnGraph &graph, + node_index node, + const bit_vector &rd_succ) { + if (auto* dbg_succ = dynamic_cast(&graph)) { + return dbg_succ->get_boss().row_diff_successor(node, rd_succ); + } else { + node_index succ = graph::DeBruijnGraph::npos; + graph.adjacent_outgoing_nodes(node, [&](node_index adjacent_node) { + if(rd_succ[adjacent_node]) { + succ = adjacent_node; + } + }); + assert(succ != graph::DeBruijnGraph::npos && "a row diff successor must exist"); + return succ; + } +} + +void row_diff_traverse(const graph::DeBruijnGraph &graph, + size_t num_threads, + size_t max_length, + const bit_vector &rd_succ, + sdsl::bit_vector *terminal) { + if (auto* dbg_succ = dynamic_cast(&graph)) { + return dbg_succ->get_boss().row_diff_traverse( + num_threads, max_length, rd_succ, terminal); + } else { + sdsl::bit_vector visited(graph.max_index() + 1); + auto finalised = visited; + std::vector distance(graph.max_index() + 1); + assert(terminal->size() == visited.size()); + assert(rd_succ.size() == visited.size()); + auto set_terminal = [&](int v) { + distance[v] = 0; + (*terminal)[v] = true; + }; + graph.call_nodes([&](node_index v) { + static std::stack path; + while (!visited[v]) { + path.push(v); + visited[v] = true; + if (!graph.has_no_outgoing(v)) { + v = row_diff_successor(graph, v, rd_succ); + } + } + // Either a sink, or a cyclic dependency + if (!finalised[v]) { + set_terminal(v); + finalised[v] = true; + } + node_index succ; + while (!empty(path)) { + succ = std::exchange(v, path.top()); + if (!finalised[v]) { + distance[v] = distance[succ] + 1; + if (distance[v] == max_length) { + set_terminal(v); + } + finalised[v] = true; + } + path.pop(); + } + }); + } +} + void build_pred_succ(const graph::DeBruijnGraph &graph, const std::string &outfbase, const std::string &count_vectors_dir, @@ -366,13 +450,13 @@ void build_pred_succ(const graph::DeBruijnGraph &graph, // traverse graph in parallel processing blocks of size |BS| // use static scheduling to make threads process ordered contiguous blocks #pragma omp parallel for ordered num_threads(num_threads) schedule(dynamic) - for (uint64_t start = 1; start <= graph.max_index(); start += BS) { - std::vector succ_buf; + for (node_index start = 1; start <= graph.max_index(); start += BS) { + std::vector succ_buf; std::vector succ_boundary_buf; - std::vector pred_buf; + std::vector pred_buf; std::vector pred_boundary_buf; - for (uint64_t i = start; i < std::min(start + BS, graph.max_index() + 1); ++i) { + for (node_index i = start; i < std::min(start + BS, graph.max_index() + 1); ++i) { bool skip_succ = false, skip_all = false; if (succinct) { // Legacy code for DBGSuccinct BOSS::edge_index boss_idx = i; @@ -384,7 +468,7 @@ void build_pred_succ(const graph::DeBruijnGraph &graph, } auto with_rd_succ = [&](bit_vector const& rd_succ) { if(!skip_succ) { - auto j = graph.row_diff_successor(i, rd_succ); + auto j = row_diff_successor(graph, i, rd_succ); succ_buf.push_back(to_row(j)); succ_boundary_buf.push_back(0); } @@ -402,7 +486,7 @@ void build_pred_succ(const graph::DeBruijnGraph &graph, if (rd_succ.size()) { with_rd_succ(rd_succ); } else { - with_rd_succ(*graph.get_last()); + with_rd_succ(*get_last(graph)); } } succ_boundary_buf.push_back(1); @@ -484,11 +568,11 @@ void assign_anchors(const graph::DeBruijnGraph &graph, if (rd_succ.size()) { logger->trace("Assigning anchors for RowDiff successors {}...", rd_succ_fname); - graph.row_diff_traverse(num_threads, max_length, rd_succ, &anchors_bv); + row_diff_traverse(graph, num_threads, max_length, rd_succ, &anchors_bv); } else { logger->warn("Assigning anchors without chosen RowDiff successors." " The last outgoing edges will be used for routing."); - graph.row_diff_traverse(num_threads, max_length, *graph.get_last(), &anchors_bv); + row_diff_traverse(graph, num_threads, max_length, *get_last(graph), &anchors_bv); } } diff --git a/metagraph/src/graph/representation/base/sequence_graph.cpp b/metagraph/src/graph/representation/base/sequence_graph.cpp index 1b933c5b71..89ecdfa16f 100644 --- a/metagraph/src/graph/representation/base/sequence_graph.cpp +++ b/metagraph/src/graph/representation/base/sequence_graph.cpp @@ -5,7 +5,6 @@ #include #include "common/logger.hpp" -#include "common/vectors/bit_vector_dyn.hpp" #include "common/seq_tools/reverse_complement.hpp" #include "common/threads/threading.hpp" #include "common/vectors/vector_algorithm.hpp" @@ -421,71 +420,6 @@ void DeBruijnGraph::call_unitigs(const CallPath &callback, ::mtg::graph::call_sequences(*this, callback, num_threads, true, min_tip_size, kmers_in_single_form); } -std::shared_ptr DeBruijnGraph::get_last() const { - bit_vector_dyn last_bv(max_index() + 1); - call_nodes([&](node_index v) { - std::pair last; - call_outgoing_kmers(v, [&](node_index u, char c) { - last = std::max(last, std::pair{c, u}); - }); - last_bv.set(last.second, true); - }); - return std::make_shared(std::move(last_bv)); -} - -void DeBruijnGraph::row_diff_traverse(size_t num_threads, - size_t max_length, - const bit_vector &rd_succ, - sdsl::bit_vector *terminal) const { - sdsl::bit_vector visited(max_index() + 1); - auto finalised = visited; - std::vector distance(max_index() + 1); - assert(terminal->size() == visited.size()); - assert(rd_succ.size() == visited.size()); - auto set_terminal = [&](int v) { - distance[v] = 0; - (*terminal)[v] = true; - }; - call_nodes([&](node_index v) { - static std::stack path; - while (!visited[v]) { - path.push(v); - visited[v] = true; - if (!has_no_outgoing(v)) { - v = row_diff_successor(v, rd_succ); - } - } - // Either a sink, or a cyclic dependency - if (!finalised[v]) { - set_terminal(v); - finalised[v] = true; - } - node_index succ; - while (!empty(path)) { - succ = std::exchange(v, path.top()); - if (!finalised[v]) { - distance[v] = distance[succ] + 1; - if (distance[v] == max_length) { - set_terminal(v); - } - finalised[v] = true; - } - path.pop(); - } - }); -} - -node_index DeBruijnGraph::row_diff_successor(node_index node, const bit_vector &rd_succ) const { - node_index succ = npos; - adjacent_outgoing_nodes(node, [&](node_index adjacent_node) { - if(rd_succ[adjacent_node]) { - succ = adjacent_node; - } - }); - assert(succ != npos && "a row diff successor must exist"); - return succ; -} - /** * Traverse graph and iterate over all nodes */ diff --git a/metagraph/src/graph/representation/base/sequence_graph.hpp b/metagraph/src/graph/representation/base/sequence_graph.hpp index e79e2f3612..72ff596e82 100644 --- a/metagraph/src/graph/representation/base/sequence_graph.hpp +++ b/metagraph/src/graph/representation/base/sequence_graph.hpp @@ -246,15 +246,6 @@ class DeBruijnGraph : public SequenceGraph { // Call all nodes that have no incoming edges virtual void call_source_nodes(const std::function &callback) const; - - virtual std::shared_ptr get_last() const; - - virtual void row_diff_traverse(size_t num_threads, - size_t max_length, - const bit_vector &rd_succ, - sdsl::bit_vector *terminal) const; - - virtual node_index row_diff_successor(node_index node, const bit_vector &rd_succ) const; }; diff --git a/metagraph/src/graph/representation/succinct/dbg_succinct.cpp b/metagraph/src/graph/representation/succinct/dbg_succinct.cpp index 69b51859fd..bf46dbb865 100644 --- a/metagraph/src/graph/representation/succinct/dbg_succinct.cpp +++ b/metagraph/src/graph/representation/succinct/dbg_succinct.cpp @@ -525,25 +525,6 @@ ::call_source_nodes(const std::function &callback) const { }); } -std::shared_ptr DBGSuccinct -::get_last() const { - return std::shared_ptr(std::shared_ptr{}, &get_boss().get_last()); -} - -void DBGSuccinct -::row_diff_traverse(size_t num_threads, - size_t max_length, - const bit_vector &rd_succ, - sdsl::bit_vector *terminal) const { - return get_boss().row_diff_traverse(num_threads, max_length, rd_succ, terminal); -} - -node_index DBGSuccinct -::row_diff_successor(node_index node, const bit_vector &rd_succ) const { - return get_boss().row_diff_successor(node, rd_succ); -} - - size_t DBGSuccinct::outdegree(node_index node) const { assert(is_valid(node)); diff --git a/metagraph/src/graph/representation/succinct/dbg_succinct.hpp b/metagraph/src/graph/representation/succinct/dbg_succinct.hpp index 69ead431d8..77a603b150 100644 --- a/metagraph/src/graph/representation/succinct/dbg_succinct.hpp +++ b/metagraph/src/graph/representation/succinct/dbg_succinct.hpp @@ -183,16 +183,6 @@ class DBGSuccinct : public DeBruijnGraph { node_index select_node(uint64_t boss_index) const; uint64_t rank_node(node_index kmer_index) const; - virtual std::shared_ptr get_last() const override final; - - virtual void row_diff_traverse(size_t num_threads, - size_t max_length, - const bit_vector &rd_succ, - sdsl::bit_vector *terminal) const override final; - - virtual node_index row_diff_successor(node_index node, - const bit_vector &rd_succ) const override final; - void initialize_bloom_filter_from_fpr(double false_positive_rate, uint32_t max_num_hash_functions = -1); From 04740f1c1b5d77126601c31c1f194b05ea9ecb4a Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Thu, 10 Oct 2024 21:38:44 +0200 Subject: [PATCH 25/29] Preserve lifetime of get_last --- metagraph/src/annotation/row_diff_builder.cpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/metagraph/src/annotation/row_diff_builder.cpp b/metagraph/src/annotation/row_diff_builder.cpp index 8fcb511a3d..7c19ba98c0 100644 --- a/metagraph/src/annotation/row_diff_builder.cpp +++ b/metagraph/src/annotation/row_diff_builder.cpp @@ -296,23 +296,24 @@ rd_succ_bv_type route_at_forks(const graph::DeBruijnGraph &graph, if (utils::ends_with(p.path(), row_count_extension)) optimize_forks = true; } - + // Other graphs may not support consecutive access + optimize_forks &= (bool)dynamic_cast(&graph); if (optimize_forks) { logger->trace("RowDiff successors will be set to the adjacent nodes with" " the largest number of labels"); - const bit_vector &last = *get_last(graph); + auto last = get_last(graph); graph::DeBruijnGraph::node_index graph_idx = to_node(0); std::vector outgoing_counts; - sdsl::bit_vector rd_succ_bv(last.size(), false); + sdsl::bit_vector rd_succ_bv(last->size(), false); sum_and_call_counts(count_vectors_dir, row_count_extension, "row counts", [&](int32_t count) { // TODO: skip single outgoing outgoing_counts.push_back(count); - if (last[graph_idx]) { + if ((*last)[graph_idx]) { // pick the node with the largest count size_t max_pos = std::max_element(outgoing_counts.rbegin(), outgoing_counts.rend()) @@ -486,7 +487,8 @@ void build_pred_succ(const graph::DeBruijnGraph &graph, if (rd_succ.size()) { with_rd_succ(rd_succ); } else { - with_rd_succ(*get_last(graph)); + auto last = get_last(graph); + with_rd_succ(*last); } } succ_boundary_buf.push_back(1); @@ -572,7 +574,8 @@ void assign_anchors(const graph::DeBruijnGraph &graph, } else { logger->warn("Assigning anchors without chosen RowDiff successors." " The last outgoing edges will be used for routing."); - row_diff_traverse(graph, num_threads, max_length, *get_last(graph), &anchors_bv); + auto last = get_last(graph); + row_diff_traverse(graph, num_threads, max_length, *last, &anchors_bv); } } From e2347d623523d88768ee107bf9ad99d1d559559a Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Tue, 15 Oct 2024 03:52:12 +0200 Subject: [PATCH 26/29] Fix integration tests --- metagraph/integration_tests/test_annotate.py | 25 +++++++++++-------- metagraph/integration_tests/test_query.py | 6 ++--- .../integration_tests/test_transform_anno.py | 20 +++++++++------ 3 files changed, 31 insertions(+), 20 deletions(-) diff --git a/metagraph/integration_tests/test_annotate.py b/metagraph/integration_tests/test_annotate.py index 38b5f90ba4..4cabf26cdc 100644 --- a/metagraph/integration_tests/test_annotate.py +++ b/metagraph/integration_tests/test_annotate.py @@ -67,8 +67,9 @@ def test_simple_all_graphs(self, graph_repr): self.assertEqual(res.returncode, 0) out = res.stdout.decode().split('\n')[2:] self.assertEqual('labels: 100', out[0]) - self.assertEqual('objects: 46960', out[1]) - self.assertEqual('density: 0.0185072', out[2]) + self.assertIn((out[1], out[2]), [ + ('objects: 47633', 'density: 0.0182458'), # DBGSuccinct with dummy nodes + ('objects: 46960', 'density: 0.0185072')]) self.assertEqual('representation: ' + anno_repr, out[3]) # TODO: add 'hashstr' once the canonical mode is implemented for it @@ -150,8 +151,9 @@ def test_simple_all_graphs_from_kmc(self, graph_repr): self.assertEqual(res.returncode, 0) out = res.stdout.decode().split('\n')[2:] self.assertEqual('labels: 1', out[0]) - self.assertEqual('objects: 469983', out[1]) - self.assertEqual('density: 1', out[2]) + self.assertIn((out[1], out[2]), [ + ('objects: 471169', 'density: 0.997483'), # DBGSuccinct with dummy nodes + ('objects: 469983', 'density: 1')]) self.assertEqual('representation: ' + anno_repr, out[3]) @parameterized.expand(GRAPH_TYPES) @@ -190,8 +192,9 @@ def test_simple_all_graphs_from_kmc_both(self, graph_repr): self.assertEqual(res.returncode, 0) out = res.stdout.decode().split('\n')[2:] self.assertEqual('labels: 1', out[0]) - self.assertEqual('objects: 802920', out[1]) - self.assertEqual('density: 0.585342', out[2]) + self.assertIn((out[1], out[2]), [ + ('objects: 804179', 'density: 0.584426'), # DBGSuccinct with dummy nodes + ('objects: 802920', 'density: 0.585342')]) self.assertEqual('representation: ' + anno_repr, out[3]) # both strands @@ -208,8 +211,9 @@ def test_simple_all_graphs_from_kmc_both(self, graph_repr): self.assertEqual(res.returncode, 0) out = res.stdout.decode().split('\n')[2:] self.assertEqual('labels: 1', out[0]) - self.assertEqual('objects: 802920', out[1]) - self.assertEqual('density: 1', out[2]) + self.assertIn((out[1], out[2]), [ + ('objects: 804179', 'density: 0.998434'), # DBGSuccinct with dummy nodes + ('objects: 802920', 'density: 1')]) self.assertEqual('representation: ' + anno_repr, out[3]) # TODO: add 'hashstr' once the canonical mode is implemented for it @@ -310,8 +314,9 @@ def test_annotate_with_disk_swap(self): self.assertEqual(res.returncode, 0) out = res.stdout.decode().split('\n')[2:] self.assertEqual('labels: 100', out[0]) - self.assertEqual('objects: 46960', out[1]) - self.assertEqual('density: 0.0185072', out[2]) + self.assertIn((out[1], out[2]), [ + ('objects: 47633', 'density: 0.0182458'), # DBGSuccinct with dummy nodes + ('objects: 46960', 'density: 0.0185072')]) self.assertEqual('representation: ' + anno_repr, out[3]) @parameterized.expand(GRAPH_TYPES) diff --git a/metagraph/integration_tests/test_query.py b/metagraph/integration_tests/test_query.py index 25434a7b22..917356e3ac 100644 --- a/metagraph/integration_tests/test_query.py +++ b/metagraph/integration_tests/test_query.py @@ -127,7 +127,7 @@ def check_suffix(anno_repr, suffix): out = res.stdout.decode().split('\n')[2:] assert('labels: 100' == out[0]) if cls.graph_repr != 'hashfast' and (cls.graph_repr != 'succinct' or cls.mask_dummy): - assert('objects: 46960' == out[1]) + assert(out[1] in ['objects: 47633', 'objects: 46960']) if cls.anno_repr.endswith('_noswap'): cls.anno_repr = cls.anno_repr[:-len('_noswap')] @@ -601,7 +601,7 @@ def check_suffix(anno_repr, suffix): assert(res.returncode == 0) out = res.stdout.decode().split('\n')[2:] assert('labels: 3' == out[0]) - assert('objects: 12' == out[1]) + assert(out[1] in ['objects: 18', 'objects: 12']) if cls.anno_repr.endswith('_noswap'): cls.anno_repr = cls.anno_repr[:-len('_noswap')] @@ -697,7 +697,7 @@ def check_suffix(anno_repr, suffix): out = res.stdout.decode().split('\n')[2:] assert('labels: 1' == out[0]) if cls.graph_repr != 'hashfast' and (cls.graph_repr != 'succinct' or cls.mask_dummy): - assert('objects: 46960' == out[1]) + assert(out[1] in ['objects: 47633', 'objects: 46960']) if cls.anno_repr.endswith('_noswap'): cls.anno_repr = cls.anno_repr[:-len('_noswap')] diff --git a/metagraph/integration_tests/test_transform_anno.py b/metagraph/integration_tests/test_transform_anno.py index a1887db9e0..3961c98bbf 100644 --- a/metagraph/integration_tests/test_transform_anno.py +++ b/metagraph/integration_tests/test_transform_anno.py @@ -56,8 +56,9 @@ def setUp(self): self.assertEqual(res.returncode, 0) out = res.stdout.decode().split('\n')[2:] self.assertEqual('labels: 100', out[0]) - self.assertEqual('objects: 46960', out[1]) - self.assertEqual('density: 0.0185072', out[2]) + self.assertIn((out[1], out[2]), [ + ('objects: 47633', 'density: 0.0182458'), # DBGSuccinct with dummy nodes + ('objects: 46960', 'density: 0.0185072')]) self.assertEqual(f'representation: {self.anno_repr}', out[3]) def tearDown(self): @@ -82,8 +83,11 @@ def _check_aggregation_min(self, min_count, expected_density): self.assertEqual(res.returncode, 0) out = res.stdout.decode().split('\n')[2:] self.assertEqual('labels: 1', out[0]) - self.assertEqual('objects: 46960', out[1]) - self.assertEqual(f'density: {expected_density}', out[2]) + split = out[2].split() + self.assertEqual(out[2], split[0] + ' ' + split[1]) + density = float(split[1]) + self.assertIn(out[1], ['objects: 47633', 'objects: 46960']) + self.assertLess(abs(density - expected_density * 46960 / 47633), 10**-6) self.assertEqual(f'representation: {self.anno_repr}', out[3]) def test_aggregate_columns(self): @@ -104,9 +108,11 @@ def _check_aggregation_min_max_value(self, min_count, max_value, expected_densit self.assertEqual(res.returncode, 0) out = res.stdout.decode().split('\n')[2:] self.assertEqual('labels: 1', out[0]) - self.assertEqual('objects: 46960', out[1]) - self.assertEqual(f'density: {expected_density}', out[2]) - self.assertEqual(f'representation: {self.anno_repr}', out[3]) + split = out[2].split() + self.assertEqual(out[2], split[0] + ' ' + split[1]) + density = float(split[1]) + self.assertIn(out[1], ['objects: 47633', 'objects: 46960']) + self.assertLess(abs(density - expected_density * 46960 / 47633), 10**-6) def test_aggregate_columns_filtered(self): self._check_aggregation_min_max_value(0, 0, 0) From 94e9f901d278c44ce6bc45eb62f4fd304342a3c8 Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Tue, 15 Oct 2024 19:06:46 +0200 Subject: [PATCH 27/29] Fix integration tests + return dict in _get_stats --- metagraph/integration_tests/base.py | 13 +- metagraph/integration_tests/test_align.py | 108 ++++----- metagraph/integration_tests/test_annotate.py | 199 ++++++++-------- metagraph/integration_tests/test_build.py | 160 ++++++------- .../integration_tests/test_build_weighted.py | 165 +++++++------- metagraph/integration_tests/test_clean.py | 214 ++++++++---------- metagraph/integration_tests/test_query.py | 132 +++++------ .../integration_tests/test_transform_anno.py | 68 +++--- metagraph/src/cli/stats.cpp | 2 +- 9 files changed, 505 insertions(+), 556 deletions(-) diff --git a/metagraph/integration_tests/base.py b/metagraph/integration_tests/base.py index 015b7dcf41..36b0b532e2 100644 --- a/metagraph/integration_tests/base.py +++ b/metagraph/integration_tests/base.py @@ -37,10 +37,19 @@ def setUpClass(cls): def _get_stats(graph_path): stats_command = METAGRAPH + ' stats ' + graph_path + ' --mmap' res = subprocess.run(stats_command.split(), stdout=PIPE, stderr=PIPE) - assert(res.returncode == 0) + if res.returncode != 0: + raise AssertionError(f"Command '{stats_command}' failed with return code {res.returncode} and error: {res.stderr.decode()}") stats_command = METAGRAPH + ' stats ' + graph_path + MMAP_FLAG res = subprocess.run(stats_command.split(), stdout=PIPE, stderr=PIPE) - return res + parsed = dict() + parsed['returncode'] = res.returncode + res = res.stdout.decode().split('\n')[2:] + for line in res: + if ': ' in line: + x, y = map(str.strip, line.split(':', 1)) + assert(x not in parsed or parsed[x] == y) + parsed[x] = y + return parsed @staticmethod def _build_graph(input, output, k, repr, mode='basic', extra_params=''): diff --git a/metagraph/integration_tests/test_align.py b/metagraph/integration_tests/test_align.py index 2e3bcb0d83..f4b4c79fa6 100644 --- a/metagraph/integration_tests/test_align.py +++ b/metagraph/integration_tests/test_align.py @@ -35,11 +35,10 @@ def test_simple_align_all_graphs(self, representation): k=11, repr=representation, extra_params="--mask-dummy") - res = self._get_stats(self.tempdir.name + '/genome.MT' + graph_file_extension[representation]) - params_str = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 11', params_str[0]) - self.assertEqual('nodes (k): 16438', params_str[1]) - self.assertEqual('mode: basic', params_str[2]) + params = self._get_stats(self.tempdir.name + '/genome.MT' + graph_file_extension[representation]) + self.assertEqual('11', params['k']) + self.assertEqual('16438', params['nodes (k)']) + self.assertEqual('basic', params['mode']) stats_command = '{exe} align --align-only-forwards -i {graph} --align-min-exact-match 0.0 {reads}'.format( exe=METAGRAPH, @@ -68,11 +67,10 @@ def test_simple_align_map_all_graphs(self, representation): k=11, repr=representation, extra_params="--mask-dummy") - res = self._get_stats(self.tempdir.name + '/genome.MT' + graph_file_extension[representation]) - params_str = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 11', params_str[0]) - self.assertEqual('nodes (k): 16438', params_str[1]) - self.assertEqual('mode: basic', params_str[2]) + params = self._get_stats(self.tempdir.name + '/genome.MT' + graph_file_extension[representation]) + self.assertEqual('11', params['k']) + self.assertEqual('16438', params['nodes (k)']) + self.assertEqual('basic', params['mode']) stats_command = '{exe} align -i {graph} --map --count-kmers {reads}'.format( exe=METAGRAPH, @@ -99,11 +97,10 @@ def test_simple_align_map_all_graphs_subk(self, representation): k=11, repr=representation, extra_params="--mask-dummy") - res = self._get_stats(self.tempdir.name + '/genome.MT' + graph_file_extension[representation]) - params_str = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 11', params_str[0]) - self.assertEqual('nodes (k): 16438', params_str[1]) - self.assertEqual('mode: basic', params_str[2]) + params = self._get_stats(self.tempdir.name + '/genome.MT' + graph_file_extension[representation]) + self.assertEqual('11', params['k']) + self.assertEqual('16438', params['nodes (k)']) + self.assertEqual('basic', params['mode']) stats_command = '{exe} align -i {graph} --map --count-kmers --align-length 10 {reads}'.format( exe=METAGRAPH, @@ -134,11 +131,10 @@ def test_simple_align_map_canonical_all_graphs(self, representation): k=11, repr=representation, mode='canonical', extra_params="--mask-dummy") - res = self._get_stats(self.tempdir.name + '/genome.MT' + graph_file_extension[representation]) - params_str = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 11', params_str[0]) - self.assertEqual('nodes (k): 32782', params_str[1]) - self.assertEqual('mode: canonical', params_str[2]) + params = self._get_stats(self.tempdir.name + '/genome.MT' + graph_file_extension[representation]) + self.assertEqual('11', params['k']) + self.assertEqual('32782', params['nodes (k)']) + self.assertEqual('canonical', params['mode']) stats_command = '{exe} align -i {graph} --map --count-kmers {reads}'.format( exe=METAGRAPH, @@ -165,11 +161,10 @@ def test_simple_align_json_all_graphs(self, representation): k=11, repr=representation, extra_params="--mask-dummy") - res = self._get_stats(self.tempdir.name + '/genome.MT' + graph_file_extension[representation]) - params_str = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 11', params_str[0]) - self.assertEqual('nodes (k): 16438', params_str[1]) - self.assertEqual('mode: basic', params_str[2]) + params = self._get_stats(self.tempdir.name + '/genome.MT' + graph_file_extension[representation]) + self.assertEqual('11', params['k']) + self.assertEqual('16438', params['nodes (k)']) + self.assertEqual('basic', params['mode']) stats_command = '{exe} align --align-only-forwards -i {graph} --align-min-exact-match 0.0 {reads}'.format( exe=METAGRAPH, @@ -189,11 +184,10 @@ def test_simple_align_fwd_rev_comp_all_graphs(self, representation): k=11, repr=representation, extra_params="--mask-dummy") - res = self._get_stats(self.tempdir.name + '/genome.MT' + graph_file_extension[representation]) - params_str = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 11', params_str[0]) - self.assertEqual('nodes (k): 16438', params_str[1]) - self.assertEqual('mode: basic', params_str[2]) + params = self._get_stats(self.tempdir.name + '/genome.MT' + graph_file_extension[representation]) + self.assertEqual('11', params['k']) + self.assertEqual('16438', params['nodes (k)']) + self.assertEqual('basic', params['mode']) stats_command = '{exe} align -i {graph} --align-min-exact-match 0.0 {reads}'.format( exe=METAGRAPH, @@ -222,11 +216,10 @@ def test_simple_align_canonical_all_graphs(self, representation): k=11, repr=representation, mode='canonical', extra_params="--mask-dummy") - res = self._get_stats(self.tempdir.name + '/genome.MT' + graph_file_extension[representation]) - params_str = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 11', params_str[0]) - self.assertEqual('nodes (k): 32782', params_str[1]) - self.assertEqual('mode: canonical', params_str[2]) + params = self._get_stats(self.tempdir.name + '/genome.MT' + graph_file_extension[representation]) + self.assertEqual('11', params['k']) + self.assertEqual('32782', params['nodes (k)']) + self.assertEqual('canonical', params['mode']) stats_command = '{exe} align -i {graph} --align-min-exact-match 0.0 {reads}'.format( exe=METAGRAPH, @@ -256,11 +249,10 @@ def test_simple_align_canonical_subk_succinct(self, representation): k=11, repr=representation, mode='canonical', extra_params="--mask-dummy") - res = self._get_stats(self.tempdir.name + '/genome.MT' + graph_file_extension[representation]) - params_str = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 11', params_str[0]) - self.assertEqual('nodes (k): 32782', params_str[1]) - self.assertEqual('mode: canonical', params_str[2]) + params = self._get_stats(self.tempdir.name + '/genome.MT' + graph_file_extension[representation]) + self.assertEqual('11', params['k']) + self.assertEqual('32782', params['nodes (k)']) + self.assertEqual('canonical', params['mode']) stats_command = '{exe} align -i {graph} --align-min-exact-match 0.0 --align-min-seed-length 10 {reads}'.format( exe=METAGRAPH, @@ -286,11 +278,10 @@ def test_simple_align_primary_all_graphs(self, representation): k=11, repr=representation, mode='primary', extra_params="--mask-dummy") - res = self._get_stats(self.tempdir.name + '/genome.MT.primary' + graph_file_extension[representation]) - params_str = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 11', params_str[0]) - self.assertEqual('nodes (k): 16391', params_str[1]) - self.assertEqual('mode: primary', params_str[2]) + params = self._get_stats(self.tempdir.name + '/genome.MT.primary' + graph_file_extension[representation]) + self.assertEqual('11', params['k']) + self.assertEqual('16391', params['nodes (k)']) + self.assertEqual('primary', params['mode']) stats_command = '{exe} align -i {graph} --align-min-exact-match 0.0 {reads}'.format( exe=METAGRAPH, @@ -320,11 +311,10 @@ def test_simple_align_primary_subk_succinct(self, representation): k=11, repr=representation, mode='primary', extra_params="--mask-dummy") - res = self._get_stats(self.tempdir.name + '/genome.MT.primary' + graph_file_extension[representation]) - params_str = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 11', params_str[0]) - self.assertEqual('nodes (k): 16391', params_str[1]) - self.assertEqual('mode: primary', params_str[2]) + params = self._get_stats(self.tempdir.name + '/genome.MT.primary' + graph_file_extension[representation]) + self.assertEqual('11', params['k']) + self.assertEqual('16391', params['nodes (k)']) + self.assertEqual('primary', params['mode']) stats_command = '{exe} align -i {graph} --align-min-exact-match 0.0 --align-min-seed-length 10 {reads}'.format( exe=METAGRAPH, @@ -349,11 +339,10 @@ def test_simple_align_fwd_rev_comp_json_all_graphs(self, representation): output=self.tempdir.name + '/genome.MT', k=11, repr=representation) - res = self._get_stats(self.tempdir.name + '/genome.MT' + graph_file_extension[representation]) - params_str = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 11', params_str[0]) - self.assertEqual('nodes (k): 16461', params_str[1]) - self.assertEqual('mode: basic', params_str[2]) + params = self._get_stats(self.tempdir.name + '/genome.MT' + graph_file_extension[representation]) + self.assertEqual('11', params['k']) + self.assertEqual('16461', params['nodes (k)']) + self.assertEqual('basic', params['mode']) stats_command = '{exe} align --json -i {graph} --align-min-exact-match 0.0 {reads}'.format( exe=METAGRAPH, @@ -375,11 +364,10 @@ def test_simple_align_edit_distance_all_graphs(self, representation): output=self.tempdir.name + '/genome.MT', k=11, repr=representation) - res = self._get_stats(self.tempdir.name + '/genome.MT' + graph_file_extension[representation]) - params_str = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 11', params_str[0]) - self.assertEqual('nodes (k): 16461', params_str[1]) - self.assertEqual('mode: basic', params_str[2]) + params = self._get_stats(self.tempdir.name + '/genome.MT' + graph_file_extension[representation]) + self.assertEqual('11', params['k']) + self.assertEqual('16461', params['nodes (k)']) + self.assertEqual('basic', params['mode']) stats_command = '{exe} align --json --align-edit-distance -i {graph} --align-min-exact-match 0.0 {reads}'.format( exe=METAGRAPH, diff --git a/metagraph/integration_tests/test_annotate.py b/metagraph/integration_tests/test_annotate.py index 4cabf26cdc..e7c7707333 100644 --- a/metagraph/integration_tests/test_annotate.py +++ b/metagraph/integration_tests/test_annotate.py @@ -44,12 +44,11 @@ def test_simple_all_graphs(self, graph_repr): res = subprocess.run([construct_command], shell=True) self.assertEqual(res.returncode, 0) - res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[graph_repr]) - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 20', out[0]) - self.assertEqual('nodes (k): 46960', out[1]) - self.assertEqual('mode: basic', out[2]) + stats_graph = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[graph_repr]) + self.assertEqual(stats_graph['returncode'], 0) + self.assertEqual('20', stats_graph['k']) + self.assertEqual('46960', stats_graph['nodes (k)']) + self.assertEqual('basic', stats_graph['mode']) for anno_repr in ['row', 'column']: # build annotation @@ -63,14 +62,15 @@ def test_simple_all_graphs(self, graph_repr): self.assertEqual(res.returncode, 0) # check annotation - res = self._get_stats(f'-a {self.tempdir.name}/annotation{anno_file_extension[anno_repr]}') - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('labels: 100', out[0]) - self.assertIn((out[1], out[2]), [ - ('objects: 47633', 'density: 0.0182458'), # DBGSuccinct with dummy nodes - ('objects: 46960', 'density: 0.0185072')]) - self.assertEqual('representation: ' + anno_repr, out[3]) + stats_annotation = self._get_stats('-a ' + self.tempdir.name + '/annotation' + anno_file_extension[anno_repr]) + self.assertEqual(stats_annotation['returncode'], 0) + self.assertEqual('100', stats_annotation['labels']) + self.assertEqual(stats_graph['max index (k)'], stats_annotation['objects']) + self.assertAlmostEqual( + 0.0185072 * (int(stats_graph['nodes (k)']) / int(stats_graph['max index (k)'])), + float(stats_annotation['density']), + places=6) + self.assertEqual(anno_repr, stats_annotation['representation']) # TODO: add 'hashstr' once the canonical mode is implemented for it @parameterized.expand(['succinct', 'bitmap', 'hash']) # , 'hashstr']: @@ -89,12 +89,11 @@ def test_simple_all_graphs_canonical(self, graph_repr): res = subprocess.run([construct_command], shell=True) self.assertEqual(res.returncode, 0) - res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[graph_repr]) - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 20', out[0]) - self.assertEqual('nodes (k): 91584', out[1]) - self.assertEqual('mode: canonical', out[2]) + stats_graph = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[graph_repr]) + self.assertEqual(stats_graph['returncode'], 0) + self.assertEqual('20', stats_graph['k']) + self.assertEqual('91584', stats_graph['nodes (k)']) + self.assertEqual('canonical', stats_graph['mode']) for anno_repr in ['row', 'column']: # build annotation @@ -107,13 +106,15 @@ def test_simple_all_graphs_canonical(self, graph_repr): self.assertEqual(res.returncode, 0) # check annotation - res = self._get_stats(f'-a {self.tempdir.name}/annotation{anno_file_extension[anno_repr]}') - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('labels: 100', out[0]) - self.assertEqual('objects: 91584', out[1]) - self.assertEqual('density: 0.00948888', out[2]) - self.assertEqual('representation: ' + anno_repr, out[3]) + stats_annotation = self._get_stats('-a ' + self.tempdir.name + '/annotation' + anno_file_extension[anno_repr]) + self.assertEqual(stats_annotation['returncode'], 0) + self.assertEqual('100', stats_annotation['labels']) + self.assertEqual(stats_graph['max index (k)'], stats_annotation['objects']) + self.assertAlmostEqual( + 0.00948888 * (int(stats_graph['nodes (k)']) / int(stats_graph['max index (k)'])), + float(stats_annotation['density']), + places=6) + self.assertEqual(anno_repr, stats_annotation['representation']) @parameterized.expand(GRAPH_TYPES) def test_simple_all_graphs_from_kmc(self, graph_repr): @@ -129,12 +130,11 @@ def test_simple_all_graphs_from_kmc(self, graph_repr): res = subprocess.run([construct_command], shell=True) self.assertEqual(res.returncode, 0) - res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[graph_repr]) - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 11', out[0]) - self.assertEqual('nodes (k): 469983', out[1]) - self.assertEqual('mode: basic', out[2]) + stats_graph = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[graph_repr]) + self.assertEqual(stats_graph['returncode'], 0) + self.assertEqual('11', stats_graph['k']) + self.assertEqual('469983', stats_graph['nodes (k)']) + self.assertEqual('basic', stats_graph['mode']) for anno_repr in ['row', 'column']: # build annotation @@ -147,14 +147,15 @@ def test_simple_all_graphs_from_kmc(self, graph_repr): self.assertEqual(res.returncode, 0) # check annotation - res = self._get_stats(f'-a {self.tempdir.name}/annotation{anno_file_extension[anno_repr]}') - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('labels: 1', out[0]) - self.assertIn((out[1], out[2]), [ - ('objects: 471169', 'density: 0.997483'), # DBGSuccinct with dummy nodes - ('objects: 469983', 'density: 1')]) - self.assertEqual('representation: ' + anno_repr, out[3]) + stats_annotation = self._get_stats('-a ' + self.tempdir.name + '/annotation' + anno_file_extension[anno_repr]) + self.assertEqual(stats_annotation['returncode'], 0) + self.assertEqual('1', stats_annotation['labels']) + self.assertEqual(stats_graph['max index (k)'], stats_annotation['objects']) + self.assertAlmostEqual( + 1 * (int(stats_graph['nodes (k)']) / int(stats_graph['max index (k)'])), + float(stats_annotation['density']), + places=6) + self.assertEqual(anno_repr, stats_annotation['representation']) @parameterized.expand(GRAPH_TYPES) def test_simple_all_graphs_from_kmc_both(self, graph_repr): @@ -170,12 +171,11 @@ def test_simple_all_graphs_from_kmc_both(self, graph_repr): res = subprocess.run([construct_command], shell=True) self.assertEqual(res.returncode, 0) - res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[graph_repr]) - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 11', out[0]) - self.assertEqual('nodes (k): 802920', out[1]) - self.assertEqual('mode: basic', out[2]) + stats_graph = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[graph_repr]) + self.assertEqual(stats_graph['returncode'], 0) + self.assertEqual('11', stats_graph['k']) + self.assertEqual('802920', stats_graph['nodes (k)']) + self.assertEqual('basic', stats_graph['mode']) for anno_repr in ['row', 'column']: # build annotation @@ -188,14 +188,15 @@ def test_simple_all_graphs_from_kmc_both(self, graph_repr): self.assertEqual(res.returncode, 0) # check annotation - res = self._get_stats(f'-a {self.tempdir.name}/annotation_single{anno_file_extension[anno_repr]}') - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('labels: 1', out[0]) - self.assertIn((out[1], out[2]), [ - ('objects: 804179', 'density: 0.584426'), # DBGSuccinct with dummy nodes - ('objects: 802920', 'density: 0.585342')]) - self.assertEqual('representation: ' + anno_repr, out[3]) + stats_annotation = self._get_stats('-a ' + self.tempdir.name + '/annotation_single' + anno_file_extension[anno_repr]) + self.assertEqual(stats_annotation['returncode'], 0) + self.assertEqual('1', stats_annotation['labels']) + self.assertEqual(stats_graph['max index (k)'], stats_annotation['objects']) + self.assertAlmostEqual( + 0.585342 * (int(stats_graph['nodes (k)']) / int(stats_graph['max index (k)'])), + float(stats_annotation['density']), + places=6) + self.assertEqual(anno_repr, stats_annotation['representation']) # both strands annotate_command = f'{METAGRAPH} annotate --anno-label LabelName -p {NUM_THREADS} \ @@ -207,14 +208,15 @@ def test_simple_all_graphs_from_kmc_both(self, graph_repr): self.assertEqual(res.returncode, 0) # check annotation - res = self._get_stats(f'-a {self.tempdir.name}/annotation_both{anno_file_extension[anno_repr]}') - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('labels: 1', out[0]) - self.assertIn((out[1], out[2]), [ - ('objects: 804179', 'density: 0.998434'), # DBGSuccinct with dummy nodes - ('objects: 802920', 'density: 1')]) - self.assertEqual('representation: ' + anno_repr, out[3]) + stats_annotation = self._get_stats('-a ' + self.tempdir.name + '/annotation_both' + anno_file_extension[anno_repr]) + self.assertEqual(stats_annotation['returncode'], 0) + self.assertEqual('1', stats_annotation['labels']) + self.assertEqual(stats_graph['max index (k)'], stats_annotation['objects']) + self.assertAlmostEqual( + 1 * (int(stats_graph['nodes (k)']) / int(stats_graph['max index (k)'])), + float(stats_annotation['density']), + places=6) + self.assertEqual(anno_repr, stats_annotation['representation']) # TODO: add 'hashstr' once the canonical mode is implemented for it @parameterized.expand(['succinct', 'bitmap', 'hash']) # , 'hashstr']: @@ -232,12 +234,11 @@ def test_simple_all_graphs_from_kmc_both_canonical(self, graph_repr): res = subprocess.run([construct_command], shell=True) self.assertEqual(res.returncode, 0) - res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[graph_repr]) - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 11', out[0]) - self.assertEqual('nodes (k): 802920', out[1]) - self.assertEqual('mode: canonical', out[2]) + stats_graph = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[graph_repr]) + self.assertEqual(stats_graph['returncode'], 0) + self.assertEqual('11', stats_graph['k']) + self.assertEqual('802920', stats_graph['nodes (k)']) + self.assertEqual('canonical', stats_graph['mode']) for anno_repr in ['row', 'column']: # build annotation @@ -250,13 +251,15 @@ def test_simple_all_graphs_from_kmc_both_canonical(self, graph_repr): self.assertEqual(res.returncode, 0) # check annotation - res = self._get_stats(f'-a {self.tempdir.name}/annotation_single{anno_file_extension[anno_repr]}') - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('labels: 1', out[0]) - self.assertEqual('objects: 802920', out[1]) - self.assertEqual('density: 0.5', out[2]) - self.assertEqual('representation: ' + anno_repr, out[3]) + stats_annotation = self._get_stats('-a ' + self.tempdir.name + '/annotation_single' + anno_file_extension[anno_repr]) + self.assertEqual(stats_annotation['returncode'], 0) + self.assertEqual('1', stats_annotation['labels']) + self.assertEqual(stats_graph['max index (k)'], stats_annotation['objects']) + self.assertAlmostEqual( + 0.5 * (int(stats_graph['nodes (k)']) / int(stats_graph['max index (k)'])), + float(stats_annotation['density']), + places=6) + self.assertEqual(anno_repr, stats_annotation['representation']) # both strands annotate_command = f'{METAGRAPH} annotate --anno-label LabelName -p {NUM_THREADS} \ @@ -268,13 +271,15 @@ def test_simple_all_graphs_from_kmc_both_canonical(self, graph_repr): self.assertEqual(res.returncode, 0) # check annotation - res = self._get_stats(f'-a {self.tempdir.name}/annotation_both{anno_file_extension[anno_repr]}') - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('labels: 1', out[0]) - self.assertEqual('objects: 802920', out[1]) - self.assertEqual('density: 0.5', out[2]) - self.assertEqual('representation: ' + anno_repr, out[3]) + stats_annotation = self._get_stats('-a ' + self.tempdir.name + '/annotation_both' + anno_file_extension[anno_repr]) + self.assertEqual(stats_annotation['returncode'], 0) + self.assertEqual('1', stats_annotation['labels']) + self.assertEqual(stats_graph['max index (k)'], stats_annotation['objects']) + self.assertAlmostEqual( + 0.5 * (int(stats_graph['nodes (k)']) / int(stats_graph['max index (k)'])), + float(stats_annotation['density']), + places=6) + self.assertEqual(anno_repr, stats_annotation['representation']) def test_annotate_with_disk_swap(self): graph_repr = 'succinct' @@ -292,12 +297,11 @@ def test_annotate_with_disk_swap(self): res = subprocess.run([construct_command], shell=True) self.assertEqual(res.returncode, 0) - res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[graph_repr]) - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 20', out[0]) - self.assertEqual('nodes (k): 46960', out[1]) - self.assertEqual('mode: basic', out[2]) + stats_graph = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[graph_repr]) + self.assertEqual(stats_graph['returncode'], 0) + self.assertEqual('20', stats_graph['k']) + self.assertEqual('46960', stats_graph['nodes (k)']) + self.assertEqual('basic', stats_graph['mode']) # build annotation annotate_command = f'{METAGRAPH} annotate --anno-header \ @@ -310,14 +314,15 @@ def test_annotate_with_disk_swap(self): self.assertEqual(res.returncode, 0) # check annotation - res = self._get_stats(f'-a {self.tempdir.name}/annotation{anno_file_extension[anno_repr]}') - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('labels: 100', out[0]) - self.assertIn((out[1], out[2]), [ - ('objects: 47633', 'density: 0.0182458'), # DBGSuccinct with dummy nodes - ('objects: 46960', 'density: 0.0185072')]) - self.assertEqual('representation: ' + anno_repr, out[3]) + stats_annotation = self._get_stats('-a ' + f'{self.tempdir.name}/annotation{anno_file_extension[anno_repr]}') + self.assertEqual(stats_annotation['returncode'], 0) + self.assertEqual('100', stats_annotation['labels']) + self.assertEqual(stats_graph['max index (k)'], stats_annotation['objects']) + self.assertAlmostEqual( + 0.0185072 * (int(stats_graph['nodes (k)']) / int(stats_graph['max index (k)'])), + float(stats_annotation['density']), + places=6) + self.assertEqual(anno_repr, stats_annotation['representation']) @parameterized.expand(GRAPH_TYPES) def test_annotate_coordinates(self, graph_repr): diff --git a/metagraph/integration_tests/test_build.py b/metagraph/integration_tests/test_build.py index 7ded5fb30e..367d1e10c0 100644 --- a/metagraph/integration_tests/test_build.py +++ b/metagraph/integration_tests/test_build.py @@ -50,12 +50,11 @@ def test_simple_all_graphs(self, build): res = subprocess.run([construct_command], shell=True) self.assertEqual(res.returncode, 0) - res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 20', out[0]) - self.assertEqual('nodes (k): 591997', out[1]) - self.assertEqual('mode: basic', out[2]) + stats_graph = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) + self.assertEqual(stats_graph['returncode'], 0) + self.assertEqual('20', stats_graph['k']) + self.assertEqual('591997', stats_graph['nodes (k)']) + self.assertEqual('basic', stats_graph['mode']) @parameterized.expand(succinct_states) def test_build_succinct_inplace(self, state): @@ -67,13 +66,12 @@ def test_build_succinct_inplace(self, state): res = subprocess.run([construct_command], shell=True) self.assertEqual(res.returncode, 0) - res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension['succinct']) - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 20', out[0]) - self.assertEqual('nodes (k): 597931', out[1]) - self.assertEqual('mode: basic', out[2]) - self.assertEqual('state: ' + state, out[8]) + stats_graph = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension['succinct']) + self.assertEqual(stats_graph['returncode'], 0) + self.assertEqual('20', stats_graph['k']) + self.assertEqual('597931', stats_graph['nodes (k)']) + self.assertEqual('basic', stats_graph['mode']) + self.assertEqual(state, stats_graph['state']) @parameterized.expand(['succinct']) def test_simple_bloom_graph(self, build): @@ -90,12 +88,11 @@ def test_simple_bloom_graph(self, build): res = subprocess.run([construct_command], shell=True) self.assertEqual(res.returncode, 0) - res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 20', out[0]) - self.assertEqual('nodes (k): 591997', out[1]) - self.assertEqual('mode: basic', out[2]) + stats_graph = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) + self.assertEqual(stats_graph['returncode'], 0) + self.assertEqual('20', stats_graph['k']) + self.assertEqual('591997', stats_graph['nodes (k)']) + self.assertEqual('basic', stats_graph['mode']) convert_command = '{exe} transform -o {outfile} --initialize-bloom {bloom_param} {input}'.format( exe=METAGRAPH, @@ -136,12 +133,11 @@ def test_simple_all_graphs_canonical(self, build): res = subprocess.run([construct_command], shell=True) self.assertEqual(res.returncode, 0) - res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 20', out[0]) - self.assertEqual('nodes (k): 1159851', out[1]) - self.assertEqual('mode: canonical', out[2]) + stats_graph = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) + self.assertEqual(stats_graph['returncode'], 0) + self.assertEqual('20', stats_graph['k']) + self.assertEqual('1159851', stats_graph['nodes (k)']) + self.assertEqual('canonical', stats_graph['mode']) @parameterized.expand(BUILDS) def test_build_tiny_k(self, build): @@ -157,12 +153,11 @@ def test_build_tiny_k(self, build): res = subprocess.run([construct_command], shell=True) self.assertEqual(res.returncode, 0) - res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 2', out[0]) - self.assertEqual('nodes (k): 16', out[1]) - self.assertEqual('mode: basic', out[2]) + stats_graph = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) + self.assertEqual(stats_graph['returncode'], 0) + self.assertEqual('2', stats_graph['k']) + self.assertEqual('16', stats_graph['nodes (k)']) + self.assertEqual('basic', stats_graph['mode']) # TODO: add 'hashstr' once the canonical mode is implemented for it @parameterized.expand([repr for repr in BUILDS if repr != 'hashstr']) @@ -180,12 +175,11 @@ def test_build_tiny_k_canonical(self, build): res = subprocess.run([construct_command], shell=True) self.assertEqual(res.returncode, 0) - res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 2', out[0]) - self.assertEqual('nodes (k): 16', out[1]) - self.assertEqual('mode: canonical', out[2]) + stats_graph = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) + self.assertEqual(stats_graph['returncode'], 0) + self.assertEqual('2', stats_graph['k']) + self.assertEqual('16', stats_graph['nodes (k)']) + self.assertEqual('canonical', stats_graph['mode']) @parameterized.expand(BUILDS) def test_build_tiny_k_parallel(self, build): @@ -199,12 +193,11 @@ def test_build_tiny_k_parallel(self, build): res = subprocess.run([construct_command], shell=True) self.assertEqual(res.returncode, 0) - res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 2', out[0]) - self.assertEqual('nodes (k): 16', out[1]) - self.assertEqual('mode: basic', out[2]) + stats_graph = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) + self.assertEqual(stats_graph['returncode'], 0) + self.assertEqual('2', stats_graph['k']) + self.assertEqual('16', stats_graph['nodes (k)']) + self.assertEqual('basic', stats_graph['mode']) # TODO: add 'hashstr' once the canonical mode is implemented for it @parameterized.expand([repr for repr in BUILDS if repr != 'hashstr']) @@ -221,12 +214,11 @@ def test_build_tiny_k_parallel_canonical(self, build): res = subprocess.run([construct_command], shell=True) self.assertEqual(res.returncode, 0) - res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 2', out[0]) - self.assertEqual('nodes (k): 16', out[1]) - self.assertEqual('mode: canonical', out[2]) + stats_graph = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) + self.assertEqual(stats_graph['returncode'], 0) + self.assertEqual('2', stats_graph['k']) + self.assertEqual('16', stats_graph['nodes (k)']) + self.assertEqual('canonical', stats_graph['mode']) @parameterized.expand(BUILDS) def test_build_from_kmc(self, build): @@ -243,12 +235,11 @@ def test_build_from_kmc(self, build): res = subprocess.run([construct_command], shell=True) self.assertEqual(res.returncode, 0) - res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 11', out[0]) - self.assertEqual('nodes (k): 469983', out[1]) - self.assertEqual('mode: basic', out[2]) + stats_graph = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) + self.assertEqual(stats_graph['returncode'], 0) + self.assertEqual('11', stats_graph['k']) + self.assertEqual('469983', stats_graph['nodes (k)']) + self.assertEqual('basic', stats_graph['mode']) @parameterized.expand(BUILDS) def test_build_from_kmc_both(self, build): @@ -265,12 +256,11 @@ def test_build_from_kmc_both(self, build): res = subprocess.run([construct_command], shell=True) self.assertEqual(res.returncode, 0) - res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 11', out[0]) - self.assertEqual('nodes (k): 802920', out[1]) - self.assertEqual('mode: basic', out[2]) + stats_graph = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) + self.assertEqual(stats_graph['returncode'], 0) + self.assertEqual('11', stats_graph['k']) + self.assertEqual('802920', stats_graph['nodes (k)']) + self.assertEqual('basic', stats_graph['mode']) @parameterized.expand([repr for repr in BUILDS if repr != 'hashstr']) @unittest.skipIf(PROTEIN_MODE, "No canonical mode for Protein alphabets") @@ -289,12 +279,11 @@ def test_build_from_kmc_canonical(self, build): res = subprocess.run([construct_command], shell=True) self.assertEqual(res.returncode, 0) - res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 11', out[0]) - self.assertEqual('nodes (k): 802920', out[1]) - self.assertEqual('mode: canonical', out[2]) + stats_graph = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) + self.assertEqual(stats_graph['returncode'], 0) + self.assertEqual('11', stats_graph['k']) + self.assertEqual('802920', stats_graph['nodes (k)']) + self.assertEqual('canonical', stats_graph['mode']) @parameterized.expand([repr for repr in BUILDS if repr != 'hashstr']) @unittest.skipIf(PROTEIN_MODE, "No canonical mode for Protein alphabets") @@ -313,12 +302,11 @@ def test_build_from_kmc_both_canonical(self, build): res = subprocess.run([construct_command], shell=True) self.assertEqual(res.returncode, 0) - res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 11', out[0]) - self.assertEqual('nodes (k): 802920', out[1]) - self.assertEqual('mode: canonical', out[2]) + stats_graph = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) + self.assertEqual(stats_graph['returncode'], 0) + self.assertEqual('11', stats_graph['k']) + self.assertEqual('802920', stats_graph['nodes (k)']) + self.assertEqual('canonical', stats_graph['mode']) @parameterized.expand(['succinct', 'succinct_disk']) @unittest.skipUnless(DNA_MODE, "Need to adapt suffixes for other alphabets") @@ -352,13 +340,12 @@ def test_build_chunks_from_kmc(self, build): self.assertEqual(res.returncode, 0) # Check graph - res = self._get_stats(self.tempdir.name + '/graph_from_chunks' - + graph_file_extension[representation]) - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 11', out[0]) - self.assertEqual('nodes (k): 469983', out[1]) - self.assertEqual('mode: basic', out[2]) + stats_graph = self._get_stats(self.tempdir.name + '/graph_from_chunks' + + graph_file_extension[representation]) + self.assertEqual(stats_graph['returncode'], 0) + self.assertEqual('11', stats_graph['k']) + self.assertEqual('469983', stats_graph['nodes (k)']) + self.assertEqual('basic', stats_graph['mode']) @parameterized.expand(['succinct', 'succinct_disk']) @unittest.skipUnless(DNA_MODE, "Need to adapt suffixes for other alphabets") @@ -392,13 +379,12 @@ def test_build_chunks_from_kmc_canonical(self, build): self.assertEqual(res.returncode, 0) # Check graph - res = self._get_stats(self.tempdir.name + '/graph_from_chunks' - + graph_file_extension[representation]) - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 11', out[0]) - self.assertEqual('nodes (k): 802920', out[1]) - self.assertEqual('mode: canonical', out[2]) + stats_graph = self._get_stats(self.tempdir.name + '/graph_from_chunks' + + graph_file_extension[representation]) + self.assertEqual(stats_graph['returncode'], 0) + self.assertEqual('11', stats_graph['k']) + self.assertEqual('802920', stats_graph['nodes (k)']) + self.assertEqual('canonical', stats_graph['mode']) if __name__ == '__main__': diff --git a/metagraph/integration_tests/test_build_weighted.py b/metagraph/integration_tests/test_build_weighted.py index 6c176cffe7..35f50aeb05 100644 --- a/metagraph/integration_tests/test_build_weighted.py +++ b/metagraph/integration_tests/test_build_weighted.py @@ -50,14 +50,13 @@ def test_simple_all_graphs(self, build): res = subprocess.run([construct_command], shell=True) self.assertEqual(res.returncode, 0) - res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 20', out[0]) - self.assertEqual('nodes (k): 591997', out[1]) - self.assertEqual('mode: basic', out[2]) - self.assertEqual('nnz weights: 591997', out[3]) - self.assertEqual('avg weight: 2.48587', out[4]) + stats_graph = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) + self.assertEqual(stats_graph['returncode'], 0) + self.assertEqual(stats_graph['k'], '20') + self.assertEqual(stats_graph['nodes (k)'], '591997') + self.assertEqual(stats_graph['mode'], 'basic') + self.assertEqual(stats_graph['nnz weights'], '591997') + self.assertEqual(stats_graph['avg weight'], '2.48587') @parameterized.expand([repr for repr in BUILDS if not (repr == 'bitmap' and PROTEIN_MODE)]) def test_simple_all_graphs_contigs(self, build): @@ -88,14 +87,13 @@ def test_simple_all_graphs_contigs(self, build): res = subprocess.run([command], shell=True) self.assertEqual(res.returncode, 0) - res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 20', out[0]) - self.assertEqual('nodes (k): 591997', out[1]) - self.assertEqual('mode: basic', out[2]) - self.assertEqual('nnz weights: 591997', out[3]) - self.assertEqual('avg weight: 2.48587', out[4]) + stats_graph = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) + self.assertEqual(stats_graph['returncode'], 0) + self.assertEqual(stats_graph['k'], '20') + self.assertEqual(stats_graph['nodes (k)'], '591997') + self.assertEqual(stats_graph['mode'], 'basic') + self.assertEqual(stats_graph['nnz weights'], '591997') + self.assertEqual(stats_graph['avg weight'], '2.48587') # TODO: add 'hashstr' once the canonical mode is implemented for it @parameterized.expand([repr for repr in BUILDS if repr != 'hashstr']) @@ -115,14 +113,13 @@ def test_simple_all_graphs_canonical(self, build): res = subprocess.run([construct_command], shell=True) self.assertEqual(res.returncode, 0) - res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 20', out[0]) - self.assertEqual('nodes (k): 1159851', out[1]) - self.assertEqual('mode: canonical', out[2]) - self.assertEqual('nnz weights: 1159851', out[3]) - self.assertEqual('avg weight: 2.53761', out[4]) + stats_graph = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) + self.assertEqual(stats_graph['returncode'], 0) + self.assertEqual(stats_graph['k'], '20') + self.assertEqual(stats_graph['nodes (k)'], '1159851') + self.assertEqual(stats_graph['mode'], 'canonical') + self.assertEqual(stats_graph['nnz weights'], '1159851') + self.assertEqual(stats_graph['avg weight'], '2.53761') @parameterized.expand(BUILDS) def test_build_tiny_k(self, build): @@ -138,14 +135,13 @@ def test_build_tiny_k(self, build): res = subprocess.run([construct_command], shell=True) self.assertEqual(res.returncode, 0) - res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 2', out[0]) - self.assertEqual('nodes (k): 16', out[1]) - self.assertEqual('mode: basic', out[2]) - self.assertEqual('nnz weights: 16', out[3]) - self.assertEqual('avg weight: 255', out[4]) + stats_graph = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) + self.assertEqual(stats_graph['returncode'], 0) + self.assertEqual(stats_graph['k'], '2') + self.assertEqual(stats_graph['nodes (k)'], '16') + self.assertEqual(stats_graph['mode'], 'basic') + self.assertEqual(stats_graph['nnz weights'], '16') + self.assertEqual(stats_graph['avg weight'], '255') # TODO: add 'hashstr' once the canonical mode is implemented for it @parameterized.expand([repr for repr in BUILDS if repr != 'hashstr']) @@ -164,14 +160,13 @@ def test_build_tiny_k_canonical(self, build): res = subprocess.run([construct_command], shell=True) self.assertEqual(res.returncode, 0) - res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 2', out[0]) - self.assertEqual('nodes (k): 16', out[1]) - self.assertEqual('mode: canonical', out[2]) - self.assertEqual('nnz weights: 16', out[3]) - self.assertEqual('avg weight: 255', out[4]) + stats_graph = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) + self.assertEqual(stats_graph['returncode'], 0) + self.assertEqual(stats_graph['k'], '2') + self.assertEqual(stats_graph['nodes (k)'], '16') + self.assertEqual(stats_graph['mode'], 'canonical') + self.assertEqual(stats_graph['nnz weights'], '16') + self.assertEqual(stats_graph['avg weight'], '255') @parameterized.expand(BUILDS) def test_build_from_kmc(self, build): @@ -189,14 +184,13 @@ def test_build_from_kmc(self, build): res = subprocess.run([construct_command], shell=True) self.assertEqual(res.returncode, 0) - res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 11', out[0]) - self.assertEqual('nodes (k): 469983', out[1]) - self.assertEqual('mode: basic', out[2]) - self.assertEqual('nnz weights: 469983', out[3]) - self.assertEqual('avg weight: 3.15029', out[4]) + stats_graph = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) + self.assertEqual(stats_graph['returncode'], 0) + self.assertEqual(stats_graph['k'], '11') + self.assertEqual(stats_graph['nodes (k)'], '469983') + self.assertEqual(stats_graph['mode'], 'basic') + self.assertEqual(stats_graph['nnz weights'], '469983') + self.assertEqual(stats_graph['avg weight'], '3.15029') @parameterized.expand(BUILDS) def test_build_from_kmc_both(self, build): @@ -214,14 +208,13 @@ def test_build_from_kmc_both(self, build): res = subprocess.run([construct_command], shell=True) self.assertEqual(res.returncode, 0) - res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 11', out[0]) - self.assertEqual('nodes (k): 802920', out[1]) - self.assertEqual('mode: basic', out[2]) - self.assertEqual('nnz weights: 802920', out[3]) - self.assertEqual('avg weight: 3.68754', out[4]) + stats_graph = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) + self.assertEqual(stats_graph['returncode'], 0) + self.assertEqual(stats_graph['k'], '11') + self.assertEqual(stats_graph['nodes (k)'], '802920') + self.assertEqual(stats_graph['mode'], 'basic') + self.assertEqual(stats_graph['nnz weights'], '802920') + self.assertEqual(stats_graph['avg weight'], '3.68754') # TODO: add 'hashstr' once the canonical mode is implemented for it @parameterized.expand([repr for repr in BUILDS if repr != 'hashstr']) @@ -241,14 +234,13 @@ def test_build_from_kmc_canonical(self, build): res = subprocess.run([construct_command], shell=True) self.assertEqual(res.returncode, 0) - res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 11', out[0]) - self.assertEqual('nodes (k): 802920', out[1]) - self.assertEqual('mode: canonical', out[2]) - self.assertEqual('nnz weights: 802920', out[3]) - self.assertEqual('avg weight: 3.68754', out[4]) + stats_graph = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) + self.assertEqual(stats_graph['returncode'], 0) + self.assertEqual(stats_graph['k'], '11') + self.assertEqual(stats_graph['nodes (k)'], '802920') + self.assertEqual(stats_graph['mode'], 'canonical') + self.assertEqual(stats_graph['nnz weights'], '802920') + self.assertEqual(stats_graph['avg weight'], '3.68754') # TODO: add 'hashstr' once the canonical mode is implemented for it @parameterized.expand([repr for repr in BUILDS if repr != 'hashstr']) @@ -268,14 +260,13 @@ def test_build_from_kmc_both_canonical(self, build): res = subprocess.run([construct_command], shell=True) self.assertEqual(res.returncode, 0) - res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 11', out[0]) - self.assertEqual('nodes (k): 802920', out[1]) - self.assertEqual('mode: canonical', out[2]) - self.assertEqual('nnz weights: 802920', out[3]) - self.assertEqual('avg weight: 3.68754', out[4]) + stats_graph = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) + self.assertEqual(stats_graph['returncode'], 0) + self.assertEqual(stats_graph['k'], '11') + self.assertEqual(stats_graph['nodes (k)'], '802920') + self.assertEqual(stats_graph['mode'], 'canonical') + self.assertEqual(stats_graph['nnz weights'], '802920') + self.assertEqual(stats_graph['avg weight'], '3.68754') @parameterized.expand( itertools.product(BUILDS, @@ -306,14 +297,13 @@ def test_kmer_count_width(self, build, width_result): res = subprocess.run([construct_command], shell=True) self.assertEqual(res.returncode, 0) - res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 4', out[0]) - self.assertEqual('nodes (k): 256', out[1]) - self.assertEqual('mode: basic', out[2]) - self.assertEqual('nnz weights: 256', out[3]) - self.assertEqual('avg weight: {}'.format(avg_count_expected), out[4]) + stats_graph = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) + self.assertEqual(stats_graph['returncode'], 0) + self.assertEqual(stats_graph['k'], '4') + self.assertEqual(stats_graph['nodes (k)'], '256') + self.assertEqual(stats_graph['mode'], 'basic') + self.assertEqual(stats_graph['nnz weights'], '256') + self.assertEqual(stats_graph['avg weight'], str(avg_count_expected)) @parameterized.expand(itertools.chain( itertools.product(BUILDS, @@ -366,14 +356,13 @@ def test_kmer_count_width_large(self, build, k_width_result): res = subprocess.run([construct_command], shell=True) self.assertEqual(res.returncode, 0) - res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: {}'.format(k), out[0]) - self.assertEqual('nodes (k): 2', out[1]) - self.assertEqual('mode: basic', out[2]) - self.assertEqual('nnz weights: 2', out[3]) - self.assertEqual('avg weight: {}'.format(avg_count_expected), out[4]) + stats_graph = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) + self.assertEqual(stats_graph['returncode'], 0) + self.assertEqual(stats_graph['k'], str(k)) + self.assertEqual(stats_graph['nodes (k)'], '2') + self.assertEqual(stats_graph['mode'], 'basic') + self.assertEqual(stats_graph['nnz weights'], '2') + self.assertEqual(stats_graph['avg weight'], str(avg_count_expected)) if __name__ == '__main__': diff --git a/metagraph/integration_tests/test_clean.py b/metagraph/integration_tests/test_clean.py index 070b396a17..c86e614483 100644 --- a/metagraph/integration_tests/test_clean.py +++ b/metagraph/integration_tests/test_clean.py @@ -35,13 +35,12 @@ def test_no_cleaning_contigs(self, representation): k=20, repr=representation, extra_params="--mask-dummy --count-kmers") - res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 20', out[0]) - self.assertEqual('nodes (k): 591997', out[1]) - self.assertEqual('mode: basic', out[2]) - self.assertEqual('nnz weights: 591997', out[3]) - self.assertEqual('avg weight: 2.48587', out[4]) + stats = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) + self.assertEqual('20', stats['k']) + self.assertEqual('591997', stats['nodes (k)']) + self.assertEqual('basic', stats['mode']) + self.assertEqual('591997', stats['nnz weights']) + self.assertEqual('2.48587', stats['avg weight']) clean_fasta = self.tempdir.name + '/contigs.fasta.gz' self._clean(self.tempdir.name + '/graph' + graph_file_extension[representation], @@ -53,13 +52,12 @@ def test_no_cleaning_contigs(self, representation): k=20, repr=representation, extra_params="--mask-dummy --count-kmers") - res = self._get_stats(self.tempdir.name + '/graph_clean' + graph_file_extension[representation]) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 20', out[0]) - self.assertEqual('nodes (k): 591997', out[1]) - self.assertEqual('mode: basic', out[2]) - self.assertEqual('nnz weights: 591997', out[3]) - self.assertEqual('avg weight: 2.48587', out[4]) + stats = self._get_stats(self.tempdir.name + '/graph_clean' + graph_file_extension[representation]) + self.assertEqual('20', stats['k']) + self.assertEqual('591997', stats['nodes (k)']) + self.assertEqual('basic', stats['mode']) + self.assertEqual('591997', stats['nnz weights']) + self.assertEqual('2.48587', stats['avg weight']) @parameterized.expand([repr for repr in GRAPH_TYPES if not (repr == 'bitmap' and PROTEIN_MODE)]) def test_no_cleaning_contigs_2bit_counts(self, representation): @@ -69,13 +67,12 @@ def test_no_cleaning_contigs_2bit_counts(self, representation): k=20, repr=representation, extra_params="--mask-dummy --count-kmers --count-width 2") - res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 20', out[0]) - self.assertEqual('nodes (k): 591997', out[1]) - self.assertEqual('mode: basic', out[2]) - self.assertEqual('nnz weights: 591997', out[3]) - self.assertEqual('avg weight: 1.73589', out[4]) + stats = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) + self.assertEqual('20', stats['k']) + self.assertEqual('591997', stats['nodes (k)']) + self.assertEqual('basic', stats['mode']) + self.assertEqual('591997', stats['nnz weights']) + self.assertEqual('1.73589', stats['avg weight']) clean_fasta = self.tempdir.name + '/contigs.fasta.gz' self._clean(self.tempdir.name + '/graph' + graph_file_extension[representation], @@ -87,13 +84,12 @@ def test_no_cleaning_contigs_2bit_counts(self, representation): k=20, repr=representation, extra_params="--mask-dummy --count-kmers") - res = self._get_stats(self.tempdir.name + '/graph_clean' + graph_file_extension[representation]) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 20', out[0]) - self.assertEqual('nodes (k): 591997', out[1]) - self.assertEqual('mode: basic', out[2]) - self.assertEqual('nnz weights: 591997', out[3]) - self.assertEqual('avg weight: 1.73589', out[4]) + stats = self._get_stats(self.tempdir.name + '/graph_clean' + graph_file_extension[representation]) + self.assertEqual('20', stats['k']) + self.assertEqual('591997', stats['nodes (k)']) + self.assertEqual('basic', stats['mode']) + self.assertEqual('591997', stats['nnz weights']) + self.assertEqual('1.73589', stats['avg weight']) @parameterized.expand([repr for repr in GRAPH_TYPES if not (repr == 'bitmap' and PROTEIN_MODE)]) def test_clean_prune_tips_no_counts(self, representation): @@ -113,11 +109,10 @@ def test_clean_prune_tips_no_counts(self, representation): k=20, repr=representation, extra_params="--mask-dummy") - res = self._get_stats(self.tempdir.name + '/graph_clean' + graph_file_extension[representation]) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 20', out[0]) - self.assertEqual('nodes (k): 589774', out[1]) - self.assertEqual('mode: basic', out[2]) + stats = self._get_stats(self.tempdir.name + '/graph_clean' + graph_file_extension[representation]) + self.assertEqual('20', stats['k']) + self.assertEqual('589774', stats['nodes (k)']) + self.assertEqual('basic', stats['mode']) @parameterized.expand([repr for repr in GRAPH_TYPES if not (repr == 'bitmap' and PROTEIN_MODE)]) def test_clean_prune_tips(self, representation): @@ -137,13 +132,12 @@ def test_clean_prune_tips(self, representation): k=20, repr=representation, extra_params="--mask-dummy --count-kmers") - res = self._get_stats(self.tempdir.name + '/graph_clean' + graph_file_extension[representation]) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 20', out[0]) - self.assertEqual('nodes (k): 589774', out[1]) - self.assertEqual('mode: basic', out[2]) - self.assertEqual('nnz weights: 589774', out[3]) - self.assertEqual('avg weight: 2.49001', out[4]) + stats = self._get_stats(self.tempdir.name + '/graph_clean' + graph_file_extension[representation]) + self.assertEqual('20', stats['k']) + self.assertEqual('589774', stats['nodes (k)']) + self.assertEqual('basic', stats['mode']) + self.assertEqual('589774', stats['nnz weights']) + self.assertEqual('2.49001', stats['avg weight']) @parameterized.expand([repr for repr in GRAPH_TYPES if not (repr == 'bitmap' and PROTEIN_MODE)]) def test_cleaning_threshold_fixed(self, representation): @@ -163,14 +157,12 @@ def test_cleaning_threshold_fixed(self, representation): k=20, repr=representation, extra_params="--mask-dummy --count-kmers") - res = self._get_stats(self.tempdir.name + '/graph_clean' + graph_file_extension[representation]) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 20', out[0]) - self.assertEqual('nodes (k): 167395', out[1]) - self.assertEqual('mode: basic', out[2]) - self.assertEqual('nnz weights: 167395', out[3]) - self.assertEqual('avg weight: 5.52732', out[4]) - + stats = self._get_stats(self.tempdir.name + '/graph_clean' + graph_file_extension[representation]) + self.assertEqual('20', stats['k']) + self.assertEqual('167395', stats['nodes (k)']) + self.assertEqual('basic', stats['mode']) + self.assertEqual('167395', stats['nnz weights']) + self.assertEqual('5.52732', stats['avg weight']) @parameterized.expand([repr for repr in GRAPH_TYPES if not (repr == 'bitmap' and PROTEIN_MODE)]) def test_cleaning_prune_tips_threshold_fixed(self, representation): @@ -189,13 +181,12 @@ def test_cleaning_prune_tips_threshold_fixed(self, representation): k=20, repr=representation, extra_params="--mask-dummy --count-kmers") - res = self._get_stats(self.tempdir.name + '/graph_clean' + graph_file_extension[representation]) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 20', out[0]) - self.assertEqual('nodes (k): 167224', out[1]) - self.assertEqual('mode: basic', out[2]) - self.assertEqual('nnz weights: 167224', out[3]) - self.assertEqual('avg weight: 5.52757', out[4]) + stats = self._get_stats(self.tempdir.name + '/graph_clean' + graph_file_extension[representation]) + self.assertEqual('20', stats['k']) + self.assertEqual('167224', stats['nodes (k)']) + self.assertEqual('basic', stats['mode']) + self.assertEqual('167224', stats['nnz weights']) + self.assertEqual('5.52757', stats['avg weight']) @unittest.skipIf(PROTEIN_MODE, "No canonical mode for Protein alphabets") @@ -212,13 +203,12 @@ def test_no_cleaning_contigs(self, representation): k=31, repr=representation, mode='canonical', extra_params="--mask-dummy --count-kmers") - res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 31', out[0]) - self.assertEqual('nodes (k): 1185814', out[1]) - self.assertEqual('mode: canonical', out[2]) - self.assertEqual('nnz weights: 1185814', out[3]) - self.assertEqual('avg weight: 2.4635', out[4]) + stats = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) + self.assertEqual('31', stats['k']) + self.assertEqual('1185814', stats['nodes (k)']) + self.assertEqual('canonical', stats['mode']) + self.assertEqual('1185814', stats['nnz weights']) + self.assertEqual('2.4635', stats['avg weight']) clean_fasta = self.tempdir.name + '/contigs.fasta.gz' self._clean(self.tempdir.name + '/graph' + graph_file_extension[representation], @@ -230,13 +220,12 @@ def test_no_cleaning_contigs(self, representation): k=31, repr=representation, mode='canonical', extra_params="--mask-dummy --count-kmers") - res = self._get_stats(self.tempdir.name + '/graph_clean' + graph_file_extension[representation]) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 31', out[0]) - self.assertEqual('nodes (k): 1185814', out[1]) - self.assertEqual('mode: canonical', out[2]) - self.assertEqual('nnz weights: 1185814', out[3]) - self.assertEqual('avg weight: 2.4635', out[4]) + stats = self._get_stats(self.tempdir.name + '/graph_clean' + graph_file_extension[representation]) + self.assertEqual('31', stats['k']) + self.assertEqual('1185814', stats['nodes (k)']) + self.assertEqual('canonical', stats['mode']) + self.assertEqual('1185814', stats['nnz weights']) + self.assertEqual('2.4635', stats['avg weight']) # TODO: add 'hashstr' once the canonical mode is implemented for it @parameterized.expand(['succinct', 'bitmap', 'hash']) # , 'hashstr']: @@ -247,13 +236,12 @@ def test_no_cleaning_contigs_2bit_counts(self, representation): k=31, repr=representation, mode='canonical', extra_params="--mask-dummy --count-kmers --count-width 2") - res = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 31', out[0]) - self.assertEqual('nodes (k): 1185814', out[1]) - self.assertEqual('mode: canonical', out[2]) - self.assertEqual('nnz weights: 1185814', out[3]) - self.assertEqual('avg weight: 1.72792', out[4]) + stats = self._get_stats(self.tempdir.name + '/graph' + graph_file_extension[representation]) + self.assertEqual('31', stats['k']) + self.assertEqual('1185814', stats['nodes (k)']) + self.assertEqual('canonical', stats['mode']) + self.assertEqual('1185814', stats['nnz weights']) + self.assertEqual('1.72792', stats['avg weight']) clean_fasta = self.tempdir.name + '/contigs.fasta.gz' self._clean(self.tempdir.name + '/graph' + graph_file_extension[representation], @@ -265,13 +253,12 @@ def test_no_cleaning_contigs_2bit_counts(self, representation): k=31, repr=representation, mode='canonical', extra_params="--mask-dummy --count-kmers") - res = self._get_stats(self.tempdir.name + '/graph_clean' + graph_file_extension[representation]) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 31', out[0]) - self.assertEqual('nodes (k): 1185814', out[1]) - self.assertEqual('mode: canonical', out[2]) - self.assertEqual('nnz weights: 1185814', out[3]) - self.assertEqual('avg weight: 1.72792', out[4]) + stats = self._get_stats(self.tempdir.name + '/graph_clean' + graph_file_extension[representation]) + self.assertEqual('31', stats['k']) + self.assertEqual('1185814', stats['nodes (k)']) + self.assertEqual('canonical', stats['mode']) + self.assertEqual('1185814', stats['nnz weights']) + self.assertEqual('1.72792', stats['avg weight']) @parameterized.expand(['succinct', 'bitmap', 'hash']) # , 'hashstr']: def test_clean_prune_tips_no_counts(self, representation): @@ -291,11 +278,10 @@ def test_clean_prune_tips_no_counts(self, representation): k=31, repr=representation, mode='canonical', extra_params="--mask-dummy") - res = self._get_stats(self.tempdir.name + '/graph_clean' + graph_file_extension[representation]) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 31', out[0]) - self.assertEqual('nodes (k): 1180802', out[1]) - self.assertEqual('mode: canonical', out[2]) + stats = self._get_stats(self.tempdir.name + '/graph_clean' + graph_file_extension[representation]) + self.assertEqual('31', stats['k']) + self.assertEqual('1180802', stats['nodes (k)']) + self.assertEqual('canonical', stats['mode']) @parameterized.expand(['succinct', 'bitmap', 'hash']) # , 'hashstr']: def test_clean_prune_tips(self, representation): @@ -315,13 +301,12 @@ def test_clean_prune_tips(self, representation): k=31, repr=representation, mode='canonical', extra_params="--mask-dummy --count-kmers") - res = self._get_stats(self.tempdir.name + '/graph_clean' + graph_file_extension[representation]) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 31', out[0]) - self.assertEqual('nodes (k): 1180802', out[1]) - self.assertEqual('mode: canonical', out[2]) - self.assertEqual('nnz weights: 1180802', out[3]) - self.assertEqual('avg weight: 2.46882', out[4]) + stats = self._get_stats(self.tempdir.name + '/graph_clean' + graph_file_extension[representation]) + self.assertEqual('31', stats['k']) + self.assertEqual('1180802', stats['nodes (k)']) + self.assertEqual('canonical', stats['mode']) + self.assertEqual('1180802', stats['nnz weights']) + self.assertEqual('2.46882', stats['avg weight']) @parameterized.expand(GRAPH_TYPES) def test_cleaning_threshold_fixed_both_strands(self, representation): @@ -342,13 +327,12 @@ def test_cleaning_threshold_fixed_both_strands(self, representation): k=31, repr=representation, extra_params="--mask-dummy --count-kmers") - res = self._get_stats(self.tempdir.name + '/graph_clean' + graph_file_extension[representation]) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 31', out[0]) - self.assertEqual('nodes (k): 331452', out[1]) - self.assertEqual('mode: basic', out[2]) - self.assertEqual('nnz weights: 331452', out[3]) - self.assertEqual('avg weight: 5.52692', out[4]) + stats = self._get_stats(self.tempdir.name + '/graph_clean' + graph_file_extension[representation]) + self.assertEqual('31', stats['k']) + self.assertEqual('331452', stats['nodes (k)']) + self.assertEqual('basic', stats['mode']) + self.assertEqual('331452', stats['nnz weights']) + self.assertEqual('5.52692', stats['avg weight']) @parameterized.expand(['succinct', 'bitmap', 'hash']) # , 'hashstr']: def test_cleaning_threshold_fixed(self, representation): @@ -368,13 +352,12 @@ def test_cleaning_threshold_fixed(self, representation): k=31, repr=representation, mode='canonical', extra_params="--mask-dummy --count-kmers") - res = self._get_stats(self.tempdir.name + '/graph_clean' + graph_file_extension[representation]) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 31', out[0]) - self.assertEqual('nodes (k): 331452', out[1]) - self.assertEqual('mode: canonical', out[2]) - self.assertEqual('nnz weights: 331452', out[3]) - self.assertEqual('avg weight: 5.52692', out[4]) + stats = self._get_stats(self.tempdir.name + '/graph_clean' + graph_file_extension[representation]) + self.assertEqual('31', stats['k']) + self.assertEqual('331452', stats['nodes (k)']) + self.assertEqual('canonical', stats['mode']) + self.assertEqual('331452', stats['nnz weights']) + self.assertEqual('5.52692', stats['avg weight']) @parameterized.expand(['succinct', 'bitmap', 'hash']) # , 'hashstr']: def test_cleaning_prune_tips_threshold_fixed(self, representation): @@ -394,13 +377,12 @@ def test_cleaning_prune_tips_threshold_fixed(self, representation): k=31, repr=representation, mode='canonical', extra_params="--mask-dummy --count-kmers") - res = self._get_stats(self.tempdir.name + '/graph_clean' + graph_file_extension[representation]) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('k: 31', out[0]) - self.assertEqual('nodes (k): 331266', out[1]) - self.assertEqual('mode: canonical', out[2]) - self.assertEqual('nnz weights: 331266', out[3]) - self.assertEqual('avg weight: 5.52728', out[4]) + stats = self._get_stats(self.tempdir.name + '/graph_clean' + graph_file_extension[representation]) + self.assertEqual('31', stats['k']) + self.assertEqual('331266', stats['nodes (k)']) + self.assertEqual('canonical', stats['mode']) + self.assertEqual('331266', stats['nnz weights']) + self.assertEqual('5.52728', stats['avg weight']) if __name__ == '__main__': diff --git a/metagraph/integration_tests/test_query.py b/metagraph/integration_tests/test_query.py index 917356e3ac..b8dfcab0c8 100644 --- a/metagraph/integration_tests/test_query.py +++ b/metagraph/integration_tests/test_query.py @@ -86,13 +86,12 @@ def setUpClass(cls): 20, cls.graph_repr, 'basic', '--mask-dummy' if cls.mask_dummy else '') - res = cls._get_stats(f'{cls.tempdir.name}/graph{graph_file_extension[cls.graph_repr]}') - assert(res.returncode == 0) - out = res.stdout.decode().split('\n')[2:] - assert('k: 20' == out[0]) + stats_graph = cls._get_stats(f'{cls.tempdir.name}/graph{graph_file_extension[cls.graph_repr]}') + assert(stats_graph['returncode'] == 0) + assert('20' == stats_graph['k']) if cls.graph_repr != 'succinct' or cls.mask_dummy: - assert('nodes (k): 46960' == out[1]) - assert('mode: basic' == out[2]) + assert('46960' == stats_graph['nodes (k)']) + assert('basic' == stats_graph['mode']) if cls.with_bloom: convert_command = f'{METAGRAPH} transform -o {cls.tempdir.name}/graph \ @@ -122,17 +121,16 @@ def check_suffix(anno_repr, suffix): ) # check annotation - res = cls._get_stats(f'-a {cls.tempdir.name}/annotation{anno_file_extension[cls.anno_repr]}') - assert(res.returncode == 0) - out = res.stdout.decode().split('\n')[2:] - assert('labels: 100' == out[0]) + stats_annotation = cls._get_stats(f'-a {cls.tempdir.name}/annotation{anno_file_extension[cls.anno_repr]}') + assert(stats_annotation['returncode'] == 0) + assert('100' == stats_annotation['labels']) if cls.graph_repr != 'hashfast' and (cls.graph_repr != 'succinct' or cls.mask_dummy): - assert(out[1] in ['objects: 47633', 'objects: 46960']) + assert(stats_graph['max index (k)'] == stats_annotation['objects']) if cls.anno_repr.endswith('_noswap'): cls.anno_repr = cls.anno_repr[:-len('_noswap')] - assert(f'representation: {cls.anno_repr}' == out[3]) + assert(cls.anno_repr == stats_annotation['representation']) def test_query(self): query_command = '{exe} query --batch-size 0 -i {graph} -a {annotation} --min-kmers-fraction-label 1.0 {input}'.format( @@ -574,12 +572,11 @@ def setUpClass(cls): cls._build_graph(cls.fasta_graph, cls.tempdir.name + '/graph', 5, cls.graph_repr, 'basic', '--mask-dummy') - res = cls._get_stats(f'{cls.tempdir.name}/graph{graph_file_extension[cls.graph_repr]}') - assert(res.returncode == 0) - out = res.stdout.decode().split('\n')[2:] - assert('k: 5' == out[0]) - assert('nodes (k): 12' == out[1]) - assert('mode: basic' == out[2]) + stats_graph = cls._get_stats(f'{cls.tempdir.name}/graph{graph_file_extension[cls.graph_repr]}') + assert(stats_graph['returncode'] == 0) + assert(stats_graph['k'] == '5') + assert(stats_graph['nodes (k)'] == '12') + assert(stats_graph['mode'] == 'basic') def check_suffix(anno_repr, suffix): match = anno_repr.endswith(suffix) @@ -597,16 +594,15 @@ def check_suffix(anno_repr, suffix): separate, no_fork_opt, no_anchor_opt) # check annotation - res = cls._get_stats(f'-a {cls.tempdir.name}/annotation{anno_file_extension[cls.anno_repr]}') - assert(res.returncode == 0) - out = res.stdout.decode().split('\n')[2:] - assert('labels: 3' == out[0]) - assert(out[1] in ['objects: 18', 'objects: 12']) + stats_annotation = cls._get_stats(f'-a {cls.tempdir.name}/annotation{anno_file_extension[cls.anno_repr]}') + assert(stats_annotation['returncode'] == 0) + assert(stats_annotation['labels'] == '3') + assert(stats_annotation['objects'] == stats_graph['max index (k)']) if cls.anno_repr.endswith('_noswap'): cls.anno_repr = cls.anno_repr[:-len('_noswap')] - assert(f'representation: {cls.anno_repr}' == out[3]) + assert(cls.anno_repr == stats_annotation['representation']) def test_query_coordinates(self): if not self.anno_repr.endswith('_coord'): @@ -655,13 +651,12 @@ def setUpClass(cls): 20, cls.graph_repr, 'basic', '--mask-dummy' if cls.mask_dummy else '') - res = cls._get_stats(cls.tempdir.name + '/graph' + graph_file_extension[cls.graph_repr]) - assert(res.returncode == 0) - out = res.stdout.decode().split('\n')[2:] - assert('k: 20' == out[0]) + stats_graph = cls._get_stats(cls.tempdir.name + '/graph' + graph_file_extension[cls.graph_repr]) + assert(stats_graph['returncode'] == 0) + assert(stats_graph['k'] == '20') if cls.graph_repr != 'succinct' or cls.mask_dummy: - assert('nodes (k): 46960' == out[1]) - assert('mode: basic' == out[2]) + assert(stats_graph['nodes (k)'] == '46960') + assert(stats_graph['mode'] == 'basic') if cls.with_bloom: convert_command = f'{METAGRAPH} transform -o {cls.tempdir.name}/graph \ @@ -692,17 +687,16 @@ def check_suffix(anno_repr, suffix): ) # check annotation - res = cls._get_stats(f'-a {cls.tempdir.name}/annotation{anno_file_extension[cls.anno_repr]}') - assert(res.returncode == 0) - out = res.stdout.decode().split('\n')[2:] - assert('labels: 1' == out[0]) + stats_annotation = cls._get_stats(f'-a {cls.tempdir.name}/annotation{anno_file_extension[cls.anno_repr]}') + assert(stats_annotation['returncode'] == 0) + assert(stats_annotation['labels'] == '1') if cls.graph_repr != 'hashfast' and (cls.graph_repr != 'succinct' or cls.mask_dummy): - assert(out[1] in ['objects: 47633', 'objects: 46960']) + assert(stats_annotation['objects'] == stats_graph['max index (k)']) if cls.anno_repr.endswith('_noswap'): cls.anno_repr = cls.anno_repr[:-len('_noswap')] - assert('representation: ' + cls.anno_repr == out[3]) + assert(cls.anno_repr == stats_annotation['representation']) def test_query(self): query_command = f'{METAGRAPH} query --batch-size 0 \ @@ -788,13 +782,12 @@ def setUpClass(cls): cls._build_graph((cls.fasta_file_1, cls.fasta_file_2), cls.tempdir.name + '/graph', cls.k, cls.graph_repr, 'basic', '--mask-dummy' if cls.mask_dummy else '') - res = cls._get_stats(f'{cls.tempdir.name}/graph{graph_file_extension[cls.graph_repr]}') - assert(res.returncode == 0) - out = res.stdout.decode().split('\n')[2:] - assert('k: 3' == out[0]) + stats_graph = cls._get_stats(f'{cls.tempdir.name}/graph{graph_file_extension[cls.graph_repr]}') + assert(stats_graph['returncode'] == 0) + assert(stats_graph['k'] == '3') if cls.graph_repr != 'succinct' or cls.mask_dummy: - assert('nodes (k): 12' == out[1]) - assert('mode: basic' == out[2]) + assert(stats_graph['nodes (k)'] == '12') + assert(stats_graph['mode'] == 'basic') if cls.with_bloom: convert_command = f'{METAGRAPH} transform -o {cls.tempdir.name}/graph \ @@ -812,13 +805,12 @@ def setUpClass(cls): ) # check annotation - res = cls._get_stats(f'-a {cls.tempdir.name}/annotation{anno_file_extension[cls.anno_repr]}') - assert(res.returncode == 0) - out = res.stdout.decode().split('\n')[2:] - assert('labels: 2' == out[0]) + stats_annotation = cls._get_stats(f'-a {cls.tempdir.name}/annotation{anno_file_extension[cls.anno_repr]}') + assert(stats_annotation['returncode'] == 0) + assert(stats_annotation['labels'] == '2') if cls.graph_repr != 'hashfast' and (cls.graph_repr != 'succinct' or cls.mask_dummy): - assert('objects: 12' == out[1]) - assert('representation: ' + cls.anno_repr == out[3]) + assert(stats_annotation['objects'] == stats_graph['max index (k)']) + assert(stats_annotation['representation'] == cls.anno_repr) cls.queries = [ 'AAA', @@ -968,13 +960,12 @@ def setUpClass(cls): 20, cls.graph_repr, 'canonical', '--mask-dummy' if cls.mask_dummy else '') - res = cls._get_stats(f'{cls.tempdir.name}/graph{graph_file_extension[cls.graph_repr]}') - assert(res.returncode == 0) - out = res.stdout.decode().split('\n')[2:] - assert('k: 20' == out[0]) + stats_graph = cls._get_stats(f'{cls.tempdir.name}/graph{graph_file_extension[cls.graph_repr]}') + assert(stats_graph['returncode'] == 0) + assert(stats_graph['k'] == '20') if cls.graph_repr != 'succinct' or cls.mask_dummy: - assert('nodes (k): 91584' == out[1]) - assert('mode: canonical' == out[2]) + assert(stats_graph['nodes (k)'] == '91584') + assert(stats_graph['mode'] == 'canonical') if cls.with_bloom: convert_command = f'{METAGRAPH} transform -o {cls.tempdir.name}/graph \ @@ -991,17 +982,16 @@ def setUpClass(cls): ) # check annotation - res = cls._get_stats(f'-a {cls.tempdir.name}/annotation{anno_file_extension[cls.anno_repr]}') - assert(res.returncode == 0) - out = res.stdout.decode().split('\n')[2:] - assert('labels: 100' == out[0]) + stats_annotation = cls._get_stats(f'-a {cls.tempdir.name}/annotation{anno_file_extension[cls.anno_repr]}') + assert(stats_annotation['returncode'] == 0) + assert(stats_annotation['labels'] == '100') if cls.graph_repr != 'hashfast' and (cls.graph_repr != 'succinct' or cls.mask_dummy): - assert('objects: 91584' == out[1]) + assert(stats_annotation['objects'] == stats_graph['max index (k)']) if cls.anno_repr.endswith('_noswap'): cls.anno_repr = cls.anno_repr[:-len('_noswap')] - assert('representation: ' + cls.anno_repr == out[3]) + assert(cls.anno_repr == stats_annotation['representation']) def test_query(self): query_command = '{exe} query --batch-size 0 -i {graph} -a {annotation} --min-kmers-fraction-label 1.0 {input}'.format( @@ -1135,13 +1125,12 @@ def setUpClass(cls): 20, cls.graph_repr, 'primary', '--mask-dummy' if cls.mask_dummy else '') - res = cls._get_stats(f'{cls.tempdir.name}/graph{graph_file_extension[cls.graph_repr]}') - assert(res.returncode == 0) - out = res.stdout.decode().split('\n')[2:] - assert('k: 20' == out[0]) + stats_graph = cls._get_stats(f'{cls.tempdir.name}/graph{graph_file_extension[cls.graph_repr]}') + assert(stats_graph['returncode'] == 0) + assert(stats_graph['k'] == '20') if cls.graph_repr != 'succinct' or cls.mask_dummy: - assert('nodes (k): 45792' == out[1]) - assert('mode: primary' == out[2]) + assert(stats_graph['nodes (k)'] == '45792') + assert(stats_graph['mode'] == 'primary') if cls.with_bloom: convert_command = f'{METAGRAPH} transform -o {cls.tempdir.name}/graph \ @@ -1158,17 +1147,16 @@ def setUpClass(cls): ) # check annotation - res = cls._get_stats(f'-a {cls.tempdir.name}/annotation{anno_file_extension[cls.anno_repr]}') - assert(res.returncode == 0) - out = res.stdout.decode().split('\n')[2:] - assert('labels: 100' == out[0]) + stats_annotation = cls._get_stats(f'-a {cls.tempdir.name}/annotation{anno_file_extension[cls.anno_repr]}') + assert(stats_annotation['returncode'] == 0) + assert(stats_annotation['labels'] == '100') if cls.graph_repr != 'hashfast' and (cls.graph_repr != 'succinct' or cls.mask_dummy): - assert('objects: 45792' == out[1]) + assert(stats_annotation['objects'] == stats_graph['max index (k)']) if cls.anno_repr.endswith('_noswap'): cls.anno_repr = cls.anno_repr[:-len('_noswap')] - assert('representation: ' + cls.anno_repr == out[3]) + assert(cls.anno_repr == stats_annotation['representation']) def test_query(self): query_command = '{exe} query --batch-size 0 -i {graph} -a {annotation} --min-kmers-fraction-label 1.0 {input}'.format( diff --git a/metagraph/integration_tests/test_transform_anno.py b/metagraph/integration_tests/test_transform_anno.py index 3961c98bbf..6d76826b09 100644 --- a/metagraph/integration_tests/test_transform_anno.py +++ b/metagraph/integration_tests/test_transform_anno.py @@ -30,12 +30,14 @@ def setUpClass(cls): cls.tempdir.name + '/graph', 20, cls.graph_repr, 'basic', '--mask-dummy') - res = cls._get_stats(f'{cls.tempdir.name}/graph{graph_file_extension[cls.graph_repr]}') - assert(res.returncode == 0) - out = res.stdout.decode().split('\n')[2:] - assert('k: 20' == out[0]) - assert('nodes (k): 46960' == out[1]) - assert('mode: basic' == out[2]) + stats_graph = cls._get_stats(f'{cls.tempdir.name}/graph{graph_file_extension[cls.graph_repr]}') + assert(stats_graph['returncode'] == 0) + assert(stats_graph['k'] == '20') + assert(stats_graph['nodes (k)'] == '46960') + assert(stats_graph['mode'] == 'basic') + + cls.num_nodes = stats_graph['nodes (k)'] + cls.max_index = stats_graph['max index (k)'] cls._annotate_graph( TEST_DATA_DIR + '/transcripts_100.fa', @@ -52,14 +54,15 @@ def setUp(self): self.annotation = f'annotation{anno_file_extension[self.anno_repr]}'; # check annotation - res = self._get_stats(f'-a {self.annotation}') - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('labels: 100', out[0]) - self.assertIn((out[1], out[2]), [ - ('objects: 47633', 'density: 0.0182458'), # DBGSuccinct with dummy nodes - ('objects: 46960', 'density: 0.0185072')]) - self.assertEqual(f'representation: {self.anno_repr}', out[3]) + stats_annotation = self._get_stats(f'-a {self.annotation}') + self.assertEqual(stats_annotation['returncode'], 0) + self.assertEqual(stats_annotation['labels'], '100') + self.assertEqual(stats_annotation['objects'], self.max_index) + self.assertAlmostEqual( + float(stats_annotation['density']), + 0.0185072 * int(self.num_nodes) / int(self.max_index), + places=6) + self.assertEqual(stats_annotation['representation'], self.anno_repr) def tearDown(self): os.chdir(self.old_cwd) @@ -79,16 +82,15 @@ def _check_aggregation_min(self, min_count, expected_density): res = subprocess.run(command.split(), stdout=PIPE) self.assertEqual(res.returncode, 0) - res = self._get_stats(f'-a aggregated{anno_file_extension[self.anno_repr]}') - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('labels: 1', out[0]) - split = out[2].split() - self.assertEqual(out[2], split[0] + ' ' + split[1]) - density = float(split[1]) - self.assertIn(out[1], ['objects: 47633', 'objects: 46960']) - self.assertLess(abs(density - expected_density * 46960 / 47633), 10**-6) - self.assertEqual(f'representation: {self.anno_repr}', out[3]) + stats_annotation = self._get_stats(f'-a aggregated{anno_file_extension[self.anno_repr]}') + self.assertEqual(stats_annotation['returncode'], 0) + self.assertEqual(stats_annotation['labels'], '1') + self.assertEqual(stats_annotation['objects'], self.max_index) + self.assertAlmostEqual( + float(stats_annotation['density']), + float(expected_density) * int(self.num_nodes) / int(self.max_index), + places=5) + self.assertEqual(stats_annotation['representation'], self.anno_repr) def test_aggregate_columns(self): self._check_aggregation_min(0, 1) @@ -104,15 +106,15 @@ def _check_aggregation_min_max_value(self, min_count, max_value, expected_densit res = subprocess.run(command.split(), stdout=PIPE) self.assertEqual(res.returncode, 0) - res = self._get_stats(f'-a aggregated{anno_file_extension[self.anno_repr]}') - self.assertEqual(res.returncode, 0) - out = res.stdout.decode().split('\n')[2:] - self.assertEqual('labels: 1', out[0]) - split = out[2].split() - self.assertEqual(out[2], split[0] + ' ' + split[1]) - density = float(split[1]) - self.assertIn(out[1], ['objects: 47633', 'objects: 46960']) - self.assertLess(abs(density - expected_density * 46960 / 47633), 10**-6) + stats_annotation = self._get_stats(f'-a aggregated{anno_file_extension[self.anno_repr]}') + self.assertEqual(stats_annotation['returncode'], 0) + self.assertEqual(stats_annotation['labels'], '1') + self.assertEqual(stats_annotation['objects'], self.max_index) + self.assertAlmostEqual( + float(stats_annotation['density']), + float(expected_density) * int(self.num_nodes) / int(self.max_index), + places=5) + self.assertEqual(stats_annotation['representation'], self.anno_repr) def test_aggregate_columns_filtered(self): self._check_aggregation_min_max_value(0, 0, 0) diff --git a/metagraph/src/cli/stats.cpp b/metagraph/src/cli/stats.cpp index f4a2cee9c6..c29ef07c09 100644 --- a/metagraph/src/cli/stats.cpp +++ b/metagraph/src/cli/stats.cpp @@ -76,6 +76,7 @@ void print_stats(const graph::DeBruijnGraph &graph, bool print_counts_hist) { std::cout << "====================== GRAPH STATS =====================" << std::endl; std::cout << "k: " << graph.get_k() << std::endl; std::cout << "nodes (k): " << graph.num_nodes() << std::endl; + std::cout << "max index (k): " << graph.max_index() << std::endl; std::cout << "mode: " << Config::graphmode_to_string(graph.get_mode()) << std::endl; if (auto weights = graph.get_extension()) { @@ -143,7 +144,6 @@ void print_stats(const graph::DeBruijnGraph &graph, bool print_counts_hist) { std::cout << std::endl; } } - std::cout << "========================================================" << std::endl; } From 3bc714dd99eb054a7752721388b77f693324b827 Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Mon, 4 Nov 2024 20:17:26 +0100 Subject: [PATCH 28/29] Apply review suggestions --- metagraph/src/annotation/row_diff_builder.cpp | 2 +- .../representation/succinct/dbg_succinct.hpp | 4 +- .../annotation/row_diff/test_row_diff.cpp | 76 ++++++++++++------- .../tests/annotation/test_converters.cpp | 40 +++++----- 4 files changed, 73 insertions(+), 49 deletions(-) diff --git a/metagraph/src/annotation/row_diff_builder.cpp b/metagraph/src/annotation/row_diff_builder.cpp index 7c19ba98c0..9060a33391 100644 --- a/metagraph/src/annotation/row_diff_builder.cpp +++ b/metagraph/src/annotation/row_diff_builder.cpp @@ -354,7 +354,7 @@ node_index row_diff_successor(const graph::DeBruijnGraph &graph, } else { node_index succ = graph::DeBruijnGraph::npos; graph.adjacent_outgoing_nodes(node, [&](node_index adjacent_node) { - if(rd_succ[adjacent_node]) { + if (rd_succ[adjacent_node]) { succ = adjacent_node; } }); diff --git a/metagraph/src/graph/representation/succinct/dbg_succinct.hpp b/metagraph/src/graph/representation/succinct/dbg_succinct.hpp index 77a603b150..8e92510ba9 100644 --- a/metagraph/src/graph/representation/succinct/dbg_succinct.hpp +++ b/metagraph/src/graph/representation/succinct/dbg_succinct.hpp @@ -180,8 +180,8 @@ class DBGSuccinct : public DeBruijnGraph { bool is_valid(node_index node) const; node_index validate_edge(node_index node) const; - node_index select_node(uint64_t boss_index) const; - uint64_t rank_node(node_index kmer_index) const; + node_index select_node(uint64_t rank) const; + uint64_t rank_node(node_index node) const; void initialize_bloom_filter_from_fpr(double false_positive_rate, uint32_t max_num_hash_functions = -1); diff --git a/metagraph/tests/annotation/row_diff/test_row_diff.cpp b/metagraph/tests/annotation/row_diff/test_row_diff.cpp index 4eb7b2e0fb..5bf9d0416b 100644 --- a/metagraph/tests/annotation/row_diff/test_row_diff.cpp +++ b/metagraph/tests/annotation/row_diff/test_row_diff.cpp @@ -17,6 +17,10 @@ using ::testing::_; using mtg::annot::matrix::RowDiff; using mtg::annot::matrix::ColumnMajor; +static auto graph_to_anno_index(graph::DeBruijnGraph::node_index node) { + return graph::AnnotatedDBG::graph_to_anno_index(node); +} + typedef RowDiff::anchor_bv_type anchor_bv_type; TEST(RowDiff, Empty) { @@ -197,10 +201,10 @@ TEST(RowDiff, GetAnnotationMasked) { cols_concrete[0].resize(graph.max_index() + 1); cols_concrete[1].resize(graph.max_index() + 1); graph.call_nodes([&](auto i) { - auto rank = graph.rank_node(i) - 1; - bterminal[i - 1] = bterminal_masked[rank]; - cols_concrete[0][i - 1] = cols_masked[0][rank]; - cols_concrete[1][i - 1] = cols_masked[1][rank]; + auto rank = graph_to_anno_index(graph.rank_node(i)); + bterminal[graph_to_anno_index(i)] = bterminal_masked[rank]; + cols_concrete[0][graph_to_anno_index(i)] = cols_masked[0][rank]; + cols_concrete[1][graph_to_anno_index(i)] = cols_masked[1][rank]; }); anchor_bv_type terminal(bterminal); utils::TempFile fterm_temp; @@ -216,30 +220,37 @@ TEST(RowDiff, GetAnnotationMasked) { RowDiff annot(&graph, std::move(mat)); annot.load_anchor(fterm_temp.name()); - EXPECT_EQ("CTAG", graph.get_node_sequence(graph.select_node(1))); - ASSERT_THAT(annot.get_rows({graph.select_node(1) - 1})[0], ElementsAre(0, 1)); + ASSERT_THAT(annot.get_rows({graph_to_anno_index(graph.select_node(1))})[0], + ElementsAre(0, 1)); EXPECT_EQ("AGCT", graph.get_node_sequence(graph.select_node(2))); - ASSERT_THAT(annot.get_rows({graph.select_node(2) - 1})[0], ElementsAre(1)); + ASSERT_THAT(annot.get_rows({graph_to_anno_index(graph.select_node(2))})[0], + ElementsAre(1)); EXPECT_EQ("CTCT", graph.get_node_sequence(graph.select_node(3))); - ASSERT_THAT(annot.get_rows({graph.select_node(3) - 1})[0], ElementsAre(0)); + ASSERT_THAT(annot.get_rows({graph_to_anno_index(graph.select_node(3))})[0], + ElementsAre(0)); EXPECT_EQ("TAGC", graph.get_node_sequence(graph.select_node(4))); - ASSERT_THAT(annot.get_rows({graph.select_node(4) - 1})[0], ElementsAre(1)); + ASSERT_THAT(annot.get_rows({graph_to_anno_index(graph.select_node(4))})[0], + ElementsAre(1)); EXPECT_EQ("ACTA", graph.get_node_sequence(graph.select_node(5))); - ASSERT_THAT(annot.get_rows({graph.select_node(5) - 1})[0], ElementsAre(1)); + ASSERT_THAT(annot.get_rows({graph_to_anno_index(graph.select_node(5))})[0], + ElementsAre(1)); EXPECT_EQ("ACTC", graph.get_node_sequence(graph.select_node(6))); - ASSERT_THAT(annot.get_rows({graph.select_node(6) - 1})[0], ElementsAre(0)); + ASSERT_THAT(annot.get_rows({graph_to_anno_index(graph.select_node(6))})[0], + ElementsAre(0)); EXPECT_EQ("GCTA", graph.get_node_sequence(graph.select_node(7))); - ASSERT_THAT(annot.get_rows({graph.select_node(7) - 1})[0], ElementsAre(1)); + ASSERT_THAT(annot.get_rows({graph_to_anno_index(graph.select_node(7))})[0], + ElementsAre(1)); EXPECT_EQ("TCTA", graph.get_node_sequence(graph.select_node(8))); - ASSERT_THAT(annot.get_rows({graph.select_node(8) - 1})[0], ElementsAre(0)); + ASSERT_THAT(annot.get_rows({graph_to_anno_index(graph.select_node(8))})[0], + ElementsAre(0)); } /** @@ -321,10 +332,10 @@ TEST(RowDiff, GetAnnotationBifurcationMasked) { cols_concrete[0].resize(graph.max_index() + 1); cols_concrete[1].resize(graph.max_index() + 1); graph.call_nodes([&](auto i) { - auto rank = graph.rank_node(i) - 1; - bterminal[i - 1] = bterminal_masked[rank]; - cols_concrete[0][i - 1] = cols_masked[0][rank]; - cols_concrete[1][i - 1] = cols_masked[1][rank]; + auto rank = graph_to_anno_index(graph.rank_node(i)); + bterminal[graph_to_anno_index(i)] = bterminal_masked[rank]; + cols_concrete[0][graph_to_anno_index(i)] = cols_masked[0][rank]; + cols_concrete[1][graph_to_anno_index(i)] = cols_masked[1][rank]; }); anchor_bv_type terminal(bterminal); utils::TempFile fterm_temp; @@ -345,34 +356,43 @@ TEST(RowDiff, GetAnnotationBifurcationMasked) { annot.load_anchor(fterm_temp.name()); EXPECT_EQ("CTAG", graph.get_node_sequence(graph.select_node(1))); - ASSERT_THAT(annot.get_rows({graph.select_node(1) - 1})[0], ElementsAre(0, 1)); - + ASSERT_THAT(annot.get_rows({graph_to_anno_index(graph.select_node(1))})[0], + ElementsAre(0, 1)); EXPECT_EQ("CTAT", graph.get_node_sequence(graph.select_node(2))); - ASSERT_THAT(annot.get_rows({graph.select_node(2) - 1})[0], ElementsAre(1)); + ASSERT_THAT(annot.get_rows({graph_to_anno_index(graph.select_node(2))})[0], + ElementsAre(1)); EXPECT_EQ("TACT", graph.get_node_sequence(graph.select_node(3))); - ASSERT_THAT(annot.get_rows({graph.select_node(3) - 1})[0], ElementsAre(0)); + ASSERT_THAT(annot.get_rows({graph_to_anno_index(graph.select_node(3))})[0], + ElementsAre(0)); EXPECT_EQ("AGCT", graph.get_node_sequence(graph.select_node(4))); - ASSERT_THAT(annot.get_rows({graph.select_node(4) - 1})[0], ElementsAre(0, 1)); + ASSERT_THAT(annot.get_rows({graph_to_anno_index(graph.select_node(4))})[0], + ElementsAre(0, 1)); EXPECT_EQ("CTCT", graph.get_node_sequence(graph.select_node(5))); - ASSERT_THAT(annot.get_rows({graph.select_node(5) - 1})[0], ElementsAre(1)); + ASSERT_THAT(annot.get_rows({graph_to_anno_index(graph.select_node(5))})[0], + ElementsAre(1)); EXPECT_EQ("TAGC", graph.get_node_sequence(graph.select_node(6))); - ASSERT_THAT(annot.get_rows({graph.select_node(6) - 1})[0], ElementsAre(0, 1)); + ASSERT_THAT(annot.get_rows({graph_to_anno_index(graph.select_node(6))})[0], + ElementsAre(0, 1)); EXPECT_EQ("ACTA", graph.get_node_sequence(graph.select_node(7))); - ASSERT_THAT(annot.get_rows({graph.select_node(7) - 1})[0], ElementsAre(0)); + ASSERT_THAT(annot.get_rows({graph_to_anno_index(graph.select_node(7))})[0], + ElementsAre(0)); EXPECT_EQ("ACTC", graph.get_node_sequence(graph.select_node(8))); - ASSERT_THAT(annot.get_rows({graph.select_node(8) - 1})[0], ElementsAre(1)); + ASSERT_THAT(annot.get_rows({graph_to_anno_index(graph.select_node(8))})[0], + ElementsAre(1)); EXPECT_EQ("GCTA", graph.get_node_sequence(graph.select_node(9))); - ASSERT_THAT(annot.get_rows({graph.select_node(9) - 1})[0], ElementsAre(0, 1)); + ASSERT_THAT(annot.get_rows({graph_to_anno_index(graph.select_node(9))})[0], + ElementsAre(0, 1)); EXPECT_EQ("TCTA", graph.get_node_sequence(graph.select_node(10))); - ASSERT_THAT(annot.get_rows({graph.select_node(10) - 1})[0], ElementsAre(1)); + ASSERT_THAT(annot.get_rows({graph_to_anno_index(graph.select_node(10))})[0], + ElementsAre(1)); } } // namespace diff --git a/metagraph/tests/annotation/test_converters.cpp b/metagraph/tests/annotation/test_converters.cpp index 85b2a2d5dc..20900179aa 100644 --- a/metagraph/tests/annotation/test_converters.cpp +++ b/metagraph/tests/annotation/test_converters.cpp @@ -19,6 +19,13 @@ using namespace mtg; using namespace mtg::annot; using namespace ::testing; +static auto graph_to_anno_index(graph::DeBruijnGraph::node_index node) { + return graph::AnnotatedDBG::graph_to_anno_index(node); +} +static auto anno_to_graph_index(graph::AnnotatedDBG::row_index row) { + return graph::AnnotatedDBG::anno_to_graph_index(row); +} + const std::string test_data_dir = "../tests/data"; const std::string test_dump_basename = test_data_dir + "/dump_test"; const std::string test_dump_basename_row_compressed_merge = test_dump_basename + "_row_compressed_merge"; @@ -210,8 +217,11 @@ TEST(RowDiff, succ) { sdsl::int_vector_buffer succ(succ_file, std::ios::in); ASSERT_EQ(expected_succ.size(), succ.size()); - for (uint32_t i = 0; i < expected_succ.size(); ++i) { - EXPECT_EQ(expected_succ[i] + 1, graph->rank_node(succ[i] + 1)) << max_depth << " " << i; + for (uint32_t i = 0; i < succ.size(); ++i) { + EXPECT_EQ( + anno_to_graph_index(expected_succ[i]), + graph->rank_node(anno_to_graph_index(succ[i])) + ) << max_depth << " " << i; } sdsl::int_vector_buffer<1> succ_boundary(succ_boundary_file, std::ios::in); @@ -223,7 +233,10 @@ TEST(RowDiff, succ) { sdsl::int_vector_buffer pred(pred_file, std::ios::in); EXPECT_EQ(expected_pred.size(), pred.size()); for (uint32_t i = 0; i < pred.size(); ++i) { - EXPECT_EQ(expected_pred[i] + 1, graph->rank_node(pred[i] + 1)) << max_depth << " " << i; + EXPECT_EQ( + anno_to_graph_index(expected_pred[i]), + graph->rank_node(anno_to_graph_index(pred[i])) + ) << max_depth << " " << i; } sdsl::int_vector_buffer<1> pred_boundary(pred_boundary_file, std::ios::in); @@ -388,12 +401,10 @@ void test_row_diff(uint32_t k, ColumnCompressed initial_annotation(graph->max_index()); std::unordered_set all_labels; - uint32_t anno_idx = 0; graph->call_nodes([&](uint32_t node_idx) { - const std::vector &labels = annotations[anno_idx]; - initial_annotation.add_labels({node_idx - 1}, labels); + const auto &labels = annotations[graph_to_anno_index(graph->rank_node(node_idx))]; + initial_annotation.add_labels({graph_to_anno_index(node_idx)}, labels); std::for_each(labels.begin(), labels.end(), [&](auto l) { all_labels.insert(l); }); - ++anno_idx; }); initial_annotation.serialize(annot_fname); @@ -410,16 +421,11 @@ void test_row_diff(uint32_t k, ASSERT_EQ(all_labels.size(), annotator.num_labels()); ASSERT_EQ(graph->max_index(), annotator.num_objects()); - anno_idx = 0; graph->call_nodes([&](uint32_t node_idx) { - ASSERT_THAT(annotator.get_labels(node_idx - 1), - UnorderedElementsAreArray(annotations[anno_idx])); - ++anno_idx; + ASSERT_THAT(annotator.get_labels(graph_to_anno_index(node_idx)), + UnorderedElementsAreArray(annotations[graph_to_anno_index(graph->rank_node(node_idx))])); }); - for (uint32_t anno_idx = 0; anno_idx < graph->max_index(); ++anno_idx) { - } - std::filesystem::remove_all(dst_dir); } @@ -444,12 +450,10 @@ void test_row_diff_separate_columns(uint32_t k, graph->serialize(graph_fname); std::map> col_annotations; - uint32_t anno_idx = 0; graph->call_nodes([&](auto node_idx) { - for (const auto &label : annotations[anno_idx]) { - col_annotations[label].push_back(node_idx - 1); + for (const auto &label : annotations[graph_to_anno_index(graph->rank_node(node_idx))]) { + col_annotations[label].push_back(graph_to_anno_index(node_idx)); } - ++anno_idx; }); for (const auto& [label, indices] : col_annotations) { From 3a87651e08919720a75619537bf4a79b637a678e Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Mon, 4 Nov 2024 20:50:20 +0100 Subject: [PATCH 29/29] Use valid_edges_->call_ones --- .../src/graph/representation/succinct/dbg_succinct.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/metagraph/src/graph/representation/succinct/dbg_succinct.cpp b/metagraph/src/graph/representation/succinct/dbg_succinct.cpp index bf46dbb865..a700360aef 100644 --- a/metagraph/src/graph/representation/succinct/dbg_succinct.cpp +++ b/metagraph/src/graph/representation/succinct/dbg_succinct.cpp @@ -194,6 +194,16 @@ void DBGSuccinct::adjacent_incoming_nodes(node_index node, void DBGSuccinct::call_nodes(const std::function &callback, const std::function &terminate) const { + if (valid_edges_) { + try { + valid_edges_->call_ones([&](uint64_t i) { + callback(i); + if (terminate()) + throw early_term(); + }); + } catch (early_term&) {} + return; + } for (node_index i = 1; i <= max_index() && !terminate(); ++i) { if (is_valid(i)) { callback(i);