diff --git a/include/multiseries/multiseries_record.hpp b/include/multiseries/multiseries_record.hpp index 69c18a5..7260805 100644 --- a/include/multiseries/multiseries_record.hpp +++ b/include/multiseries/multiseries_record.hpp @@ -142,22 +142,6 @@ class basic_record_store { return m_series.size() - 1; } - /// \brief Returns the series data of a record - /// \param series_name The name of the series - /// \param record_id The record ID - /// \return The series data - /// If the series data does not exist, it throws a runtime error. - template - const auto get(const std::string_view series_name, - const record_id_type record_id) const { - priv_series_type_check(); - auto itr = priv_find_series(series_name); - if (itr == m_series.end()) { - throw std::runtime_error("Series not found: " + std::string(series_name)); - } - return priv_get_series_data(itr->container, record_id); - } - /// \brief Returns the series data of a record /// \param series_index The index of the series /// \param record_id The record ID @@ -229,21 +213,6 @@ class basic_record_store { return to_return_record; } - /// \brief Returns if a series data of a record is None (does not exist) - bool is_none(const std::string_view series_name, - const record_id_type record_id) const { - auto itr = priv_find_series(series_name); - if (itr == m_series.end()) { - return true; - } - - return !std::visit( - [&record_id](const auto &container) { - return container.contains(record_id); - }, - itr->container); - } - /// \brief Returns if a series data of a record is None (does not exist) bool is_none(const series_index_type series_index, const record_id_type record_id) const { @@ -259,21 +228,6 @@ class basic_record_store { /// \brief Set a series data of a record (row) /// TODO: explore the use of templated variadic variants. (See Roger) - /// template - /// void foo(std::variant) {} - - template - void set(const std::string_view series_name, const record_id_type record_id, - T value) { - priv_series_type_check(); - auto itr = priv_find_series(series_name); - if (itr == m_series.end()) { - throw std::runtime_error("Series not found: " + std::string(series_name)); - } - - priv_set_series_data(*itr, record_id, value); - } - template void set(const series_index_type series_index, const record_id_type record_id, T value) { @@ -331,25 +285,6 @@ class basic_record_store { itr->container); } - /// \brief Loop over all records of a series, skipping None values. - /// series_func_t: [](int record_id, auto single_series_value) {} - template - void for_all(const std::string_view series_name, - series_func_t series_func) const { - auto itr = priv_find_series(series_name); - if (itr == m_series.end()) { - throw std::runtime_error("Series not found: " + std::string(series_name)); - } - - const auto &container = - priv_get_series_container(itr->container); - for (size_t i = 0; i < m_record_status.size(); ++i) { - if (m_record_status[i] && container.contains(i)) { - series_func(i, container.at(i)); - } - } - } - template void for_all(const series_index_type series_index, series_func_t series_func) const { @@ -358,7 +293,7 @@ class basic_record_store { } const auto &container = - priv_get_series_container(m_series[series_index]); + priv_get_series_container(m_series[series_index].container); for (size_t i = 0; i < m_record_status.size(); ++i) { if (m_record_status[i] && container.contains(i)) { series_func(i, container.at(i)); @@ -475,25 +410,6 @@ class basic_record_store { return series_names; } - /// \brief Remove a single data - bool remove(const std::string_view series_name, - const record_id_type record_id) { - auto itr = priv_find_series(series_name); - if (itr == m_series.end()) { - return false; - } - - bool to_return = false; - std::visit( - [&record_id, &to_return](auto &container) { - if (container.contains(record_id)) { - to_return = container.erase(record_id); - } - }, - itr->container); - return to_return; - } - /// \brief Remove a single data bool remove(const series_index_type series_index, const record_id_type record_id) { diff --git a/src/examples/multiseries/demo_series_container.cpp b/src/examples/multiseries/demo_series_container.cpp index a556905..22abcb8 100644 --- a/src/examples/multiseries/demo_series_container.cpp +++ b/src/examples/multiseries/demo_series_container.cpp @@ -27,9 +27,7 @@ using namespace multiseries; -void parse_option(int argc, - char *argv[], - std::filesystem::path &metall_path, +void parse_option(int argc, char *argv[], std::filesystem::path &metall_path, size_t &num_records) { int opt_char; while ((opt_char = getopt(argc, argv, "d:n:")) != -1) { @@ -44,14 +42,13 @@ void parse_option(int argc, } } -using record_store_type = basic_record_store >; +using record_store_type = + basic_record_store >; using string_store_type = record_store_type::string_store_type; -template +template void run_bench(const std::filesystem::path &metall_path, - const size_t num_records, - const container_kind kind, + const size_t num_records, const container_kind kind, generator_type &&generator) { metall::manager manager(metall::create_only, metall_path); @@ -63,46 +60,39 @@ void run_bench(const std::filesystem::path &metall_path, record_store->add_series("data", kind); for (int64_t i = 0; i < num_records; ++i) { const auto record_id = record_store->add_record(); - record_store->set("data", record_id, generator()); + const auto demo_id = record_store->find_series("data"); + record_store->set(demo_id, record_id, generator()); } - std::cout << "Total #of records: " << record_store->num_records() << - std::endl; + std::cout << "Total #of records: " << record_store->num_records() + << std::endl; std::cout << "#of unique strings: " << string_store->size() << std::endl; std::cout << get_dir_usage(metall_path) << std::endl; } int main(int argc, char **argv) { std::filesystem::path metall_path{"./metall_data"}; - size_t num_records = 1'000'000; + size_t num_records = 1'000'000; parse_option(argc, argv, metall_path, num_records); std::cout << "Ingest bool values" << std::endl; std::cout << "Dense container" << std::endl; - run_bench(metall_path, - num_records, - container_kind::dense, + run_bench(metall_path, num_records, container_kind::dense, []() { return bool(std::rand() % 2); }); std::cout << "Sparse container" << std::endl; - run_bench(metall_path, - num_records, - container_kind::sparse, + run_bench(metall_path, num_records, container_kind::sparse, []() { return bool(std::rand() % 2); }); std::cout << "----------" << std::endl; std::cout << "Ingest int64_t values" << std::endl; std::cout << "Dense container" << std::endl; - run_bench(metall_path, - num_records, - container_kind::dense, + run_bench(metall_path, num_records, container_kind::dense, []() { return std::rand(); }); std::cout << "Sparse container" << std::endl; - run_bench(metall_path, - num_records, - container_kind::sparse, + run_bench(metall_path, num_records, container_kind::sparse, []() { return std::rand(); }); std::cout << "----------" << std::endl; @@ -111,19 +101,13 @@ int main(int argc, char **argv) { boost::uuids::random_generator gen; std::cout << "Sample UUID: " << boost::uuids::to_string(gen()) << std::endl; std::cout << "Dense container" << std::endl; - run_bench(metall_path, - num_records, - container_kind::dense, - [&gen]() -> std::string { - return boost::uuids::to_string(gen()); - }); + run_bench( + metall_path, num_records, container_kind::dense, + [&gen]() -> std::string { return boost::uuids::to_string(gen()); }); std::cout << "Sparse container" << std::endl; - run_bench(metall_path, - num_records, - container_kind::sparse, - [&gen]() -> std::string { - return boost::uuids::to_string(gen()); - }); + run_bench( + metall_path, num_records, container_kind::sparse, + [&gen]() -> std::string { return boost::uuids::to_string(gen()); }); return 0; } diff --git a/src/examples/multiseries/find_max_single.cpp b/src/examples/multiseries/find_max_single.cpp index 4ee84a2..50f3cbd 100644 --- a/src/examples/multiseries/find_max_single.cpp +++ b/src/examples/multiseries/find_max_single.cpp @@ -23,7 +23,7 @@ #include using record_store_type = - multiseries::basic_record_store>; + multiseries::basic_record_store>; using string_store_type = record_store_type::string_store_type; struct option { @@ -68,10 +68,10 @@ void show_usage(std::ostream &os) { auto start_timer() { return std::chrono::high_resolution_clock::now(); } auto get_elapsed_time_seconds( - const std::chrono::time_point &start) { + const std::chrono::time_point &start) { const auto end = std::chrono::high_resolution_clock::now(); return std::chrono::duration_cast>(end - start) - .count(); + .count(); } int main(int argc, char *argv[]) { @@ -91,7 +91,7 @@ int main(int argc, char *argv[]) { metall::manager manager(metall::open_read_only, opt.metall_path); auto *record_store = - manager.find(metall::unique_instance).first; + manager.find(metall::unique_instance).first; if (!record_store) { std::cerr << "Failed to find record store in " + opt.metall_path.string() << std::endl; @@ -110,8 +110,9 @@ int main(int argc, char *argv[]) { using value_type = int64_t; std::cout << "Value type is: " << typeid(value_type).name() << std::endl; - value_type max_value = std::numeric_limits::min(); - record_store->for_all(series_name, + value_type max_value = std::numeric_limits::min(); + auto series_idx = record_store->find_series(series_name); + record_store->for_all(series_idx, [&](const auto, const auto value) { max_value = std::max(max_value, value); }); diff --git a/src/examples/multiseries/ingest_parquet.cpp b/src/examples/multiseries/ingest_parquet.cpp index 6ca4798..650dcc8 100644 --- a/src/examples/multiseries/ingest_parquet.cpp +++ b/src/examples/multiseries/ingest_parquet.cpp @@ -26,8 +26,8 @@ #include "utils.hpp" -using record_store_type = multiseries::basic_record_store< - metall::manager::allocator_type >; +using record_store_type = + multiseries::basic_record_store >; using string_store_type = record_store_type::string_store_type; struct option { @@ -87,13 +87,13 @@ int main(int argc, char **argv) { ygm::utility::timer setup_timer; metall::utility::metall_mpi_adaptor mpi_adaptor( - metall::create_only, opt.metall_path, comm.get_mpi_comm()); + metall::create_only, opt.metall_path, comm.get_mpi_comm()); auto &manager = mpi_adaptor.get_local_manager(); auto *string_store = manager.construct( - metall::unique_instance)(manager.get_allocator()); + metall::unique_instance)(manager.get_allocator()); auto *record_store = manager.construct( - metall::unique_instance)(string_store, manager.get_allocator()); + metall::unique_instance)(string_store, manager.get_allocator()); ygm::io::parquet_parser parquetp(comm, {opt.input_path}); const auto &schema = parquetp.get_schema(); @@ -129,35 +129,35 @@ int main(int argc, char **argv) { continue; // Leave the field empty for None/NaN values } - const auto &name = schema[i].name; + auto name_idx = record_store->find_series(schema[i].name); std::visit( - [&record_store, &record_id, &name, &opt](auto &&field) { - using T = std::decay_t; - if constexpr (std::is_same_v || - std::is_same_v) { - record_store->set(name, record_id, field); - if (opt.profile) { - total_ingested_bytes += sizeof(T); - } - } else if constexpr (std::is_same_v || - std::is_same_v) { - record_store->set(name, record_id, field); - if (opt.profile) { - total_ingested_bytes += sizeof(T); - } - } else if constexpr (std::is_same_v) { - record_store->set(name, record_id, field); - - if (opt.profile) { - total_ingested_str_size += field.size(); - total_ingested_bytes += field.size(); // Assume ASCII - ++total_num_strs; - } - } else { - throw std::runtime_error("Unsupported type"); + [&record_store, &record_id, name_idx, &opt](auto &&field) { + using T = std::decay_t; + if constexpr (std::is_same_v || + std::is_same_v) { + record_store->set(name_idx, record_id, field); + if (opt.profile) { + total_ingested_bytes += sizeof(T); } - }, - std::move(field)); + } else if constexpr (std::is_same_v || + std::is_same_v) { + record_store->set(name_idx, record_id, field); + if (opt.profile) { + total_ingested_bytes += sizeof(T); + } + } else if constexpr (std::is_same_v) { + record_store->set(name_idx, record_id, field); + + if (opt.profile) { + total_ingested_str_size += field.size(); + total_ingested_bytes += field.size(); // Assume ASCII + ++total_num_strs; + } + } else { + throw std::runtime_error("Unsupported type"); + } + }, + std::move(field)); } }); comm.barrier(); @@ -177,7 +177,7 @@ int main(int argc, char **argv) { comm.cout0() << "Series name, Load factor" << std::endl; for (const auto &s : schema) { const auto ave_load_factor = - ygm::sum(record_store->load_factor(s.name), comm) / comm.size(); + ygm::sum(record_store->load_factor(s.name), comm) / comm.size(); comm.cout0() << " " << s.name << ", " << ave_load_factor << std::endl; } diff --git a/src/libmetalldata/metall_graph.cpp b/src/libmetalldata/metall_graph.cpp index dc36dd7..9c19441 100644 --- a/src/libmetalldata/metall_graph.cpp +++ b/src/libmetalldata/metall_graph.cpp @@ -255,8 +255,6 @@ metall_graph::return_code metall_graph::ingest_parquet_edges( } } - auto metall_edges = m_pedges; - size_t local_num_edges = 0; size_t local_num_nodes = m_pnode_to_idx->size(); static metall_graph* sthis = nullptr; @@ -264,9 +262,9 @@ metall_graph::return_code metall_graph::ingest_parquet_edges( parquetp.for_all( parquet_cols, [&](const std::vector& row) { - auto rec = metall_edges->add_record(); + auto rec = m_pedges->add_record(); // first, set the directedness. - metall_edges->set(DIR_COL.unqualified(), rec, directed); + m_pedges->set(m_dir_col_idx, rec, directed); for (size_t i = 0; i < parquet_cols.size(); ++i) { auto parquet_ser = parquet_cols[i]; @@ -277,7 +275,9 @@ metall_graph::return_code metall_graph::ingest_parquet_edges( auto parquet_val = row[i]; - auto metall_ser = parquet_to_metall[parquet_ser]; + auto metall_ser = parquet_to_metall[parquet_ser]; + series_index_type metall_ser_idx = + m_pedges->find_series(metall_ser.unqualified()); auto add_val = [&](const auto& val) { using T = std::decay_t; @@ -286,17 +286,13 @@ metall_graph::return_code metall_graph::ingest_parquet_edges( if constexpr (std::is_same_v) { // do nothing } else if constexpr (std::is_same_v) { - metall_edges->set(metall_ser.unqualified(), rec, - static_cast(val)); + m_pedges->set(metall_ser_idx, rec, static_cast(val)); } else if constexpr (std::is_same_v) { - metall_edges->set(metall_ser.unqualified(), rec, - static_cast(val)); + m_pedges->set(metall_ser_idx, rec, static_cast(val)); } else if constexpr (std::is_same_v) { - metall_edges->set(metall_ser.unqualified(), rec, - static_cast(val)); + m_pedges->set(metall_ser_idx, rec, static_cast(val)); } else if constexpr (std::is_same_v) { - metall_edges->set(metall_ser.unqualified(), rec, - std::string_view(val)); + m_pedges->set(metall_ser_idx, rec, std::string_view(val)); // if this is u or v, add to the distributed nodeset. if (metall_ser == U_COL || metall_ser == V_COL) { int owner = m_partitioner.owner(val); @@ -308,7 +304,7 @@ metall_graph::return_code metall_graph::ingest_parquet_edges( val); } } else { - metall_edges->set(metall_ser.unqualified(), rec, val); + m_pedges->set(metall_ser_idx, rec, val); }; ++local_num_edges; }; @@ -350,13 +346,13 @@ metall_graph::return_code metall_graph::priv_in_out_degree( using record_id_type = record_store_type::record_id_type; metall_graph::return_code to_return; - series_name degcol, otherdegcol; + series_index_type degcol, otherdegcol; if (outdeg) { - degcol = U_COL; - otherdegcol = V_COL; + degcol = m_u_col_idx; + otherdegcol = m_v_col_idx; } else { - degcol = V_COL; - otherdegcol = U_COL; + degcol = m_v_col_idx; + otherdegcol = m_u_col_idx; } if (!name.is_node_series()) { @@ -375,15 +371,14 @@ metall_graph::return_code metall_graph::priv_in_out_degree( [&](record_id_type id) { // Note: clangd may report a false positive error on the next line // The code compiles and runs correctly - std::string_view edge_name = - m_pedges->get(degcol.unqualified(), id); + std::string_view edge_name = m_pedges->get(degcol, id); degrees.async_insert(std::string(edge_name)); // for undirected edges, add the reverse. - bool is_directed = m_pedges->get(DIR_COL.unqualified(), id); + bool is_directed = m_pedges->get(m_dir_col_idx, id); if (!is_directed) { auto reverseedge_name = - m_pedges->get(otherdegcol.unqualified(), id); + m_pedges->get(otherdegcol, id); degrees.async_insert(std::string(reverseedge_name)); } }, @@ -437,13 +432,13 @@ metall_graph::return_code metall_graph::degrees( // Note: clangd may report a false positive error on the next line // The code compiles and runs correctly auto in_edge_name = - std::string(m_pedges->get(V_COL.unqualified(), id)); + std::string(m_pedges->get(m_v_col_idx, id)); auto out_edge_name = - std::string(m_pedges->get(U_COL.unqualified(), id)); + std::string(m_pedges->get(m_u_col_idx, id)); indegrees.async_insert(in_edge_name); outdegrees.async_insert(out_edge_name); - bool is_directed = m_pedges->get(DIR_COL.unqualified(), id); + bool is_directed = m_pedges->get(m_dir_col_idx, id); if (!is_directed) { indegrees.async_insert(out_edge_name); outdegrees.async_insert(in_edge_name); @@ -462,8 +457,7 @@ metall_graph::return_code metall_graph::degrees( // create a node_local map of record id to node value. std::map node_to_id{}; m_pnodes->for_all_rows([&](record_id_type id) { - std::string_view node = - m_pnodes->get(NODE_COL.unqualified(), id); + std::string_view node = m_pnodes->get(m_node_col_idx, id); node_to_id[std::string(node)] = id; }); diff --git a/src/libmetalldata/metall_graph_assign.cpp b/src/libmetalldata/metall_graph_assign.cpp index 9eb44be..f4d0219 100644 --- a/src/libmetalldata/metall_graph_assign.cpp +++ b/src/libmetalldata/metall_graph_assign.cpp @@ -1,9 +1,6 @@ #include -// using data_types = -// std::variant; - namespace metalldata { metall_graph::return_code metall_graph::assign( @@ -21,7 +18,7 @@ metall_graph::return_code metall_graph::assign( auto pedges_ = m_pedges; bool assigned_ok = true; std::visit( - [&assigned_ok, &name, &pedges_](const auto& v) { + [&assigned_ok, &name, pedges_](const auto& v) { using T = std::decay_t; if constexpr (std::is_same_v) { // do nothing @@ -43,19 +40,19 @@ metall_graph::return_code metall_graph::assign( to_return.error = "Invalid type for value; aborting"; return to_return; } - auto wrapper = [&val, &pedges_, &name](record_id_type record_id) { + series_index_type name_idx = m_pedges->find_series(name.unqualified()); + auto wrapper = [&val, pedges_, name_idx](record_id_type record_id) { std::visit( - [&pedges_, &name, record_id](const auto& v) { + [pedges_, name_idx, record_id](const auto& v) { using T = std::decay_t; if constexpr (std::is_same_v) { // Skip monostate } else if constexpr (std::is_same_v) { - pedges_->set(name.unqualified(), record_id, std::string_view(v)); + pedges_->set(name_idx, record_id, std::string_view(v)); } else if constexpr (std::is_same_v) { - pedges_->set(name.unqualified(), record_id, - static_cast(v)); + pedges_->set(name_idx, record_id, static_cast(v)); } else { - pedges_->set(name.unqualified(), record_id, v); + pedges_->set(name_idx, record_id, v); } }, val); @@ -65,7 +62,7 @@ metall_graph::return_code metall_graph::assign( auto pnodes_ = m_pnodes; bool assigned_ok = true; std::visit( - [&assigned_ok, &name, &pnodes_](const auto& v) { + [&assigned_ok, &name, pnodes_](const auto& v) { using T = std::decay_t; if constexpr (std::is_same_v) { // do nothing @@ -87,20 +84,19 @@ metall_graph::return_code metall_graph::assign( to_return.error = "Invalid type for value; aborting"; return to_return; } - - auto wrapper = [&val, &pnodes_, &name](record_id_type record_id) { + series_index_type name_idx = m_pnodes->find_series(name.unqualified()); + auto wrapper = [&val, pnodes_, name_idx](record_id_type record_id) { std::visit( - [&pnodes_, &name, record_id](const auto& v) { + [pnodes_, name_idx, record_id](const auto& v) { using T = std::decay_t; if constexpr (std::is_same_v) { // Skip monostate } else if constexpr (std::is_same_v) { - pnodes_->set(name.unqualified(), record_id, std::string_view(v)); + pnodes_->set(name_idx, record_id, std::string_view(v)); } else if constexpr (std::is_same_v) { - pnodes_->set(name.unqualified(), record_id, - static_cast(v)); + pnodes_->set(name_idx, record_id, static_cast(v)); } else { - pnodes_->set(name.unqualified(), record_id, v); + pnodes_->set(name_idx, record_id, v); } }, val); diff --git a/test/multiseries/test_multiseries_record.cpp b/test/multiseries/test_multiseries_record.cpp index 149295b..9646b9b 100644 --- a/test/multiseries/test_multiseries_record.cpp +++ b/test/multiseries/test_multiseries_record.cpp @@ -55,10 +55,11 @@ TEST(MultiSeriesTest, GetValues) { // Use series names to retrieve values for (size_t i = 0; i < cities.size(); ++i) { - EXPECT_EQ(store.get("name", i), names[i]); - EXPECT_EQ(store.get("age", i), ages[i]); - EXPECT_EQ(store.get("city", i), cities[i]); - EXPECT_EQ(store.get("flag", i), flags[i]); + EXPECT_EQ(store.get(series_indices["name"], i), names[i]); + EXPECT_EQ(store.get(series_indices["age"], i), ages[i]); + EXPECT_EQ(store.get(series_indices["city"], i), + cities[i]); + EXPECT_EQ(store.get(series_indices["flag"], i), flags[i]); } } @@ -100,13 +101,15 @@ TEST(MultiSeriesTest, IsNone) { record_store::string_store_type string_store; record_store store(&string_store); - EXPECT_TRUE(store.is_none("name", 0)); + auto name_id = store.find_series("name"); + EXPECT_TRUE(store.is_none(name_id, 0)); store.add_series("name"); - EXPECT_TRUE(store.is_none("name", 0)); + name_id = store.find_series("name"); + EXPECT_TRUE(store.is_none(name_id, 0)); store.add_record(); - EXPECT_TRUE(store.is_none("name", 0)); - store.set("name", 0, "Alice"); - EXPECT_FALSE(store.is_none("name", 0)); + EXPECT_TRUE(store.is_none(name_id, 0)); + store.set(name_id, 0, "Alice"); + EXPECT_FALSE(store.is_none(name_id, 0)); } // remove_data @@ -114,15 +117,15 @@ TEST(MultiSeriesTest, RemoveData) { record_store::string_store_type string_store; record_store store(&string_store); - store.add_series("name"); - EXPECT_FALSE(store.remove("name", 0)); + auto name_idx = store.add_series("name"); + EXPECT_FALSE(store.remove(name_idx, 0)); store.add_record(); - EXPECT_FALSE(store.remove("name", 0)); + EXPECT_FALSE(store.remove(name_idx, 0)); - store.set("name", 0, "Alice"); - EXPECT_TRUE(store.remove("name", 0)); - EXPECT_TRUE(store.is_none("name", 0)); + store.set(name_idx, 0, "Alice"); + EXPECT_TRUE(store.remove(name_idx, 0)); + EXPECT_TRUE(store.is_none(name_idx, 0)); } TEST(MultiSeriesTest, SeriesTypeChecks) {