diff --git a/include/metalldata/impl/metall_graph_priv_for_all.ipp b/include/metalldata/impl/metall_graph_priv_for_all.ipp index bd09c8f..6a7a4d8 100644 --- a/include/metalldata/impl/metall_graph_priv_for_all.ipp +++ b/include/metalldata/impl/metall_graph_priv_for_all.ipp @@ -1,5 +1,6 @@ #pragma once #include +#include namespace metalldata { // The following for_all functions take a function that @@ -127,17 +128,10 @@ void metall_graph::priv_for_all_nodes( }, where); - std::unordered_map node_to_id; - auto node_col_idx = m_pnodes->find_series(NODE_COL.unqualified()); - m_pnodes->for_all_rows([&](record_id_type rid) { - auto name = m_pnodes->get(node_col_idx, rid); - - node_to_id[std::string(name)] = rid; - }); - for (const auto& node : nodeset) { - // throw an exception if the node is not in our node dataframe. - func(node_to_id.at(node)); + auto opsa = priv_local_node_find(node); + YGM_ASSERT_RELEASE(opsa.has_value()); + func(opsa.value()); } } } diff --git a/include/metalldata/impl/metall_graph_set_node_column.ipp b/include/metalldata/impl/metall_graph_set_node_column.ipp index b8aa114..d5d49a3 100644 --- a/include/metalldata/impl/metall_graph_set_node_column.ipp +++ b/include/metalldata/impl/metall_graph_set_node_column.ipp @@ -22,25 +22,17 @@ metall_graph::return_code metall_graph::set_node_column( using record_id_type = record_store_type::record_id_type; using val_type = typename T::mapped_type; - // create a node_local map of record id to node value. - std::map node_to_id{}; - m_pnodes->for_all_rows([&](record_id_type id) { - std::string_view node = - m_pnodes->get(NODE_COL.unqualified(), id); - node_to_id[std::string(node)] = id; - }); - - // create series and store index so we don't have to keep looking it up. + // create series auto nodecol_idx = m_pnodes->add_series(nodecol_name.unqualified()); size_t invalid_nodes = 0; - for (const auto& [k, v] : collection) { - if (!node_to_id.contains(k)) { + for (const auto& [node_name, value] : collection) { + auto opsv = priv_local_node_find(node_name); + if (!opsv.has_value()) { ++invalid_nodes; continue; } - auto node_idx = node_to_id.at(k); - m_pnodes->set(nodecol_idx, node_idx, v); + m_pnodes->set(nodecol_idx, opsv.value(), value); } if (invalid_nodes > 0) { diff --git a/include/metalldata/metall_graph.hpp b/include/metalldata/metall_graph.hpp index bc3e648..d7afeff 100644 --- a/include/metalldata/metall_graph.hpp +++ b/include/metalldata/metall_graph.hpp @@ -16,12 +16,15 @@ #include #include #include +#include #include #include #include +#include #include #include #include +#include namespace bjsn = boost::json; @@ -42,11 +45,23 @@ namespace metalldata { class metall_graph { private: + /// multiseries record store are the dataframes using record_store_type = multiseries::basic_record_store>; - using string_store_type = record_store_type::string_store_type; + using record_id_type = record_store_type::record_id_type; + using series_index_type = record_store_type::series_index_type; - using record_id_type = record_store_type::record_id_type; + /// string table deduplicates strings + using string_store_type = record_store_type::string_store_type; + using string_table_accessor = compact_string::string_accessor; + + /// hash table to index local node's record ids + using local_vertex_map_type = metall::container::unordered_map< + string_table_accessor, record_id_type, + compact_string::string_accessor_hasher, + std::equal_to, + metall::manager::allocator_type< + std::pair>>; public: // TODO: Rationalize these data types to correspond better with JSONLogic and @@ -503,6 +518,15 @@ class metall_graph { record_store_type* m_pnodes = nullptr; /// Dataframe for directed edges record_store_type* m_pedges = nullptr; + /// Map from vertex string to local record index + local_vertex_map_type* m_pnode_to_idx = nullptr; + /// String store + string_store_type* m_pstring_store = nullptr; + + series_index_type m_u_col_idx; + series_index_type m_v_col_idx; + series_index_type m_dir_col_idx; + series_index_type m_node_col_idx; size_t local_num_nodes() const { return m_pnodes->num_records(); }; size_t local_num_edges() const { return m_pedges->num_records(); }; @@ -532,6 +556,33 @@ class metall_graph { template return_code set_node_column(series_name nodecol_name, const T& collection); + record_id_type priv_local_node_find_or_insert(std::string_view id) { + YGM_ASSERT_RELEASE(m_partitioner.owner(id) == m_comm.rank()); + auto v_in_ss = compact_string::add_string(id, *m_pstring_store); + if (!m_pnode_to_idx->contains(v_in_ss)) { + auto ridx = m_pnodes->add_record(); + m_pnodes->set(m_node_col_idx, ridx, id); + m_pnode_to_idx->insert_or_assign(v_in_ss, ridx); + return ridx; + } + return m_pnode_to_idx->at(v_in_ss); + } + + std::optional priv_local_node_find( + std::string_view id) const { + YGM_ASSERT_RELEASE(m_partitioner.owner(id) == m_comm.rank()); + auto ret = compact_string::find_string(id, *m_pstring_store); + if (ret) { + return m_pnode_to_idx->at(ret.value()); + } + return {}; + } + + // Using YGM's default partitioner to assign node owner + ygm::container::detail::hash_partitioner< + ygm::container::detail::hash> + m_partitioner; + }; // class metall_graph } // namespace metalldata diff --git a/include/multiseries/multiseries_record.hpp b/include/multiseries/multiseries_record.hpp index d30763a..69c18a5 100644 --- a/include/multiseries/multiseries_record.hpp +++ b/include/multiseries/multiseries_record.hpp @@ -659,8 +659,7 @@ class basic_record_store { const record_id_type record_id, const series_type &value) { if constexpr (std::is_same_v) { - auto accessor = - cstr::add_string(value.data(), value.size(), *m_string_store); + auto accessor = cstr::add_string(value, *m_string_store); priv_get_series_container(series.container)[record_id] = accessor; } else { diff --git a/include/string_table/string_accessor.hpp b/include/string_table/string_accessor.hpp index 4b4bc24..916486a 100644 --- a/include/string_table/string_accessor.hpp +++ b/include/string_table/string_accessor.hpp @@ -11,28 +11,28 @@ #include #include #include +#include namespace compact_string { - /// \brief Provides a way to access a string stored in a string store. /// If a string is short, it stores the string in the object itself. /// If a string is long, it stores the pointer to the string in the object. /// Can take only strings allocated by allocate_string_embedding_length(), /// however, w/o the length prefix. class string_accessor { -public: + public: using size_type = std::size_t; using char_type = char; - using offset_t = std::ptrdiff_t; + using offset_t = std::ptrdiff_t; -private: + private: using self_type = string_accessor; static constexpr size_t k_num_blocks = sizeof(offset_t); static constexpr size_t k_short_str_max_length = - k_num_blocks - 2; // -1 for '\0' and -1 for metadata + k_num_blocks - 2; // -1 for '\0' and -1 for metadata -public: + public: string_accessor() = default; /// \brief Construct a string accessor from a pointer to string. @@ -62,7 +62,7 @@ class string_accessor { // as offset must be recalculated. priv_set_long_str_pointer(other.priv_to_long_str_pointer()); } - other.m_entier_block = 0; // clear the data + other.m_entier_block = 0; // clear the data } string_accessor &operator=(const string_accessor &other) { @@ -83,7 +83,7 @@ class string_accessor { } else { priv_set_long_str_pointer(other.priv_to_long_str_pointer()); } - other.m_entier_block = 0; // clear the data + other.m_entier_block = 0; // clear the data return *this; } @@ -129,7 +129,24 @@ class string_accessor { } } -private: + friend bool operator==(const string_accessor &lhs, + const string_accessor &rhs) { + if (lhs.length() != rhs.length()) { + return false; + } + + if (lhs.is_short()) { + return std::char_traits::compare(lhs.c_str(), rhs.c_str(), + lhs.length()) == 0; + } + + // If the string is long, the same string is stored only once in the + // string store, so comparing c_str(), which returns chars*, is + // sufficient. + return lhs.c_str() == rhs.c_str(); + } + + private: bool priv_get_long_flag() const { return m_blocks[k_num_blocks - 1] & 0x1; } void priv_set_long_str_pointer(const char_type *const str) { @@ -138,7 +155,7 @@ class string_accessor { bool is_negative = false; if (off < 0) { - off = -off; + off = -off; is_negative = true; } if (uint64_t(off) > (1ULL << 55)) { @@ -156,9 +173,9 @@ class string_accessor { // Finally set the metadata uint8_t metadata = 0; if (is_negative) { - metadata |= 0x2; // set the negative bit + metadata |= 0x2; // set the negative bit } - metadata |= 0x1; // set the long string bit + metadata |= 0x1; // set the long string bit m_blocks[k_num_blocks - 1] = metadata; } @@ -174,7 +191,7 @@ class string_accessor { off = -off; } auto addr = - reinterpret_cast(const_cast(this)) + off; + reinterpret_cast(const_cast(this)) + off; return reinterpret_cast(addr); } @@ -217,4 +234,11 @@ class string_accessor { "sizeof(offset_ptr_t) != sizeof(uint64_t)"); }; }; -} // namespace compact_string + +struct string_accessor_hasher { + std::size_t operator()(const string_accessor &str) const { + return boost::hash_range(str.c_str(), str.c_str() + str.length()); + } +}; + +} // namespace compact_string diff --git a/include/string_table/string_store.hpp b/include/string_table/string_store.hpp index 6f59736..cc70ca6 100644 --- a/include/string_table/string_store.hpp +++ b/include/string_table/string_store.hpp @@ -30,11 +30,11 @@ namespace csdtl { /// \param alloc The allocator. /// \return A pointer to an allocated buffer. The buffer contains the length of /// the string followed by the string data. -template +template char *allocate_string_embedding_length(const std::string_view &str, - const allocator_type &alloc) { - using char_allocator = typename std::allocator_traits< - allocator_type>::template rebind_alloc; + const allocator_type &alloc) { + using char_allocator = + typename std::allocator_traits::template rebind_alloc; static_assert( std::is_same_v::value_type, char>, @@ -42,236 +42,244 @@ char *allocate_string_embedding_length(const std::string_view &str, char_allocator char_alloc(alloc); char *buf = - std::to_address(char_alloc.allocate(sizeof(size_type) + str.size() + 1)); + std::to_address(char_alloc.allocate(sizeof(size_type) + str.size() + 1)); auto *size_buf = reinterpret_cast(buf); - size_buf[0] = str.size(); + size_buf[0] = str.size(); auto *str_buf = &buf[sizeof(size_type)]; - std::char_traits::copy(str_buf, - std::to_address(str.data()), + std::char_traits::copy(str_buf, std::to_address(str.data()), str.size()); std::char_traits::assign(str_buf[str.size()], '\0'); return buf; } -} // namespace csdtl +} // namespace csdtl -template > +template > class string_store { - private: - using self_type = string_store; + private: + using self_type = string_store; - template - using other_allocator = + template + using other_allocator = typename std::allocator_traits::template rebind_alloc; - template - using other_scoped_allocator = + template + using other_scoped_allocator = std::scoped_allocator_adaptor >; - public: - using allocator_type = other_allocator; - using size_type = std::size_t; + public: + using allocator_type = other_allocator; + using size_type = std::size_t; - private: - // Internal pointer types to store that could be offset pointers - using voild_pointer = + private: + // Internal pointer types to store that could be offset pointers + using voild_pointer = typename std::allocator_traits::void_pointer; - using internal_const_char_pointer = + using internal_const_char_pointer = std::pointer_traits::template rebind; - class str_holder { - public: - str_holder() = default; - - /// \brief Construct a string holder from a pointer to the string data. - /// \param str A pointer to the string data. This pointer must point to the - /// length data followed by actual string data. - str_holder(const char *const str) : m_ptr(str) { - } - - str_holder(const str_holder &) = delete; - str_holder(str_holder &&) noexcept = default; - str_holder &operator=(const str_holder &) = delete; - str_holder &operator=(str_holder &&) noexcept = default; - - ~str_holder() = default; - - const char *str() const { - static_assert(sizeof(char) == 1, "char must be one byte"); - return std::to_address(&m_ptr[sizeof(size_type)]); - } - - size_type length() const { - // First entry is the length - return reinterpret_cast(std::to_address(m_ptr))[0]; - } - - // equal operator - bool operator==(const str_holder &other) const { - if (length() != other.length()) { - return false; - } - return std::char_traits::compare(std::to_address(str()), - std::to_address(other.str()), - length()) == 0; - } - - // not equal operator - bool operator!=(const str_holder &other) const { - return !(*this == other); - } - - private: - internal_const_char_pointer m_ptr; - }; - struct str_holder_equal { - bool operator()(const str_holder &left, - const std::string_view &right) const { - if (left.length() != right.length()) { - return false; - } - return std::char_traits::compare(std::to_address(left.str()), - right.data(), - right.length()) == 0; - } - bool operator()(const std::string_view &right, - const str_holder &left) const { - return operator()(left, right); - } - }; + class str_holder { + public: + str_holder() = default; - // Hash function for basic_string_view - struct set_hasher { - using is_transparent = void; + /// \brief Construct a string holder from a pointer to the string data. + /// \param str A pointer to the string data. This pointer must point to the + /// length data followed by actual string data. + str_holder(const char *const str) : m_ptr(str) {} - std::size_t operator()(const str_holder &str) const { - return boost::hash_range(str.str(), str.str() + str.length()); - } - std::size_t operator()(const std::string_view &str) const { - return boost::hash_range(str.data(), str.data() + str.length()); - } - }; + str_holder(const str_holder &) = delete; + str_holder(str_holder &&) noexcept = default; + str_holder &operator=(const str_holder &) = delete; + str_holder &operator=(str_holder &&) noexcept = default; - struct set_equal { - using is_transparent = void; - bool operator()(const str_holder &left, const str_holder &right) const { - return left == right; - } - bool operator()(const str_holder &left, - const std::string_view &right) const { - return str_holder_equal()(left, right); - } + ~str_holder() = default; + + const char *str() const { + static_assert(sizeof(char) == 1, "char must be one byte"); + return std::to_address(&m_ptr[sizeof(size_type)]); + } + + size_type length() const { + // First entry is the length + return reinterpret_cast(std::to_address(m_ptr))[0]; + } - bool operator()(const std::string_view &left, - const str_holder &right) const { - return str_holder_equal()(right, left); + // equal operator + bool operator==(const str_holder &other) const { + if (length() != other.length()) { + return false; } + return std::char_traits::compare(std::to_address(str()), + std::to_address(other.str()), + length()) == 0; + } - bool operator()(const std::string_view &left, - const std::string_view &right) const { - return left == right; + // not equal operator + bool operator!=(const str_holder &other) const { return !(*this == other); } + + private: + internal_const_char_pointer m_ptr; + }; + struct str_holder_equal { + bool operator()(const str_holder &left, + const std::string_view &right) const { + if (left.length() != right.length()) { + return false; } - }; - - using set_allocator_type = other_scoped_allocator; - using set_type = boost::unordered_flat_set< - str_holder, set_hasher, set_equal, - set_allocator_type>; - - public: - /// \brief Get the length of the string - /// \param str A pointer to actual string data (not the address that points to - /// the length data). - static constexpr size_t str_length(const char *const str) { - const auto *ptr = reinterpret_cast(str); - return *(ptr - 1); + return std::char_traits::compare(std::to_address(left.str()), + right.data(), right.length()) == 0; + } + bool operator()(const std::string_view &right, + const str_holder &left) const { + return operator()(left, right); } + }; - string_store() = default; + // Hash function for basic_string_view + struct set_hasher { + using is_transparent = void; - explicit string_store(allocator_type allocator) : m_set(allocator) { + std::size_t operator()(const str_holder &str) const { + return boost::hash_range(str.str(), str.str() + str.length()); + } + std::size_t operator()(const std::string_view &str) const { + return boost::hash_range(str.data(), str.data() + str.length()); } + }; - string_store(const string_store &) = delete; - string_store(string_store &&) noexcept = default; - string_store &operator=(const string_store &) = delete; - string_store &operator=(string_store &&) noexcept = default; + struct set_equal { + using is_transparent = void; + bool operator()(const str_holder &left, const str_holder &right) const { + return left == right; + } + bool operator()(const str_holder &left, + const std::string_view &right) const { + return str_holder_equal()(left, right); + } - ~string_store() noexcept = default; + bool operator()(const std::string_view &left, + const str_holder &right) const { + return str_holder_equal()(right, left); + } - /// Return the pointer to the string data if the string is found in the store. - const char *find_or_add(std::string_view str) { - auto itr = m_set.find(str); - if (itr != m_set.end()) { - // Found in the store - return std::to_address(itr->str()); - } - // Not found, add it - char *len_str_buf = priv_allocate_string(str); - auto ret = m_set.emplace(len_str_buf); - assert(ret.second); // must be inserted - const auto &str_holder = *(ret.first); - assert(str_holder.length() == str.length()); - assert(std::string_view(str_holder.str(), str_holder.length()) == - str.data()); - return std::to_address(str_holder.str()); + bool operator()(const std::string_view &left, + const std::string_view &right) const { + return left == right; } + }; + + using set_allocator_type = other_scoped_allocator; + using set_type = boost::unordered_flat_set; + + public: + /// \brief Get the length of the string + /// \param str A pointer to actual string data (not the address that points to + /// the length data). + static constexpr size_t str_length(const char *const str) { + const auto *ptr = reinterpret_cast(str); + return *(ptr - 1); + } - const char *find(std::string_view str) { - auto itr = m_set.find(str); - if (itr == m_set.end()) { - return nullptr; - } + string_store() = default; + + explicit string_store(allocator_type allocator) : m_set(allocator) {} + + string_store(const string_store &) = delete; + string_store(string_store &&) noexcept = default; + string_store &operator=(const string_store &) = delete; + string_store &operator=(string_store &&) noexcept = default; + + ~string_store() noexcept = default; + + /// Return the pointer to the string data if the string is found in the store. + const char *find_or_add(std::string_view str) { + auto itr = m_set.find(str); + if (itr != m_set.end()) { + // Found in the store return std::to_address(itr->str()); } + // Not found, add it + char *len_str_buf = priv_allocate_string(str); + auto ret = m_set.emplace(len_str_buf); + assert(ret.second); // must be inserted + const auto &str_holder = *(ret.first); + assert(str_holder.length() == str.length()); + assert(std::string_view(str_holder.str(), str_holder.length()) == + str.data()); + return std::to_address(str_holder.str()); + } + + const char *find(std::string_view str) const { + auto itr = m_set.find(str); + if (itr == m_set.end()) { + return nullptr; + } + return std::to_address(itr->str()); + } - std::size_t size() const { return m_set.size(); } + std::size_t size() const { return m_set.size(); } - typename set_type::const_iterator begin() const { return m_set.begin(); } - typename set_type::const_iterator end() const { return m_set.end(); } + typename set_type::const_iterator begin() const { return m_set.begin(); } + typename set_type::const_iterator end() const { return m_set.end(); } - void clear() { - for (auto &item : m_set) { - priv_deallocate_string(item); - } - m_set.clear(); + void clear() { + for (auto &item : m_set) { + priv_deallocate_string(item); } + m_set.clear(); + } - allocator_type get_allocator() const { return m_set.get_allocator(); } + allocator_type get_allocator() const { return m_set.get_allocator(); } - private: - char *priv_allocate_string(const std::string_view &str) { - return csdtl::allocate_string_embedding_length( - str, - m_set.get_allocator()); - } + private: + char *priv_allocate_string(const std::string_view &str) { + return csdtl::allocate_string_embedding_length( + str, m_set.get_allocator()); + } - void priv_deallocate_string(const std::string_view &str) { - static_assert( - std::is_same_v< - typename std::allocator_traits::value_type, char>, - "allocator_type::value_type must be the same as char"); + void priv_deallocate_string(const std::string_view &str) { + static_assert( + std::is_same_v::value_type, + char>, + "allocator_type::value_type must be the same as char"); - std::allocator_traits::deallocate( - m_set.get_allocator(), - const_cast(str.data()), - sizeof(size_type) + str.size() + 1); - } + std::allocator_traits::deallocate( + m_set.get_allocator(), const_cast(str.data()), + sizeof(size_type) + str.size() + 1); + } - set_type m_set; + set_type m_set; }; /// \brief Helper function to add a string to the string store -template -string_accessor add_string(const char *const str, - const std::size_t len, - string_store &store) { +template +string_accessor add_string(std::string_view sv, string_store &store) { + // If string is short, store it in the holder + // otherwise, store it in the store and store the pointer in the holder + if (sv.length() <= string_accessor::short_str_max_length()) { + return string_accessor(sv.data(), sv.length()); + } + return string_accessor(store.find_or_add(sv.data()), sv.length()); +} + +/// \brief Helper function to find a string to the string store. +/// \warning This only truly 'finds' long strings because short strings +/// are encoded directly in the string_accessor. +template +std::optional find_string(std::string_view sv, + const string_store &store) { // If string is short, store it in the holder // otherwise, store it in the store and store the pointer in the holder - if (len <= string_accessor::short_str_max_length()) { - return string_accessor(str, len); + if (sv.length() <= string_accessor::short_str_max_length()) { + return string_accessor(sv.data(), sv.length()); } - return string_accessor(store.find_or_add(str), len); + const char *ptr = store.find(sv); + if (ptr) { + return string_accessor(ptr, sv.length()); + } + return {}; } -} // namespace compact_string + +} // namespace compact_string diff --git a/include/string_table/string_vector.hpp b/include/string_table/string_vector.hpp index e4c79f0..940e12d 100644 --- a/include/string_table/string_vector.hpp +++ b/include/string_table/string_vector.hpp @@ -10,27 +10,28 @@ #include namespace compact_string { -template > class vector { -private: +template > +class vector { + private: using self_type = vector; template using other_allocator = - typename std::allocator_traits::template rebind_alloc; + typename std::allocator_traits::template rebind_alloc; template using other_scoped_allocator = - std::scoped_allocator_adaptor>; + std::scoped_allocator_adaptor>; using pointer_type = typename std::allocator_traits::pointer; using internal_value_type = string_accessor; using internal_vector_type = - boost::container::vector>; + boost::container::vector>; -public: - using char_type = char; + public: + using char_type = char; using allocator_type = Alloc; using const_iterator = typename internal_vector_type::const_iterator; @@ -41,11 +42,11 @@ template > class vector { explicit vector(string_store_type *const string_table, const Alloc &alloc) : m_vector(alloc), string_table(string_table) {} - vector(const vector &) = default; - vector(vector &&) noexcept = default; - vector &operator=(const vector &) = default; + vector(const vector &) = default; + vector(vector &&) noexcept = default; + vector &operator=(const vector &) = default; vector &operator=(vector &&) noexcept = default; - ~vector() = default; + ~vector() = default; std::string_view operator[](const size_t i) const { return m_vector[i].to_view(); @@ -57,13 +58,9 @@ template > class vector { m_vector.push_back(add_string(str.data(), str.length(), *string_table)); } - void push_back(const char_type *data, const std::size_t length) { - m_vector.push_back(add_string(data, length, *string_table)); - } - /// FIXME: this is a temporary solution to update data void assign(std::string_view str, const size_t i) { - m_vector[i] = add_string(str.data(), str.length(), *string_table); + m_vector[i] = add_string(str, *string_table); } size_t size() const { return m_vector.size(); } @@ -77,12 +74,12 @@ template > class vector { allocator_type get_allocator() const { return m_vector.get_allocator(); } -private: + private: using string_store_pointer_type = typename std::pointer_traits< - pointer_type>::template rebind; + pointer_type>::template rebind; - internal_vector_type m_vector{}; + internal_vector_type m_vector{}; string_store_pointer_type string_table{nullptr}; }; -} // namespace compact_string \ No newline at end of file +} // namespace compact_string \ No newline at end of file diff --git a/src/libmetalldata/metall_graph.cpp b/src/libmetalldata/metall_graph.cpp index 53609ad..dc36dd7 100644 --- a/src/libmetalldata/metall_graph.cpp +++ b/src/libmetalldata/metall_graph.cpp @@ -23,6 +23,8 @@ #include #include #include +#include "metall/tags.hpp" +#include "ygm/utility/assert.hpp" namespace metalldata { @@ -36,7 +38,7 @@ namespace metalldata { metall_graph::metall_graph(ygm::comm& comm, std::string_view path, bool overwrite) - : m_comm(comm), m_metall_path(path) { + : m_comm(comm), m_metall_path(path), m_partitioner(m_comm) { // // Check if metall store already exists and overwrite if requested // There are three states: @@ -54,17 +56,20 @@ metall_graph::metall_graph(ygm::comm& comm, std::string_view path, metall::create_only, m_metall_path, m_comm.get_mpi_comm()); auto& manager = m_pmetall_mpi->get_local_manager(); - auto* string_store = manager.construct( + m_pstring_store = manager.construct( metall::unique_instance)(manager.get_allocator()); m_pnodes = manager.construct("nodes")( - string_store, manager.get_allocator()); + m_pstring_store, manager.get_allocator()); m_pedges = manager.construct("edges")( - string_store, manager.get_allocator()); + m_pstring_store, manager.get_allocator()); + m_pnode_to_idx = manager.construct("nodeindex")( + manager.get_allocator()); // add the default series for the indices. add_series(NODE_COL); add_series(U_COL); add_series(V_COL); + add_series(DIR_COL); } else { // open existing comm.barrier(); @@ -72,25 +77,36 @@ metall_graph::metall_graph(ygm::comm& comm, std::string_view path, metall::open_only, m_metall_path, m_comm.get_mpi_comm()); auto& manager = m_pmetall_mpi->get_local_manager(); - m_pnodes = manager.find("nodes").first; - m_pedges = manager.find("edges").first; + m_pstring_store = + manager.find(metall::unique_instance).first; + m_pnodes = manager.find("nodes").first; + m_pedges = manager.find("edges").first; + m_pnode_to_idx = manager.find("nodeindex").first; if (!m_pnodes || !m_pedges) { m_comm.cerr0( "Error: Failed to find required data structures in metall store"); delete m_pmetall_mpi; - m_pmetall_mpi = nullptr; - m_pnodes = nullptr; - m_pedges = nullptr; + m_pmetall_mpi = nullptr; + m_pstring_store = nullptr; + m_pnodes = nullptr; + m_pedges = nullptr; + m_pnode_to_idx = nullptr; } } + ///\todo Instead of hard crashing, need a nicer fail, maybe .good() method YGM_ASSERT_RELEASE(has_node_series(NODE_COL)); YGM_ASSERT_RELEASE(has_edge_series(U_COL)); YGM_ASSERT_RELEASE(has_edge_series(V_COL)); + YGM_ASSERT_RELEASE(has_edge_series(DIR_COL)); // - // Open metall store + // Find required column names + m_u_col_idx = m_pedges->find_series(U_COL.unqualified()); + m_v_col_idx = m_pedges->find_series(V_COL.unqualified()); + m_dir_col_idx = m_pedges->find_series(DIR_COL.unqualified()); + m_node_col_idx = m_pnodes->find_series(NODE_COL.unqualified()); } metall_graph::~metall_graph() { @@ -98,8 +114,10 @@ metall_graph::~metall_graph() { m_comm.barrier(); // We don't free these because they are persistent in the metall store - m_pnodes = nullptr; - m_pedges = nullptr; + m_pstring_store = nullptr; + m_pnodes = nullptr; + m_pedges = nullptr; + m_pnode_to_idx = nullptr; // Destroy the metall manager delete m_pmetall_mpi; @@ -147,8 +165,8 @@ metall_graph::return_code metall_graph::ingest_parquet_edges( std::set metaset; if (meta.has_value()) { - auto& v = meta.value(); - metaset = {v.begin(), v.end()}; + auto& v = meta.value(); + metaset = {v.begin(), v.end()}; } else { for (const auto& col : parquet_cols) { if (col != col_u && col != col_v) { @@ -158,9 +176,6 @@ metall_graph::return_code metall_graph::ingest_parquet_edges( } } - ygm::container::set nodeset(m_comm); - std::unordered_set existing_localnodes{}; - for (const auto& name : RESERVED_COLUMN_NAMES) { if (metaset.contains(name)) { to_return.error = @@ -242,12 +257,10 @@ metall_graph::return_code metall_graph::ingest_parquet_edges( auto metall_edges = m_pedges; - // auto _U_COL = strip_selector(U_COL); - // auto _V_COL = strip_selector(V_COL); - // auto _DIR_COL = strip_selector(DIR_COL); - // for each row, set the metall data. - - size_t local_num_edges = 0; + size_t local_num_edges = 0; + size_t local_num_nodes = m_pnode_to_idx->size(); + static metall_graph* sthis = nullptr; + sthis = this; parquetp.for_all( parquet_cols, [&](const std::vector& row) { @@ -286,7 +299,13 @@ metall_graph::return_code metall_graph::ingest_parquet_edges( std::string_view(val)); // if this is u or v, add to the distributed nodeset. if (metall_ser == U_COL || metall_ser == V_COL) { - nodeset.async_insert(val); + int owner = m_partitioner.owner(val); + m_comm.async( + owner, + [](const std::string& s) { + sthis->priv_local_node_find_or_insert(s); + }, + val); } } else { metall_edges->set(metall_ser.unqualified(), rec, val); @@ -297,24 +316,11 @@ metall_graph::return_code metall_graph::ingest_parquet_edges( } // for loop }); // for_all - // go through the local possible nodes to add and if we don't - // have them, then add to the graph's m_pnodes. This starts with - // a barrier so we don't need an explicit one beforehand. - - size_t local_num_nodes = existing_localnodes.size(); - for (const auto& v : nodeset) { - if (!existing_localnodes.contains(v)) { - auto rec = m_pnodes->add_record(); - m_pnodes->set(NODE_COL.unqualified(), rec, std::string_view(v)); - existing_localnodes.emplace(v); - } - }; - local_num_nodes = existing_localnodes.size() - local_num_nodes; - + m_comm.barrier(); to_return.return_info["num_edges_ingested"] = ygm::sum(local_num_edges, m_comm); to_return.return_info["num_new_nodes_ingested"] = - ygm::sum(local_num_nodes, m_comm); + ygm::sum(m_pnode_to_idx->size() - local_num_edges, m_comm); return to_return; } @@ -388,28 +394,7 @@ metall_graph::return_code metall_graph::priv_in_out_degree( // explicit here. m_comm.barrier(); - // TODO: we want to abstract this to set_node_column because this is a - // common operation. Make this a private function inside metall_graph. - - // create a node_local map of node value to record ids. - std::map node_to_id{}; - m_pnodes->for_all_rows([&](record_id_type id) { - std::string_view node = - m_pnodes->get(NODE_COL.unqualified(), id); - node_to_id[std::string(node)] = id; - }); - - // create series and store index so we don't have to keep looking it up. - auto deg_idx = m_pnodes->add_series(name.unqualified()); - - // add the values to the degrees series. We are taking advantage of the fact - // that the node information is local from the degrees shared counting set - // because it uses the same partitioning scheme as we used when we added the - // nodes in ingest. - for (const auto& [k, v] : degrees) { - auto rec_idx = node_to_id.at(k); - m_pnodes->set(deg_idx, rec_idx, v); - } + set_node_column(name, degrees); return to_return; } diff --git a/test/string_table/test_string_store.cpp b/test/string_table/test_string_store.cpp index 48a583b..0a3e9e8 100644 --- a/test/string_table/test_string_store.cpp +++ b/test/string_table/test_string_store.cpp @@ -10,7 +10,7 @@ #include using store_type = - compact_string::string_store>; + compact_string::string_store>; using char_pointer = metall::offset_ptr; // Demonstrate some basic assertions. @@ -18,16 +18,16 @@ TEST(StringTableTest, Basic) { { metall::manager manager(metall::create_only, "/tmp/metall-test"); auto *store = manager.construct(metall::unique_instance)( - manager.get_allocator()); + manager.get_allocator()); auto *ptr0 = manager.construct("ptr0")(); - *ptr0 = store->find_or_add("key0"); + *ptr0 = store->find_or_add("key0"); EXPECT_EQ(store->find("key0"), *ptr0); - EXPECT_EQ(store->find_or_add("key0"), *ptr0); // No duplicate insert + EXPECT_EQ(store->find_or_add("key0"), *ptr0); // No duplicate insert auto *ptr1 = manager.construct("ptr1")(); - *ptr1 = store->find_or_add("key1"); + *ptr1 = store->find_or_add("key1"); EXPECT_STREQ(store->find("key0"), ptr0->get()); EXPECT_STREQ(store->find("key1"), ptr1->get()); @@ -56,12 +56,11 @@ TEST(StringTableTest, AddString) { { metall::manager manager(metall::create_only, "/tmp/metall-test"); auto *store = manager.construct(metall::unique_instance)( - manager.get_allocator()); + manager.get_allocator()); for (int len = 0; len < 100; ++len) { std::string str(len, 'a'); - auto accessor = - compact_string::add_string(str.c_str(), str.length(), *store); + auto accessor = compact_string::add_string(str, *store); EXPECT_EQ(accessor.length(), len); EXPECT_STREQ(accessor.c_str(), str.c_str()); }