Skip to content

Commit 31a195c

Browse files
committed
add multi threaded graph construction +publish
1 parent 8ad5880 commit 31a195c

File tree

5 files changed

+255
-86
lines changed

5 files changed

+255
-86
lines changed

cpp/benchmark/src/deglib_build_bench.cpp

+45-36
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ void optimze_graph(const std::string initial_graph_file, const std::string graph
5050
* Load the data repository and create a dynamic exploratino graph with it.
5151
* Store the graph in the graph file.
5252
*/
53-
void create_graph(const std::string repository_file, const DataStreamType data_stream_type, const std::string graph_file, deglib::Metric metric, deglib::builder::LID lid, const uint8_t d, const uint8_t k_ext, const float eps_ext, const uint8_t k_opt, const float eps_opt, const uint8_t i_opt) {
53+
void create_graph(const std::string repository_file, const DataStreamType data_stream_type, const std::string graph_file, deglib::Metric metric, deglib::builder::LID lid, const uint8_t d, const uint8_t k_ext, const float eps_ext, const uint8_t k_opt, const float eps_opt, const uint8_t i_opt, const uint32_t thread_count) {
5454

5555
auto rnd = std::mt19937(7); // default 7
5656
const uint32_t swap_tries = 0; // additional swap tries between the next graph extension
@@ -74,6 +74,8 @@ void create_graph(const std::string repository_file, const DataStreamType data_s
7474
// create a graph builder to add vertices to the new graph and improve its edges
7575
fmt::print("Start graph builder \n");
7676
auto builder = deglib::builder::EvenRegularGraphBuilder(graph, rnd, lid, k_ext, eps_ext, k_opt, eps_opt, i_opt, swap_tries, additional_swap_tries);
77+
builder.setBatchSize(10000);
78+
builder.setThreadCount(thread_count);
7779

7880
// provide all features to the graph builder at once. In an online system this will be called multiple times
7981
auto base_size = uint32_t(repository.size());
@@ -109,29 +111,30 @@ void create_graph(const std::string repository_file, const DataStreamType data_s
109111
fmt::print("Actual memory usage: {} Mb, Max memory usage: {} Mb after setup graph builder\n", getCurrentRSS() / 1000000, getPeakRSS() / 1000000);
110112

111113
// check the integrity of the graph during the graph build process
112-
const auto log_after = 100000;
114+
const auto log_after = 100;
113115

114116
fmt::print("Start building \n");
115117
auto start = std::chrono::steady_clock::now();
116118
uint64_t duration_ms = 0;
117119
const auto improvement_callback = [&](deglib::builder::BuilderStatus& status) {
118120
const auto size = graph.size();
119121

120-
if(status.step % log_after == 0 || size == base_size) {
121-
duration_ms += uint32_t(std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - start).count());
122-
auto avg_edge_weight = deglib::analysis::calc_avg_edge_weight(graph, scale);
123-
auto weight_histogram_sorted = deglib::analysis::calc_edge_weight_histogram(graph, true, scale);
124-
auto weight_histogram = deglib::analysis::calc_edge_weight_histogram(graph, false, scale);
125-
auto valid_weights = deglib::analysis::check_graph_weights(graph) && deglib::analysis::check_graph_regularity(graph, uint32_t(size), true);
126-
auto connected = deglib::analysis::check_graph_connectivity(graph);
127-
auto duration = duration_ms / 1000;
128-
auto currRSS = getCurrentRSS() / 1000000;
129-
auto peakRSS = getPeakRSS() / 1000000;
130-
fmt::print("{:7} vertices, {:5}s, {:8} / {:8} improv, Q: {:4.2f} -> Sorted:{:.1f}, InOrder:{:.1f}, {} connected & {}, RSS {} & peakRSS {}\n",
131-
size, duration, status.improved, status.tries, avg_edge_weight, fmt::join(weight_histogram_sorted, " "), fmt::join(weight_histogram, " "), connected ? "" : "not", valid_weights ? "valid" : "invalid", currRSS, peakRSS);
132-
start = std::chrono::steady_clock::now();
133-
}
134-
else if(status.step % (log_after/10) == 0) {
122+
// if(status.step % log_after == 0 || size == base_size) {
123+
// duration_ms += uint32_t(std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - start).count());
124+
// auto avg_edge_weight = deglib::analysis::calc_avg_edge_weight(graph, scale);
125+
// auto weight_histogram_sorted = deglib::analysis::calc_edge_weight_histogram(graph, true, scale);
126+
// auto weight_histogram = deglib::analysis::calc_edge_weight_histogram(graph, false, scale);
127+
// auto valid_weights = deglib::analysis::check_graph_weights(graph) && deglib::analysis::check_graph_regularity(graph, uint32_t(size), true);
128+
// auto connected = deglib::analysis::check_graph_connectivity(graph);
129+
// auto duration = duration_ms / 1000;
130+
// auto currRSS = getCurrentRSS() / 1000000;
131+
// auto peakRSS = getPeakRSS() / 1000000;
132+
// fmt::print("{:7} vertices, {:5}s, {:8} / {:8} improv, Q: {:4.2f} -> Sorted:{:.1f}, InOrder:{:.1f}, {} connected & {}, RSS {} & peakRSS {}\n",
133+
// size, duration, status.improved, status.tries, avg_edge_weight, fmt::join(weight_histogram_sorted, " "), fmt::join(weight_histogram, " "), connected ? "" : "not", valid_weights ? "valid" : "invalid", currRSS, peakRSS);
134+
// start = std::chrono::steady_clock::now();
135+
// }
136+
// else
137+
if(status.step % (log_after/10) == 0) {
135138
duration_ms += uint32_t(std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - start).count());
136139
auto avg_edge_weight = deglib::analysis::calc_avg_edge_weight(graph, scale);
137140
auto connected = deglib::analysis::check_graph_connectivity(graph);
@@ -261,31 +264,37 @@ int main() {
261264

262265
// ------------------------------- SIFT1M -----------------------------------------
263266
const auto data_stream_type = DataStreamType::AddAll;
264-
// const auto repository_file = (data_path / "laion2B" / "laion2B-en-clip768v2-n=300K.fvecs").string();
265-
// const auto query_file = (data_path / "laion2B" / "public-queries-2024-laion2B-en-clip768v2-n=10k.fvecs").string();
266-
const auto repository_file = (data_path / "laion2B" / "laion2B-en-clip768v2-n=300K_512byte.u8vecs").string();
267-
const auto query_file = (data_path / "laion2B" / "public-queries-2024-laion2B-en-clip768v2-n=10k_512byte.u8vecs").string();
268-
const auto gt_file = (data_path / "laion2B" / "gold-standard-dbsize=300K--public-queries-2024-laion2B-en-clip768v2-n=10k.ivecs").string();
269-
const auto graph_file = (data_path / "deg" / "768D_L2_K30_AddK60Eps0.1_schemeD_t1_512byte.deg").string();
270-
const auto opt_graph_file = (data_path / "deg" / "768D_L2_K30_AddK60Eps0.1_schemeD_t1_512byte_200kAll.deg").string();
271-
const auto mrng_graph_file = (data_path / "deg" / "768D_L2_K30_AddK60Eps0.1_schemeD_t1_512byte_200kAll_removedNonMRNG.deg").string();
267+
// const auto repository_file = (data_path / "laion2B" / "laion2B-en-clip768v2-n=300K.fvecs").string(); // 300K 768float
268+
// const auto repository_file = (data_path / "laion2B" / "laion2B-en-clip768v2-n=300K_512float.fvecs").string(); // 300K 768float
269+
// const auto repository_file = (data_path / "laion2B" / "laion2B-en-clip768v2-n=300K_512byte.u8vecs").string(); // 300K 512uint8
270+
const auto repository_file = (data_path / "laion2B" / "laion2B-en-clip768v2-n=10M_512byte.u8vecs").string(); // 10M 512uint8
271+
272+
// const auto query_file = (data_path / "laion2B" / "public-queries-2024-laion2B-en-clip768v2-n=10k.fvecs").string(); // 768float
273+
// const auto query_file = (data_path / "laion2B" / "public-queries-2024-laion2B-en-clip768v2-n=10k_512float.fvecs").string(); // 512float
274+
const auto query_file = (data_path / "laion2B" / "public-queries-2024-laion2B-en-clip768v2-n=10k_512byte.u8vecs").string(); // 512uint8
275+
276+
// const auto gt_file = (data_path / "laion2B" / "gold-standard-dbsize=300K--public-queries-2024-laion2B-en-clip768v2-n=10k.ivecs").string(); // 300K
277+
const auto gt_file = (data_path / "laion2B" / "gold-standard-dbsize=10M--public-queries-2024-laion2B-en-clip768v2-n=10k.ivecs").string(); // 10M
278+
const auto graph_file = (data_path / "deg" / "10m" / "768D_L2_K30_AddK60Eps0.1_schemeD_t12_512byte.deg").string();
279+
const auto opt_graph_file = (data_path / "deg" / "10m" / "768D_L2_K30_AddK60Eps0.1_schemeD_t1_512byte_200kAll.deg").string();
280+
const auto mrng_graph_file = (data_path / "deg" / "10m" / "768D_L2_K30_AddK60Eps0.1_schemeD_t1_512byte_removedNonMRNG.deg").string();
272281
const auto lid = deglib::builder::LID::Low; // low=schemeD, high=schemeC
273282
const deglib::Metric metric = deglib::Metric::L2_Uint8;
274283

275-
// if(std::filesystem::exists(graph_file.c_str()) == false)
276-
// create_graph(repository_file, data_stream_type, graph_file, metric, lid, 30, 60, 0.1f, 30, 0.001f, 5); // d, k_ext, eps_ext, k_opt, eps_opt, i_opt
277-
// test_graph(query_file, gt_file, graph_file, 1, 30); // repeat_test, k
284+
if(std::filesystem::exists(graph_file.c_str()) == false)
285+
create_graph(repository_file, data_stream_type, graph_file, metric, lid, 30, 60, 0.1f, 30, 0.001f, 5, 12); // d, k_ext, eps_ext, k_opt, eps_opt, i_opt, thread_count
286+
test_graph(query_file, gt_file, graph_file, 1, 30); // repeat_test, k
278287

279-
if(std::filesystem::exists(opt_graph_file.c_str()) == false)
280-
optimze_graph(graph_file, opt_graph_file, 30, 0.001f, 5, 200000); // k_opt, eps_opt, i_opt, iteration
281-
test_graph(query_file, gt_file, opt_graph_file, 1, 30); // repeat_test, k
288+
// if(std::filesystem::exists(opt_graph_file.c_str()) == false)
289+
// optimze_graph(graph_file, opt_graph_file, 30, 0.001f, 5, 200000); // k_opt, eps_opt, i_opt, iteration
290+
// test_graph(query_file, gt_file, opt_graph_file, 1, 30); // repeat_test, k
282291

283-
if(std::filesystem::exists(mrng_graph_file.c_str()) == false) {
284-
// remove_non_mrng_edges(graph_file, mrng_graph_file);
285-
remove_non_mrng_edges(opt_graph_file, mrng_graph_file);
292+
// if(std::filesystem::exists(mrng_graph_file.c_str()) == false) {
293+
// remove_non_mrng_edges(graph_file, mrng_graph_file);
294+
// remove_non_mrng_edges(opt_graph_file, mrng_graph_file);
286295
// change_features(graph_file, repository_file, metric, opt_graph_file);
287-
}
288-
test_graph(query_file, gt_file, mrng_graph_file, 1, 30); // repeat_test, k
296+
// }
297+
// test_graph(query_file, gt_file, mrng_graph_file, 1, 30); // repeat_test, k
289298

290299

291300
// // ------------------------------- GLOVE -----------------------------------------

0 commit comments

Comments
 (0)