From 83b351383ad8e332cd08b4be905dabdc5f8fcd82 Mon Sep 17 00:00:00 2001 From: Andrey Prokopenko Date: Sat, 20 Feb 2021 01:46:14 -0500 Subject: [PATCH 1/5] Adding sampling to DBSCAN --- examples/dbscan/dbscan.cpp | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/examples/dbscan/dbscan.cpp b/examples/dbscan/dbscan.cpp index 37a3b4c29..7b83ff71e 100644 --- a/examples/dbscan/dbscan.cpp +++ b/examples/dbscan/dbscan.cpp @@ -298,6 +298,7 @@ int main(int argc, char *argv[]) int cluster_min_size; int core_min_size; int max_num_points; + int num_samples; bpo::options_description desc("Allowed options"); // clang-format off @@ -310,6 +311,7 @@ int main(int argc, char *argv[]) ( "cluster-min-size", bpo::value(&cluster_min_size)->default_value(2), "minimum cluster size") ( "core-min-size", bpo::value(&core_min_size)->default_value(2), "DBSCAN min_pts") ( "verify", bpo::bool_switch(&verify)->default_value(false), "verify connected components") + ( "samples", bpo::value(&num_samples)->default_value(-1), "number of samples" ) ( "print-dbscan-timers", bpo::bool_switch(&print_dbscan_timers)->default_value(false), "print dbscan timers") ( "output-sizes-and-centers", bpo::bool_switch(&print_sizes_centers)->default_value(false), "print cluster sizes and centers") ; @@ -330,13 +332,31 @@ int main(int argc, char *argv[]) printf("cluster min size : %d\n", cluster_min_size); printf("filename : %s [%s, max_pts = %d]\n", filename.c_str(), (binary ? "binary" : "text"), max_num_points); + printf("samples : %d\n", num_samples); printf("verify : %s\n", (verify ? "true" : "false")); printf("print timers : %s\n", (print_dbscan_timers ? "true" : "false")); printf("output centers : %s\n", (print_sizes_centers ? "true" : "false")); // read in data - auto const primitives = vec2view( - parsePoints(filename, binary, max_num_points), "primitives"); + std::vector data = + parsePoints(filename, binary, max_num_points); + if (num_samples > 0 && num_samples < (int)data.size()) + { + std::vector sampled_data(num_samples); + + // Knuth algorithm + auto const N = (int)data.size(); + auto const M = num_samples; + for (int in = 0, im = 0; in < N && im < M; ++in) + { + int rn = N - in; + int rm = M - im; + if (rand() % rn < rm) + sampled_data[im++] = data[in + 1]; + } + data = sampled_data; + } + auto const primitives = vec2view(data, "primitives"); ExecutionSpace exec_space; From bf01c63be7d320a5d19da119bd5f536f89005659 Mon Sep 17 00:00:00 2001 From: Andrey Prokopenko Date: Sat, 20 Feb 2021 12:02:20 -0500 Subject: [PATCH 2/5] Allow labels output --- examples/dbscan/dbscan.cpp | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/examples/dbscan/dbscan.cpp b/examples/dbscan/dbscan.cpp index 7b83ff71e..45530027a 100644 --- a/examples/dbscan/dbscan.cpp +++ b/examples/dbscan/dbscan.cpp @@ -84,6 +84,21 @@ std::vector parsePoints(std::string const &filename, return v; } +template +void writeLabelsData(std::string const &filename, + Kokkos::View labels) +{ + std::ofstream out(filename, std::ofstream::binary); + ARBORX_ASSERT(out.good()); + + auto labels_host = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, labels); + + int n = labels_host.size(); + out.write((char *)&n, sizeof(int)); + out.write((char *)labels_host.data(), sizeof(int) * n); +} + template auto vec2view(std::vector const &in, std::string const &label = "") { @@ -299,6 +314,7 @@ int main(int argc, char *argv[]) int core_min_size; int max_num_points; int num_samples; + std::string filename_labels; bpo::options_description desc("Allowed options"); // clang-format off @@ -312,6 +328,7 @@ int main(int argc, char *argv[]) ( "core-min-size", bpo::value(&core_min_size)->default_value(2), "DBSCAN min_pts") ( "verify", bpo::bool_switch(&verify)->default_value(false), "verify connected components") ( "samples", bpo::value(&num_samples)->default_value(-1), "number of samples" ) + ( "labels", bpo::value(&filename_labels)->default_value(""), "clutering results output" ) ( "print-dbscan-timers", bpo::bool_switch(&print_dbscan_timers)->default_value(false), "print dbscan timers") ( "output-sizes-and-centers", bpo::bool_switch(&print_sizes_centers)->default_value(false), "print cluster sizes and centers") ; @@ -332,6 +349,7 @@ int main(int argc, char *argv[]) printf("cluster min size : %d\n", cluster_min_size); printf("filename : %s [%s, max_pts = %d]\n", filename.c_str(), (binary ? "binary" : "text"), max_num_points); + printf("filename [labels] : %s [binary]\n", filename_labels.c_str()); printf("samples : %d\n", num_samples); printf("verify : %s\n", (verify ? "true" : "false")); printf("print timers : %s\n", (print_dbscan_timers ? "true" : "false")); @@ -401,6 +419,9 @@ int main(int argc, char *argv[]) printf("Verification %s\n", (passed ? "passed" : "failed")); } + if (filename_labels != "") + writeLabelsData(filename_labels, labels); + if (print_sizes_centers) printClusterSizesAndCenters(exec_space, primitives, cluster_indices, cluster_offset); From 52c5388d7794ed4cfa268dccea1a2e3a6be7eb65 Mon Sep 17 00:00:00 2001 From: Andrey Prokopenko Date: Fri, 12 Mar 2021 18:37:27 -0500 Subject: [PATCH 3/5] Move sampling into a separate function --- examples/dbscan/dbscan.cpp | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/examples/dbscan/dbscan.cpp b/examples/dbscan/dbscan.cpp index 45530027a..e8086cc87 100644 --- a/examples/dbscan/dbscan.cpp +++ b/examples/dbscan/dbscan.cpp @@ -84,6 +84,24 @@ std::vector parsePoints(std::string const &filename, return v; } +std::vector sampleData(std::vector &data, + int num_samples) +{ + std::vector sampled_data(num_samples); + + // Knuth algorithm + auto const N = (int)data.size(); + auto const M = num_samples; + for (int in = 0, im = 0; in < N && im < M; ++in) + { + int rn = N - in; + int rm = M - im; + if (rand() % rn < rm) + sampled_data[im++] = data[in + 1]; + } + return sampled_data; +} + template void writeLabelsData(std::string const &filename, Kokkos::View labels) @@ -359,21 +377,7 @@ int main(int argc, char *argv[]) std::vector data = parsePoints(filename, binary, max_num_points); if (num_samples > 0 && num_samples < (int)data.size()) - { - std::vector sampled_data(num_samples); - - // Knuth algorithm - auto const N = (int)data.size(); - auto const M = num_samples; - for (int in = 0, im = 0; in < N && im < M; ++in) - { - int rn = N - in; - int rm = M - im; - if (rand() % rn < rm) - sampled_data[im++] = data[in + 1]; - } - data = sampled_data; - } + data = sampleData(data, num_samples); auto const primitives = vec2view(data, "primitives"); ExecutionSpace exec_space; From f221e361a4b5e828a67184bb41bc3523244b9ba5 Mon Sep 17 00:00:00 2001 From: Andrey Prokopenko Date: Sat, 13 Mar 2021 11:11:53 -0500 Subject: [PATCH 4/5] Couple readability fixes Co-authored-by: Damien L-G --- examples/dbscan/dbscan.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/dbscan/dbscan.cpp b/examples/dbscan/dbscan.cpp index e8086cc87..e63c4cadb 100644 --- a/examples/dbscan/dbscan.cpp +++ b/examples/dbscan/dbscan.cpp @@ -84,7 +84,7 @@ std::vector parsePoints(std::string const &filename, return v; } -std::vector sampleData(std::vector &data, +std::vector sampleData(std::vector const &data, int num_samples) { std::vector sampled_data(num_samples); @@ -423,7 +423,7 @@ int main(int argc, char *argv[]) printf("Verification %s\n", (passed ? "passed" : "failed")); } - if (filename_labels != "") + if (!filename_labels.empty()) writeLabelsData(filename_labels, labels); if (print_sizes_centers) From f16863e6188a9cab1fc068823b41980923978e65 Mon Sep 17 00:00:00 2001 From: Andrey Prokopenko Date: Sat, 13 Mar 2021 11:12:45 -0500 Subject: [PATCH 5/5] Fix a bug in sampling The first value of the data was never used. Co-authored-by: Damien L-G --- examples/dbscan/dbscan.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/dbscan/dbscan.cpp b/examples/dbscan/dbscan.cpp index e63c4cadb..cd2607337 100644 --- a/examples/dbscan/dbscan.cpp +++ b/examples/dbscan/dbscan.cpp @@ -97,7 +97,7 @@ std::vector sampleData(std::vector const &data, int rn = N - in; int rm = M - im; if (rand() % rn < rm) - sampled_data[im++] = data[in + 1]; + sampled_data[im++] = data[in]; } return sampled_data; }