Skip to content

Commit b36659a

Browse files
authored
GH-45591: [C++][Acero] Refine hash join benchmark and remove openmp from the project (#45593)
### Rationale for this change See #45591 . ### What changes are included in this PR? 1. Replace the usage of openmp with arrow-native multi-threading primitives; 2. Remove all the occurrences of openmp from the project; 3. Support stats for build side rows in hash join benchmark, and update certain benchmark. ### Are these changes tested? Manually tested. ### Are there any user-facing changes? Removed a public CMake option but I think it shouldn't affect the user. * GitHub Issue: #45591 Authored-by: Rossi Sun <zanmato1984@gmail.com> Signed-off-by: Rossi Sun <zanmato1984@gmail.com>
1 parent 5e5fb07 commit b36659a

8 files changed

Lines changed: 34 additions & 52 deletions

File tree

ci/scripts/cpp_build.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,6 @@ else
147147
-DARROW_BUILD_BENCHMARKS=${ARROW_BUILD_BENCHMARKS:-OFF} \
148148
-DARROW_BUILD_EXAMPLES=${ARROW_BUILD_EXAMPLES:-OFF} \
149149
-DARROW_BUILD_INTEGRATION=${ARROW_BUILD_INTEGRATION:-OFF} \
150-
-DARROW_BUILD_OPENMP_BENCHMARKS=${ARROW_BUILD_OPENMP_BENCHMARKS:-OFF} \
151150
-DARROW_BUILD_SHARED=${ARROW_BUILD_SHARED:-ON} \
152151
-DARROW_BUILD_STATIC=${ARROW_BUILD_STATIC:-ON} \
153152
-DARROW_BUILD_TESTS=${ARROW_BUILD_TESTS:-OFF} \

cpp/CMakePresets.json

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@
4141
"cacheVariables": {
4242
"ARROW_BUILD_BENCHMARKS": "ON",
4343
"ARROW_BUILD_BENCHMARKS_REFERENCE": "ON",
44-
"ARROW_BUILD_OPENMP_BENCHMARKS": "ON",
4544
"ARROW_BUILD_DETAILED_BENCHMARKS": "OFF",
4645
"CMAKE_BUILD_TYPE": "RelWithDebInfo"
4746
}

cpp/cmake_modules/DefineOptions.cmake

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -243,9 +243,6 @@ takes precedence over ccache if a storage backend is configured" ON)
243243
define_option(ARROW_BUILD_BENCHMARKS_REFERENCE
244244
"Build the Arrow micro reference benchmarks" OFF)
245245

246-
define_option(ARROW_BUILD_OPENMP_BENCHMARKS
247-
"Build the Arrow benchmarks that rely on OpenMP" OFF)
248-
249246
define_option(ARROW_BUILD_DETAILED_BENCHMARKS
250247
"Build benchmarks that do a longer exploration of performance" OFF)
251248

cpp/src/arrow/acero/CMakeLists.txt

Lines changed: 3 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -221,36 +221,21 @@ if(ARROW_BUILD_BENCHMARKS)
221221

222222
add_arrow_acero_benchmark(aggregate_benchmark SOURCES aggregate_benchmark.cc)
223223

224-
if(ARROW_BUILD_OPENMP_BENCHMARKS)
225-
find_package(OpenMP REQUIRED)
226-
add_arrow_acero_benchmark(hash_join_benchmark
227-
EXTRA_LINK_LIBS
228-
OpenMP::OpenMP_CXX
229-
SOURCES
230-
hash_join_benchmark.cc)
231-
if(MSVC)
232-
target_compile_options(arrow-compute-hash-join-benchmark
233-
PRIVATE "-openmp:experimental -openmp:llvm")
234-
endif()
235-
endif()
224+
add_arrow_acero_benchmark(hash_join_benchmark SOURCES hash_join_benchmark.cc)
236225

237226
if(ARROW_BUILD_STATIC)
238227
target_link_libraries(arrow-acero-expression-benchmark PUBLIC arrow_acero_static)
239228
target_link_libraries(arrow-acero-filter-benchmark PUBLIC arrow_acero_static)
240229
target_link_libraries(arrow-acero-project-benchmark PUBLIC arrow_acero_static)
241230
target_link_libraries(arrow-acero-asof-join-benchmark PUBLIC arrow_acero_static)
242231
target_link_libraries(arrow-acero-tpch-benchmark PUBLIC arrow_acero_static)
243-
if(ARROW_BUILD_OPENMP_BENCHMARKS)
244-
target_link_libraries(arrow-acero-hash-join-benchmark PUBLIC arrow_acero_static)
245-
endif()
232+
target_link_libraries(arrow-acero-hash-join-benchmark PUBLIC arrow_acero_static)
246233
else()
247234
target_link_libraries(arrow-acero-expression-benchmark PUBLIC arrow_acero_shared)
248235
target_link_libraries(arrow-acero-filter-benchmark PUBLIC arrow_acero_shared)
249236
target_link_libraries(arrow-acero-project-benchmark PUBLIC arrow_acero_shared)
250237
target_link_libraries(arrow-acero-asof-join-benchmark PUBLIC arrow_acero_shared)
251238
target_link_libraries(arrow-acero-tpch-benchmark PUBLIC arrow_acero_shared)
252-
if(ARROW_BUILD_OPENMP_BENCHMARKS)
253-
target_link_libraries(arrow-acero-hash-join-benchmark PUBLIC arrow_acero_shared)
254-
endif()
239+
target_link_libraries(arrow-acero-hash-join-benchmark PUBLIC arrow_acero_shared)
255240
endif()
256241
endif()

cpp/src/arrow/acero/accumulation_queue.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ using arrow::compute::ExecBatch;
3434

3535
/// \brief A container that accumulates batches until they are ready to
3636
/// be processed.
37-
class AccumulationQueue {
37+
class ARROW_ACERO_EXPORT AccumulationQueue {
3838
public:
3939
AccumulationQueue() : row_count_(0) {}
4040
~AccumulationQueue() = default;

cpp/src/arrow/acero/hash_join.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ namespace acero {
3737

3838
using util::AccumulationQueue;
3939

40-
class HashJoinImpl {
40+
class ARROW_ACERO_EXPORT HashJoinImpl {
4141
public:
4242
using OutputBatchCallback = std::function<Status(int64_t, ExecBatch)>;
4343
using BuildFinishedCallback = std::function<Status(size_t)>;

cpp/src/arrow/acero/hash_join_benchmark.cc

Lines changed: 28 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,6 @@
3232
#include <cstdio>
3333
#include <memory>
3434

35-
#include <omp.h>
36-
3735
namespace arrow {
3836
namespace acero {
3937
struct BenchmarkSettings {
@@ -56,6 +54,8 @@ struct BenchmarkSettings {
5654
int var_length_max = 20; // Maximum length of any var length types
5755

5856
Expression residual_filter = literal(true);
57+
58+
bool stats_probe_rows = true;
5959
};
6060

6161
class JoinBenchmark {
@@ -128,6 +128,7 @@ class JoinBenchmark {
128128
for (ExecBatch& batch : r_batches_with_schema.batches)
129129
r_batches_.InsertBatch(std::move(batch));
130130

131+
stats_.num_build_rows = settings.num_build_batches * settings.batch_size;
131132
stats_.num_probe_rows = settings.num_probe_batches * settings.batch_size;
132133

133134
schema_mgr_ = std::make_unique<HashJoinSchema>();
@@ -141,14 +142,9 @@ class JoinBenchmark {
141142
join_ = *HashJoinImpl::MakeSwiss();
142143
}
143144

144-
omp_set_num_threads(settings.num_threads);
145-
auto schedule_callback = [](std::function<Status(size_t)> func) -> Status {
146-
#pragma omp task
147-
{ DCHECK_OK(func(omp_get_thread_num())); }
148-
return Status::OK();
149-
};
150-
151145
scheduler_ = TaskScheduler::Make();
146+
thread_pool_ = arrow::internal::GetCpuThreadPool();
147+
DCHECK_OK(thread_pool_->SetCapacity(settings.num_threads));
152148
DCHECK_OK(ctx_.Init(nullptr));
153149

154150
auto register_task_group_callback = [&](std::function<Status(size_t, int64_t)> task,
@@ -157,15 +153,15 @@ class JoinBenchmark {
157153
};
158154

159155
auto start_task_group_callback = [&](int task_group_id, int64_t num_tasks) {
160-
return scheduler_->StartTaskGroup(omp_get_thread_num(), task_group_id, num_tasks);
156+
return scheduler_->StartTaskGroup(/*thread_id=*/0, task_group_id, num_tasks);
161157
};
162158

163159
DCHECK_OK(join_->Init(
164160
&ctx_, settings.join_type, settings.num_threads, &(schema_mgr_->proj_maps[0]),
165161
&(schema_mgr_->proj_maps[1]), std::move(key_cmp), settings.residual_filter,
166162
std::move(register_task_group_callback), std::move(start_task_group_callback),
167163
[](int64_t, ExecBatch) { return Status::OK(); },
168-
[](int64_t) { return Status::OK(); }));
164+
[&](int64_t) { return Status::OK(); }));
169165

170166
task_group_probe_ = scheduler_->RegisterTaskGroup(
171167
[this](size_t thread_index, int64_t task_id) -> Status {
@@ -178,25 +174,27 @@ class JoinBenchmark {
178174
scheduler_->RegisterEnd();
179175

180176
DCHECK_OK(scheduler_->StartScheduling(
181-
0 /*thread index*/, std::move(schedule_callback),
182-
static_cast<int>(2 * settings.num_threads) /*concurrent tasks*/,
183-
settings.num_threads == 1));
177+
/*thread_id=*/0,
178+
[&](std::function<Status(size_t)> task) -> Status {
179+
return thread_pool_->Spawn([&, task]() { DCHECK_OK(task(thread_indexer_())); });
180+
},
181+
thread_pool_->GetCapacity(), settings.num_threads == 1));
184182
}
185183

186184
void RunJoin() {
187-
#pragma omp parallel
188-
{
189-
int tid = omp_get_thread_num();
190-
#pragma omp single
191-
DCHECK_OK(
192-
join_->BuildHashTable(tid, std::move(r_batches_), [this](size_t thread_index) {
193-
return scheduler_->StartTaskGroup(thread_index, task_group_probe_,
194-
l_batches_.batch_count());
195-
}));
196-
}
185+
DCHECK_OK(join_->BuildHashTable(
186+
/*thread_id=*/0, std::move(r_batches_), [this](size_t thread_index) {
187+
return scheduler_->StartTaskGroup(thread_index, task_group_probe_,
188+
l_batches_.batch_count());
189+
}));
190+
191+
thread_pool_->WaitForIdle();
197192
}
198193

199194
std::unique_ptr<TaskScheduler> scheduler_;
195+
ThreadIndexer thread_indexer_;
196+
arrow::internal::ThreadPool* thread_pool_;
197+
200198
AccumulationQueue l_batches_;
201199
AccumulationQueue r_batches_;
202200
std::unique_ptr<HashJoinSchema> schema_mgr_;
@@ -205,6 +203,7 @@ class JoinBenchmark {
205203
int task_group_probe_;
206204

207205
struct {
206+
uint64_t num_build_rows;
208207
uint64_t num_probe_rows;
209208
} stats_;
210209
};
@@ -219,11 +218,13 @@ static void HashJoinBasicBenchmarkImpl(benchmark::State& st,
219218
st.ResumeTiming();
220219
bm.RunJoin();
221220
st.PauseTiming();
222-
total_rows += bm.stats_.num_probe_rows;
221+
total_rows += (settings.stats_probe_rows ? bm.stats_.num_probe_rows
222+
: bm.stats_.num_build_rows);
223223
}
224224
st.ResumeTiming();
225225
}
226-
st.counters["rows/sec"] = benchmark::Counter(total_rows, benchmark::Counter::kIsRate);
226+
st.counters["rows/sec"] =
227+
benchmark::Counter(static_cast<double>(total_rows), benchmark::Counter::kIsRate);
227228
}
228229

229230
template <typename... Args>
@@ -302,6 +303,7 @@ static void BM_HashJoinBasic_BuildParallelism(benchmark::State& st) {
302303
settings.num_threads = static_cast<int>(st.range(0));
303304
settings.num_build_batches = static_cast<int>(st.range(1));
304305
settings.num_probe_batches = settings.num_threads;
306+
settings.stats_probe_rows = false;
305307

306308
HashJoinBasicBenchmarkImpl(st, settings);
307309
}

cpp/src/arrow/acero/swiss_join_internal.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ class RowArrayAccessor {
175175
// Read operations (row comparison, column decoding)
176176
// can be called by multiple threads concurrently.
177177
//
178-
struct RowArray {
178+
struct ARROW_ACERO_EXPORT RowArray {
179179
RowArray() : is_initialized_(false), hardware_flags_(0) {}
180180

181181
Status InitIfNeeded(MemoryPool* pool, int64_t hardware_flags, const ExecBatch& batch);

0 commit comments

Comments
 (0)