Skip to content

Commit d0f45cf

Browse files
committed
GH-48897: [C++] Add benchmark for CountSetBits
1 parent 421a475 commit d0f45cf

File tree

2 files changed

+19
-4
lines changed

2 files changed

+19
-4
lines changed

cpp/src/arrow/util/bit_util_benchmark.cc

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -439,6 +439,17 @@ static void SetBitsTo(benchmark::State& state) {
439439
state.SetBytesProcessed(state.iterations() * nbytes);
440440
}
441441

442+
static void CountSetBits(benchmark::State& state) {
443+
int64_t nbytes = state.range(0);
444+
std::shared_ptr<Buffer> buffer = CreateRandomBuffer(nbytes);
445+
446+
for (auto _ : state) {
447+
auto count = internal::CountSetBits(buffer->data(), /*bit_offset=*/0, nbytes * 8);
448+
benchmark::DoNotOptimize(count);
449+
}
450+
state.SetBytesProcessed(state.iterations() * nbytes);
451+
}
452+
442453
template <int64_t OffsetSrc, int64_t OffsetDest = 0>
443454
static void CopyBitmap(benchmark::State& state) { // NOLINT non-const reference
444455
const int64_t buffer_size = state.range(0);
@@ -519,6 +530,7 @@ BENCHMARK(ReverseSetBitRunReader)->Apply(SetBitRunReaderPercentageArg);
519530
BENCHMARK(VisitBits)->Arg(kBufferSize);
520531
BENCHMARK(VisitBitsUnrolled)->Arg(kBufferSize);
521532
BENCHMARK(SetBitsTo)->Arg(2)->Arg(1 << 4)->Arg(1 << 10)->Arg(1 << 17);
533+
BENCHMARK(CountSetBits)->Arg(1 << 4)->Arg(1 << 10)->Arg(1 << 17);
522534

523535
#ifdef ARROW_WITH_BENCHMARKS_REFERENCE
524536
static void ReferenceNaiveBitmapWriter(benchmark::State& state) {

cpp/src/arrow/util/bitmap_ops.cc

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
#include "arrow/util/bitmap_ops.h"
1919

20+
#include <array>
2021
#include <cstdint>
2122
#include <cstring>
2223
#include <functional>
@@ -55,13 +56,15 @@ int64_t CountSetBits(const uint8_t* data, int64_t bit_offset, int64_t length) {
5556
constexpr int64_t kCountUnrollFactor = 4;
5657
const int64_t words_rounded =
5758
bit_util::RoundDown(p.aligned_words, kCountUnrollFactor);
58-
int64_t count_unroll[kCountUnrollFactor] = {0};
59+
std::array<int64_t, kCountUnrollFactor> count_unroll{};
5960

6061
// Unroll the loop for better performance
6162
for (int64_t i = 0; i < words_rounded; i += kCountUnrollFactor) {
62-
for (int64_t k = 0; k < kCountUnrollFactor; k++) {
63-
count_unroll[k] += bit_util::PopCount(u64_data[k]);
64-
}
63+
// (hand-unrolled as some gcc versions would unnest a nested `for` loop)
64+
count_unroll[0] += bit_util::PopCount(u64_data[0]);
65+
count_unroll[1] += bit_util::PopCount(u64_data[1]);
66+
count_unroll[2] += bit_util::PopCount(u64_data[2]);
67+
count_unroll[3] += bit_util::PopCount(u64_data[3]);
6568
u64_data += kCountUnrollFactor;
6669
}
6770
for (int64_t k = 0; k < kCountUnrollFactor; k++) {

0 commit comments

Comments
 (0)