Skip to content

Commit dd5438e

Browse files
authored
add gbench bandwidth stats (#298)
1 parent 66a2237 commit dd5438e

File tree

12 files changed

+124
-25
lines changed

12 files changed

+124
-25
lines changed

benchmarks/README.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,3 +23,11 @@ When running::
2323

2424
To verify that you are using 48 cores, run ``top`` in another
2525
window. The stream process should show 4800 in the ``%CPU`` column.
26+
27+
Google Benchmarks
28+
=================
29+
30+
Micro-benchmark framework for measuring primitive operations. See
31+
`gbench README`_.
32+
33+
.. _`gbench README`: gbench/README.rst

benchmarks/gbench/README.rst

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
.. SPDX-FileCopyrightText: Intel Corporation
2+
..
3+
.. SPDX-License-Identifier: BSD-3-Clause
4+
5+
==============
6+
Google Bench
7+
==============
8+
9+
We use google bench for micro-benchmarks.
10+
11+
Sample Commands
12+
===============
13+
14+
Run with all benchmarks with 2 ranks::
15+
16+
mpirun -n 2 ./mhp-bench --benchmark_time_unit=ms --benchmark_counters_tabular=true
17+
18+
Run 2D stencil algorithms::
19+
20+
mpirun -n 2 ./mhp-bench --benchmark_time_unit=ms --benchmark_counters_tabular=true --benchmark_filter=Stencil2D
21+
22+
Run distributed ranges algorithms::
23+
24+
mpirun -n 2 ./mhp-bench --benchmark_time_unit=ms --benchmark_counters_tabular=true --benchmark_filter=.*DR
25+
26+
Show google bench options::
27+
28+
./mhp-bench --help
29+
30+
Show our custom options::
31+
32+
./mhp-bench --drhelp
33+
34+
See `user guide`_ for more information on google benchmark.
35+
36+
.. _`user guide`: https://github.com/google/benchmark/blob/main/docs/user_guide.md#custom-counters
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
// SPDX-FileCopyrightText: Intel Corporation
2+
//
3+
// SPDX-License-Identifier: BSD-3-Clause
4+
#pragma once
5+
6+
#include "cxxopts.hpp"
7+
#include <benchmark/benchmark.h>
8+
#include <fmt/core.h>
9+
#include <fmt/ranges.h>
10+
#include <vendor/source_location/source_location.hpp>
11+
12+
extern std::size_t comm_rank;
13+
extern std::size_t comm_size;
14+
15+
extern std::size_t default_vector_size;
16+
extern std::size_t default_repetitions;
17+
18+
inline void memory_bandwidth(benchmark::State &state, std::size_t bytes) {
19+
state.counters["Memory"] =
20+
benchmark::Counter(bytes, benchmark::Counter::kIsIterationInvariantRate,
21+
benchmark::Counter::kIs1024);
22+
}

benchmarks/gbench/common/distributed_vector.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ static void Fill_DR(benchmark::State &state) {
2020
xhp::fill(vec, 0);
2121
}
2222
}
23+
memory_bandwidth(state,
24+
default_repetitions * default_vector_size * sizeof(T));
2325
}
2426

2527
BENCHMARK(Fill_DR);
@@ -31,6 +33,8 @@ static void Fill_Local(benchmark::State &state) {
3133
rng::fill(vec, 0);
3234
}
3335
}
36+
memory_bandwidth(state,
37+
default_repetitions * default_vector_size * sizeof(T));
3438
}
3539

3640
BENCHMARK(Fill_Local);
@@ -46,6 +50,8 @@ static void Copy_DR(benchmark::State &state) {
4650
xhp::copy(src, dst.begin());
4751
}
4852
}
53+
memory_bandwidth(state,
54+
2 * default_repetitions * default_vector_size * sizeof(T));
4955
}
5056

5157
BENCHMARK(Copy_DR);
@@ -59,6 +65,8 @@ static void Copy_Local(benchmark::State &state) {
5965
rng::copy(src, dst.begin());
6066
}
6167
}
68+
memory_bandwidth(state,
69+
2 * default_repetitions * default_vector_size * sizeof(T));
6270
}
6371

6472
BENCHMARK(Copy_Local);
@@ -71,6 +79,8 @@ static void Reduce_DR(benchmark::State &state) {
7179
benchmark::DoNotOptimize(res);
7280
}
7381
}
82+
memory_bandwidth(state,
83+
default_repetitions * default_vector_size * sizeof(T));
7484
}
7585

7686
BENCHMARK(Reduce_DR);
@@ -83,6 +93,8 @@ static void Reduce_Local(benchmark::State &state) {
8393
benchmark::DoNotOptimize(res);
8494
}
8595
}
96+
memory_bandwidth(state,
97+
default_repetitions * default_vector_size * sizeof(T));
8698
}
8799

88100
BENCHMARK(Reduce_Local);
@@ -98,6 +110,8 @@ static void Reduce_DPL(benchmark::State &state) {
98110
benchmark::DoNotOptimize(res);
99111
}
100112
}
113+
memory_bandwidth(state,
114+
default_repetitions * default_vector_size * sizeof(T));
101115
}
102116

103117
BENCHMARK(Reduce_DPL);
@@ -111,6 +125,8 @@ static void TransformIdentity_DR(benchmark::State &state) {
111125
xhp::transform(src, dst.begin(), std::identity());
112126
}
113127
}
128+
memory_bandwidth(state,
129+
2 * default_repetitions * default_vector_size * sizeof(T));
114130
}
115131

116132
BENCHMARK(TransformIdentity_DR);
@@ -123,6 +139,8 @@ static void TransformIdentity_Local(benchmark::State &state) {
123139
rng::transform(src, dst.begin(), std::identity());
124140
}
125141
}
142+
memory_bandwidth(state,
143+
2 * default_repetitions * default_vector_size * sizeof(T));
126144
}
127145

128146
BENCHMARK(TransformIdentity_Local);
@@ -143,6 +161,8 @@ static void Mul_DR(benchmark::State &state) {
143161
xhp::transform(xhp::views::zip(a, b), c.begin(), mul);
144162
}
145163
}
164+
memory_bandwidth(state,
165+
3 * default_repetitions * default_vector_size * sizeof(T));
146166
}
147167

148168
BENCHMARK(Mul_DR);
@@ -159,6 +179,8 @@ static void Mul_Local(benchmark::State &state) {
159179
c.begin(), mul);
160180
}
161181
}
182+
memory_bandwidth(state,
183+
3 * default_repetitions * default_vector_size * sizeof(T));
162184
}
163185

164186
BENCHMARK(Mul_Local);

benchmarks/gbench/common/dot_product.cpp

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ static void DotProduct_ZipReduce_DR(benchmark::State &state) {
4444
}
4545
}
4646
check_dp(res);
47+
memory_bandwidth(state,
48+
2 * default_repetitions * default_vector_size * sizeof(T));
4749
}
4850

4951
BENCHMARK(DotProduct_ZipReduce_DR);
@@ -65,6 +67,8 @@ static void DotProduct_ZipReduce_Std(benchmark::State &state) {
6567
}
6668
}
6769
check_dp(res);
70+
memory_bandwidth(state,
71+
2 * default_repetitions * default_vector_size * sizeof(T));
6872
}
6973

7074
BENCHMARK(DotProduct_ZipReduce_Std);
@@ -83,6 +87,8 @@ static void DotProduct_TransformReduce_Std(benchmark::State &state) {
8387
}
8488
}
8589
check_dp(res);
90+
memory_bandwidth(state,
91+
2 * default_repetitions * default_vector_size * sizeof(T));
8692
}
8793

8894
BENCHMARK(DotProduct_TransformReduce_Std);
@@ -93,13 +99,17 @@ static void DotProduct_Loop_Std(benchmark::State &state) {
9399
T res = 0;
94100

95101
for (auto _ : state) {
96-
res = 0;
97-
for (std::size_t i = 0; i < default_vector_size; i++) {
98-
res += a[i] * b[i];
102+
for (std::size_t rep = 0; rep < default_repetitions; rep++) {
103+
res = 0;
104+
for (std::size_t i = 0; i < default_vector_size; i++) {
105+
res += a[i] * b[i];
106+
}
107+
benchmark::DoNotOptimize(res);
99108
}
100-
benchmark::DoNotOptimize(res);
101109
}
102110
check_dp(res);
111+
memory_bandwidth(state,
112+
2 * default_repetitions * default_vector_size * sizeof(T));
103113
}
104114

105115
BENCHMARK(DotProduct_Loop_Std);
@@ -126,6 +136,8 @@ static void DotProduct_TransformReduce_DPL(benchmark::State &state) {
126136
}
127137
}
128138
check_dp(res);
139+
memory_bandwidth(state,
140+
2 * default_repetitions * default_vector_size * sizeof(T));
129141
}
130142

131143
BENCHMARK(DotProduct_TransformReduce_DPL);

benchmarks/gbench/mhp/rooted.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ static void CopyDist2Local_DR(benchmark::State &state) {
1414
xhp::copy(0, src, dst.begin());
1515
}
1616
}
17+
memory_bandwidth(state,
18+
2 * default_repetitions * default_vector_size * sizeof(T));
1719
}
1820

1921
BENCHMARK(CopyDist2Local_DR);
@@ -26,6 +28,8 @@ static void CopyLocal2Dist_DR(benchmark::State &state) {
2628
xhp::copy(0, src, dst.begin());
2729
}
2830
}
31+
memory_bandwidth(state,
32+
2 * default_repetitions * default_vector_size * sizeof(T));
2933
}
3034

3135
BENCHMARK(CopyLocal2Dist_DR);

benchmarks/gbench/mhp/stencil_1d.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ static void Stencil1D_Slide_Std(benchmark::State &state) {
3636
std::swap(out_curr, out_prev);
3737
}
3838
}
39+
memory_bandwidth(state, 2 * stencil_steps * default_vector_size * sizeof(T));
3940
}
4041

4142
BENCHMARK(Stencil1D_Slide_Std);
@@ -58,6 +59,7 @@ static void Stencil1D_Subrange_Std(benchmark::State &state) {
5859
std::swap(in, out);
5960
}
6061
}
62+
memory_bandwidth(state, 2 * stencil_steps * default_vector_size * sizeof(T));
6163
}
6264

6365
BENCHMARK(Stencil1D_Subrange_Std);
@@ -77,6 +79,7 @@ static void Stencil1D_Subrange_DR(benchmark::State &state) {
7779
std::swap(in, out);
7880
}
7981
}
82+
memory_bandwidth(state, 2 * stencil_steps * default_vector_size * sizeof(T));
8083
}
8184

8285
BENCHMARK(Stencil1D_Subrange_DR);
@@ -99,6 +102,7 @@ static void Stencil1D_Subrange_DPL(benchmark::State &state) {
99102
std::swap(in, out);
100103
}
101104
}
105+
memory_bandwidth(state, 2 * stencil_steps * default_vector_size * sizeof(T));
102106
}
103107

104108
BENCHMARK(Stencil1D_Subrange_DPL);

benchmarks/gbench/mhp/stencil_2d.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ static void Stencil2D_Loop_Serial(benchmark::State &state) {
143143

144144
checker.check(rng::span(in, rows * cols));
145145
}
146+
memory_bandwidth(state, 2 * stencil_steps * rows * cols * sizeof(T));
146147
}
147148

148149
BENCHMARK(Stencil2D_Loop_Serial);
@@ -190,6 +191,7 @@ static void Stencil2D_ForeachStdArray_DR(benchmark::State &state) {
190191
}
191192
checker.check_array(stencil_steps % 2 ? b : a);
192193
}
194+
memory_bandwidth(state, 2 * stencil_steps * rows * cols * sizeof(T));
193195
}
194196

195197
BENCHMARK(Stencil2D_ForeachStdArray_DR);
@@ -228,6 +230,7 @@ static void Stencil2D_NocollectiveCPU_DR(benchmark::State &state) {
228230
}
229231
checker.check(stencil_steps % 2 ? b : a);
230232
}
233+
memory_bandwidth(state, 2 * stencil_steps * rows * cols * sizeof(T));
231234
}
232235

233236
BENCHMARK(Stencil2D_NocollectiveCPU_DR);
@@ -315,6 +318,7 @@ static void Stencil2D_Basic_SYCL(benchmark::State &state) {
315318
}
316319
checker.check_device(q, in);
317320
}
321+
memory_bandwidth(state, 2 * stencil_steps * rows * cols * sizeof(T));
318322
}
319323

320324
BENCHMARK(Stencil2D_Basic_SYCL);

benchmarks/gbench/mhp/xhp-bench.hpp

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,21 +3,15 @@
33
// SPDX-License-Identifier: BSD-3-Clause
44
#pragma once
55

6-
#include "cxxopts.hpp"
7-
#include <benchmark/benchmark.h>
8-
#include <fmt/core.h>
9-
#include <fmt/ranges.h>
10-
#include <vendor/source_location/source_location.hpp>
6+
#include "../common/common-bench.hpp"
117

128
#include "dr/mhp.hpp"
139

1410
namespace xhp = dr::mhp;
1511

16-
extern std::size_t default_vector_size;
17-
extern std::size_t default_repetitions;
12+
#define BENCH_MHP
13+
1814
extern std::size_t stencil_steps;
1915
extern std::size_t num_rows;
2016
extern std::size_t num_columns;
2117
extern bool check_results;
22-
23-
#define BENCH_MHP

benchmarks/gbench/shp/shp-bench.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77
std::size_t default_vector_size;
88
std::size_t default_repetitions;
99

10+
std::size_t comm_rank = 0;
11+
std::size_t comm_size = 1;
12+
1013
cxxopts::ParseResult options;
1114

1215
int main(int argc, char *argv[]) {

benchmarks/gbench/shp/xhp-bench.hpp

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,10 @@
33
// SPDX-License-Identifier: BSD-3-Clause
44
#pragma once
55

6-
#include "cxxopts.hpp"
7-
#include <benchmark/benchmark.h>
8-
#include <fmt/core.h>
9-
#include <fmt/ranges.h>
10-
#include <vendor/source_location/source_location.hpp>
6+
#include "../common/common-bench.hpp"
117

128
#include "dr/shp.hpp"
139

1410
namespace xhp = dr::shp;
1511

16-
extern std::size_t default_vector_size;
17-
extern std::size_t default_repetitions;
18-
19-
inline std::size_t comm_rank = 0;
20-
inline std::size_t comm_size = 1;
21-
2212
#define BENCH_SHP

examples/mhp/dot_product_benchmark.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ void time_summary(auto &durations, auto &sum) {
9191

9292
fmt::print("Median duration: {} ms\n", median_duration * 1000);
9393
fmt::print("Memory bandwidth: {:.6} MB/s\n",
94-
n * 3 * sizeof(T) / (median_duration * 1000 * 1000));
94+
n * 2 * sizeof(T) / (median_duration * 1000 * 1000));
9595
}
9696

9797
void stats(auto &durations, auto &sum, auto v_serial, auto &x_local,

0 commit comments

Comments
 (0)