|
| 1 | +#include <algorithm> |
| 2 | +#include <array> |
| 3 | +#include <cstdint> |
| 4 | +#include <limits> |
| 5 | +#include <memory> |
| 6 | +#include <optional> |
| 7 | +#include <cstdio> |
| 8 | + |
| 9 | +// This is a simple conway's game-of-life implementation |
| 10 | +// that is constexpr friendly and can work as a benchmark |
| 11 | +// for parallel computation models in C++ |
| 12 | +// |
| 13 | +// Notes I learned along the way while learning AdaptiveCpp |
| 14 | +// |
| 15 | +// AMD GPU Install notes: |
| 16 | +// * AMD focuses on LTS ubuntu releases, if you have a different release, |
| 17 | +// expect a little pain |
| 18 | +// * I had good luck installing the AMDGPU Installer option here: |
| 19 | +// https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#amdgpu-ubuntu |
| 20 | +// * The rocm-gdb package would not install on my OS because of some outdated |
| 21 | +// dependencies |
| 22 | +// * The amdgpu-install tool will set up the apt repositories that you need |
| 23 | +// * If your OS is fully supported, just install the copy level package |
| 24 | +// * Honestly, I just kept installing random ROCm packages until I got things |
| 25 | +// working, |
| 26 | +// which was I think everything except for the gdb package that I could not |
| 27 | +// install |
| 28 | +// |
| 29 | +// After You've Installed ROCm |
| 30 | +// * add yourself to the render group |
| 31 | +// * consider rebooting probably |
| 32 | +// * run `rocminfo` and make sure it sees your GPUs |
| 33 | +// |
| 34 | +// Other GPUs: |
| 35 | +// * I have no input here |
| 36 | +// |
| 37 | +// Use the "automatic installation script" to install llvm >= 14 |
| 38 | +// * https://apt.llvm.org/ |
| 39 | +// * You probably want to install "all" |
| 40 | +// ```sh |
| 41 | +// wget https://apt.llvm.org/llvm.sh |
| 42 | +// chmod +x llvm.sh |
| 43 | +// sudo ./llvm.sh <version number> all |
| 44 | +// ``` |
| 45 | +// |
| 46 | +// Now Build And Install AdaptiveCpp |
| 47 | +// * https://github.com/AdaptiveCpp/AdaptiveCpp/blob/develop/doc/installing.md#a-standard-installation |
| 48 | +// * Run `acpp-info` and make sure you get output similar to what `rocminfo` |
| 49 | +// gave you |
| 50 | +// |
| 51 | +// Install nvtop to monitor GPU usage and make sure this is doing what you want. |
| 52 | +// |
| 53 | +// To Compare with GCC |
| 54 | +// * install libttb-dev |
| 55 | +// |
| 56 | +// Theoretically you are ready to go now?! |
| 57 | +// |
| 58 | +// |
| 59 | +// To compile with all optimizations and parallel std lib support enabled: |
| 60 | +// |
| 61 | +// ```sh |
| 62 | +// # AdaptiveCpp |
| 63 | +// acpp -std=c++23 ./game_of_life.cpp -O3 -march=native --acpp-stdpar |
| 64 | +// |
| 65 | +// # gcc/clang. If you don't have ttb installed/linked it falls back to single |
| 66 | +// threaded silently g++ -std=c++23 ./game_of_life.cpp -O3 -march=native -lttb |
| 67 | +// clang++ -std=c++23 ./game_of_life.cpp -O3 -march=native -lttb |
| 68 | +// |
| 69 | +// # Depending on clang version you might need to add -fexperimental-library |
| 70 | +// ``` |
| 71 | +// |
| 72 | +// Run, watch nvtop, htop, run with /usr/bin/time to see total CPU utilization, |
| 73 | +// etc and see how it scales on your platform |
| 74 | + |
| 75 | +// Handy modulo operator that wraps around automatically |
| 76 | +[[nodiscard]] constexpr auto floor_modulo(auto dividend, auto divisor) { |
| 77 | + return ((dividend % divisor) + divisor) % divisor; |
| 78 | +} |
| 79 | + |
| 80 | +// This is probably unnecessary, but the min_int |
| 81 | +// utilities exist to make the `Point` type as compact as possible |
| 82 | +// so that we only use int16 if that's all we need, for example |
| 83 | +template <std::size_t value> auto min_int() { |
| 84 | + if constexpr (value <= std::numeric_limits<std::int8_t>::max()) { |
| 85 | + return std::int8_t{}; |
| 86 | + } else if constexpr (value <= std::numeric_limits<std::int16_t>::max()) { |
| 87 | + return std::int16_t{}; |
| 88 | + } else if constexpr (value <= std::numeric_limits<std::int32_t>::max()) { |
| 89 | + return std::int32_t{}; |
| 90 | + } else { |
| 91 | + return std::int64_t{}; |
| 92 | + } |
| 93 | +} |
| 94 | + |
| 95 | +template <std::size_t value> using min_int_t = decltype(min_int<value>()); |
| 96 | + |
| 97 | +// templated on size mostly to give the compiler extra hints |
| 98 | +// about the code, so it knows what it can unroll, etc. |
| 99 | +template <std::size_t Width, std::size_t Height> struct GameBoard { |
| 100 | + // These are the properly sized things necessary to hold coordinates |
| 101 | + // that work with this particular size of board |
| 102 | + using x_index_t = min_int_t<Width>; |
| 103 | + using y_index_t = min_int_t<Height>; |
| 104 | + |
| 105 | + static constexpr x_index_t width = Width; |
| 106 | + static constexpr y_index_t height = Height; |
| 107 | + |
| 108 | + std::array<bool, Width * Height> data; |
| 109 | + |
| 110 | + struct Point { |
| 111 | + x_index_t x; |
| 112 | + y_index_t y; |
| 113 | + [[nodiscard]] constexpr Point operator+(Point rhs) const { |
| 114 | + return Point{static_cast<x_index_t>(x + rhs.x), |
| 115 | + static_cast<y_index_t>(y + rhs.y)}; |
| 116 | + } |
| 117 | + }; |
| 118 | + |
| 119 | + // The 8 relative positions for neighbors for a given point |
| 120 | + constexpr static std::array<Point, 8> neighbors{ |
| 121 | + Point{-1, -1}, Point{0, -1}, Point{1, -1}, Point{-1, 0}, |
| 122 | + Point{1, 0}, Point{-1, 1}, Point{0, 1}, Point{1, 1}}; |
| 123 | + |
| 124 | + // Takes the input point, wraps it vertically/horizontally and takes |
| 125 | + // the new location and maps that to the linear address of the point |
| 126 | + // in the underlying array |
| 127 | + [[nodiscard]] constexpr static std::size_t index(Point p) { |
| 128 | + return static_cast<std::size_t>(floor_modulo(p.y, height) * width + |
| 129 | + floor_modulo(p.x, width)); |
| 130 | + } |
| 131 | + |
| 132 | + [[nodiscard]] constexpr bool operator[](Point p) const noexcept { |
| 133 | + return data[index(p)]; |
| 134 | + } |
| 135 | + |
| 136 | + constexpr void set(Point p) noexcept { data[index(p)] = true; } |
| 137 | + |
| 138 | + [[nodiscard]] constexpr std::size_t count_neighbors(Point p) const { |
| 139 | + return static_cast<std::size_t>( |
| 140 | + std::count_if(neighbors.begin(), neighbors.end(), |
| 141 | + [&](auto offset) { return (*this)[p + offset]; })); |
| 142 | + } |
| 143 | + |
| 144 | + // Pre-compute all of the Point coordinates that exist in this particular |
| 145 | + // gameboard. We use this later to iterate over every location in the |
| 146 | + // gameboard. |
| 147 | + [[nodiscard]] static auto make_indexes() { |
| 148 | + auto result = std::make_unique<std::array<Point, Width * Height>>(); |
| 149 | + |
| 150 | + std::size_t output_index = 0; |
| 151 | + |
| 152 | + for (y_index_t y = 0; y < height; ++y) { |
| 153 | + for (x_index_t x = 0; x < width; ++x) { |
| 154 | + (*result)[output_index] = Point{x, y}; |
| 155 | + ++output_index; |
| 156 | + } |
| 157 | + } |
| 158 | + return result; |
| 159 | + }; |
| 160 | + |
| 161 | + // https://en.wikipedia.org/wiki/Conway's_Game_of_Life#Examples_of_patterns |
| 162 | + |
| 163 | + // Add a glider at a given location on the game board |
| 164 | + constexpr void add_glider(Point p) { |
| 165 | + set(p); |
| 166 | + set(p + Point{1, 1}); |
| 167 | + set(p + Point{2, 1}); |
| 168 | + set(p + Point{0, 2}); |
| 169 | + set(p + Point{1, 2}); |
| 170 | + } |
| 171 | +}; |
| 172 | + |
| 173 | +template <typename BoardType> |
| 174 | +constexpr void iterate_board(const BoardType &input, BoardType &output, |
| 175 | + auto &indices) { |
| 176 | + |
| 177 | + const auto rules = [&](const auto &index) { |
| 178 | + const auto neighbor_count = input.count_neighbors(index); |
| 179 | + const auto is_alive = input[index]; |
| 180 | + |
| 181 | + if (is_alive) { |
| 182 | + if (neighbor_count < 2) { |
| 183 | + return false; |
| 184 | + } else if (neighbor_count <= 3) { |
| 185 | + return true; |
| 186 | + } else { |
| 187 | + return false; |
| 188 | + } |
| 189 | + } else { |
| 190 | + if (neighbor_count == 3) { |
| 191 | + return true; |
| 192 | + } else { |
| 193 | + return false; |
| 194 | + } |
| 195 | + } |
| 196 | + |
| 197 | + return true; |
| 198 | + }; |
| 199 | + |
| 200 | + std::transform(indices.begin(), indices.end(), output.data.begin(), rules); |
| 201 | +} |
| 202 | + |
| 203 | + |
| 204 | +template <typename BoardType> auto print_board(const BoardType &board) { |
| 205 | + for (int y = 0; y < board.height; ++y) { |
| 206 | + for (int x = 0; x < board.width; ++x) { |
| 207 | + if (board[typename BoardType::Point(x, y)]) { |
| 208 | + putchar('*'); |
| 209 | + } else { |
| 210 | + putchar(' '); |
| 211 | + } |
| 212 | + } |
| 213 | + putchar('\n'); |
| 214 | + } |
| 215 | +} |
| 216 | + |
| 217 | +template <std::size_t Width, std::size_t Height, std::size_t Iterations> |
| 218 | +void run_board() { |
| 219 | + using board_type = GameBoard<Width, Height>; |
| 220 | + |
| 221 | + // I would consider putting these on the stack, but the GPU engine |
| 222 | + // requires pointers that it knows how to work with. With AdaptiveCpp |
| 223 | + // it swaps out malloc and owns these pointers in a way that can be used |
| 224 | + // with the GPU automagically |
| 225 | + |
| 226 | + auto board1 = std::make_unique<board_type>(); |
| 227 | + board1->add_glider(typename board_type::Point(1, 3)); |
| 228 | + board1->add_glider(typename board_type::Point(10, 1)); |
| 229 | + auto board2 = std::make_unique<board_type>(); |
| 230 | + |
| 231 | + const auto indices = board_type::make_indexes(); |
| 232 | + |
| 233 | + { |
| 234 | + for (int i = 0; i < Iterations; ++i) { |
| 235 | + // just swapping buffers back and forth |
| 236 | + iterate_board(*board1, *board2, *indices); |
| 237 | + std::swap(board1, board2); |
| 238 | + } |
| 239 | + } |
| 240 | + |
| 241 | + // this exists solely to make sure the compiler doesn't optimize out the |
| 242 | + // actual work |
| 243 | + if ((*board1)[typename board_type::Point(0, 0)]) { |
| 244 | + puts("0,0 is Set!"); |
| 245 | + } else { |
| 246 | + puts("0,0 is Not Set!"); |
| 247 | + } |
| 248 | +} |
| 249 | + |
| 250 | +int main() { |
| 251 | + run_board<10, 10, 5'000'000>(); |
| 252 | + run_board<100, 10, 500'000>(); |
| 253 | + run_board<100, 100, 50'000>(); |
| 254 | + run_board<100, 1000, 5'000>(); |
| 255 | + run_board<1000, 1000, 500>(); |
| 256 | + run_board<10000, 1000, 50>(); |
| 257 | + run_board<10000, 10000, 5>(); |
| 258 | +} |
0 commit comments