Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .clang-tidy
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
WarningsAsErrors: '*'
HeaderFilterRegex: '.'
FormatStyle: file
TransformLambdas: none

CheckOptions:
- { key: readability-identifier-naming.ClassCase , value: lower_case }
Expand Down
7 changes: 5 additions & 2 deletions .gitlab-ci-correaa.yml
Original file line number Diff line number Diff line change
Expand Up @@ -283,16 +283,19 @@ clang++-18 mull-18: # mull-18 crashes
- large-memory-space
- non-shared
interruptible: true
allow_failure: false
allow_failure: true
script:
- apt-get -qq update
- apt-get -qq install --no-install-recommends -y --quiet gnupg curl wget ca-certificates clang-18 cmake libclang-rt-18-dev make libboost-timer-dev libboost-serialization-dev > /dev/null
- apt-get -qq install --no-install-recommends -y --quiet gnupg curl wget ca-certificates clang-18 cmake file libclang-rt-18-dev make libboost-timer-dev libboost-serialization-dev > /dev/null
- curl -1sLf 'https://dl.cloudsmith.io/public/mull-project/mull-stable/setup.deb.sh' | bash
- apt-get -qq update
- apt search mull
- apt-get -qq install --no-install-recommends -y --quiet mull-18 libclang-cpp18
- mkdir build && cd build
- clang++-18 --version
- which mull-runner-18
- ls -all `which mull-runner-18`
- file `which mull-runner-18`
- mull-runner-18 --version
- CXX=clang++-18 cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS="-O3 -DNDEBUG -fpass-plugin=/usr/lib/mull-ir-frontend-18 -g -grecord-command-line"
- cmake --build . --parallel 8 || cmake --build . --parallel 1 --verbose
Expand Down
86 changes: 44 additions & 42 deletions include/boost/multi/adaptors/cufft.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ static auto cuda_get_error_enum(cufftResult error) -> char const* {
}

#define cufftSafeCall(err) implcufftSafeCall(err, __FILE__, __LINE__)
inline void implcufftSafeCall(cufftResult err, const char* file, const int line) {
inline void implcufftSafeCall(cufftResult err, char const* file, int const line) {
if(CUFFT_SUCCESS != err) {
std::cerr << "CUFFT error in file " << file << ", line " << line << "\nerror " << err << ": " << cuda_get_error_enum(err) << "\n";
// fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n", __FILE__, __LINE__, err,
Expand Down Expand Up @@ -97,7 +97,7 @@ class plan {
Alloc alloc_;
::size_t workSize_ = 0;
void* workArea_{};
cufftHandle h_{}; // TODO(correaa) put this in a unique_ptr
std::array<cufftHandle, DD> hs_{}; // TODO(correaa) put this in a unique_ptr
std::array<std::pair<bool, cufft_iodim64>, DD + 1> which_iodims_{};
int first_howmany_{};

Expand All @@ -120,7 +120,7 @@ class plan {
: alloc_{std::move(other.alloc_)},
workSize_{std::exchange(other.workSize_, {})},
workArea_{std::exchange(other.workArea_, {})},
h_{std::exchange(other.h_, {})},
hs_{std::exchange(other.hs_, {})},
which_iodims_{std::exchange(other.which_iodims_, {})},
first_howmany_{std::exchange(other.first_howmany_, {})} {
// other.used_ = true; // moved-from object cannot be used
Expand All @@ -141,26 +141,30 @@ class plan {
auto const istride_tuple = in.strides();
auto const ostride_tuple = out.strides();

using boost::multi::detail::get;
auto which_iodims = std::apply([](auto... elems) {
return std::array<std::pair<bool, cufft_iodim64>, sizeof...(elems) + 1>{
// TODO(correaa) added one element to avoid problem with gcc 13 static analysis (out-of-bounds)
std::pair<bool, cufft_iodim64>{
get<0>(elems),
cufft_iodim64{get<1>(elems), get<2>(elems), get<3>(elems)}
}
...,
std::pair<bool, cufft_iodim64>{}
};
},
boost::multi::detail::tuple_zip(which, sizes_tuple, istride_tuple, ostride_tuple));
using std::get; // boost::multi::detail::get;
auto which_iodims = std::apply(
[](auto... elems) {
return std::array /*<std::pair<bool, cufft_iodim64>, sizeof...(elems) + 1>*/ {
// TODO(correaa) added one element to avoid problem with gcc 13 static analysis (out-of-bounds)
std::pair /*<bool, cufft_iodim64>*/ {
get<0>(elems),
cufft_iodim64{get<1>(elems), get<2>(elems), get<3>(elems)}
}
...,
std::pair<bool, cufft_iodim64>{}
};
},
boost::multi::detail::tuple_zip(which, sizes_tuple, istride_tuple, ostride_tuple)
);

std::stable_sort(which_iodims.begin(), which_iodims.end() - 1, [](auto const& alpha, auto const& omega) { return get<1>(alpha).is > get<1>(omega).is; });
auto const part = std::stable_partition(which_iodims.begin(), which_iodims.end() - 1, [](auto elem) { return get<0>(elem); });

auto const part = std::stable_partition(which_iodims.begin(), which_iodims.end() - 1, [](auto elem) { return std::get<0>(elem); });
std::stable_sort(which_iodims.begin(), part, [](auto const& alpha, auto const& omega) { return get<1>(alpha).os > get<1>(omega).os; });
std::stable_sort(part, which_iodims.end() - 1, [](auto const& alpha, auto const& omega) { return get<1>(alpha).os > get<1>(omega).os; });

std::array<cufft_iodim64, D> dims{};
auto const dims_end = std::transform(which_iodims.begin(), part, dims.begin(), [](auto elem) { return elem.second; });

auto const dims_end = std::transform(which_iodims.begin(), part, dims.begin(), [](auto elem) { return elem.second; });

// std::array<cufftw_iodim64, D> howmany_dims{};
// auto const howmany_dims_end = std::transform(part, which_iodims.end() -1, howmany_dims.begin(), [](auto elem) {return elem.second;});
Expand Down Expand Up @@ -189,6 +193,8 @@ class plan {
assert(ostrides[idx - 1] >= ostrides[idx]);
assert(ostrides[idx - 1] % ostrides[idx] == 0);
onembed[idx] = ostrides[idx - 1] / ostrides[idx];

assert(istrides[idx - 1] >= istrides[idx]);
assert(istrides[idx - 1] % istrides[idx] == 0);
inembed[idx] = istrides[idx - 1] / istrides[idx];
}
Expand Down Expand Up @@ -217,7 +223,7 @@ class plan {
if constexpr(std::is_same_v<Alloc, void*>) {
assert(dims_end - dims.begin() < 4); // cufft cannot do 4D FFT
cufftSafeCall(::cufftPlanMany(
/*cufftHandle *plan*/ &h_,
/*cufftHandle *plan*/ &hs_[0],
/*int rank*/ dims_end - dims.begin(),
/*int *n*/ ion.data(),
/*int *inembed*/ inembed.data(),
Expand All @@ -230,10 +236,10 @@ class plan {
/*int batch*/ 1 // BATCH
));
} else {
cufftSafeCall(cufftCreate(&h_));
cufftSafeCall(cufftSetAutoAllocation(h_, false));
cufftSafeCall(cufftCreate(&hs_[0]));
cufftSafeCall(cufftSetAutoAllocation(hs_[0], false));
cufftSafeCall(cufftMakePlanMany(
/*cufftHandle *plan*/ h_,
/*cufftHandle *plan*/ hs_[0],
/*int rank*/ dims_end - dims.begin(),
/*int *n*/ ion.data(),
/*int *inembed*/ inembed.data(),
Expand All @@ -246,14 +252,14 @@ class plan {
/*int batch*/ 1, // BATCH
/*size_t **/ &workSize_
));
cufftSafeCall(cufftGetSize(h_, &workSize_));
cufftSafeCall(cufftGetSize(hs_[0], &workSize_));
workArea_ = ::thrust::raw_pointer_cast(alloc_.allocate(workSize_));
static_assert(sizeof(Alloc) == 1000);
// auto s = cudaMalloc(&workArea_, workSize_);
// if(s != cudaSuccess) {throw std::runtime_error{"L212"};}
cufftSafeCall(cufftSetWorkArea(h_, workArea_));
cufftSafeCall(cufftSetWorkArea(hs_[0], workArea_));
}
if(!h_) {
if(!hs_[0]) {
throw std::runtime_error{"cufftPlanMany null"};
}
return;
Expand All @@ -264,7 +270,7 @@ class plan {
if(first_howmany_ <= D - 1) {
if constexpr(std::is_same_v<Alloc, void*>) { // NOLINT(bugprone-branch-clone) workaround bug in DeepSource
cufftSafeCall(::cufftPlanMany(
/*cufftHandle *plan*/ &h_,
/*cufftHandle *plan*/ &hs_[0],
/*int rank*/ dims_end - dims.begin(),
/*int *n*/ ion.data(),
/*int *inembed*/ inembed.data(),
Expand All @@ -277,10 +283,10 @@ class plan {
/*int batch*/ which_iodims_[first_howmany_].second.n
));
} else {
cufftSafeCall(cufftCreate(&h_));
cufftSafeCall(cufftSetAutoAllocation(h_, false));
cufftSafeCall(cufftCreate(&hs_[0]));
cufftSafeCall(cufftSetAutoAllocation(hs_[0], false));
cufftSafeCall(cufftMakePlanMany(
/*cufftHandle *plan*/ h_,
/*cufftHandle *plan*/ hs_[0],
/*int rank*/ dims_end - dims.begin(),
/*int *n*/ ion.data(),
/*int *inembed*/ inembed.data(),
Expand All @@ -293,11 +299,11 @@ class plan {
/*int batch*/ which_iodims_[first_howmany_].second.n,
/*size_t **/ &workSize_
));
cufftSafeCall(cufftGetSize(h_, &workSize_));
cufftSafeCall(cufftGetSize(hs_[0], &workSize_));
workArea_ = ::thrust::raw_pointer_cast(alloc_.allocate(workSize_));
cufftSafeCall(cufftSetWorkArea(h_, workArea_));
cufftSafeCall(cufftSetWorkArea(hs_[0], workArea_));
}
if(!h_) {
if(!hs_[0]) {
throw std::runtime_error{"cufftPlanMany null"};
}
++first_howmany_;
Expand All @@ -310,7 +316,7 @@ class plan {
template<typename = void>
void ExecZ2Z_(complex_type const* idata, complex_type* odata, int direction) const {
// used_ = true;
cufftSafeCall(cufftExecZ2Z(h_, const_cast<complex_type*>(idata), odata, direction)); // NOLINT(cppcoreguidelines-pro-type-const-cast) wrap legacy interface
cufftSafeCall(cufftExecZ2Z(hs_[0], const_cast<complex_type*>(idata), odata, direction)); // NOLINT(cppcoreguidelines-pro-type-const-cast) wrap legacy interface
// cudaDeviceSynchronize();
}

Expand All @@ -333,7 +339,7 @@ class plan {

for(int idx = 0; idx != which_iodims_[first_howmany_].second.n; ++idx) { // NOLINT(altera-unroll-loops,altera-id-dependent-backward-branch)
cufftExecZ2Z(
h_,
hs_[0],
const_cast<complex_type*>(reinterpret_cast<complex_type const*>(::thrust::raw_pointer_cast(idata + idx * which_iodims_[first_howmany_].second.is))), // NOLINT(cppcoreguidelines-pro-type-const-cast,cppcoreguidelines-pro-type-reinterpret-cast) legacy interface
reinterpret_cast<complex_type*>(::thrust::raw_pointer_cast(odata + idx * which_iodims_[first_howmany_].second.os)), // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast) legacy interface
direction
Expand All @@ -354,7 +360,7 @@ class plan {
for(int idx = 0; idx != which_iodims_[first_howmany_].second.n; ++idx) { // NOLINT(altera-unroll-loops,altera-unroll-loops,altera-id-dependent-backward-branch) TODO(correaa) use an algorithm
for(int jdx = 0; jdx != which_iodims_[first_howmany_ + 1].second.n; ++jdx) { // NOLINT(altera-unroll-loops,altera-unroll-loops,altera-id-dependent-backward-branch) TODO(correaa) use an algorithm
cufftExecZ2Z(
h_,
hs_[0],
const_cast<complex_type*>(reinterpret_cast<complex_type const*>(::thrust::raw_pointer_cast(idata + idx * which_iodims_[first_howmany_].second.is + jdx * which_iodims_[first_howmany_ + 1].second.is))), // NOLINT(cppcoreguidelines-pro-type-const-cast,cppcoreguidelines-pro-type-reinterpret-cast) legacy interface
reinterpret_cast<complex_type*>(::thrust::raw_pointer_cast(odata + idx * which_iodims_[first_howmany_].second.os + jdx * which_iodims_[first_howmany_ + 1].second.os)), // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast) legacy interface
direction
Expand Down Expand Up @@ -397,13 +403,9 @@ class plan {
alloc_.deallocate(typename std::allocator_traits<Alloc>::pointer(reinterpret_cast<char*>(workArea_)), workSize_);
} // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast) legacy interface
}
if(h_ != 0) {
cufftSafeCall(cufftDestroy(h_));
if(hs_[0] != 0) {
cufftSafeCall(cufftDestroy(hs_[0]));
}
// if(!used_) {
// std::cerr <<"Warning: cufft plan was never used\n";
// std::terminate();
// }
}

using size_type = int;
Expand Down
2 changes: 1 addition & 1 deletion include/boost/multi/adaptors/cufft/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ include(CTest)
include_directories(${CMAKE_BINARY_DIR})

# file(GLOB TEST_SRCS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp)
set(TEST_SRCS cufft.cpp)
set(TEST_SRCS cases.cpp cufft.cpp)

foreach(TEST_FILE ${TEST_SRCS})
set(TEST_EXE "${TEST_FILE}.x")
Expand Down
65 changes: 65 additions & 0 deletions include/boost/multi/adaptors/cufft/test/cases.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
// Copyright 2020-2025 Alfredo A. Correa
// Distributed under the Boost Software License, Version 1.0.
// https://www.boost.org/LICENSE_1_0.txt

#include <boost/core/lightweight_test.hpp>

#include <boost/multi/adaptors/fftw.hpp>
#include <boost/multi/array.hpp>

#include <boost/multi/adaptors/cufft.hpp>

#if(!(defined(__HIP_PLATFORM_AMD__) || defined(__HIP_PLATFORM_NVIDIA__))) && (!defined(__HIPCC__))
#include <boost/multi/adaptors/cufft.hpp>
#else
#include <boost/multi/adaptors/hipfft.hpp>
#endif

#include <boost/multi/adaptors/fft.hpp>
#include <boost/multi/adaptors/thrust.hpp>

#include <thrust/complex.h>
#include <thrust/transform_reduce.h>

#include <random>

namespace multi = boost::multi;

using complex = thrust::complex<double>;

template<>
constexpr bool multi::force_element_trivial_default_construction<thrust::complex<double>> = true;

auto main() -> int try {
complex const I{0.0, 1.0}; // NOLINT(readability-identifier-length)

auto in_cpu = multi::array<complex, 4>({20, 30, 50, 70});

std::generate(
in_cpu.elements().begin(), in_cpu.elements().end(),
[dist = std::normal_distribution<>{}, gen = std::mt19937(std::random_device{}())] () mutable { return dist(gen); }
);

auto const in_gpu = multi::thrust::cuda::array<complex, 4>(in_cpu);

auto const& in_cpu_view = in_cpu.transposed().rotated().transposed().unrotated().transposed();
auto const& in_gpu_view = in_gpu.transposed().rotated().transposed().unrotated().transposed();

auto out_gpu = multi::thrust::cuda::array<complex, 4>(in_gpu_view.extensions());
auto out_cpu = multi::array<complex, 4>(out_gpu.extensions());

assert(in_gpu_view.extensions() == out_gpu.extensions());

multi::fftw::dft_forward({true, true, true, false}, in_cpu_view, out_cpu);

// multi::cufft::plan<4>({true, true, true, false}, in_gpu_view.layout(), out_gpu.layout())
// .execute(in_gpu_view.base(), out_gpu.base(), multi::cufft::forward);

// std::cout << out_cpu[2][3][4][5] << ' ' << out_gpu[2][3][4][5] << std::endl;
// BOOST_TEST( thrust::abs(out_cpu[2][3][4][5] - static_cast<complex>(out_gpu[2][3][4][5])) < 1e-6 );

return boost::report_errors();
} catch(...) {
throw;
return 1;
}
2 changes: 2 additions & 0 deletions include/boost/multi/detail/layout.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ class f_extensions_t {
}

public:
static constexpr dimensionality_type dimensionality = D;
constexpr static dimensionality_type rank_v = D;

using difference_type = typename extensions_t<D>::difference_type;
Expand Down Expand Up @@ -219,6 +220,7 @@ class f_extensions_t {
};

constexpr auto elements() const { return elements_t{xs_.elements(), proj_}; }
constexpr auto num_elements() const { return xs_.num_elements(); }
};

template<dimensionality_type D>
Expand Down
7 changes: 7 additions & 0 deletions test/extensions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ auto main() -> int { // NOLINT(bugprone-exception-escape,readability-function-c
{
auto x1d = multi::extensions_t<1>(3);

BOOST_TEST( multi::extensions_t<1>(3) == multi::extensions_t(3) );

auto it = x1d.elements().begin();
BOOST_TEST( get<0>(*it) == 0 );

Expand Down Expand Up @@ -128,6 +130,8 @@ auto main() -> int { // NOLINT(bugprone-exception-escape,readability-function-c
{
multi::extensions_t<2> const x2d({4, 3});

BOOST_TEST( multi::extensions_t<2>(4, 3) == multi::extensions_t(4, 3) );

auto ll = [](auto xx, auto yy) {
return xx + yy;
};
Expand Down Expand Up @@ -375,6 +379,8 @@ auto main() -> int { // NOLINT(bugprone-exception-escape,readability-function-c
{
multi::extensions_t<3> const xs{3, 4, 5};

BOOST_TEST(( multi::extensions_t<3>{3, 4, 5} == multi::extensions_t(3, 4, 5) ));

BOOST_TEST( xs.sub() == multi::extensions_t<2>(4, 5) );
static_assert(std::is_same_v<decltype(xs[1][1][1]), multi::extensions_t<3>::element>);
}
Expand Down Expand Up @@ -402,6 +408,7 @@ auto main() -> int { // NOLINT(bugprone-exception-escape,readability-function-c

auto const& values = [](auto ii, auto jj) { return ii + jj; } ^ arr.extensions();

BOOST_TEST( values.dimensionality == 2 );
BOOST_TEST( values.extensions() == arr.extensions() );
BOOST_TEST( *values.elements().begin() == 0 );
BOOST_TEST( values.elements().begin() < values.elements().end() );
Expand Down