From cd54fe8f1a92d08302fd1a608a8d3187fb2891d6 Mon Sep 17 00:00:00 2001 From: Alfredo Correa Date: Thu, 24 Jul 2025 18:17:53 -0700 Subject: [PATCH 1/2] handle D-2 case --- include/boost/multi/adaptors/cufft.hpp | 44 +++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/include/boost/multi/adaptors/cufft.hpp b/include/boost/multi/adaptors/cufft.hpp index 3305a9f48..71e4660bf 100644 --- a/include/boost/multi/adaptors/cufft.hpp +++ b/include/boost/multi/adaptors/cufft.hpp @@ -247,7 +247,7 @@ class plan { std::sort(which_iodims_.begin() + first_howmany_, which_iodims_.begin() + D, [](auto const& alpha, auto const& omega){ return get<1>(alpha).n > get<1>(omega).n; }); - if(first_howmany_ <= D - 1) { + if(first_howmany_ == D - 1) { if constexpr(std::is_same_v) { // NOLINT(bugprone-branch-clone) workaround bug in DeepSource cufftSafeCall(::cufftPlanMany( /*cufftHandle *plan*/ &h_, @@ -287,6 +287,48 @@ class plan { ++first_howmany_; return; } + + if(first_howmany_ <= D - 2) { + if constexpr(std::is_same_v) { // NOLINT(bugprone-branch-clone) workaround bug in DeepSource + cufftSafeCall(::cufftPlanMany( + /*cufftHandle *plan*/ &h_, + /*int rank*/ dims_end - dims.begin(), + /*int *n*/ ion.data(), + /*int *inembed*/ inembed.data(), + /*int istride*/ istride, + /*int idist*/ which_iodims_[first_howmany_].second.is, + /*int *onembed*/ onembed.data(), + /*int ostride*/ ostride, + /*int odist*/ which_iodims_[first_howmany_].second.os, + /*cufftType type*/ CUFFT_Z2Z, + /*int batch*/ which_iodims_[first_howmany_].second.n + )); + } else { + cufftSafeCall(cufftCreate(&h_)); + cufftSafeCall(cufftSetAutoAllocation(h_, false)); + cufftSafeCall(cufftMakePlanMany( + /*cufftHandle *plan*/ h_, + /*int rank*/ dims_end - dims.begin(), + /*int *n*/ ion.data(), + /*int *inembed*/ inembed.data(), + /*int istride*/ istride, + /*int idist*/ which_iodims_[first_howmany_].second.is, + /*int *onembed*/ onembed.data(), + /*int ostride*/ ostride, + /*int odist*/ which_iodims_[first_howmany_].second.os, + /*cufftType type*/ CUFFT_Z2Z, + /*int batch*/ which_iodims_[first_howmany_].second.n, + /*size_t **/ &workSize_ + )); + cufftSafeCall(cufftGetSize(h_, &workSize_)); + workArea_ = ::thrust::raw_pointer_cast(alloc_.allocate(workSize_)); + cufftSafeCall(cufftSetWorkArea(h_, workArea_)); + } + if(!h_) { throw std::runtime_error{"cufftPlanMany null"}; } + ++first_howmany_; + return; + } + // throw std::runtime_error{"cufft not implemented yet"}; } From cd22a65d2a5b22f17e2744e9f9874322d9d5737e Mon Sep 17 00:00:00 2001 From: Alfredo Correa Date: Fri, 25 Jul 2025 13:42:31 -0700 Subject: [PATCH 2/2] test multiplan --- include/boost/multi/adaptors/cufft.hpp | 103 ++++++++++++++----------- 1 file changed, 57 insertions(+), 46 deletions(-) diff --git a/include/boost/multi/adaptors/cufft.hpp b/include/boost/multi/adaptors/cufft.hpp index 71e4660bf..6e15fac78 100644 --- a/include/boost/multi/adaptors/cufft.hpp +++ b/include/boost/multi/adaptors/cufft.hpp @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -289,42 +290,57 @@ class plan { } if(first_howmany_ <= D - 2) { - if constexpr(std::is_same_v) { // NOLINT(bugprone-branch-clone) workaround bug in DeepSource - cufftSafeCall(::cufftPlanMany( - /*cufftHandle *plan*/ &h_, - /*int rank*/ dims_end - dims.begin(), - /*int *n*/ ion.data(), - /*int *inembed*/ inembed.data(), - /*int istride*/ istride, - /*int idist*/ which_iodims_[first_howmany_].second.is, - /*int *onembed*/ onembed.data(), - /*int ostride*/ ostride, - /*int odist*/ which_iodims_[first_howmany_].second.os, - /*cufftType type*/ CUFFT_Z2Z, - /*int batch*/ which_iodims_[first_howmany_].second.n - )); - } else { - cufftSafeCall(cufftCreate(&h_)); - cufftSafeCall(cufftSetAutoAllocation(h_, false)); - cufftSafeCall(cufftMakePlanMany( - /*cufftHandle *plan*/ h_, - /*int rank*/ dims_end - dims.begin(), - /*int *n*/ ion.data(), - /*int *inembed*/ inembed.data(), - /*int istride*/ istride, - /*int idist*/ which_iodims_[first_howmany_].second.is, - /*int *onembed*/ onembed.data(), - /*int ostride*/ ostride, - /*int odist*/ which_iodims_[first_howmany_].second.os, - /*cufftType type*/ CUFFT_Z2Z, - /*int batch*/ which_iodims_[first_howmany_].second.n, - /*size_t **/ &workSize_ - )); - cufftSafeCall(cufftGetSize(h_, &workSize_)); - workArea_ = ::thrust::raw_pointer_cast(alloc_.allocate(workSize_)); - cufftSafeCall(cufftSetWorkArea(h_, workArea_)); + + int nstreams = which_iodims_[first_howmany_].second.n; + std::vector streams(nstreams); + for(auto& s : streams) { + cudaStreamCreate(&s) == cudaSuccess ?0:throw std::runtime_error{"Failed to create CUDA stream"}; + } + std::vector plans(nstreams); + + std::vector<::size_t> worksizes(nstreams); + std::vector workareas(nstreams); + + for(int idx = 0; idx != nstreams; ++idx) { + if constexpr(std::is_same_v) { // NOLINT(bugprone-branch-clone) workaround bug in DeepSource + std::terminate(); + cufftSafeCall(::cufftPlanMany( + /*cufftHandle *plan*/ &plans[idx], + /*int rank*/ dims_end - dims.begin(), + /*int *n*/ ion.data(), + /*int *inembed*/ inembed.data(), + /*int istride*/ istride, + /*int idist*/ which_iodims_[first_howmany_].second.is, + /*int *onembed*/ onembed.data(), + /*int ostride*/ ostride, + /*int odist*/ which_iodims_[first_howmany_].second.os, + /*cufftType type*/ CUFFT_Z2Z, + /*int batch*/ which_iodims_[first_howmany_].second.n + )); + } else { + std::terminate(); + cufftSafeCall(cufftCreate(&plans[idx])); + cufftSafeCall(cufftSetAutoAllocation(plans[idx], false)); + cufftSafeCall(cufftMakePlanMany( + /*cufftHandle *plan*/ plans[idx], + /*int rank*/ dims_end - dims.begin(), + /*int *n*/ ion.data(), + /*int *inembed*/ inembed.data(), + /*int istride*/ istride, + /*int idist*/ which_iodims_[first_howmany_].second.is, + /*int *onembed*/ onembed.data(), + /*int ostride*/ ostride, + /*int odist*/ which_iodims_[first_howmany_].second.os, + /*cufftType type*/ CUFFT_Z2Z, + /*int batch*/ which_iodims_[first_howmany_].second.n, + /*size_t **/ &workSize_ + )); + cufftSafeCall(cufftGetSize(plans[idx], &worksizes[idx])); + workareas[idx] = ::thrust::raw_pointer_cast(alloc_.allocate(worksizes[idx])); + cufftSafeCall(cufftSetWorkArea(plans[idx], workareas[idx])); + } + if(!plans[idx]) { throw std::runtime_error{"cufftPlanMany null"}; } } - if(!h_) { throw std::runtime_error{"cufftPlanMany null"}; } ++first_howmany_; return; } @@ -335,7 +351,7 @@ class plan { private: template - void ExecZ2Z_(complex_type const* idata, complex_type* odata, int direction) const { + void ExecZ2Z_(complex_type const* idata, complex_type* odata, int direction) { // used_ = true; cufftSafeCall(cufftExecZ2Z(h_, const_cast(idata), odata, direction)); // NOLINT(cppcoreguidelines-pro-type-const-cast) wrap legacy interface // cudaDeviceSynchronize(); @@ -343,7 +359,7 @@ class plan { public: template - auto execute(IPtr idata, OPtr odata, int direction) const + auto execute(IPtr idata, OPtr odata, int direction) -> decltype((void)( reinterpret_cast(::thrust::raw_pointer_cast(idata)), reinterpret_cast(::thrust::raw_pointer_cast(odata)) @@ -372,6 +388,7 @@ class plan { if(idata == odata) {throw std::runtime_error{"complicated inplace 2"};} for(int idx = 0; idx != which_iodims_[first_howmany_].second.n; ++idx) { // NOLINT(altera-unroll-loops,altera-unroll-loops,altera-id-dependent-backward-branch) TODO(correaa) use an algorithm for(int jdx = 0; jdx != which_iodims_[first_howmany_ + 1].second.n; ++jdx) { // NOLINT(altera-unroll-loops,altera-unroll-loops,altera-id-dependent-backward-branch) TODO(correaa) use an algorithm + throw std::runtime_error{"complicated loop"}; cufftExecZ2Z( h_, const_cast(reinterpret_cast(::thrust::raw_pointer_cast(idata + idx*which_iodims_[first_howmany_].second.is + jdx*which_iodims_[first_howmany_ + 1].second.is))), // NOLINT(cppcoreguidelines-pro-type-const-cast,cppcoreguidelines-pro-type-reinterpret-cast) legacy interface @@ -385,17 +402,11 @@ class plan { throw std::runtime_error{"error2"}; } - template - void execute_forward(IPtr idata, OPtr odata) { // TODO(correaa) make const - execute(idata, odata, cufft::forward); - } - template - void execute_backward(IPtr idata, OPtr odata) { // TODO(correaa) make const - execute(idata, odata, cufft::backward); - } + template void execute_forward(IPtr idata, OPtr odata) { execute(idata, odata, cufft::forward); } + template void execute_backward(IPtr idata, OPtr odata) { execute(idata, odata, cufft::backward); } template - void operator()(IPtr idata, OPtr odata, int direction) const { + void operator()(IPtr idata, OPtr odata, int direction) { // used_ = true; ExecZ2Z_(reinterpret_cast(::thrust::raw_pointer_cast(idata)), reinterpret_cast(::thrust::raw_pointer_cast(odata)), direction); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast) legacy interface }