From bffb33cfd3d8db63976dc63f6c9c062fb7cb7871 Mon Sep 17 00:00:00 2001 From: Alfredo Correa Date: Sat, 18 Oct 2025 17:46:32 -0700 Subject: [PATCH 01/12] add cases test --- .../multi/adaptors/cufft/test/CMakeLists.txt | 2 +- .../boost/multi/adaptors/cufft/test/cases.cpp | 829 ++++++++++++++++++ 2 files changed, 830 insertions(+), 1 deletion(-) create mode 100644 include/boost/multi/adaptors/cufft/test/cases.cpp diff --git a/include/boost/multi/adaptors/cufft/test/CMakeLists.txt b/include/boost/multi/adaptors/cufft/test/CMakeLists.txt index 4d1f8b2ad..4bd8f3ef1 100644 --- a/include/boost/multi/adaptors/cufft/test/CMakeLists.txt +++ b/include/boost/multi/adaptors/cufft/test/CMakeLists.txt @@ -66,7 +66,7 @@ include(CTest) include_directories(${CMAKE_BINARY_DIR}) # file(GLOB TEST_SRCS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp) -set(TEST_SRCS cufft.cpp) +set(TEST_SRCS cases.cpp cufft.cpp) foreach(TEST_FILE ${TEST_SRCS}) set(TEST_EXE "${TEST_FILE}.x") diff --git a/include/boost/multi/adaptors/cufft/test/cases.cpp b/include/boost/multi/adaptors/cufft/test/cases.cpp new file mode 100644 index 000000000..f69087914 --- /dev/null +++ b/include/boost/multi/adaptors/cufft/test/cases.cpp @@ -0,0 +1,829 @@ +// Copyright 2020-2025 Alfredo A. Correa +// Distributed under the Boost Software License, Version 1.0. +// https://www.boost.org/LICENSE_1_0.txt + +#include +#include + +#include + +#include +#include +#include + +#if(!(defined(__HIP_PLATFORM_AMD__) || defined(__HIP_PLATFORM_NVIDIA__))) && (!defined(__HIPCC__)) +#include +#else +#include +#endif + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +namespace multi = boost::multi; +using complex = thrust::complex; + +template<> +constexpr bool multi::force_element_trivial_default_construction> = true; + +template +__attribute__((always_inline)) inline void DoNotOptimize(T const& value) { // NOLINT(readability-identifier-naming) consistency with Google benchmark + asm volatile("" : "+m"(const_cast(value))); // NOLINT(hicpp-no-assembler,cppcoreguidelines-pro-type-const-cast) hack +} + +class watch : std::chrono::high_resolution_clock { + std::string label_; + time_point start_; + + public: + explicit watch(char const* label) : label_{label} { + cudaDeviceSynchronize() == cudaSuccess ? void() : assert(0); // NOLINT(misc-include-cleaner) the header is included conditionally + start_ = now(); + } + + watch(watch const&) = delete; + watch(watch&&) = delete; + + auto operator=(watch const&) -> watch& = delete; + auto operator=(watch&&) -> watch& = delete; + + watch() : watch("") {} + ~watch() { + cudaDeviceSynchronize() == cudaSuccess ? void() : assert(0); + auto const count = std::chrono::duration(now() - start_).count(); + std::cerr << label_ << ": " << count << " sec\n"; + } +}; + +using complex = thrust::complex; // this can't be std::complex in the gpu + +struct norm_t { + __host__ __device__ auto operator()(complex const& x) const { + return thrust::norm(x); + } +}; + +auto main() -> int try { + complex const I{0.0, 1.0}; // NOLINT(readability-identifier-length) + + // BOOST_AUTO_TEST_CASE(cufft_2D, *boost::unit_test::tolerance(0.0001)) + { + auto const in_cpu = multi::array{ + { 1.0 + 2.0 * I, 9.0 - 1.0 * I, 2.0 + 4.0 * I}, + { 3.0 + 3.0 * I, 7.0 - 4.0 * I, 1.0 + 9.0 * I}, + { 4.0 + 1.0 * I, 5.0 + 3.0 * I, 2.0 + 4.0 * I}, + { 3.0 - 1.0 * I, 8.0 + 7.0 * I, 2.0 + 1.0 * I}, + {31.0 - 1.0 * I, 18.0 + 7.0 * I, 2.0 + 10.0 * I} + }; + + { + auto fw_cpu = multi::array(extensions(in_cpu)); + multi::fftw::dft_forward({true, true}, in_cpu, fw_cpu); + + auto const in_gpu = multi::thrust::cuda::array{in_cpu}; + auto fw_gpu = multi::thrust::cuda::array(extensions(in_gpu)); + + BOOST_TEST( fw_cpu[3][2].real() != 0.0 ); + BOOST_TEST( fw_cpu[3][2].imag() != 0.0 ); + + multi::cufft::plan<2>({true, true}, in_gpu.layout(), fw_gpu.layout()) + .execute(in_gpu.base(), fw_gpu.base(), multi::cufft::forward); + + BOOST_TEST( std::abs((complex(fw_gpu[3][2]) - fw_cpu[3][2]).real()) < 1.0e-8 ); + BOOST_TEST( std::abs((complex(fw_gpu[3][2]) - fw_cpu[3][2]).imag()) < 1.0e-8 ); + + // TODO(correaa) test funcional interface for GPU + // auto const& dft = multi::fft::DFT({true, true}, in_cpu, multi::fft::forward); + + // BOOST_TEST( dft.extensions() == in_cpu.extensions() ); + // BOOST_TEST( (*dft.begin()).size() == (*in_cpu.begin()).size() ); + // BOOST_TEST( (*dft.begin()).extensions() == (*in_cpu.begin()).extensions() ); + + // multi::array const fw_cpu_out = multi::fft::DFT({true, true}, in_cpu, multi::fft::forward); + } + } + { + auto const in_cpu = std::invoke([] { + multi::array ret({20, 20, 20, 20}); + auto const [is, js, ks, ls] = ret.extensions(); + for(auto i : is) + for(auto j : js) + for(auto k : ks) + for(auto l : ls) { + ret[i][j][k][l] = complex{ + static_cast(i + j + k + l), + static_cast(i - j + k - l), + }; + } + return ret; + }); + + auto const in_gpu = multi::thrust::cuda::array{in_cpu}; + + auto const nrm = thrust::transform_reduce( + in_gpu.elements().begin(), in_gpu.elements().end(), + norm_t{}, 0.0, thrust::plus<>{} + ); + + auto fw_gpu = multi::thrust::cuda::array(in_gpu.extensions()); + fw_gpu = in_gpu; + // multi::cufft::plan<4>({true, true, true, true}, in_gpu.layout(), fw_gpu.layout()) + // .execute(in_gpu.base(), fw_gpu.base(), multi::cufft::forward); + + // cudaDeviceSynchronize() == cudaSuccess ? void() : throw std::runtime_error{"cuda error"}; + + // auto const nrm_fwd = thrust::transform_reduce( + // fw_gpu.elements().begin(), fw_gpu.elements().end(), + // norm_t{}, 0.0, thrust::plus<>{} + // ); + // std::cout << "norm: " << nrm*20.0*20.0 << ", norm forward: " << nrm_fwd << '\n'; + // BOOST_TEST( nrm_fwd == nrm*20.0*20.0 ); + } + { + auto const in_cpu = multi::array{ + { 1.0 + 2.0 * I, 9.0 - 1.0 * I, 2.0 + 4.0 * I}, + { 3.0 + 3.0 * I, 7.0 - 4.0 * I, 1.0 + 9.0 * I}, + { 4.0 + 1.0 * I, 5.0 + 3.0 * I, 2.0 + 4.0 * I}, + { 3.0 - 1.0 * I, 8.0 + 7.0 * I, 2.0 + 1.0 * I}, + {31.0 - 1.0 * I, 18.0 + 7.0 * I, 2.0 + 10.0 * I} + }; + + auto fw_cpu = multi::array(extensions(in_cpu)); + multi::fftw::dft({false, true}, in_cpu, fw_cpu, multi::fftw::forward); + + auto const in_gpu = multi::thrust::cuda::array{in_cpu}; + auto fw_gpu = multi::thrust::cuda::array(extensions(in_gpu)); + + BOOST_TEST( fw_cpu[3][2].real() != 0.0 ); + BOOST_TEST( fw_cpu[3][2].imag() != 0.0 ); + + multi::cufft::plan<2>({false, true}, in_gpu.layout(), fw_gpu.layout()) + .execute(in_gpu.base(), fw_gpu.base(), multi::cufft::forward); + + BOOST_TEST( thrust::abs(complex(fw_gpu[3][2]) - fw_cpu[3][2]) < 1e-12 ); + } + { + auto const in_cpu = multi::array{ + { 1.0 + 2.0 * I, 9.0 - 1.0 * I, 2.0 + 4.0 * I}, + { 3.0 + 3.0 * I, 7.0 - 4.0 * I, 1.0 + 9.0 * I}, + { 4.0 + 1.0 * I, 5.0 + 3.0 * I, 2.0 + 4.0 * I}, + { 3.0 - 1.0 * I, 8.0 + 7.0 * I, 2.0 + 1.0 * I}, + {31.0 - 1.0 * I, 18.0 + 7.0 * I, 2.0 + 10.0 * I} + }; + + auto fw_cpu = multi::array(extensions(in_cpu)); + multi::fftw::dft({false, true}, in_cpu, fw_cpu, multi::fftw::forward); + + auto const in_gpu = multi::thrust::cuda::array{in_cpu}; + auto fw_gpu = multi::thrust::cuda::array(extensions(in_gpu)); + + BOOST_TEST( fw_cpu[3][2].real() != 0.0 ); + BOOST_TEST( fw_cpu[3][2].imag() != 0.0 ); + + for(int i = 0; i != in_gpu.size(); ++i) { + multi::cufft::plan<1>({true}, in_gpu[i].layout(), fw_gpu[i].layout()) + .execute(in_gpu[i].base(), fw_gpu[i].base(), multi::cufft::forward); + } + + BOOST_TEST( thrust::abs(complex(fw_gpu[3][2]) - fw_cpu[3][2]) < 1.0e-12 ); + } + { + auto const in_cpu = multi::array{ + { 1.0 + 2.0 * I, 9.0 - 1.0 * I, 2.0 + 4.0 * I}, + { 3.0 + 3.0 * I, 7.0 - 4.0 * I, 1.0 + 9.0 * I}, + { 4.0 + 1.0 * I, 5.0 + 3.0 * I, 2.0 + 4.0 * I}, + { 3.0 - 1.0 * I, 8.0 + 7.0 * I, 2.0 + 1.0 * I}, + {31.0 - 1.0 * I, 18.0 + 7.0 * I, 2.0 + 10.0 * I} + }; + + auto fw_cpu = multi::array(extensions(in_cpu)); + multi::fftw::dft({false, true}, in_cpu, fw_cpu, multi::fftw::forward); + + auto const in_gpu = multi::thrust::cuda::array{in_cpu}; + auto fw_gpu = multi::thrust::cuda::array(extensions(in_gpu)); + auto fw_gpu2 = multi::thrust::cuda::array(extensions(in_gpu)); + auto fw_gpu3 = multi::thrust::cuda::array(extensions(in_gpu)); + + BOOST_TEST( fw_cpu[3][2].real() != 0.0 ); + BOOST_TEST( fw_cpu[3][2].imag() != 0.0 ); + + for(int i = 0; i != in_gpu.size(); ++i) { + multi::cufft::plan<1>({true}, in_gpu[i].layout(), fw_gpu[i].layout()) + .execute(in_gpu[i].base(), fw_gpu[i].base(), multi::cufft::forward); + } + + multi::cufft::plan<2>({false, true}, in_gpu.layout(), fw_gpu2.layout()) + .execute(in_gpu.base(), fw_gpu2.base(), multi::cufft::forward); + + BOOST_TEST( abs(complex(fw_gpu[3][2]) - fw_cpu[3][2]) < 1e-10 ); + BOOST_TEST( abs(complex(fw_gpu[3][2]) - complex(fw_gpu2[3][2])) < 1e-10 ); + } + + { + auto const in_cpu = multi::array{ + { 1.0 + 2.0 * I, 9.0 - 1.0 * I, 2.0 + 4.0 * I}, + { 3.0 + 3.0 * I, 7.0 - 4.0 * I, 1.0 + 9.0 * I}, + { 4.0 + 1.0 * I, 5.0 + 3.0 * I, 2.0 + 4.0 * I}, + { 3.0 - 1.0 * I, 8.0 + 7.0 * I, 2.0 + 1.0 * I}, + {31.0 - 1.0 * I, 18.0 + 7.0 * I, 2.0 + 10.0 * I} + }; + auto fw_cpu = multi::array(extensions(in_cpu)); + multi::fftw::dft({false, true}, in_cpu, fw_cpu, multi::fftw::forward); + + auto const in_gpu = multi::thrust::cuda::array{in_cpu}; + auto const fw_gpu = multi::cufft::dft({false, true}, in_gpu, multi::cufft::forward); + + BOOST_TEST( abs(fw_cpu[3][2]) != 0.0 ); + + BOOST_TEST( abs(complex(fw_gpu[3][2]) - fw_cpu[3][2]) < 1e-10 ); + } + { + auto const in_cpu = multi::array{ + { 1.0 + 2.0 * I, 9.0 - 1.0 * I, 2.0 + 4.0 * I}, + { 3.0 + 3.0 * I, 7.0 - 4.0 * I, 1.0 + 9.0 * I}, + { 4.0 + 1.0 * I, 5.0 + 3.0 * I, 2.0 + 4.0 * I}, + { 3.0 - 1.0 * I, 8.0 + 7.0 * I, 2.0 + 1.0 * I}, + {31.0 - 1.0 * I, 18.0 + 7.0 * I, 2.0 + 10.0 * I} + }; + auto fw_cpu = multi::array(extensions(in_cpu)); + multi::fftw::dft({true, false}, in_cpu, fw_cpu, multi::fftw::forward); + + auto const in_gpu = multi::thrust::cuda::array{in_cpu}; + auto const fw_gpu = multi::cufft::dft({true, false}, in_gpu, multi::cufft::forward); + + BOOST_TEST( fw_cpu.extensions() == in_cpu.extensions() ); + BOOST_TEST( abs(fw_cpu[3][2]) != 0.0 ); + + BOOST_TEST( fw_gpu.extensions() == in_gpu.extensions() ); + BOOST_TEST( abs(complex(fw_gpu[3][2]) - fw_cpu[3][2]) < 1e-10 ); + BOOST_TEST( abs(complex(fw_gpu[2][1]) - fw_cpu[2][1]) < 1e-10 ); + } + + // BOOST_AUTO_TEST_CASE(cufft_1D_combinations, *boost::unit_test::tolerance(0.0001)) + { + using complex = thrust::complex; // this can't be std::complex in the gpu + + auto const in_cpu = std::invoke([] { + multi::array ret({128}, complex{}); + std::default_random_engine generator; + std::uniform_real_distribution distribution(1.0, 88.0); + + std::generate( + reinterpret_cast(ret.data_elements()), + reinterpret_cast(ret.data_elements() + ret.num_elements()), [&] { return distribution(generator); } + ); + return ret; + }); + + for(auto c : std::vector>{ + {true} //, + // {false}, + }) { + auto const in_gpu = multi::thrust::cuda::array{in_cpu}; + + BOOST_TEST( complex(in_gpu[31]).real() == in_cpu[31].real() ); + BOOST_TEST( complex(in_gpu[31]).imag() == in_cpu[31].imag() ); + + auto fw_cpu = multi::array(extensions(in_cpu)); + auto fw_gpu = multi::thrust::cuda::array(extensions(in_gpu)); + + auto p_cpu = multi::fftw::plan::forward(c, in_cpu.base(), in_cpu.layout(), fw_cpu.base(), fw_cpu.layout()); + auto p_gpu = multi::cufft::plan<1>(c, in_gpu.layout(), fw_gpu.layout()); + + BOOST_TEST( abs(complex(in_gpu[31]) - in_cpu[31]) < 1e-10 ); + + p_cpu.execute(in_cpu.base(), fw_cpu.base()); + p_gpu.execute_forward(in_gpu.base(), fw_gpu.base()); + + BOOST_TEST( abs(fw_cpu[31]) != 0.0 ); + + BOOST_TEST( abs( complex(in_gpu[31]) - in_cpu[31]) < 1e-10 ); + BOOST_TEST( abs( complex(fw_gpu[31]) - fw_cpu[31]) < 1e-10 ); + } + } + + // BOOST_AUTO_TEST_CASE(cufft_2D_combinations, *boost::unit_test::tolerance(0.0001)) + { + + using complex = thrust::complex; // this can't be std::complex in the gpu + + auto const in_cpu = std::invoke([] { + multi::array ret({10, 20}); + std::default_random_engine generator; + std::uniform_real_distribution distribution(-1.0, 1.0); + + std::generate( + reinterpret_cast(ret.data_elements()), + reinterpret_cast(ret.data_elements() + ret.num_elements()), [&] { return distribution(generator); } + ); + return ret; + }); + + for(auto c : std::vector>{ + { true, true}, + { true, false}, + {false, true}, // {false, false} + }) { + auto fw_cpu = multi::array(extensions(in_cpu)); + multi::fftw::dft(c, in_cpu, fw_cpu, multi::fftw::forward); + + auto const in_gpu = multi::thrust::cuda::array{in_cpu}; + auto fw_gpu = multi::thrust::cuda::array(extensions(in_gpu)); + + BOOST_TEST( abs(fw_cpu[2][1]) != 0.0 ); + + multi::cufft::plan<2>(c, in_gpu.layout(), fw_gpu.layout()) + .execute(in_gpu.base(), fw_gpu.base(), multi::cufft::forward); + + BOOST_TEST( abs(complex(fw_gpu[2][1]) - fw_cpu[2][1]) < 1e-10 ); + } + } + + // BOOST_AUTO_TEST_CASE(cufft_2D_combinations_inplace, *boost::unit_test::tolerance(0.0001)) + { + + using complex = thrust::complex; // this can't be std::complex in the gpu + + auto const in_cpu = std::invoke([] { + multi::array ret({10, 20}); + std::default_random_engine generator; + std::uniform_real_distribution distribution(-1.0, 1.0); + + std::generate( + reinterpret_cast(ret.data_elements()), + reinterpret_cast(ret.data_elements() + ret.num_elements()), [&] { return distribution(generator); } + ); + return ret; + }); + + for(auto c : std::vector>{ + { true, true}, + { true, false}, + {false, true} //, + // {false, false} + }) { + auto fw_cpu = in_cpu; + auto const in_gpu = multi::thrust::cuda::array{in_cpu}; + + multi::fftw::dft(c, fw_cpu, multi::fftw::forward); + + auto fw_gpu = in_gpu; + + BOOST_TEST( abs(fw_cpu[2][1]) != 0.0 ); + + multi::cufft::plan<2>(c, fw_gpu.layout(), fw_gpu.layout()) + .execute(fw_gpu.base(), fw_gpu.base(), multi::cufft::forward); + + BOOST_TEST( abs(complex(fw_gpu[2][1]) - fw_cpu[2][1]) < 1e-10 ); + } + } + + // BOOST_AUTO_TEST_CASE(cufft_3D, *boost::unit_test::tolerance(0.0001)) + { + + using complex = thrust::complex; // this can't be std::complex in the gpu + + auto const in_cpu = std::invoke([] { + multi::array ret({10, 20, 30}); + std::default_random_engine generator; + std::uniform_real_distribution distribution(-1.0, 1.0); + + std::generate( + reinterpret_cast(ret.data_elements()), + reinterpret_cast(ret.data_elements() + ret.num_elements()), [&] { return distribution(generator); } + ); + return ret; + }); + + for(auto c : std::vector>{ + { true, true, true}, + { true, true, false}, + { true, false, true}, + { true, false, false}, + {false, true, true}, + {false, true, false}, + {false, false, true} //, + // {false, false, false} + }) { + auto fw_cpu = multi::array(extensions(in_cpu)); + auto const in_gpu = multi::thrust::cuda::array{in_cpu}; + + multi::fftw::dft(c, in_cpu, fw_cpu, multi::fftw::forward); + auto fw_gpu = multi::thrust::cuda::array(extensions(in_gpu)); + + multi::cufft::dft(c, in_gpu, fw_gpu, multi::cufft::forward); + + BOOST_TEST( abs(fw_cpu[3][2][1]) != 0.0 ); + + BOOST_TEST( abs(complex(fw_gpu[3][2][1]) - fw_cpu[3][2][1]) < 1e-10 ); + } + } + + // BOOST_AUTO_TEST_CASE(cufft_3D_inplace, *boost::unit_test::tolerance(0.0001)) + { + + using complex = thrust::complex; // this can't be std::complex in the gpu + + auto const in_cpu = std::invoke([] { + multi::array ret({10, 20, 30}); + std::default_random_engine generator; + std::uniform_real_distribution distribution(-1.0, 1.0); + + std::generate( + reinterpret_cast(ret.data_elements()), + reinterpret_cast(ret.data_elements() + ret.num_elements()), [&] { return distribution(generator); } + ); + return ret; + }); + + for(auto c : std::vector>{ + { true, true, true}, + { true, true, false}, + { true, false, true}, + { true, false, false}, + {false, true, true}, + {false, true, false}, + {false, false, true} //, + // {false, false, false} + }) { + auto fw_cpu = in_cpu; + auto const in_gpu = multi::thrust::cuda::array{in_cpu}; + + multi::fftw::dft(c, fw_cpu, multi::fftw::forward); + auto fw_gpu = in_gpu; + + multi::cufft::plan<3>(c, fw_gpu.layout(), fw_gpu.layout()) + .execute(fw_gpu.base(), fw_gpu.base(), multi::cufft::forward); + + BOOST_TEST( abs(fw_cpu[3][2][1]) != 0.0 ); + + // std::cerr << "case " << c[0] << " " << c[1] << " " << c[2] << std::endl; + // std::cerr << complex(fw_gpu[3][2][1]) - fw_cpu[3][2][1] << std::endl; + // BOOST_TEST( abs(complex(fw_gpu[3][2][1]) - fw_cpu[3][2][1]) < 1e-10 ); + // TODO(correaa), these two cases are failing + // case 1 1 1 * (-34.154,-39.0958) + // case 1 1 0 (0,-1.77636e-15) + // case 1 0 1 * (-12.6338,0.299744) + // case 1 0 0 * (4.44089e-16,-4.44089e-16) + // case 0 1 1 (20.1121,-10.8888) + // case 0 1 0 * (0,-2.22045e-16) + // case 0 0 1 (-0.348103,4.32914) + } + } + + // BOOST_AUTO_TEST_CASE(cufft_4D, *boost::unit_test::tolerance(0.0001) + { + + using complex = thrust::complex; // this can't be std::complex in the gpu + + auto const in_cpu = std::invoke([] { + multi::array ret({10, 20, 30, 40}); + std::default_random_engine generator; + std::uniform_real_distribution distribution(-1.0, 1.0); + + std::generate( + reinterpret_cast(ret.data_elements()), + reinterpret_cast(ret.data_elements() + ret.num_elements()), [&] { return distribution(generator); } + ); + return ret; + }); + + for(auto c : std::vector>{ + {true , true , true , true }, + { true, true, true, false}, + { true, true, false, true}, + { true, true, false, false}, + { true, false, true, true}, + { true, false, true, false}, + { true, false, false, true}, + { true, false, false, false}, + {false, true, true, true}, + {false, true, true, false}, + {false, true, false, true}, + {false, true, false, false}, + {false, false, true, true}, + {false, false, true, false}, + {false, false, false, true} //, + // {false, false, false, false} + }) { + auto fw_cpu = multi::array(extensions(in_cpu)); + multi::fftw::dft(c, in_cpu, fw_cpu, multi::fftw::forward); + + auto const in_gpu = multi::thrust::cuda::array{in_cpu}; + auto fw_gpu = multi::thrust::cuda::array(extensions(in_gpu)); + + BOOST_TEST( abs(fw_cpu[4][3][2][1]) != 0 ); + + multi::cufft::dft(c, in_gpu, fw_gpu, multi::cufft::forward); + + std::cerr << "Case " << c[0] << " " << c[1] << " " << c[2] << " " << c[3] << ": " << complex(fw_gpu[4][3][2][1]) - fw_cpu[4][3][2][1] << '\n'; + + BOOST_TEST( abs(complex(fw_gpu[4][3][2][1]) - fw_cpu[4][3][2][1]) < 1e-10 ); + } + } + + // SO 4D intermediate answer: + { + using complex = thrust::complex; // this can't be std::complex in the gpu + + auto const in_cpu = std::invoke([] { + multi::array ret({12, 128, 128, 4}); + std::default_random_engine generator; + std::uniform_real_distribution distribution(-1.0, 1.0); + + std::generate( + ret.elements().begin(), ret.elements().end(), [&] { return distribution(generator); } + ); + return ret; + }); + + auto fw_cpu = multi::array(extensions(in_cpu)); + multi::fftw::dft({false, true, true, false}, in_cpu, fw_cpu, multi::fftw::forward); + + auto const in_gpu = multi::thrust::cuda::array{in_cpu}; + auto fw_gpu = multi::thrust::cuda::array(extensions(in_gpu)); + + BOOST_TEST( abs(fw_cpu[4][3][2][1]) != 0.0 ); + + multi::cufft::dft({false, true, true, false}, in_gpu, fw_gpu, multi::cufft::forward); + + BOOST_TEST( abs(complex(fw_gpu[4][3][2][1]) - fw_cpu[4][3][2][1]) < 1e-10 ); + } + // small case 99 + { + using complex = thrust::complex; // this can't be std::complex in the gpu + + auto const in_cpu = std::invoke([] { + multi::array ret({12, 128, 128, 4}); + std::default_random_engine generator; + std::uniform_real_distribution distribution(-1.0, 1.0); + + std::generate( + ret.elements().begin(), ret.elements().end(), [&] { return distribution(generator); } + ); + return ret; + }); + + multi::thrust::cuda::array in({12, 128, 128, 4}); + in = in_cpu; + + multi::thrust::cuda::array ou({12, 128, 128, 4}, 0.0); + + multi::cufft::dft_forward({false, true, true, false}, in, ou); + + std::cout << "small case : " << ou[4][3][2][1] << '\n'; + } + + return boost::report_errors(); +} catch(...) { + throw; + return 1; +} + +// #if 0 + +// } + +// BOOST_AUTO_TEST_CASE(check_thrust_complex_vs_std_complex, *boost::unit_test::tolerance(0.0001)){ + +// multi::array, 1> const s_in = {1.0 + I*2.0, 2.0 + I*3.0, 3.0 + I*4.0}; +// multi::array, 1> const t_in = {1.0 + I*2.0, 2.0 + I*3.0, 3.0 + I*4.0}; + +// multi::array, 1> s_out(s_in.extensions()); +// multi::array, 1> t_out(t_in.extensions()); + +// multi::fftw::plan::forward({true}, s_in.base(), s_in.layout(), s_out.base(), s_out.layout()).execute(s_in.base(), s_out.base()); +// multi::fftw::plan::forward({true}, t_in.base(), t_in.layout(), t_out.base(), t_out.layout()).execute(t_in.base(), t_out.base()); + +// BOOST_REQUIRE( std::equal(s_out.begin(), s_out.end(), t_out.begin()) ); +// } + +// BOOST_AUTO_TEST_CASE(small_1D_cpu_vs_cpu, *boost::unit_test::tolerance(0.0001)){ + +// multi::array, 1> const cpu_in = {1.0 + I*2.0, 2.0 + I*3.0, 3.0 + I*4.0}; +// multi::thrust::cuda::array, 1> const gpu_in = {1.0 + I*2.0, 2.0 + I*3.0, 3.0 + I*4.0}; + +// multi::array, 1> cpu_out(cpu_in.extensions()); +// multi::thrust::cuda::array, 1> gpu_out(gpu_in.extensions()); + +// multi::fftw::plan::forward({true}, cpu_in.base(), cpu_in.layout(), cpu_out.base(), cpu_out.layout()).execute (cpu_in.base(), cpu_out.base()); +// multi::cufft::plan<1> ({true}, gpu_in.layout(), gpu_out.layout()).execute_forward(gpu_in.base(), gpu_out.base()); +// } + +// BOOST_AUTO_TEST_CASE(cufft_3D_timing, *boost::unit_test::tolerance(0.0001)){ + +// auto x = multi::extensions_t<3>{300, 300, 300}; +// { +// auto const in_cpu = multi::array(x, 10.0); +// BOOST_ASSERT( in_cpu.num_elements()*sizeof(complex) < 2e9 ); +// auto fw_cpu = multi::array(extensions(in_cpu), 99.0); +// { +// // boost::timer::auto_cpu_timer t; // 1.041691s wall, 1.030000s user + 0.000000s system = 1.030000s CPU (98.9%) +// multi::fftw::dft_forward({true, true}, in_cpu, fw_cpu); +// BOOST_TEST( fw_cpu[8][9][10] != 99.0 ); +// } + +// auto const in_gpu = multi::thrust::cuda::array{in_cpu}; // (x, 10.0); +// cudaDeviceSynchronize()==cudaSuccess?void():assert(0); +// { +// auto fw_gpu = multi::thrust::cuda::array(extensions(in_gpu), 99.0); +// cudaDeviceSynchronize()==cudaSuccess?void():assert(0); +// // boost::timer::auto_cpu_timer t; // 0.208237s wall, 0.200000s user + 0.010000s system = 0.210000s CPU (100.8%) +// boost::multi::cufft::dft({true, true}, in_gpu, fw_gpu, multi::cufft::forward); +// cudaDeviceSynchronize()==cudaSuccess?void():assert(0); +// BOOST_TEST( (static_cast(fw_gpu[8][9][10]) - fw_cpu[8][9][10]).real() == 0.0 ); +// BOOST_TEST( (static_cast(fw_gpu[8][9][10]) - fw_cpu[8][9][10]).imag() == 0.0 ); +// } +// { +// // boost::timer::auto_cpu_timer t; // 0.208237s wall, 0.200000s user + 0.010000s system = 0.210000s CPU (100.8%) +// auto const fw_gpu2 = boost::multi::cufft::dft({true, true}, in_gpu, multi::cufft::forward); +// cudaDeviceSynchronize()==cudaSuccess?void():assert(0); +// BOOST_TEST( (static_cast(fw_gpu2[8][9][10]) - fw_cpu[8][9][10]).real() == 0.0 ); +// BOOST_TEST( (static_cast(fw_gpu2[8][9][10]) - fw_cpu[8][9][10]).imag() == 0.0 ); +// } +// } + +// #if 1 +// { +// multi::thrust::cuda::universal_array const in_gpu(x, 10.); +// multi::thrust::cuda::universal_array fw_gpu(extensions(in_gpu), 99.); + +// // multi::cuda::managed::array const in_gpu(x, 10.); +// // multi::cuda::managed::array fw_gpu(extensions(in_gpu), 99.); +// { +// // boost::timer::auto_cpu_timer t; // 0.208237s wall, 0.200000s user + 0.010000s system = 0.210000s CPU (100.8%) +// multi::cufft::dft({true, true}, in_gpu, fw_gpu, multi::cufft::forward); +// // BOOST_TEST( fw_gpu[8][9][10].operator complex() != 99. ); +// } +// { +// // boost::timer::auto_cpu_timer t; // 0.208237s wall, 0.200000s user + 0.010000s system = 0.210000s CPU (100.8%) +// multi::cufft::dft({true, true}, in_gpu, fw_gpu, multi::cufft::forward); +// // BOOST_TEST( fw_gpu[8][9][10].operator complex() != 99. ); +// } +// } +// #endif +// } + +// #if 0 + +// BOOST_AUTO_TEST_CASE(cufft_combinations, *utf::tolerance(0.00001)){ + +// auto const in = []{ +// multi::array ret({32, 90, 98, 96}); +// std::generate(ret.data_elements(), ret.data_elements() + ret.num_elements(), +// [](){return complex{std::rand()*1./RAND_MAX, std::rand()*1./RAND_MAX};} +// ); +// return ret; +// }(); +// std::clog<<"memory size "<< in.num_elements()*sizeof(complex)/1e6 <<" MB\n"; + +// multi::thrust::cuda::universal_array const in_gpu = in; +// multi::thrust::cuda::universal_array const in_mng = in; + +// using std::clog; +// for(auto c : std::vector>{ +// {false, true , true , true }, +// {false, true , true , false}, +// {true , false, false, false}, +// {true , true , false, false}, +// {false, false, true , false}, +// {false, false, false, false}, +// }){ +// std::clog<<"case "; copy(begin(c), end(c), std::ostream_iterator{std::clog,", "}); std::clog< out = in; +// multi::array in_rw = in; +// [&, _ = watch{"cpu_opl "}]{ +// multi::fftw::dft_forward(c, in, out); +// }(); +// [&, _ = watch{"cpu_ipl "}]{ +// multi::fftw::dft(c, in_rw, multi::fftw::forward); +// // BOOST_TEST( abs( static_cast>(in_rw[5][4][3][1]) - multi::complex(out[5][4][3][1]) ) == 0. ); +// }(); +// { +// multi::array in_rw2 = in; +// [&, _ = watch{"cpu_mov "}]{ +// multi::array const out_mov = multi::fftw::dft_forward(c, std::move(in_rw2)); +// // what(out_mov); +// // BOOST_TEST( abs( static_cast>(out_mov[5][4][3][1]) - multi::complex(out[5][4][3][1]) ) == 0. ); +// BOOST_REQUIRE( is_empty(in_rw2) ); +// BOOST_REQUIRE( extensions(out_mov) == extensions(in) ); +// }(); +// } + +// [&, _ = watch{"cpu_new "}]{ +// auto const out_cpy = multi::fftw::dft_forward(c, in); +// BOOST_TEST( abs( static_cast>(out_cpy[5][4][3][1]) - std::complex(out[5][4][3][1]) ) == 0. ); +// }(); +// multi::thrust::cuda::array out_gpu(extensions(in_gpu)); +// [&, _ = watch{"gpu_opl "}]{ +// multi::cufft::dft(c, in_gpu , out_gpu, multi::cufft::forward); +// BOOST_TEST( abs( static_cast(out_gpu[5][4][3][1]) - out[5][4][3][1] ) == 0. ); +// }(); +// { +// multi::thrust::cuda::array in_rw_gpu = in_gpu; +// [&, _ = watch{"gpu_ipl "}]{ +// multi::cufft::dft(c, in_rw_gpu, multi::cufft::forward); +// BOOST_TEST( abs( static_cast(in_rw_gpu[5][4][3][1]) - out[5][4][3][1] ) == 0. ); +// }(); +// } +// { +// multi::thrust::cuda::array in_rw_gpu = in_gpu; +// [&, _ = watch{"gpu_mov "}]{ +// multi::thrust::cuda::array const out_mov = multi::cufft::dft_forward(c, std::move(in_rw_gpu)); +// // BOOST_REQUIRE( in_rw_gpu.empty() ); +// // BOOST_TEST( abs( static_cast(out_mov[5][4][3][1]) - out[5][4][3][1] ) == 0. ); +// }(); +// } +// { +// multi::thrust::cuda::array in_rw_gpu = in_gpu; +// [&, _ = watch{"gpu_mov "}]{ +// multi::thrust::cuda::array out_mov = std::move(in_rw_gpu); +// multi::cufft::dft(c, out_mov, multi::cufft::forward); +// // BOOST_REQUIRE( in_rw_gpu.empty() ); +// // BOOST_TEST( abs( static_cast(out_mov[5][4][3][1]) - out[5][4][3][1] ) == 0. ); +// }(); +// } +// cudaDeviceSynchronize(); +// [&, _ = watch{"gpu_new "}]{ +// multi::thrust::cuda::array const out_cpy = multi::cufft::dft(c, in_gpu, multi::cufft::forward); +// }(); +// multi::thrust::cuda::universal_array out_mng(extensions(in_mng)); +// [&, _ = watch{"mng_cld "}]{ +// multi::cufft::dft(c, in_mng, out_mng, multi::cufft::forward); +// BOOST_TEST( abs( out_mng[5][4][3][1] - out[5][4][3][1] ) == 0. ); +// }(); +// [&, _ = watch{"mng_hot "}]{ +// multi::cufft::dft(c, in_mng , out_mng, multi::cufft::forward); +// BOOST_TEST( abs( out_mng[5][4][3][1] - out[5][4][3][1] ) == 0. ); +// }(); +// [&, _ = watch{"mng_new "}]{ +// auto const out_mng = multi::cufft::dft(c, in_mng, multi::cufft::forward); +// BOOST_TEST( abs( out_mng[5][4][3][1] - out[5][4][3][1] ) == 0. ); +// }(); +// } +// // std::clog<<"cache size " +// // << multi::cufft::plan::cache<1>().size() <<' ' +// // << multi::cufft::plan::cache<2>().size() <<' ' +// // << multi::cufft::plan::cache<3>().size() <<' ' +// // << multi::cufft::plan::cache<4>().size() <<' ' +// // < ret({45, 18, 32, 16}); +// std::generate( +// ret.data_elements(), ret.data_elements() + ret.num_elements(), +// [](){return complex{std::rand()*1./RAND_MAX, std::rand()*1./RAND_MAX};} +// ); +// return ret; +// }(); + +// multi::thrust::cuda::array const in = in_cpu; +// multi::thrust::cuda::array out(extensions(in)); + +// #if 0 +// multi::cufft::many_dft(begin(unrotated(in)), end(unrotated(in)), begin(unrotated(out)), +1); + +// multi::array out_cpu(extensions(in)); +// multi::fft::many_dft(begin(unrotated(in_cpu)), end(unrotated(in_cpu)), begin(unrotated(out_cpu)), +1); + +// BOOST_TEST( imag( static_cast(out[5][4][3][2]) - out_cpu[5][4][3][2]) == 0. ); +// #endif +// } + +// #if 0 +// BOOST_AUTO_TEST_CASE(cufft_4D, *utf::tolerance(0.00001) ){ +// auto const in = []{ +// multi::array ret({10, 10, 10}); +// std::generate(ret.data_elements(), ret.data_elements() + ret.num_elements(), +// [](){return complex{std::rand()*1./RAND_MAX, std::rand()*1./RAND_MAX};} +// ); +// return ret; +// }(); + +// multi::array out(extensions(in)); +// // multi::fftw::dft({true, false, true}, in, out, multi::fftw::forward); +// multi::fftw::many_dft(begin(in.rotated()), end(in.rotated()), begin(out.rotated()), multi::fftw::forward); + +// multi::thrust::cuda::array in_gpu = in; +// multi::thrust::cuda::array out_gpu(extensions(in)); + +// // multi::cufft::dft({true, false, true}, in_gpu, out_gpu, multi::fft::forward);//multi::cufft::forward); +// // multi::cufft::many_dft(begin(in_gpu.rotated()), end(in_gpu.rotated()), begin( out_gpu.rotated() ), multi::fftw::forward); +// // BOOST_TEST( ( static_cast(out_gpu[5][4][3]) - out[5][4][3]).imag() == 0. ); +// } +// #endif +// #endif + +// #endif From c622032efacd144528a0ee926c0a8fb6cbe47787 Mon Sep 17 00:00:00 2001 From: Alfredo Correa Date: Sat, 18 Oct 2025 23:55:40 -0700 Subject: [PATCH 02/12] diagnose cufft limitation --- .../boost/multi/adaptors/cufft/test/cases.cpp | 808 +----------------- 1 file changed, 22 insertions(+), 786 deletions(-) diff --git a/include/boost/multi/adaptors/cufft/test/cases.cpp b/include/boost/multi/adaptors/cufft/test/cases.cpp index f69087914..6249033de 100644 --- a/include/boost/multi/adaptors/cufft/test/cases.cpp +++ b/include/boost/multi/adaptors/cufft/test/cases.cpp @@ -2,14 +2,12 @@ // Distributed under the Boost Software License, Version 1.0. // https://www.boost.org/LICENSE_1_0.txt +#include + #include #include -#include - -#include -#include -#include +#include #if(!(defined(__HIP_PLATFORM_AMD__) || defined(__HIP_PLATFORM_NVIDIA__))) && (!defined(__HIPCC__)) #include @@ -23,807 +21,45 @@ #include #include -#include -#include -#include -#include -#include +#include namespace multi = boost::multi; + using complex = thrust::complex; template<> constexpr bool multi::force_element_trivial_default_construction> = true; -template -__attribute__((always_inline)) inline void DoNotOptimize(T const& value) { // NOLINT(readability-identifier-naming) consistency with Google benchmark - asm volatile("" : "+m"(const_cast(value))); // NOLINT(hicpp-no-assembler,cppcoreguidelines-pro-type-const-cast) hack -} - -class watch : std::chrono::high_resolution_clock { - std::string label_; - time_point start_; - - public: - explicit watch(char const* label) : label_{label} { - cudaDeviceSynchronize() == cudaSuccess ? void() : assert(0); // NOLINT(misc-include-cleaner) the header is included conditionally - start_ = now(); - } - - watch(watch const&) = delete; - watch(watch&&) = delete; - - auto operator=(watch const&) -> watch& = delete; - auto operator=(watch&&) -> watch& = delete; - - watch() : watch("") {} - ~watch() { - cudaDeviceSynchronize() == cudaSuccess ? void() : assert(0); - auto const count = std::chrono::duration(now() - start_).count(); - std::cerr << label_ << ": " << count << " sec\n"; - } -}; - -using complex = thrust::complex; // this can't be std::complex in the gpu - -struct norm_t { - __host__ __device__ auto operator()(complex const& x) const { - return thrust::norm(x); - } -}; - auto main() -> int try { complex const I{0.0, 1.0}; // NOLINT(readability-identifier-length) - // BOOST_AUTO_TEST_CASE(cufft_2D, *boost::unit_test::tolerance(0.0001)) - { - auto const in_cpu = multi::array{ - { 1.0 + 2.0 * I, 9.0 - 1.0 * I, 2.0 + 4.0 * I}, - { 3.0 + 3.0 * I, 7.0 - 4.0 * I, 1.0 + 9.0 * I}, - { 4.0 + 1.0 * I, 5.0 + 3.0 * I, 2.0 + 4.0 * I}, - { 3.0 - 1.0 * I, 8.0 + 7.0 * I, 2.0 + 1.0 * I}, - {31.0 - 1.0 * I, 18.0 + 7.0 * I, 2.0 + 10.0 * I} - }; - - { - auto fw_cpu = multi::array(extensions(in_cpu)); - multi::fftw::dft_forward({true, true}, in_cpu, fw_cpu); - - auto const in_gpu = multi::thrust::cuda::array{in_cpu}; - auto fw_gpu = multi::thrust::cuda::array(extensions(in_gpu)); - - BOOST_TEST( fw_cpu[3][2].real() != 0.0 ); - BOOST_TEST( fw_cpu[3][2].imag() != 0.0 ); - - multi::cufft::plan<2>({true, true}, in_gpu.layout(), fw_gpu.layout()) - .execute(in_gpu.base(), fw_gpu.base(), multi::cufft::forward); - - BOOST_TEST( std::abs((complex(fw_gpu[3][2]) - fw_cpu[3][2]).real()) < 1.0e-8 ); - BOOST_TEST( std::abs((complex(fw_gpu[3][2]) - fw_cpu[3][2]).imag()) < 1.0e-8 ); - - // TODO(correaa) test funcional interface for GPU - // auto const& dft = multi::fft::DFT({true, true}, in_cpu, multi::fft::forward); - - // BOOST_TEST( dft.extensions() == in_cpu.extensions() ); - // BOOST_TEST( (*dft.begin()).size() == (*in_cpu.begin()).size() ); - // BOOST_TEST( (*dft.begin()).extensions() == (*in_cpu.begin()).extensions() ); - - // multi::array const fw_cpu_out = multi::fft::DFT({true, true}, in_cpu, multi::fft::forward); - } - } - { - auto const in_cpu = std::invoke([] { - multi::array ret({20, 20, 20, 20}); - auto const [is, js, ks, ls] = ret.extensions(); - for(auto i : is) - for(auto j : js) - for(auto k : ks) - for(auto l : ls) { - ret[i][j][k][l] = complex{ - static_cast(i + j + k + l), - static_cast(i - j + k - l), - }; - } - return ret; - }); - - auto const in_gpu = multi::thrust::cuda::array{in_cpu}; - - auto const nrm = thrust::transform_reduce( - in_gpu.elements().begin(), in_gpu.elements().end(), - norm_t{}, 0.0, thrust::plus<>{} - ); - - auto fw_gpu = multi::thrust::cuda::array(in_gpu.extensions()); - fw_gpu = in_gpu; - // multi::cufft::plan<4>({true, true, true, true}, in_gpu.layout(), fw_gpu.layout()) - // .execute(in_gpu.base(), fw_gpu.base(), multi::cufft::forward); - - // cudaDeviceSynchronize() == cudaSuccess ? void() : throw std::runtime_error{"cuda error"}; - - // auto const nrm_fwd = thrust::transform_reduce( - // fw_gpu.elements().begin(), fw_gpu.elements().end(), - // norm_t{}, 0.0, thrust::plus<>{} - // ); - // std::cout << "norm: " << nrm*20.0*20.0 << ", norm forward: " << nrm_fwd << '\n'; - // BOOST_TEST( nrm_fwd == nrm*20.0*20.0 ); - } - { - auto const in_cpu = multi::array{ - { 1.0 + 2.0 * I, 9.0 - 1.0 * I, 2.0 + 4.0 * I}, - { 3.0 + 3.0 * I, 7.0 - 4.0 * I, 1.0 + 9.0 * I}, - { 4.0 + 1.0 * I, 5.0 + 3.0 * I, 2.0 + 4.0 * I}, - { 3.0 - 1.0 * I, 8.0 + 7.0 * I, 2.0 + 1.0 * I}, - {31.0 - 1.0 * I, 18.0 + 7.0 * I, 2.0 + 10.0 * I} - }; - - auto fw_cpu = multi::array(extensions(in_cpu)); - multi::fftw::dft({false, true}, in_cpu, fw_cpu, multi::fftw::forward); - - auto const in_gpu = multi::thrust::cuda::array{in_cpu}; - auto fw_gpu = multi::thrust::cuda::array(extensions(in_gpu)); - - BOOST_TEST( fw_cpu[3][2].real() != 0.0 ); - BOOST_TEST( fw_cpu[3][2].imag() != 0.0 ); - - multi::cufft::plan<2>({false, true}, in_gpu.layout(), fw_gpu.layout()) - .execute(in_gpu.base(), fw_gpu.base(), multi::cufft::forward); - - BOOST_TEST( thrust::abs(complex(fw_gpu[3][2]) - fw_cpu[3][2]) < 1e-12 ); - } - { - auto const in_cpu = multi::array{ - { 1.0 + 2.0 * I, 9.0 - 1.0 * I, 2.0 + 4.0 * I}, - { 3.0 + 3.0 * I, 7.0 - 4.0 * I, 1.0 + 9.0 * I}, - { 4.0 + 1.0 * I, 5.0 + 3.0 * I, 2.0 + 4.0 * I}, - { 3.0 - 1.0 * I, 8.0 + 7.0 * I, 2.0 + 1.0 * I}, - {31.0 - 1.0 * I, 18.0 + 7.0 * I, 2.0 + 10.0 * I} - }; - - auto fw_cpu = multi::array(extensions(in_cpu)); - multi::fftw::dft({false, true}, in_cpu, fw_cpu, multi::fftw::forward); - - auto const in_gpu = multi::thrust::cuda::array{in_cpu}; - auto fw_gpu = multi::thrust::cuda::array(extensions(in_gpu)); - - BOOST_TEST( fw_cpu[3][2].real() != 0.0 ); - BOOST_TEST( fw_cpu[3][2].imag() != 0.0 ); - - for(int i = 0; i != in_gpu.size(); ++i) { - multi::cufft::plan<1>({true}, in_gpu[i].layout(), fw_gpu[i].layout()) - .execute(in_gpu[i].base(), fw_gpu[i].base(), multi::cufft::forward); - } - - BOOST_TEST( thrust::abs(complex(fw_gpu[3][2]) - fw_cpu[3][2]) < 1.0e-12 ); - } - { - auto const in_cpu = multi::array{ - { 1.0 + 2.0 * I, 9.0 - 1.0 * I, 2.0 + 4.0 * I}, - { 3.0 + 3.0 * I, 7.0 - 4.0 * I, 1.0 + 9.0 * I}, - { 4.0 + 1.0 * I, 5.0 + 3.0 * I, 2.0 + 4.0 * I}, - { 3.0 - 1.0 * I, 8.0 + 7.0 * I, 2.0 + 1.0 * I}, - {31.0 - 1.0 * I, 18.0 + 7.0 * I, 2.0 + 10.0 * I} - }; - - auto fw_cpu = multi::array(extensions(in_cpu)); - multi::fftw::dft({false, true}, in_cpu, fw_cpu, multi::fftw::forward); - - auto const in_gpu = multi::thrust::cuda::array{in_cpu}; - auto fw_gpu = multi::thrust::cuda::array(extensions(in_gpu)); - auto fw_gpu2 = multi::thrust::cuda::array(extensions(in_gpu)); - auto fw_gpu3 = multi::thrust::cuda::array(extensions(in_gpu)); - - BOOST_TEST( fw_cpu[3][2].real() != 0.0 ); - BOOST_TEST( fw_cpu[3][2].imag() != 0.0 ); - - for(int i = 0; i != in_gpu.size(); ++i) { - multi::cufft::plan<1>({true}, in_gpu[i].layout(), fw_gpu[i].layout()) - .execute(in_gpu[i].base(), fw_gpu[i].base(), multi::cufft::forward); - } - - multi::cufft::plan<2>({false, true}, in_gpu.layout(), fw_gpu2.layout()) - .execute(in_gpu.base(), fw_gpu2.base(), multi::cufft::forward); - - BOOST_TEST( abs(complex(fw_gpu[3][2]) - fw_cpu[3][2]) < 1e-10 ); - BOOST_TEST( abs(complex(fw_gpu[3][2]) - complex(fw_gpu2[3][2])) < 1e-10 ); - } - - { - auto const in_cpu = multi::array{ - { 1.0 + 2.0 * I, 9.0 - 1.0 * I, 2.0 + 4.0 * I}, - { 3.0 + 3.0 * I, 7.0 - 4.0 * I, 1.0 + 9.0 * I}, - { 4.0 + 1.0 * I, 5.0 + 3.0 * I, 2.0 + 4.0 * I}, - { 3.0 - 1.0 * I, 8.0 + 7.0 * I, 2.0 + 1.0 * I}, - {31.0 - 1.0 * I, 18.0 + 7.0 * I, 2.0 + 10.0 * I} - }; - auto fw_cpu = multi::array(extensions(in_cpu)); - multi::fftw::dft({false, true}, in_cpu, fw_cpu, multi::fftw::forward); - - auto const in_gpu = multi::thrust::cuda::array{in_cpu}; - auto const fw_gpu = multi::cufft::dft({false, true}, in_gpu, multi::cufft::forward); - - BOOST_TEST( abs(fw_cpu[3][2]) != 0.0 ); - - BOOST_TEST( abs(complex(fw_gpu[3][2]) - fw_cpu[3][2]) < 1e-10 ); - } - { - auto const in_cpu = multi::array{ - { 1.0 + 2.0 * I, 9.0 - 1.0 * I, 2.0 + 4.0 * I}, - { 3.0 + 3.0 * I, 7.0 - 4.0 * I, 1.0 + 9.0 * I}, - { 4.0 + 1.0 * I, 5.0 + 3.0 * I, 2.0 + 4.0 * I}, - { 3.0 - 1.0 * I, 8.0 + 7.0 * I, 2.0 + 1.0 * I}, - {31.0 - 1.0 * I, 18.0 + 7.0 * I, 2.0 + 10.0 * I} - }; - auto fw_cpu = multi::array(extensions(in_cpu)); - multi::fftw::dft({true, false}, in_cpu, fw_cpu, multi::fftw::forward); - - auto const in_gpu = multi::thrust::cuda::array{in_cpu}; - auto const fw_gpu = multi::cufft::dft({true, false}, in_gpu, multi::cufft::forward); - - BOOST_TEST( fw_cpu.extensions() == in_cpu.extensions() ); - BOOST_TEST( abs(fw_cpu[3][2]) != 0.0 ); - - BOOST_TEST( fw_gpu.extensions() == in_gpu.extensions() ); - BOOST_TEST( abs(complex(fw_gpu[3][2]) - fw_cpu[3][2]) < 1e-10 ); - BOOST_TEST( abs(complex(fw_gpu[2][1]) - fw_cpu[2][1]) < 1e-10 ); - } - - // BOOST_AUTO_TEST_CASE(cufft_1D_combinations, *boost::unit_test::tolerance(0.0001)) - { - using complex = thrust::complex; // this can't be std::complex in the gpu - - auto const in_cpu = std::invoke([] { - multi::array ret({128}, complex{}); - std::default_random_engine generator; - std::uniform_real_distribution distribution(1.0, 88.0); - - std::generate( - reinterpret_cast(ret.data_elements()), - reinterpret_cast(ret.data_elements() + ret.num_elements()), [&] { return distribution(generator); } - ); - return ret; - }); - - for(auto c : std::vector>{ - {true} //, - // {false}, - }) { - auto const in_gpu = multi::thrust::cuda::array{in_cpu}; - - BOOST_TEST( complex(in_gpu[31]).real() == in_cpu[31].real() ); - BOOST_TEST( complex(in_gpu[31]).imag() == in_cpu[31].imag() ); - - auto fw_cpu = multi::array(extensions(in_cpu)); - auto fw_gpu = multi::thrust::cuda::array(extensions(in_gpu)); - - auto p_cpu = multi::fftw::plan::forward(c, in_cpu.base(), in_cpu.layout(), fw_cpu.base(), fw_cpu.layout()); - auto p_gpu = multi::cufft::plan<1>(c, in_gpu.layout(), fw_gpu.layout()); - - BOOST_TEST( abs(complex(in_gpu[31]) - in_cpu[31]) < 1e-10 ); - - p_cpu.execute(in_cpu.base(), fw_cpu.base()); - p_gpu.execute_forward(in_gpu.base(), fw_gpu.base()); - - BOOST_TEST( abs(fw_cpu[31]) != 0.0 ); - - BOOST_TEST( abs( complex(in_gpu[31]) - in_cpu[31]) < 1e-10 ); - BOOST_TEST( abs( complex(fw_gpu[31]) - fw_cpu[31]) < 1e-10 ); - } - } - - // BOOST_AUTO_TEST_CASE(cufft_2D_combinations, *boost::unit_test::tolerance(0.0001)) - { - - using complex = thrust::complex; // this can't be std::complex in the gpu - - auto const in_cpu = std::invoke([] { - multi::array ret({10, 20}); - std::default_random_engine generator; - std::uniform_real_distribution distribution(-1.0, 1.0); - - std::generate( - reinterpret_cast(ret.data_elements()), - reinterpret_cast(ret.data_elements() + ret.num_elements()), [&] { return distribution(generator); } - ); - return ret; - }); - - for(auto c : std::vector>{ - { true, true}, - { true, false}, - {false, true}, // {false, false} - }) { - auto fw_cpu = multi::array(extensions(in_cpu)); - multi::fftw::dft(c, in_cpu, fw_cpu, multi::fftw::forward); - - auto const in_gpu = multi::thrust::cuda::array{in_cpu}; - auto fw_gpu = multi::thrust::cuda::array(extensions(in_gpu)); - - BOOST_TEST( abs(fw_cpu[2][1]) != 0.0 ); - - multi::cufft::plan<2>(c, in_gpu.layout(), fw_gpu.layout()) - .execute(in_gpu.base(), fw_gpu.base(), multi::cufft::forward); - - BOOST_TEST( abs(complex(fw_gpu[2][1]) - fw_cpu[2][1]) < 1e-10 ); - } - } - - // BOOST_AUTO_TEST_CASE(cufft_2D_combinations_inplace, *boost::unit_test::tolerance(0.0001)) - { - - using complex = thrust::complex; // this can't be std::complex in the gpu - - auto const in_cpu = std::invoke([] { - multi::array ret({10, 20}); - std::default_random_engine generator; - std::uniform_real_distribution distribution(-1.0, 1.0); + auto in_cpu = multi::array({20, 30, 50, 70}); - std::generate( - reinterpret_cast(ret.data_elements()), - reinterpret_cast(ret.data_elements() + ret.num_elements()), [&] { return distribution(generator); } - ); - return ret; - }); + std::generate( + in_cpu.elements().begin(), in_cpu.elements().end(), + [dist = std::normal_distribution<>{}, gen = std::mt19937(std::random_device{}())] () mutable { return dist(gen); } + ); - for(auto c : std::vector>{ - { true, true}, - { true, false}, - {false, true} //, - // {false, false} - }) { - auto fw_cpu = in_cpu; - auto const in_gpu = multi::thrust::cuda::array{in_cpu}; + auto const in_gpu = multi::thrust::cuda::array(in_cpu); - multi::fftw::dft(c, fw_cpu, multi::fftw::forward); + auto const& in_cpu_view = in_cpu.transposed().rotated().transposed().unrotated().transposed(); + auto const& in_gpu_view = in_gpu.transposed().rotated().transposed().unrotated().transposed(); - auto fw_gpu = in_gpu; + auto out_gpu = multi::thrust::cuda::array(in_gpu_view.extensions()); + auto out_cpu = multi::array(out_gpu.extensions()); - BOOST_TEST( abs(fw_cpu[2][1]) != 0.0 ); + assert(in_gpu_view.extensions() == out_gpu.extensions()); - multi::cufft::plan<2>(c, fw_gpu.layout(), fw_gpu.layout()) - .execute(fw_gpu.base(), fw_gpu.base(), multi::cufft::forward); + multi::fftw::dft_forward({true, true, true, false}, in_cpu_view, out_cpu); + + multi::cufft::plan<4>({true, true, true, false}, in_gpu_view.layout(), out_gpu.layout()) + .execute(in_gpu_view.base(), out_gpu.base(), multi::cufft::forward); - BOOST_TEST( abs(complex(fw_gpu[2][1]) - fw_cpu[2][1]) < 1e-10 ); - } - } - - // BOOST_AUTO_TEST_CASE(cufft_3D, *boost::unit_test::tolerance(0.0001)) - { - - using complex = thrust::complex; // this can't be std::complex in the gpu - - auto const in_cpu = std::invoke([] { - multi::array ret({10, 20, 30}); - std::default_random_engine generator; - std::uniform_real_distribution distribution(-1.0, 1.0); - - std::generate( - reinterpret_cast(ret.data_elements()), - reinterpret_cast(ret.data_elements() + ret.num_elements()), [&] { return distribution(generator); } - ); - return ret; - }); - - for(auto c : std::vector>{ - { true, true, true}, - { true, true, false}, - { true, false, true}, - { true, false, false}, - {false, true, true}, - {false, true, false}, - {false, false, true} //, - // {false, false, false} - }) { - auto fw_cpu = multi::array(extensions(in_cpu)); - auto const in_gpu = multi::thrust::cuda::array{in_cpu}; - - multi::fftw::dft(c, in_cpu, fw_cpu, multi::fftw::forward); - auto fw_gpu = multi::thrust::cuda::array(extensions(in_gpu)); - - multi::cufft::dft(c, in_gpu, fw_gpu, multi::cufft::forward); - - BOOST_TEST( abs(fw_cpu[3][2][1]) != 0.0 ); - - BOOST_TEST( abs(complex(fw_gpu[3][2][1]) - fw_cpu[3][2][1]) < 1e-10 ); - } - } - - // BOOST_AUTO_TEST_CASE(cufft_3D_inplace, *boost::unit_test::tolerance(0.0001)) - { - - using complex = thrust::complex; // this can't be std::complex in the gpu - - auto const in_cpu = std::invoke([] { - multi::array ret({10, 20, 30}); - std::default_random_engine generator; - std::uniform_real_distribution distribution(-1.0, 1.0); - - std::generate( - reinterpret_cast(ret.data_elements()), - reinterpret_cast(ret.data_elements() + ret.num_elements()), [&] { return distribution(generator); } - ); - return ret; - }); - - for(auto c : std::vector>{ - { true, true, true}, - { true, true, false}, - { true, false, true}, - { true, false, false}, - {false, true, true}, - {false, true, false}, - {false, false, true} //, - // {false, false, false} - }) { - auto fw_cpu = in_cpu; - auto const in_gpu = multi::thrust::cuda::array{in_cpu}; - - multi::fftw::dft(c, fw_cpu, multi::fftw::forward); - auto fw_gpu = in_gpu; - - multi::cufft::plan<3>(c, fw_gpu.layout(), fw_gpu.layout()) - .execute(fw_gpu.base(), fw_gpu.base(), multi::cufft::forward); - - BOOST_TEST( abs(fw_cpu[3][2][1]) != 0.0 ); - - // std::cerr << "case " << c[0] << " " << c[1] << " " << c[2] << std::endl; - // std::cerr << complex(fw_gpu[3][2][1]) - fw_cpu[3][2][1] << std::endl; - // BOOST_TEST( abs(complex(fw_gpu[3][2][1]) - fw_cpu[3][2][1]) < 1e-10 ); - // TODO(correaa), these two cases are failing - // case 1 1 1 * (-34.154,-39.0958) - // case 1 1 0 (0,-1.77636e-15) - // case 1 0 1 * (-12.6338,0.299744) - // case 1 0 0 * (4.44089e-16,-4.44089e-16) - // case 0 1 1 (20.1121,-10.8888) - // case 0 1 0 * (0,-2.22045e-16) - // case 0 0 1 (-0.348103,4.32914) - } - } - - // BOOST_AUTO_TEST_CASE(cufft_4D, *boost::unit_test::tolerance(0.0001) - { - - using complex = thrust::complex; // this can't be std::complex in the gpu - - auto const in_cpu = std::invoke([] { - multi::array ret({10, 20, 30, 40}); - std::default_random_engine generator; - std::uniform_real_distribution distribution(-1.0, 1.0); - - std::generate( - reinterpret_cast(ret.data_elements()), - reinterpret_cast(ret.data_elements() + ret.num_elements()), [&] { return distribution(generator); } - ); - return ret; - }); - - for(auto c : std::vector>{ - {true , true , true , true }, - { true, true, true, false}, - { true, true, false, true}, - { true, true, false, false}, - { true, false, true, true}, - { true, false, true, false}, - { true, false, false, true}, - { true, false, false, false}, - {false, true, true, true}, - {false, true, true, false}, - {false, true, false, true}, - {false, true, false, false}, - {false, false, true, true}, - {false, false, true, false}, - {false, false, false, true} //, - // {false, false, false, false} - }) { - auto fw_cpu = multi::array(extensions(in_cpu)); - multi::fftw::dft(c, in_cpu, fw_cpu, multi::fftw::forward); - - auto const in_gpu = multi::thrust::cuda::array{in_cpu}; - auto fw_gpu = multi::thrust::cuda::array(extensions(in_gpu)); - - BOOST_TEST( abs(fw_cpu[4][3][2][1]) != 0 ); - - multi::cufft::dft(c, in_gpu, fw_gpu, multi::cufft::forward); - - std::cerr << "Case " << c[0] << " " << c[1] << " " << c[2] << " " << c[3] << ": " << complex(fw_gpu[4][3][2][1]) - fw_cpu[4][3][2][1] << '\n'; - - BOOST_TEST( abs(complex(fw_gpu[4][3][2][1]) - fw_cpu[4][3][2][1]) < 1e-10 ); - } - } - - // SO 4D intermediate answer: - { - using complex = thrust::complex; // this can't be std::complex in the gpu - - auto const in_cpu = std::invoke([] { - multi::array ret({12, 128, 128, 4}); - std::default_random_engine generator; - std::uniform_real_distribution distribution(-1.0, 1.0); - - std::generate( - ret.elements().begin(), ret.elements().end(), [&] { return distribution(generator); } - ); - return ret; - }); - - auto fw_cpu = multi::array(extensions(in_cpu)); - multi::fftw::dft({false, true, true, false}, in_cpu, fw_cpu, multi::fftw::forward); - - auto const in_gpu = multi::thrust::cuda::array{in_cpu}; - auto fw_gpu = multi::thrust::cuda::array(extensions(in_gpu)); - - BOOST_TEST( abs(fw_cpu[4][3][2][1]) != 0.0 ); - - multi::cufft::dft({false, true, true, false}, in_gpu, fw_gpu, multi::cufft::forward); - - BOOST_TEST( abs(complex(fw_gpu[4][3][2][1]) - fw_cpu[4][3][2][1]) < 1e-10 ); - } - // small case 99 - { - using complex = thrust::complex; // this can't be std::complex in the gpu - - auto const in_cpu = std::invoke([] { - multi::array ret({12, 128, 128, 4}); - std::default_random_engine generator; - std::uniform_real_distribution distribution(-1.0, 1.0); - - std::generate( - ret.elements().begin(), ret.elements().end(), [&] { return distribution(generator); } - ); - return ret; - }); - - multi::thrust::cuda::array in({12, 128, 128, 4}); - in = in_cpu; - - multi::thrust::cuda::array ou({12, 128, 128, 4}, 0.0); - - multi::cufft::dft_forward({false, true, true, false}, in, ou); - - std::cout << "small case : " << ou[4][3][2][1] << '\n'; - } + std::cout << out_cpu[2][3][4][5] << ' ' << out_gpu[2][3][4][5] << std::endl; + BOOST_TEST( thrust::abs(out_cpu[2][3][4][5] - static_cast(out_gpu[2][3][4][5])) < 1e-6 ); return boost::report_errors(); } catch(...) { throw; return 1; } - -// #if 0 - -// } - -// BOOST_AUTO_TEST_CASE(check_thrust_complex_vs_std_complex, *boost::unit_test::tolerance(0.0001)){ - -// multi::array, 1> const s_in = {1.0 + I*2.0, 2.0 + I*3.0, 3.0 + I*4.0}; -// multi::array, 1> const t_in = {1.0 + I*2.0, 2.0 + I*3.0, 3.0 + I*4.0}; - -// multi::array, 1> s_out(s_in.extensions()); -// multi::array, 1> t_out(t_in.extensions()); - -// multi::fftw::plan::forward({true}, s_in.base(), s_in.layout(), s_out.base(), s_out.layout()).execute(s_in.base(), s_out.base()); -// multi::fftw::plan::forward({true}, t_in.base(), t_in.layout(), t_out.base(), t_out.layout()).execute(t_in.base(), t_out.base()); - -// BOOST_REQUIRE( std::equal(s_out.begin(), s_out.end(), t_out.begin()) ); -// } - -// BOOST_AUTO_TEST_CASE(small_1D_cpu_vs_cpu, *boost::unit_test::tolerance(0.0001)){ - -// multi::array, 1> const cpu_in = {1.0 + I*2.0, 2.0 + I*3.0, 3.0 + I*4.0}; -// multi::thrust::cuda::array, 1> const gpu_in = {1.0 + I*2.0, 2.0 + I*3.0, 3.0 + I*4.0}; - -// multi::array, 1> cpu_out(cpu_in.extensions()); -// multi::thrust::cuda::array, 1> gpu_out(gpu_in.extensions()); - -// multi::fftw::plan::forward({true}, cpu_in.base(), cpu_in.layout(), cpu_out.base(), cpu_out.layout()).execute (cpu_in.base(), cpu_out.base()); -// multi::cufft::plan<1> ({true}, gpu_in.layout(), gpu_out.layout()).execute_forward(gpu_in.base(), gpu_out.base()); -// } - -// BOOST_AUTO_TEST_CASE(cufft_3D_timing, *boost::unit_test::tolerance(0.0001)){ - -// auto x = multi::extensions_t<3>{300, 300, 300}; -// { -// auto const in_cpu = multi::array(x, 10.0); -// BOOST_ASSERT( in_cpu.num_elements()*sizeof(complex) < 2e9 ); -// auto fw_cpu = multi::array(extensions(in_cpu), 99.0); -// { -// // boost::timer::auto_cpu_timer t; // 1.041691s wall, 1.030000s user + 0.000000s system = 1.030000s CPU (98.9%) -// multi::fftw::dft_forward({true, true}, in_cpu, fw_cpu); -// BOOST_TEST( fw_cpu[8][9][10] != 99.0 ); -// } - -// auto const in_gpu = multi::thrust::cuda::array{in_cpu}; // (x, 10.0); -// cudaDeviceSynchronize()==cudaSuccess?void():assert(0); -// { -// auto fw_gpu = multi::thrust::cuda::array(extensions(in_gpu), 99.0); -// cudaDeviceSynchronize()==cudaSuccess?void():assert(0); -// // boost::timer::auto_cpu_timer t; // 0.208237s wall, 0.200000s user + 0.010000s system = 0.210000s CPU (100.8%) -// boost::multi::cufft::dft({true, true}, in_gpu, fw_gpu, multi::cufft::forward); -// cudaDeviceSynchronize()==cudaSuccess?void():assert(0); -// BOOST_TEST( (static_cast(fw_gpu[8][9][10]) - fw_cpu[8][9][10]).real() == 0.0 ); -// BOOST_TEST( (static_cast(fw_gpu[8][9][10]) - fw_cpu[8][9][10]).imag() == 0.0 ); -// } -// { -// // boost::timer::auto_cpu_timer t; // 0.208237s wall, 0.200000s user + 0.010000s system = 0.210000s CPU (100.8%) -// auto const fw_gpu2 = boost::multi::cufft::dft({true, true}, in_gpu, multi::cufft::forward); -// cudaDeviceSynchronize()==cudaSuccess?void():assert(0); -// BOOST_TEST( (static_cast(fw_gpu2[8][9][10]) - fw_cpu[8][9][10]).real() == 0.0 ); -// BOOST_TEST( (static_cast(fw_gpu2[8][9][10]) - fw_cpu[8][9][10]).imag() == 0.0 ); -// } -// } - -// #if 1 -// { -// multi::thrust::cuda::universal_array const in_gpu(x, 10.); -// multi::thrust::cuda::universal_array fw_gpu(extensions(in_gpu), 99.); - -// // multi::cuda::managed::array const in_gpu(x, 10.); -// // multi::cuda::managed::array fw_gpu(extensions(in_gpu), 99.); -// { -// // boost::timer::auto_cpu_timer t; // 0.208237s wall, 0.200000s user + 0.010000s system = 0.210000s CPU (100.8%) -// multi::cufft::dft({true, true}, in_gpu, fw_gpu, multi::cufft::forward); -// // BOOST_TEST( fw_gpu[8][9][10].operator complex() != 99. ); -// } -// { -// // boost::timer::auto_cpu_timer t; // 0.208237s wall, 0.200000s user + 0.010000s system = 0.210000s CPU (100.8%) -// multi::cufft::dft({true, true}, in_gpu, fw_gpu, multi::cufft::forward); -// // BOOST_TEST( fw_gpu[8][9][10].operator complex() != 99. ); -// } -// } -// #endif -// } - -// #if 0 - -// BOOST_AUTO_TEST_CASE(cufft_combinations, *utf::tolerance(0.00001)){ - -// auto const in = []{ -// multi::array ret({32, 90, 98, 96}); -// std::generate(ret.data_elements(), ret.data_elements() + ret.num_elements(), -// [](){return complex{std::rand()*1./RAND_MAX, std::rand()*1./RAND_MAX};} -// ); -// return ret; -// }(); -// std::clog<<"memory size "<< in.num_elements()*sizeof(complex)/1e6 <<" MB\n"; - -// multi::thrust::cuda::universal_array const in_gpu = in; -// multi::thrust::cuda::universal_array const in_mng = in; - -// using std::clog; -// for(auto c : std::vector>{ -// {false, true , true , true }, -// {false, true , true , false}, -// {true , false, false, false}, -// {true , true , false, false}, -// {false, false, true , false}, -// {false, false, false, false}, -// }){ -// std::clog<<"case "; copy(begin(c), end(c), std::ostream_iterator{std::clog,", "}); std::clog< out = in; -// multi::array in_rw = in; -// [&, _ = watch{"cpu_opl "}]{ -// multi::fftw::dft_forward(c, in, out); -// }(); -// [&, _ = watch{"cpu_ipl "}]{ -// multi::fftw::dft(c, in_rw, multi::fftw::forward); -// // BOOST_TEST( abs( static_cast>(in_rw[5][4][3][1]) - multi::complex(out[5][4][3][1]) ) == 0. ); -// }(); -// { -// multi::array in_rw2 = in; -// [&, _ = watch{"cpu_mov "}]{ -// multi::array const out_mov = multi::fftw::dft_forward(c, std::move(in_rw2)); -// // what(out_mov); -// // BOOST_TEST( abs( static_cast>(out_mov[5][4][3][1]) - multi::complex(out[5][4][3][1]) ) == 0. ); -// BOOST_REQUIRE( is_empty(in_rw2) ); -// BOOST_REQUIRE( extensions(out_mov) == extensions(in) ); -// }(); -// } - -// [&, _ = watch{"cpu_new "}]{ -// auto const out_cpy = multi::fftw::dft_forward(c, in); -// BOOST_TEST( abs( static_cast>(out_cpy[5][4][3][1]) - std::complex(out[5][4][3][1]) ) == 0. ); -// }(); -// multi::thrust::cuda::array out_gpu(extensions(in_gpu)); -// [&, _ = watch{"gpu_opl "}]{ -// multi::cufft::dft(c, in_gpu , out_gpu, multi::cufft::forward); -// BOOST_TEST( abs( static_cast(out_gpu[5][4][3][1]) - out[5][4][3][1] ) == 0. ); -// }(); -// { -// multi::thrust::cuda::array in_rw_gpu = in_gpu; -// [&, _ = watch{"gpu_ipl "}]{ -// multi::cufft::dft(c, in_rw_gpu, multi::cufft::forward); -// BOOST_TEST( abs( static_cast(in_rw_gpu[5][4][3][1]) - out[5][4][3][1] ) == 0. ); -// }(); -// } -// { -// multi::thrust::cuda::array in_rw_gpu = in_gpu; -// [&, _ = watch{"gpu_mov "}]{ -// multi::thrust::cuda::array const out_mov = multi::cufft::dft_forward(c, std::move(in_rw_gpu)); -// // BOOST_REQUIRE( in_rw_gpu.empty() ); -// // BOOST_TEST( abs( static_cast(out_mov[5][4][3][1]) - out[5][4][3][1] ) == 0. ); -// }(); -// } -// { -// multi::thrust::cuda::array in_rw_gpu = in_gpu; -// [&, _ = watch{"gpu_mov "}]{ -// multi::thrust::cuda::array out_mov = std::move(in_rw_gpu); -// multi::cufft::dft(c, out_mov, multi::cufft::forward); -// // BOOST_REQUIRE( in_rw_gpu.empty() ); -// // BOOST_TEST( abs( static_cast(out_mov[5][4][3][1]) - out[5][4][3][1] ) == 0. ); -// }(); -// } -// cudaDeviceSynchronize(); -// [&, _ = watch{"gpu_new "}]{ -// multi::thrust::cuda::array const out_cpy = multi::cufft::dft(c, in_gpu, multi::cufft::forward); -// }(); -// multi::thrust::cuda::universal_array out_mng(extensions(in_mng)); -// [&, _ = watch{"mng_cld "}]{ -// multi::cufft::dft(c, in_mng, out_mng, multi::cufft::forward); -// BOOST_TEST( abs( out_mng[5][4][3][1] - out[5][4][3][1] ) == 0. ); -// }(); -// [&, _ = watch{"mng_hot "}]{ -// multi::cufft::dft(c, in_mng , out_mng, multi::cufft::forward); -// BOOST_TEST( abs( out_mng[5][4][3][1] - out[5][4][3][1] ) == 0. ); -// }(); -// [&, _ = watch{"mng_new "}]{ -// auto const out_mng = multi::cufft::dft(c, in_mng, multi::cufft::forward); -// BOOST_TEST( abs( out_mng[5][4][3][1] - out[5][4][3][1] ) == 0. ); -// }(); -// } -// // std::clog<<"cache size " -// // << multi::cufft::plan::cache<1>().size() <<' ' -// // << multi::cufft::plan::cache<2>().size() <<' ' -// // << multi::cufft::plan::cache<3>().size() <<' ' -// // << multi::cufft::plan::cache<4>().size() <<' ' -// // < ret({45, 18, 32, 16}); -// std::generate( -// ret.data_elements(), ret.data_elements() + ret.num_elements(), -// [](){return complex{std::rand()*1./RAND_MAX, std::rand()*1./RAND_MAX};} -// ); -// return ret; -// }(); - -// multi::thrust::cuda::array const in = in_cpu; -// multi::thrust::cuda::array out(extensions(in)); - -// #if 0 -// multi::cufft::many_dft(begin(unrotated(in)), end(unrotated(in)), begin(unrotated(out)), +1); - -// multi::array out_cpu(extensions(in)); -// multi::fft::many_dft(begin(unrotated(in_cpu)), end(unrotated(in_cpu)), begin(unrotated(out_cpu)), +1); - -// BOOST_TEST( imag( static_cast(out[5][4][3][2]) - out_cpu[5][4][3][2]) == 0. ); -// #endif -// } - -// #if 0 -// BOOST_AUTO_TEST_CASE(cufft_4D, *utf::tolerance(0.00001) ){ -// auto const in = []{ -// multi::array ret({10, 10, 10}); -// std::generate(ret.data_elements(), ret.data_elements() + ret.num_elements(), -// [](){return complex{std::rand()*1./RAND_MAX, std::rand()*1./RAND_MAX};} -// ); -// return ret; -// }(); - -// multi::array out(extensions(in)); -// // multi::fftw::dft({true, false, true}, in, out, multi::fftw::forward); -// multi::fftw::many_dft(begin(in.rotated()), end(in.rotated()), begin(out.rotated()), multi::fftw::forward); - -// multi::thrust::cuda::array in_gpu = in; -// multi::thrust::cuda::array out_gpu(extensions(in)); - -// // multi::cufft::dft({true, false, true}, in_gpu, out_gpu, multi::fft::forward);//multi::cufft::forward); -// // multi::cufft::many_dft(begin(in_gpu.rotated()), end(in_gpu.rotated()), begin( out_gpu.rotated() ), multi::fftw::forward); -// // BOOST_TEST( ( static_cast(out_gpu[5][4][3]) - out[5][4][3]).imag() == 0. ); -// } -// #endif -// #endif - -// #endif From a59a897366aa51f715716396207bb634632658ff Mon Sep 17 00:00:00 2001 From: Alfredo Correa Date: Sun, 19 Oct 2025 15:37:06 -0700 Subject: [PATCH 03/12] use ctad for array and pair --- include/boost/multi/adaptors/cufft.hpp | 32 ++++++++++--------- .../boost/multi/adaptors/cufft/test/cases.cpp | 8 ++--- 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/include/boost/multi/adaptors/cufft.hpp b/include/boost/multi/adaptors/cufft.hpp index b3ac00327..8050aaa89 100644 --- a/include/boost/multi/adaptors/cufft.hpp +++ b/include/boost/multi/adaptors/cufft.hpp @@ -58,7 +58,7 @@ static auto cuda_get_error_enum(cufftResult error) -> char const* { } #define cufftSafeCall(err) implcufftSafeCall(err, __FILE__, __LINE__) -inline void implcufftSafeCall(cufftResult err, const char* file, const int line) { +inline void implcufftSafeCall(cufftResult err, char const* file, int const line) { if(CUFFT_SUCCESS != err) { std::cerr << "CUFFT error in file " << file << ", line " << line << "\nerror " << err << ": " << cuda_get_error_enum(err) << "\n"; // fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n", __FILE__, __LINE__, err, @@ -141,23 +141,25 @@ class plan { auto const istride_tuple = in.strides(); auto const ostride_tuple = out.strides(); - using boost::multi::detail::get; - auto which_iodims = std::apply([](auto... elems) { - return std::array, sizeof...(elems) + 1>{ - // TODO(correaa) added one element to avoid problem with gcc 13 static analysis (out-of-bounds) - std::pair{ - get<0>(elems), - cufft_iodim64{get<1>(elems), get<2>(elems), get<3>(elems)} - } - ..., - std::pair{} - }; - }, - boost::multi::detail::tuple_zip(which, sizes_tuple, istride_tuple, ostride_tuple)); + using std::get; // boost::multi::detail::get; + auto which_iodims = std::apply( + [](auto... elems) { + return std::array /*, sizeof...(elems) + 1>*/ { + // TODO(correaa) added one element to avoid problem with gcc 13 static analysis (out-of-bounds) + std::pair/**/{ + get<0>(elems), + cufft_iodim64{get<1>(elems), get<2>(elems), get<3>(elems)} + } + ..., + std::pair{} + }; + }, + boost::multi::detail::tuple_zip(which, sizes_tuple, istride_tuple, ostride_tuple) + ); std::stable_sort(which_iodims.begin(), which_iodims.end() - 1, [](auto const& alpha, auto const& omega) { return get<1>(alpha).is > get<1>(omega).is; }); - auto const part = std::stable_partition(which_iodims.begin(), which_iodims.end() - 1, [](auto elem) { return std::get<0>(elem); }); + auto const part = std::stable_partition(which_iodims.begin(), which_iodims.end() - 1, [](auto elem) { return get<0>(elem); }); std::array dims{}; auto const dims_end = std::transform(which_iodims.begin(), part, dims.begin(), [](auto elem) { return elem.second; }); diff --git a/include/boost/multi/adaptors/cufft/test/cases.cpp b/include/boost/multi/adaptors/cufft/test/cases.cpp index 6249033de..5281505ef 100644 --- a/include/boost/multi/adaptors/cufft/test/cases.cpp +++ b/include/boost/multi/adaptors/cufft/test/cases.cpp @@ -52,11 +52,11 @@ auto main() -> int try { multi::fftw::dft_forward({true, true, true, false}, in_cpu_view, out_cpu); - multi::cufft::plan<4>({true, true, true, false}, in_gpu_view.layout(), out_gpu.layout()) - .execute(in_gpu_view.base(), out_gpu.base(), multi::cufft::forward); + // multi::cufft::plan<4>({true, true, true, false}, in_gpu_view.layout(), out_gpu.layout()) + // .execute(in_gpu_view.base(), out_gpu.base(), multi::cufft::forward); - std::cout << out_cpu[2][3][4][5] << ' ' << out_gpu[2][3][4][5] << std::endl; - BOOST_TEST( thrust::abs(out_cpu[2][3][4][5] - static_cast(out_gpu[2][3][4][5])) < 1e-6 ); + // std::cout << out_cpu[2][3][4][5] << ' ' << out_gpu[2][3][4][5] << std::endl; + // BOOST_TEST( thrust::abs(out_cpu[2][3][4][5] - static_cast(out_gpu[2][3][4][5])) < 1e-6 ); return boost::report_errors(); } catch(...) { From 8a7217de3a7466282addf0d1fae99ddbf3f7798e Mon Sep 17 00:00:00 2001 From: Alfredo Correa Date: Sun, 19 Oct 2025 16:09:25 -0700 Subject: [PATCH 04/12] add num_elements --- include/boost/multi/detail/layout.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/include/boost/multi/detail/layout.hpp b/include/boost/multi/detail/layout.hpp index 89f0769dc..083e3cf68 100644 --- a/include/boost/multi/detail/layout.hpp +++ b/include/boost/multi/detail/layout.hpp @@ -219,6 +219,7 @@ class f_extensions_t { }; constexpr auto elements() const { return elements_t{xs_.elements(), proj_}; } + constexpr auto num_elements() const { return xs_.num_elements(); } }; template From c4e08fc52210d5f959dfd03c7ae4577427d6d241 Mon Sep 17 00:00:00 2001 From: Alfredo Correa Date: Sun, 19 Oct 2025 16:12:48 -0700 Subject: [PATCH 05/12] test extensions ctad --- test/extensions.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/extensions.cpp b/test/extensions.cpp index e64910f16..d7f9c51cc 100644 --- a/test/extensions.cpp +++ b/test/extensions.cpp @@ -77,6 +77,8 @@ auto main() -> int { // NOLINT(bugprone-exception-escape,readability-function-c { auto x1d = multi::extensions_t<1>(3); + BOOST_TEST( multi::extensions_t<1>(3) == multi::extensions_t(3) ); + auto it = x1d.elements().begin(); BOOST_TEST( get<0>(*it) == 0 ); @@ -128,6 +130,8 @@ auto main() -> int { // NOLINT(bugprone-exception-escape,readability-function-c { multi::extensions_t<2> const x2d({4, 3}); + BOOST_TEST( multi::extensions_t<2>(4, 3) == multi::extensions_t(4, 3) ); + auto ll = [](auto xx, auto yy) { return xx + yy; }; @@ -375,6 +379,8 @@ auto main() -> int { // NOLINT(bugprone-exception-escape,readability-function-c { multi::extensions_t<3> const xs{3, 4, 5}; + BOOST_TEST(( multi::extensions_t<3>{3, 4, 5} == multi::extensions_t(3, 4, 5) )); + BOOST_TEST( xs.sub() == multi::extensions_t<2>(4, 5) ); static_assert(std::is_same_v::element>); } From fb0d402d6e4fcc6aa0ea05f16046e0305aea47ac Mon Sep 17 00:00:00 2001 From: Alfredo Correa Date: Sun, 19 Oct 2025 16:41:07 -0700 Subject: [PATCH 06/12] add dimensionality to restriction --- include/boost/multi/detail/layout.hpp | 1 + test/extensions.cpp | 1 + 2 files changed, 2 insertions(+) diff --git a/include/boost/multi/detail/layout.hpp b/include/boost/multi/detail/layout.hpp index 083e3cf68..b1e7cd972 100644 --- a/include/boost/multi/detail/layout.hpp +++ b/include/boost/multi/detail/layout.hpp @@ -104,6 +104,7 @@ class f_extensions_t { } public: + static constexpr dimensionality_type dimensionality = D; constexpr static dimensionality_type rank_v = D; using difference_type = typename extensions_t::difference_type; diff --git a/test/extensions.cpp b/test/extensions.cpp index d7f9c51cc..0f4bf1968 100644 --- a/test/extensions.cpp +++ b/test/extensions.cpp @@ -408,6 +408,7 @@ auto main() -> int { // NOLINT(bugprone-exception-escape,readability-function-c auto const& values = [](auto ii, auto jj) { return ii + jj; } ^ arr.extensions(); + BOOST_TEST( values.dimensionality == 2 ); BOOST_TEST( values.extensions() == arr.extensions() ); BOOST_TEST( *values.elements().begin() == 0 ); BOOST_TEST( values.elements().begin() < values.elements().end() ); From 614e47568f2fedd534078b1d0605be4ab52e0b23 Mon Sep 17 00:00:00 2001 From: Alfredo Correa Date: Sun, 19 Oct 2025 19:09:03 -0700 Subject: [PATCH 07/12] investigate mull exec --- .gitlab-ci-correaa.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitlab-ci-correaa.yml b/.gitlab-ci-correaa.yml index 53c16b5b0..b0f6ca37d 100644 --- a/.gitlab-ci-correaa.yml +++ b/.gitlab-ci-correaa.yml @@ -293,6 +293,8 @@ clang++-18 mull-18: # mull-18 crashes - apt-get -qq install --no-install-recommends -y --quiet mull-18 libclang-cpp18 - mkdir build && cd build - clang++-18 --version + - which mull-runner-18 + - file `which mull-runner-18` - mull-runner-18 --version - CXX=clang++-18 cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS="-O3 -DNDEBUG -fpass-plugin=/usr/lib/mull-ir-frontend-18 -g -grecord-command-line" - cmake --build . --parallel 8 || cmake --build . --parallel 1 --verbose From bf5bddb0ce17a358e8112776a368d6e96307fca1 Mon Sep 17 00:00:00 2001 From: Alfredo Correa Date: Sun, 19 Oct 2025 19:15:51 -0700 Subject: [PATCH 08/12] add file apt --- .gitlab-ci-correaa.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci-correaa.yml b/.gitlab-ci-correaa.yml index b0f6ca37d..2cb38f3b7 100644 --- a/.gitlab-ci-correaa.yml +++ b/.gitlab-ci-correaa.yml @@ -286,7 +286,7 @@ clang++-18 mull-18: # mull-18 crashes allow_failure: false script: - apt-get -qq update - - apt-get -qq install --no-install-recommends -y --quiet gnupg curl wget ca-certificates clang-18 cmake libclang-rt-18-dev make libboost-timer-dev libboost-serialization-dev > /dev/null + - apt-get -qq install --no-install-recommends -y --quiet gnupg curl wget ca-certificates clang-18 cmake file libclang-rt-18-dev make libboost-timer-dev libboost-serialization-dev > /dev/null - curl -1sLf 'https://dl.cloudsmith.io/public/mull-project/mull-stable/setup.deb.sh' | bash - apt-get -qq update - apt search mull From 1d5e2617404b7cc62de5e9b769a63613d1c7c2fb Mon Sep 17 00:00:00 2001 From: Alfredo Correa Date: Sun, 19 Oct 2025 19:25:57 -0700 Subject: [PATCH 09/12] report size --- .gitlab-ci-correaa.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitlab-ci-correaa.yml b/.gitlab-ci-correaa.yml index 2cb38f3b7..31137700e 100644 --- a/.gitlab-ci-correaa.yml +++ b/.gitlab-ci-correaa.yml @@ -294,6 +294,7 @@ clang++-18 mull-18: # mull-18 crashes - mkdir build && cd build - clang++-18 --version - which mull-runner-18 + - ls -all `which mull-runner-18` - file `which mull-runner-18` - mull-runner-18 --version - CXX=clang++-18 cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS="-O3 -DNDEBUG -fpass-plugin=/usr/lib/mull-ir-frontend-18 -g -grecord-command-line" From cc2fb43b464ff71c31da89dca24ebe691f101962 Mon Sep 17 00:00:00 2001 From: Alfredo Correa Date: Sun, 19 Oct 2025 21:40:42 -0700 Subject: [PATCH 10/12] disable mull for now --- .gitlab-ci-correaa.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci-correaa.yml b/.gitlab-ci-correaa.yml index 31137700e..fa29359c2 100644 --- a/.gitlab-ci-correaa.yml +++ b/.gitlab-ci-correaa.yml @@ -283,7 +283,7 @@ clang++-18 mull-18: # mull-18 crashes - large-memory-space - non-shared interruptible: true - allow_failure: false + allow_failure: true script: - apt-get -qq update - apt-get -qq install --no-install-recommends -y --quiet gnupg curl wget ca-certificates clang-18 cmake file libclang-rt-18-dev make libboost-timer-dev libboost-serialization-dev > /dev/null From 9bfcf34469b4443b9adade2e0400a2c16fd8b3a3 Mon Sep 17 00:00:00 2001 From: Alfredo Correa Date: Mon, 20 Oct 2025 15:50:02 -0700 Subject: [PATCH 11/12] sort by which first --- include/boost/multi/adaptors/cufft.hpp | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/include/boost/multi/adaptors/cufft.hpp b/include/boost/multi/adaptors/cufft.hpp index 8050aaa89..cd45d5126 100644 --- a/include/boost/multi/adaptors/cufft.hpp +++ b/include/boost/multi/adaptors/cufft.hpp @@ -146,23 +146,25 @@ class plan { [](auto... elems) { return std::array /*, sizeof...(elems) + 1>*/ { // TODO(correaa) added one element to avoid problem with gcc 13 static analysis (out-of-bounds) - std::pair/**/{ - get<0>(elems), - cufft_iodim64{get<1>(elems), get<2>(elems), get<3>(elems)} + std::pair /**/ { + get<0>(elems), + cufft_iodim64{get<1>(elems), get<2>(elems), get<3>(elems)} } - ..., + ..., std::pair{} }; }, boost::multi::detail::tuple_zip(which, sizes_tuple, istride_tuple, ostride_tuple) ); - std::stable_sort(which_iodims.begin(), which_iodims.end() - 1, [](auto const& alpha, auto const& omega) { return get<1>(alpha).is > get<1>(omega).is; }); - auto const part = std::stable_partition(which_iodims.begin(), which_iodims.end() - 1, [](auto elem) { return get<0>(elem); }); + std::stable_sort(which_iodims.begin(), part, [](auto const& alpha, auto const& omega) { return get<1>(alpha).is > get<1>(omega).is; }); + std::stable_sort(part, which_iodims.end() - 1, [](auto const& alpha, auto const& omega) { return get<1>(alpha).is > get<1>(omega).is; }); + std::array dims{}; - auto const dims_end = std::transform(which_iodims.begin(), part, dims.begin(), [](auto elem) { return elem.second; }); + + auto const dims_end = std::transform(which_iodims.begin(), part, dims.begin(), [](auto elem) { return elem.second; }); // std::array howmany_dims{}; // auto const howmany_dims_end = std::transform(part, which_iodims.end() -1, howmany_dims.begin(), [](auto elem) {return elem.second;}); From f1883a4131828ecef15cf61834cf3981f2062579 Mon Sep 17 00:00:00 2001 From: Alfredo Correa Date: Tue, 21 Oct 2025 21:57:21 -0700 Subject: [PATCH 12/12] convert handle into an array --- .clang-tidy | 1 + include/boost/multi/adaptors/cufft.hpp | 52 +++++++++++++------------- 2 files changed, 26 insertions(+), 27 deletions(-) diff --git a/.clang-tidy b/.clang-tidy index 45d7f1ec0..32d9dd2d0 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -3,6 +3,7 @@ WarningsAsErrors: '*' HeaderFilterRegex: '.' FormatStyle: file +TransformLambdas: none CheckOptions: - { key: readability-identifier-naming.ClassCase , value: lower_case } diff --git a/include/boost/multi/adaptors/cufft.hpp b/include/boost/multi/adaptors/cufft.hpp index cd45d5126..f696878fc 100644 --- a/include/boost/multi/adaptors/cufft.hpp +++ b/include/boost/multi/adaptors/cufft.hpp @@ -97,7 +97,7 @@ class plan { Alloc alloc_; ::size_t workSize_ = 0; void* workArea_{}; - cufftHandle h_{}; // TODO(correaa) put this in a unique_ptr + std::array hs_{}; // TODO(correaa) put this in a unique_ptr std::array, DD + 1> which_iodims_{}; int first_howmany_{}; @@ -120,7 +120,7 @@ class plan { : alloc_{std::move(other.alloc_)}, workSize_{std::exchange(other.workSize_, {})}, workArea_{std::exchange(other.workArea_, {})}, - h_{std::exchange(other.h_, {})}, + hs_{std::exchange(other.hs_, {})}, which_iodims_{std::exchange(other.which_iodims_, {})}, first_howmany_{std::exchange(other.first_howmany_, {})} { // other.used_ = true; // moved-from object cannot be used @@ -159,8 +159,8 @@ class plan { auto const part = std::stable_partition(which_iodims.begin(), which_iodims.end() - 1, [](auto elem) { return get<0>(elem); }); - std::stable_sort(which_iodims.begin(), part, [](auto const& alpha, auto const& omega) { return get<1>(alpha).is > get<1>(omega).is; }); - std::stable_sort(part, which_iodims.end() - 1, [](auto const& alpha, auto const& omega) { return get<1>(alpha).is > get<1>(omega).is; }); + std::stable_sort(which_iodims.begin(), part, [](auto const& alpha, auto const& omega) { return get<1>(alpha).os > get<1>(omega).os; }); + std::stable_sort(part, which_iodims.end() - 1, [](auto const& alpha, auto const& omega) { return get<1>(alpha).os > get<1>(omega).os; }); std::array dims{}; @@ -193,6 +193,8 @@ class plan { assert(ostrides[idx - 1] >= ostrides[idx]); assert(ostrides[idx - 1] % ostrides[idx] == 0); onembed[idx] = ostrides[idx - 1] / ostrides[idx]; + + assert(istrides[idx - 1] >= istrides[idx]); assert(istrides[idx - 1] % istrides[idx] == 0); inembed[idx] = istrides[idx - 1] / istrides[idx]; } @@ -221,7 +223,7 @@ class plan { if constexpr(std::is_same_v) { assert(dims_end - dims.begin() < 4); // cufft cannot do 4D FFT cufftSafeCall(::cufftPlanMany( - /*cufftHandle *plan*/ &h_, + /*cufftHandle *plan*/ &hs_[0], /*int rank*/ dims_end - dims.begin(), /*int *n*/ ion.data(), /*int *inembed*/ inembed.data(), @@ -234,10 +236,10 @@ class plan { /*int batch*/ 1 // BATCH )); } else { - cufftSafeCall(cufftCreate(&h_)); - cufftSafeCall(cufftSetAutoAllocation(h_, false)); + cufftSafeCall(cufftCreate(&hs_[0])); + cufftSafeCall(cufftSetAutoAllocation(hs_[0], false)); cufftSafeCall(cufftMakePlanMany( - /*cufftHandle *plan*/ h_, + /*cufftHandle *plan*/ hs_[0], /*int rank*/ dims_end - dims.begin(), /*int *n*/ ion.data(), /*int *inembed*/ inembed.data(), @@ -250,14 +252,14 @@ class plan { /*int batch*/ 1, // BATCH /*size_t **/ &workSize_ )); - cufftSafeCall(cufftGetSize(h_, &workSize_)); + cufftSafeCall(cufftGetSize(hs_[0], &workSize_)); workArea_ = ::thrust::raw_pointer_cast(alloc_.allocate(workSize_)); static_assert(sizeof(Alloc) == 1000); // auto s = cudaMalloc(&workArea_, workSize_); // if(s != cudaSuccess) {throw std::runtime_error{"L212"};} - cufftSafeCall(cufftSetWorkArea(h_, workArea_)); + cufftSafeCall(cufftSetWorkArea(hs_[0], workArea_)); } - if(!h_) { + if(!hs_[0]) { throw std::runtime_error{"cufftPlanMany null"}; } return; @@ -268,7 +270,7 @@ class plan { if(first_howmany_ <= D - 1) { if constexpr(std::is_same_v) { // NOLINT(bugprone-branch-clone) workaround bug in DeepSource cufftSafeCall(::cufftPlanMany( - /*cufftHandle *plan*/ &h_, + /*cufftHandle *plan*/ &hs_[0], /*int rank*/ dims_end - dims.begin(), /*int *n*/ ion.data(), /*int *inembed*/ inembed.data(), @@ -281,10 +283,10 @@ class plan { /*int batch*/ which_iodims_[first_howmany_].second.n )); } else { - cufftSafeCall(cufftCreate(&h_)); - cufftSafeCall(cufftSetAutoAllocation(h_, false)); + cufftSafeCall(cufftCreate(&hs_[0])); + cufftSafeCall(cufftSetAutoAllocation(hs_[0], false)); cufftSafeCall(cufftMakePlanMany( - /*cufftHandle *plan*/ h_, + /*cufftHandle *plan*/ hs_[0], /*int rank*/ dims_end - dims.begin(), /*int *n*/ ion.data(), /*int *inembed*/ inembed.data(), @@ -297,11 +299,11 @@ class plan { /*int batch*/ which_iodims_[first_howmany_].second.n, /*size_t **/ &workSize_ )); - cufftSafeCall(cufftGetSize(h_, &workSize_)); + cufftSafeCall(cufftGetSize(hs_[0], &workSize_)); workArea_ = ::thrust::raw_pointer_cast(alloc_.allocate(workSize_)); - cufftSafeCall(cufftSetWorkArea(h_, workArea_)); + cufftSafeCall(cufftSetWorkArea(hs_[0], workArea_)); } - if(!h_) { + if(!hs_[0]) { throw std::runtime_error{"cufftPlanMany null"}; } ++first_howmany_; @@ -314,7 +316,7 @@ class plan { template void ExecZ2Z_(complex_type const* idata, complex_type* odata, int direction) const { // used_ = true; - cufftSafeCall(cufftExecZ2Z(h_, const_cast(idata), odata, direction)); // NOLINT(cppcoreguidelines-pro-type-const-cast) wrap legacy interface + cufftSafeCall(cufftExecZ2Z(hs_[0], const_cast(idata), odata, direction)); // NOLINT(cppcoreguidelines-pro-type-const-cast) wrap legacy interface // cudaDeviceSynchronize(); } @@ -337,7 +339,7 @@ class plan { for(int idx = 0; idx != which_iodims_[first_howmany_].second.n; ++idx) { // NOLINT(altera-unroll-loops,altera-id-dependent-backward-branch) cufftExecZ2Z( - h_, + hs_[0], const_cast(reinterpret_cast(::thrust::raw_pointer_cast(idata + idx * which_iodims_[first_howmany_].second.is))), // NOLINT(cppcoreguidelines-pro-type-const-cast,cppcoreguidelines-pro-type-reinterpret-cast) legacy interface reinterpret_cast(::thrust::raw_pointer_cast(odata + idx * which_iodims_[first_howmany_].second.os)), // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast) legacy interface direction @@ -358,7 +360,7 @@ class plan { for(int idx = 0; idx != which_iodims_[first_howmany_].second.n; ++idx) { // NOLINT(altera-unroll-loops,altera-unroll-loops,altera-id-dependent-backward-branch) TODO(correaa) use an algorithm for(int jdx = 0; jdx != which_iodims_[first_howmany_ + 1].second.n; ++jdx) { // NOLINT(altera-unroll-loops,altera-unroll-loops,altera-id-dependent-backward-branch) TODO(correaa) use an algorithm cufftExecZ2Z( - h_, + hs_[0], const_cast(reinterpret_cast(::thrust::raw_pointer_cast(idata + idx * which_iodims_[first_howmany_].second.is + jdx * which_iodims_[first_howmany_ + 1].second.is))), // NOLINT(cppcoreguidelines-pro-type-const-cast,cppcoreguidelines-pro-type-reinterpret-cast) legacy interface reinterpret_cast(::thrust::raw_pointer_cast(odata + idx * which_iodims_[first_howmany_].second.os + jdx * which_iodims_[first_howmany_ + 1].second.os)), // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast) legacy interface direction @@ -401,13 +403,9 @@ class plan { alloc_.deallocate(typename std::allocator_traits::pointer(reinterpret_cast(workArea_)), workSize_); } // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast) legacy interface } - if(h_ != 0) { - cufftSafeCall(cufftDestroy(h_)); + if(hs_[0] != 0) { + cufftSafeCall(cufftDestroy(hs_[0])); } - // if(!used_) { - // std::cerr <<"Warning: cufft plan was never used\n"; - // std::terminate(); - // } } using size_type = int;