correaa · correaa · Oct 19, 2025 · Oct 19, 2025 · Oct 19, 2025 · Oct 19, 2025
diff --git a/.clang-tidy b/.clang-tidy
@@ -3,6 +3,7 @@
 WarningsAsErrors: '*'
 HeaderFilterRegex: '.'
 FormatStyle: file
+TransformLambdas: none
 
 CheckOptions:
   - { key: readability-identifier-naming.ClassCase                         , value: lower_case }

diff --git a/.gitlab-ci-correaa.yml b/.gitlab-ci-correaa.yml
@@ -283,16 +283,19 @@ clang++-18 mull-18:  # mull-18 crashes
     - large-memory-space
     - non-shared
   interruptible: true
-  allow_failure: false
+  allow_failure: true
   script:
     - apt-get -qq update
-    - apt-get -qq install --no-install-recommends -y --quiet gnupg curl wget ca-certificates clang-18 cmake libclang-rt-18-dev make libboost-timer-dev libboost-serialization-dev > /dev/null
+    - apt-get -qq install --no-install-recommends -y --quiet gnupg curl wget ca-certificates clang-18 cmake file libclang-rt-18-dev make libboost-timer-dev libboost-serialization-dev > /dev/null
     - curl -1sLf 'https://dl.cloudsmith.io/public/mull-project/mull-stable/setup.deb.sh' | bash
     - apt-get -qq update
     - apt search mull
     - apt-get -qq install --no-install-recommends -y --quiet mull-18 libclang-cpp18
     - mkdir build && cd build
     - clang++-18 --version
+    - which mull-runner-18
+    - ls -all `which mull-runner-18`
+    - file `which mull-runner-18`
     - mull-runner-18 --version
     - CXX=clang++-18 cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS="-O3 -DNDEBUG -fpass-plugin=/usr/lib/mull-ir-frontend-18 -g -grecord-command-line"
     - cmake --build . --parallel 8 || cmake --build . --parallel 1 --verbose

diff --git a/include/boost/multi/adaptors/cufft.hpp b/include/boost/multi/adaptors/cufft.hpp
@@ -58,7 +58,7 @@ static auto cuda_get_error_enum(cufftResult error) -> char const* {
 }
 
 #define cufftSafeCall(err) implcufftSafeCall(err, __FILE__, __LINE__)
-inline void implcufftSafeCall(cufftResult err, const char* file, const int line) {
+inline void implcufftSafeCall(cufftResult err, char const* file, int const line) {
 	if(CUFFT_SUCCESS != err) {
 		std::cerr << "CUFFT error in file " << file << ", line " << line << "\nerror " << err << ": " << cuda_get_error_enum(err) << "\n";
 		// fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n", __FILE__, __LINE__, err,
@@ -97,7 +97,7 @@ class plan {
 	Alloc                                              alloc_;
 	::size_t                                           workSize_ = 0;
 	void*                                              workArea_{};
-	cufftHandle                                        h_{};  // TODO(correaa) put this in a unique_ptr
+	std::array<cufftHandle, DD>                            hs_{};  // TODO(correaa) put this in a unique_ptr
 	std::array<std::pair<bool, cufft_iodim64>, DD + 1> which_iodims_{};
 	int                                                first_howmany_{};
 
@@ -120,7 +120,7 @@ class plan {
 	: alloc_{std::move(other.alloc_)},
 	  workSize_{std::exchange(other.workSize_, {})},
 	  workArea_{std::exchange(other.workArea_, {})},
-	  h_{std::exchange(other.h_, {})},
+	  hs_{std::exchange(other.hs_, {})},
 	  which_iodims_{std::exchange(other.which_iodims_, {})},
 	  first_howmany_{std::exchange(other.first_howmany_, {})} {
 		// other.used_ = true;  // moved-from object cannot be used
@@ -141,26 +141,30 @@ class plan {
 		auto const istride_tuple = in.strides();
 		auto const ostride_tuple = out.strides();
 
-		using boost::multi::detail::get;
-		auto which_iodims = std::apply([](auto... elems) {
-			return std::array<std::pair<bool, cufft_iodim64>, sizeof...(elems) + 1>{
-  // TODO(correaa) added one element to avoid problem with gcc 13 static analysis (out-of-bounds)
-				std::pair<bool, cufft_iodim64>{
-											   get<0>(elems),
-											   cufft_iodim64{get<1>(elems), get<2>(elems), get<3>(elems)}
-				}
-				 ...,
-				std::pair<bool, cufft_iodim64>{}
-			};
-		},
-									   boost::multi::detail::tuple_zip(which, sizes_tuple, istride_tuple, ostride_tuple));
+		using std::get;  // boost::multi::detail::get;
+		auto which_iodims = std::apply(
+			[](auto... elems) {
+				return std::array /*<std::pair<bool, cufft_iodim64>, sizeof...(elems) + 1>*/ {
+					// TODO(correaa) added one element to avoid problem with gcc 13 static analysis (out-of-bounds)
+					std::pair  /*<bool, cufft_iodim64>*/ {
+														  get<0>(elems),
+														  cufft_iodim64{get<1>(elems), get<2>(elems), get<3>(elems)}
+					}
+					   ...,
+					std::pair<bool, cufft_iodim64>{}
+				};
+			},
+			boost::multi::detail::tuple_zip(which, sizes_tuple, istride_tuple, ostride_tuple)
+		);
 
-		std::stable_sort(which_iodims.begin(), which_iodims.end() - 1, [](auto const& alpha, auto const& omega) { return get<1>(alpha).is > get<1>(omega).is; });
+		auto const part = std::stable_partition(which_iodims.begin(), which_iodims.end() - 1, [](auto elem) { return get<0>(elem); });
 
-		auto const part = std::stable_partition(which_iodims.begin(), which_iodims.end() - 1, [](auto elem) { return std::get<0>(elem); });
+		std::stable_sort(which_iodims.begin(), part, [](auto const& alpha, auto const& omega) { return get<1>(alpha).os > get<1>(omega).os; });
+		std::stable_sort(part, which_iodims.end() - 1, [](auto const& alpha, auto const& omega) { return get<1>(alpha).os > get<1>(omega).os; });
 
 		std::array<cufft_iodim64, D> dims{};
-		auto const                   dims_end = std::transform(which_iodims.begin(), part, dims.begin(), [](auto elem) { return elem.second; });
+
+		auto const dims_end = std::transform(which_iodims.begin(), part, dims.begin(), [](auto elem) { return elem.second; });
 
 		// std::array<cufftw_iodim64, D> howmany_dims{};
 		// auto const howmany_dims_end = std::transform(part, which_iodims.end() -1, howmany_dims.begin(), [](auto elem) {return elem.second;});
@@ -189,6 +193,8 @@ class plan {
 			assert(ostrides[idx - 1] >= ostrides[idx]);
 			assert(ostrides[idx - 1] % ostrides[idx] == 0);
 			onembed[idx] = ostrides[idx - 1] / ostrides[idx];
+
+			assert(istrides[idx - 1] >= istrides[idx]);
 			assert(istrides[idx - 1] % istrides[idx] == 0);
 			inembed[idx] = istrides[idx - 1] / istrides[idx];
 		}
@@ -217,7 +223,7 @@ class plan {
 			if constexpr(std::is_same_v<Alloc, void*>) {
 				assert(dims_end - dims.begin() < 4);  // cufft cannot do 4D FFT
 				cufftSafeCall(::cufftPlanMany(
-					/*cufftHandle *plan*/ &h_,
+					/*cufftHandle *plan*/ &hs_[0],
 					/*int rank*/ dims_end - dims.begin(),
 					/*int *n*/ ion.data(),
 					/*int *inembed*/ inembed.data(),
@@ -230,10 +236,10 @@ class plan {
 					/*int batch*/ 1  // BATCH
 				));
 			} else {
-				cufftSafeCall(cufftCreate(&h_));
-				cufftSafeCall(cufftSetAutoAllocation(h_, false));
+				cufftSafeCall(cufftCreate(&hs_[0]));
+				cufftSafeCall(cufftSetAutoAllocation(hs_[0], false));
 				cufftSafeCall(cufftMakePlanMany(
-					/*cufftHandle *plan*/ h_,
+					/*cufftHandle *plan*/ hs_[0],
 					/*int rank*/ dims_end - dims.begin(),
 					/*int *n*/ ion.data(),
 					/*int *inembed*/ inembed.data(),
@@ -246,14 +252,14 @@ class plan {
 					/*int batch*/ 1,  // BATCH
 					/*size_t **/ &workSize_
 				));
-				cufftSafeCall(cufftGetSize(h_, &workSize_));
+				cufftSafeCall(cufftGetSize(hs_[0], &workSize_));
 				workArea_ = ::thrust::raw_pointer_cast(alloc_.allocate(workSize_));
 				static_assert(sizeof(Alloc) == 1000);
 				// auto s = cudaMalloc(&workArea_, workSize_);
 				// if(s != cudaSuccess) {throw std::runtime_error{"L212"};}
-				cufftSafeCall(cufftSetWorkArea(h_, workArea_));
+				cufftSafeCall(cufftSetWorkArea(hs_[0], workArea_));
 			}
-			if(!h_) {
+			if(!hs_[0]) {
 				throw std::runtime_error{"cufftPlanMany null"};
 			}
 			return;
@@ -264,7 +270,7 @@ class plan {
 		if(first_howmany_ <= D - 1) {
 			if constexpr(std::is_same_v<Alloc, void*>) {  // NOLINT(bugprone-branch-clone) workaround bug in DeepSource
 				cufftSafeCall(::cufftPlanMany(
-					/*cufftHandle *plan*/ &h_,
+					/*cufftHandle *plan*/ &hs_[0],
 					/*int rank*/ dims_end - dims.begin(),
 					/*int *n*/ ion.data(),
 					/*int *inembed*/ inembed.data(),
@@ -277,10 +283,10 @@ class plan {
 					/*int batch*/ which_iodims_[first_howmany_].second.n
 				));
 			} else {
-				cufftSafeCall(cufftCreate(&h_));
-				cufftSafeCall(cufftSetAutoAllocation(h_, false));
+				cufftSafeCall(cufftCreate(&hs_[0]));
+				cufftSafeCall(cufftSetAutoAllocation(hs_[0], false));
 				cufftSafeCall(cufftMakePlanMany(
-					/*cufftHandle *plan*/ h_,
+					/*cufftHandle *plan*/ hs_[0],
 					/*int rank*/ dims_end - dims.begin(),
 					/*int *n*/ ion.data(),
 					/*int *inembed*/ inembed.data(),
@@ -293,11 +299,11 @@ class plan {
 					/*int batch*/ which_iodims_[first_howmany_].second.n,
 					/*size_t **/ &workSize_
 				));
-				cufftSafeCall(cufftGetSize(h_, &workSize_));
+				cufftSafeCall(cufftGetSize(hs_[0], &workSize_));
 				workArea_ = ::thrust::raw_pointer_cast(alloc_.allocate(workSize_));
-				cufftSafeCall(cufftSetWorkArea(h_, workArea_));
+				cufftSafeCall(cufftSetWorkArea(hs_[0], workArea_));
 			}
-			if(!h_) {
+			if(!hs_[0]) {
 				throw std::runtime_error{"cufftPlanMany null"};
 			}
 			++first_howmany_;
@@ -310,7 +316,7 @@ class plan {
 	template<typename = void>
 	void ExecZ2Z_(complex_type const* idata, complex_type* odata, int direction) const {
 		// used_ = true;
-		cufftSafeCall(cufftExecZ2Z(h_, const_cast<complex_type*>(idata), odata, direction));  // NOLINT(cppcoreguidelines-pro-type-const-cast) wrap legacy interface
+		cufftSafeCall(cufftExecZ2Z(hs_[0], const_cast<complex_type*>(idata), odata, direction));  // NOLINT(cppcoreguidelines-pro-type-const-cast) wrap legacy interface
 																							  // cudaDeviceSynchronize();
 	}
 
@@ -333,7 +339,7 @@ class plan {
 
 			for(int idx = 0; idx != which_iodims_[first_howmany_].second.n; ++idx) {  // NOLINT(altera-unroll-loops,altera-id-dependent-backward-branch)
 				cufftExecZ2Z(
-					h_,
+					hs_[0],
 					const_cast<complex_type*>(reinterpret_cast<complex_type const*>(::thrust::raw_pointer_cast(idata + idx * which_iodims_[first_howmany_].second.is))),  // NOLINT(cppcoreguidelines-pro-type-const-cast,cppcoreguidelines-pro-type-reinterpret-cast) legacy interface
 					reinterpret_cast<complex_type*>(::thrust::raw_pointer_cast(odata + idx * which_iodims_[first_howmany_].second.os)),                                   // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast) legacy interface
 					direction
@@ -354,7 +360,7 @@ class plan {
 			for(int idx = 0; idx != which_iodims_[first_howmany_].second.n; ++idx) {          // NOLINT(altera-unroll-loops,altera-unroll-loops,altera-id-dependent-backward-branch) TODO(correaa) use an algorithm
 				for(int jdx = 0; jdx != which_iodims_[first_howmany_ + 1].second.n; ++jdx) {  // NOLINT(altera-unroll-loops,altera-unroll-loops,altera-id-dependent-backward-branch) TODO(correaa) use an algorithm
 					cufftExecZ2Z(
-						h_,
+						hs_[0],
 						const_cast<complex_type*>(reinterpret_cast<complex_type const*>(::thrust::raw_pointer_cast(idata + idx * which_iodims_[first_howmany_].second.is + jdx * which_iodims_[first_howmany_ + 1].second.is))),  // NOLINT(cppcoreguidelines-pro-type-const-cast,cppcoreguidelines-pro-type-reinterpret-cast) legacy interface
 						reinterpret_cast<complex_type*>(::thrust::raw_pointer_cast(odata + idx * which_iodims_[first_howmany_].second.os + jdx * which_iodims_[first_howmany_ + 1].second.os)),                                   // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast) legacy interface
 						direction
@@ -397,13 +403,9 @@ class plan {
 				alloc_.deallocate(typename std::allocator_traits<Alloc>::pointer(reinterpret_cast<char*>(workArea_)), workSize_);
 			}  // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast) legacy interface
 		}
-		if(h_ != 0) {
-			cufftSafeCall(cufftDestroy(h_));
+		if(hs_[0] != 0) {
+			cufftSafeCall(cufftDestroy(hs_[0]));
 		}
-		// if(!used_) {
-		//  std::cerr <<"Warning: cufft plan was never used\n";
-		//  std::terminate();
-		// }
 	}
 
 	using size_type  = int;

diff --git a/include/boost/multi/adaptors/cufft/test/CMakeLists.txt b/include/boost/multi/adaptors/cufft/test/CMakeLists.txt
@@ -66,7 +66,7 @@ include(CTest)
 include_directories(${CMAKE_BINARY_DIR})
 
 # file(GLOB TEST_SRCS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp)
-set(TEST_SRCS cufft.cpp)
+set(TEST_SRCS cases.cpp cufft.cpp)
 
 foreach(TEST_FILE ${TEST_SRCS})
 	set(TEST_EXE "${TEST_FILE}.x")

diff --git a/include/boost/multi/adaptors/cufft/test/cases.cpp b/include/boost/multi/adaptors/cufft/test/cases.cpp
@@ -0,0 +1,65 @@
+// Copyright 2020-2025 Alfredo A. Correa
+// Distributed under the Boost Software License, Version 1.0.
+// https://www.boost.org/LICENSE_1_0.txt
+
+#include <boost/core/lightweight_test.hpp>
+
+#include <boost/multi/adaptors/fftw.hpp>
+#include <boost/multi/array.hpp>
+
+#include <boost/multi/adaptors/cufft.hpp>
+
+#if(!(defined(__HIP_PLATFORM_AMD__) || defined(__HIP_PLATFORM_NVIDIA__))) && (!defined(__HIPCC__))
+#include <boost/multi/adaptors/cufft.hpp>
+#else
+#include <boost/multi/adaptors/hipfft.hpp>
+#endif
+
+#include <boost/multi/adaptors/fft.hpp>
+#include <boost/multi/adaptors/thrust.hpp>
+
+#include <thrust/complex.h>
+#include <thrust/transform_reduce.h>
+
+#include <random>
+
+namespace multi = boost::multi;
+
+using complex   = thrust::complex<double>;
+
+template<>
+constexpr bool multi::force_element_trivial_default_construction<thrust::complex<double>> = true;
+
+auto main() -> int try {
+	complex const I{0.0, 1.0};  // NOLINT(readability-identifier-length)
+
+	auto in_cpu = multi::array<complex, 4>({20, 30, 50, 70});
+
+	std::generate(
+		in_cpu.elements().begin(), in_cpu.elements().end(),
+		[dist = std::normal_distribution<>{}, gen = std::mt19937(std::random_device{}())] () mutable { return dist(gen); }
+	);
+
+	auto const in_gpu = multi::thrust::cuda::array<complex, 4>(in_cpu);
+
+	auto const& in_cpu_view = in_cpu.transposed().rotated().transposed().unrotated().transposed();
+	auto const& in_gpu_view = in_gpu.transposed().rotated().transposed().unrotated().transposed();
+
+	auto out_gpu = multi::thrust::cuda::array<complex, 4>(in_gpu_view.extensions());
+	auto out_cpu = multi::array<complex, 4>(out_gpu.extensions());
+
+	assert(in_gpu_view.extensions() == out_gpu.extensions());
+
+	multi::fftw::dft_forward({true, true, true, false}, in_cpu_view, out_cpu);
+
+	// multi::cufft::plan<4>({true, true, true, false}, in_gpu_view.layout(), out_gpu.layout())
+	// 	.execute(in_gpu_view.base(), out_gpu.base(), multi::cufft::forward);
+
+	// std::cout << out_cpu[2][3][4][5] << ' ' << out_gpu[2][3][4][5] << std::endl;
+	// BOOST_TEST( thrust::abs(out_cpu[2][3][4][5] - static_cast<complex>(out_gpu[2][3][4][5])) < 1e-6 );
+
+	return boost::report_errors();
+} catch(...) {
+	throw;
+	return 1;
+}
diff --git a/include/boost/multi/detail/layout.hpp b/include/boost/multi/detail/layout.hpp
@@ -104,6 +104,7 @@ class f_extensions_t {
 	}
 
  public:
+	static constexpr dimensionality_type dimensionality = D;
 	constexpr static dimensionality_type rank_v = D;
 
 	using difference_type = typename extensions_t<D>::difference_type;
@@ -219,6 +220,7 @@ class f_extensions_t {
 	};
 
 	constexpr auto elements() const { return elements_t{xs_.elements(), proj_}; }
+	constexpr auto num_elements() const { return xs_.num_elements(); }
 };
 
 template<dimensionality_type D>

diff --git a/test/extensions.cpp b/test/extensions.cpp
@@ -77,6 +77,8 @@ auto main() -> int {  // NOLINT(bugprone-exception-escape,readability-function-c
 	{
 		auto x1d = multi::extensions_t<1>(3);
 
+		BOOST_TEST( multi::extensions_t<1>(3) == multi::extensions_t(3) );
+
 		auto it = x1d.elements().begin();
 		BOOST_TEST( get<0>(*it) == 0 );
 
@@ -128,6 +130,8 @@ auto main() -> int {  // NOLINT(bugprone-exception-escape,readability-function-c
 	{
 		multi::extensions_t<2> const x2d({4, 3});
 
+		BOOST_TEST( multi::extensions_t<2>(4, 3) == multi::extensions_t(4, 3) );
+
 		auto ll = [](auto xx, auto yy) {
 			return xx + yy;
 		};
@@ -375,6 +379,8 @@ auto main() -> int {  // NOLINT(bugprone-exception-escape,readability-function-c
 	{
 		multi::extensions_t<3> const xs{3, 4, 5};
 
+		BOOST_TEST(( multi::extensions_t<3>{3, 4, 5} == multi::extensions_t(3, 4, 5) ));
+
 		BOOST_TEST( xs.sub() == multi::extensions_t<2>(4, 5) );
 		static_assert(std::is_same_v<decltype(xs[1][1][1]), multi::extensions_t<3>::element>);
 	}
@@ -402,6 +408,7 @@ auto main() -> int {  // NOLINT(bugprone-exception-escape,readability-function-c
 
 		auto const& values = [](auto ii, auto jj) { return ii + jj; } ^ arr.extensions();
 
+		BOOST_TEST( values.dimensionality == 2 );
 		BOOST_TEST( values.extensions() == arr.extensions() );
 		BOOST_TEST( *values.elements().begin() == 0 );
 		BOOST_TEST( values.elements().begin() < values.elements().end() );