NVIDIA · miscco · Jun 12, 2026 · bernhardmgruber · Jun 12, 2026
@@ -122,6 +122,9 @@ struct DeviceCopy
   //!   **[inferred]** Device-accessible random-access input iterator type providing the number of items to be
   //!   copied for each pair of ranges
   //!
+  //! @tparam EnvT
+  //!   **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``.
+  //!
   //! @param[in] d_temp_storage
   //!   @devicestorage
   //!
@@ -140,29 +143,28 @@ struct DeviceCopy
   //! @param[in] num_ranges
   //!   The total number of range pairs
   //!
-  //! @param[in] stream
-  //!   @rst
-  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
-  //!   @endrst
-  template <typename InputIt, typename OutputIt, typename SizeIteratorT>
+  //! @param[in] env
+  //!   **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
+  template <typename InputIt, typename OutputIt, typename SizeIteratorT, typename EnvT = ::cuda::std::execution::env<>>
   CUB_RUNTIME_FUNCTION static cudaError_t Batched(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
     InputIt input_it,
     OutputIt output_it,
     SizeIteratorT sizes,
     ::cuda::std::int64_t num_ranges,
-    cudaStream_t stream = nullptr)
+    const EnvT& env = {})
   {
     _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceCopy::Batched");
 
-    // Integer type large enough to hold any offset in [0, num_thread_blocks_launched), where a safe
-    // upper bound on num_thread_blocks_launched can be assumed to be given by
-    // IDIV_CEIL(num_ranges, 64)
-    using BlockOffsetT = uint32_t;
+    using BlockOffsetT            = uint32_t;
+    using default_policy_selector = detail::batch_memcpy::policy_selector;
 
-    return detail::batch_memcpy::dispatch<CopyAlg::Copy, BlockOffsetT>(
-      d_temp_storage, temp_storage_bytes, input_it, output_it, sizes, num_ranges, stream);
+    return detail::dispatch_with_env_and_tuning<default_policy_selector>(
+      d_temp_storage, temp_storage_bytes, env, [&](auto policy_selector, void* storage, size_t& bytes, auto stream) {
+        return detail::batch_memcpy::dispatch<CopyAlg::Copy, BlockOffsetT>(
+          storage, bytes, input_it, output_it, sizes, num_ranges, stream, policy_selector);
+      });
   }
 
   //! @rst
@@ -224,8 +226,8 @@ struct DeviceCopy
   //! @param[in] env
   //!   **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
   template <typename InputIt, typename OutputIt, typename SizeIteratorT, typename EnvT = ::cuda::std::execution::env<>>
-  [[nodiscard]] CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
-  Batched(InputIt input_it, OutputIt output_it, SizeIteratorT sizes, ::cuda::std::int64_t num_ranges, EnvT env = {})
+  [[nodiscard]] CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Batched(
+    InputIt input_it, OutputIt output_it, SizeIteratorT sizes, ::cuda::std::int64_t num_ranges, const EnvT& env = {})
   {
     _CCCL_NVTX_RANGE_SCOPE("cub::DeviceCopy::Batched");
 
@@ -294,6 +296,9 @@ struct DeviceCopy
   //! @tparam Accessor_Out
   //!   **[inferred]** The accessor type of the destination mdspan
   //!
+  //! @tparam EnvT
+  //!   **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``.
+  //!
   //! @param[in] d_temp_storage
   //!   @devicestorage
   //!
@@ -306,34 +311,35 @@ struct DeviceCopy
   //! @param[in] mdspan_out
   //!   Destination mdspan where the data will be copied
   //!
-  //! @param[in] stream
-  //!   @rst
-  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
-  //!   @endrst
-  //!
-  //! @returns
-  //!   @rst
-  //!   **cudaSuccess** on success, **cudaErrorInvalidValue** if mdspan extents don't match, or error code on failure
-  //!   @endrst
+  //! @param[in] env
+  //!   **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
   template <typename T_In,
             typename Extents_In,
             typename Layout_In,
             typename Accessor_In,
             typename T_Out,
             typename Extents_Out,
             typename Layout_Out,
-            typename Accessor_Out>
+            typename Accessor_Out,
+            typename EnvT = ::cuda::std::execution::env<>>
   [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
   Copy(void* d_temp_storage,
        size_t& temp_storage_bytes,
        ::cuda::std::mdspan<T_In, Extents_In, Layout_In, Accessor_In> mdspan_in,
        ::cuda::std::mdspan<T_Out, Extents_Out, Layout_Out, Accessor_Out> mdspan_out,
-       ::cudaStream_t stream = nullptr)
+       const EnvT& env = {})
   {
     _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceCopy::Copy");
     _CCCL_ASSERT(mdspan_in.extents() == mdspan_out.extents(), "mdspan extents must be equal");
     _CCCL_ASSERT((mdspan_in.data_handle() != nullptr && mdspan_out.data_handle() != nullptr) || mdspan_in.size() == 0,
                  "mdspan data handle must not be nullptr if the size is not 0");
+
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = 1;
+      return ::cudaSuccess;
+    }
+
     // Check for memory overlap between input and output mdspans
     if (mdspan_in.size() != 0)
     {
@@ -344,12 +350,8 @@ struct DeviceCopy
       // TODO(fbusato): replace with __are_ptrs_overlapping
       _CCCL_ASSERT(!(in_end >= out_start && out_end >= in_start), "mdspan memory ranges must not overlap");
     }
-    if (d_temp_storage == nullptr)
-    {
-      temp_storage_bytes = 1;
-      return ::cudaSuccess;
-    }
-    return detail::copy_mdspan::copy(mdspan_in, mdspan_out, stream);
+
+    return detail::copy_mdspan::copy(mdspan_in, mdspan_out, env);
   }
 
   //! @rst
@@ -435,12 +437,14 @@ struct DeviceCopy
   [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
   Copy(::cuda::std::mdspan<T_In, Extents_In, Layout_In, Accessor_In> mdspan_in,
        ::cuda::std::mdspan<T_Out, Extents_Out, Layout_Out, Accessor_Out> mdspan_out,
-       EnvT env = {})
+       const EnvT& env = {})
   {
     _CCCL_NVTX_RANGE_SCOPE("cub::DeviceCopy::Copy");
     _CCCL_ASSERT(mdspan_in.extents() == mdspan_out.extents(), "mdspan extents must be equal");
     _CCCL_ASSERT((mdspan_in.data_handle() != nullptr && mdspan_out.data_handle() != nullptr) || mdspan_in.size() == 0,
                  "mdspan data handle must not be nullptr if the size is not 0");
+
+    // Check for memory overlap between input and output mdspans
     if (mdspan_in.size() != 0)
     {
       auto in_start  = mdspan_in.data_handle();

@@ -59,7 +59,7 @@ template <typename T_In,
 [[nodiscard]] CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
 copy(::cuda::std::mdspan<T_In, E_In, L_In, A_In> mdspan_in,
      ::cuda::std::mdspan<T_Out, E_Out, L_Out, A_Out> mdspan_out,
-     EnvT env = {})
+     const EnvT& env = {})
 {
   if (mdspan_in.is_exhaustive() && mdspan_out.is_exhaustive()
       && detail::have_same_strides(mdspan_in.mapping(), mdspan_out.mapping()))

@@ -11,6 +11,7 @@
 
 #include <cuda/devices>
 #include <cuda/iterator>
+#include <cuda/std/execution>
 
 #include <cstdint>
 
@@ -162,18 +163,92 @@ try
     static_cast<std::uint8_t*>(thrust::raw_pointer_cast(d_out.data()))};
   auto d_range_dsts = cuda::transform_iterator(d_range_dst_offsets.begin(), dst_transform_op);
 
-  // Invoke device-side algorithm
-  copy_batched(d_range_srcs, d_range_dsts, d_range_sizes.begin(), num_ranges);
-
-  // Prepare CPU-side result for verification
-  for (range_offset_t i = 0; i < num_ranges; i++)
+  SECTION("With environment")
   {
-    auto out_begin = h_out.begin() + h_dst_offsets[i];
-    auto out_end   = out_begin + h_range_sizes[i];
-    std::fill(out_begin, out_end, static_cast<std::uint8_t>(i));
+    // Invoke device-side algorithm
+    copy_batched(d_range_srcs, d_range_dsts, d_range_sizes.begin(), num_ranges);
+
+    // Prepare CPU-side result for verification
+    for (range_offset_t i = 0; i < num_ranges; i++)
+    {
+      auto out_begin = h_out.begin() + h_dst_offsets[i];
+      auto out_end   = out_begin + h_range_sizes[i];
+      std::fill(out_begin, out_end, static_cast<std::uint8_t>(i));
+    }
+    REQUIRE(d_out == h_out);
   }
 
-  REQUIRE(d_out == h_out);
+  SECTION("With user provided memory and environment")
+  {
+    auto test_copy_batched = [&](const auto& env) {
+      size_t num_bytes = 0;
+      auto error       = cub::DeviceCopy::Batched(
+        static_cast<void*>(nullptr), num_bytes, d_range_srcs, d_range_dsts, d_range_sizes.begin(), num_ranges, env);
+      REQUIRE(error == cudaSuccess);
+      REQUIRE(cudaSuccess == cudaPeekAtLastError());
+      REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+      auto d_temp        = c2h::device_vector<uint8_t>(num_bytes, thrust::no_init);
+      void* temp_storage = thrust::raw_pointer_cast(d_temp.data());
+
+      error = cub::DeviceCopy::Batched(
+        temp_storage, num_bytes, d_range_srcs, d_range_dsts, d_range_sizes.begin(), num_ranges, env);
+      REQUIRE(error == cudaSuccess);
+      REQUIRE(cudaSuccess == cudaPeekAtLastError());
+      REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+      // Prepare CPU-side result for verification
+      for (range_offset_t i = 0; i < num_ranges; i++)
+      {
+        auto out_begin = h_out.begin() + h_dst_offsets[i];
+        auto out_end   = out_begin + h_range_sizes[i];
+        std::fill(out_begin, out_end, static_cast<std::uint8_t>(i));
+      }
+      REQUIRE(d_out == h_out);
+    };
+
+    int current_device;
+    error = cudaGetDevice(&current_device);
+    REQUIRE(error == cudaSuccess);
+
+    SECTION("DeviceCopy::Batched works with cudaStream_t")
+    {
+      cuda::stream stream{cuda::devices[current_device]};
+      test_copy_batched(stream.get());
+    }
+
+    SECTION("DeviceCopy::Batched works with cuda::stream")
+    {
+      cuda::stream stream{cuda::devices[current_device]};
+      test_copy_batched(stream);
+    }
+
+    SECTION("DeviceCopy::Batched works with cuda::stream_ref")
+    {
+      cuda::stream stream{cuda::devices[current_device]};
+      cuda::stream_ref stream_ref{stream};
+      test_copy_batched(stream_ref);
+    }
+
+    SECTION("DeviceCopy::Batched works with cuda::std::execution::env")
+    {
+      cuda::std::execution::env env{};
+      test_copy_batched(env);
+    }
+
+    SECTION("DeviceCopy::Batched works with cuda::execution::gpu")
+    {
+      const auto policy = cuda::execution::gpu;
+      test_copy_batched(policy);
+    }
+
+    SECTION("DeviceCopy::Batched works with cuda::execution::gpu with stream")
+    {
+      cuda::stream stream{cuda::devices[current_device]};
+      const auto policy = cuda::execution::gpu.with(cuda::get_stream, stream);
+      test_copy_batched(policy);
+    }
+  }
 }
 catch (std::bad_alloc& e)
 {

@@ -10,7 +10,9 @@
 #include <thrust/host_vector.h>
 #include <thrust/sequence.h>
 
+#include <cuda/devices>
 #include <cuda/std/array>
+#include <cuda/std/execution>
 #include <cuda/std/mdspan>
 
 #include <c2h/catch2_test_helper.h>
@@ -58,6 +60,91 @@ C2H_TEST("DeviceCopy::Copy: 1D, 2D, 4D mdspan with matching layouts", "[copy][md
   REQUIRE(d_input == d_output);
 }
 
+C2H_TEST("DeviceCopy::Copy: 1D, 2D, 4D mdspan with matching layouts and user provided memory", "[copy][mdspan]")
+{
+  constexpr size_t num_items = 10000;
+  c2h::device_vector<int> d_input(num_items, thrust::no_init);
+  c2h::device_vector<int> d_output(num_items, thrust::no_init);
+  thrust::sequence(d_input.begin(), d_input.end(), 0);
+  thrust::fill(d_output.begin(), d_output.end(), 42);
+
+  // We do not really need any device storage
+  auto d_temp        = c2h::device_vector<uint8_t>(1, thrust::no_init);
+  void* temp_storage = thrust::raw_pointer_cast(d_temp.data());
+  size_t num_bytes   = 1;
+
+  auto test_mdspan_copy = [&](const auto& env) {
+    auto mdspan_in1  = cuda::std::mdspan{thrust::raw_pointer_cast(d_input.data()), dims_1d_t{num_items}};
+    auto mdspan_out1 = cuda::std::mdspan{thrust::raw_pointer_cast(d_output.data()), dims_1d_t{num_items}};
+    auto error       = cub::DeviceCopy::Copy(temp_storage, num_bytes, mdspan_in1, mdspan_out1, env);
+    REQUIRE(error == cudaSuccess);
+    REQUIRE(cudaSuccess == cudaPeekAtLastError());
+    REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+    REQUIRE(d_input == d_output);
+    thrust::fill(d_output.begin(), d_output.end(), 42);
+
+    using mdspan_2d_left_t = cuda::std::mdspan<int, dims_2d_t, cuda::std::layout_left>;
+    auto d_mdspan_in2      = mdspan_2d_left_t(thrust::raw_pointer_cast(d_input.data()), dims_2d_t{100, 100});
+    auto d_mdspan_out2     = mdspan_2d_left_t(thrust::raw_pointer_cast(d_output.data()), dims_2d_t{100, 100});
+    error                  = cub::DeviceCopy::Copy(temp_storage, num_bytes, d_mdspan_in2, d_mdspan_out2, env);
+    REQUIRE(error == cudaSuccess);
+    REQUIRE(cudaSuccess == cudaPeekAtLastError());
+    REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+    REQUIRE(d_input == d_output);
+    thrust::fill(d_output.begin(), d_output.end(), 42);
+
+    auto mdspan_in3  = cuda::std::mdspan(thrust::raw_pointer_cast(d_input.data()), dims_4d_t{10, 10, 10, 10});
+    auto mdspan_out3 = cuda::std::mdspan(thrust::raw_pointer_cast(d_output.data()), dims_4d_t{10, 10, 10, 10});
+    error            = cub::DeviceCopy::Copy(temp_storage, num_bytes, mdspan_in3, mdspan_out3, env);
+    REQUIRE(error == cudaSuccess);
+    REQUIRE(cudaSuccess == cudaPeekAtLastError());
+    REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+    REQUIRE(d_input == d_output);
+  };
+
+  int current_device;
+  error = cudaGetDevice(&current_device);
+  REQUIRE(error == cudaSuccess);
+
+  SECTION("DeviceCopy::Copy works with cudaStream_t")
+  {
+    cuda::stream stream{cuda::devices[current_device]};
+    test_mdspan_copy(stream.get());
+  }
+
+  SECTION("DeviceCopy::Copy works with cuda::stream")
+  {
+    cuda::stream stream{cuda::devices[current_device]};
+    test_mdspan_copy(stream);
+  }
+
+  SECTION("DeviceCopy::Copy works with cuda::stream_ref")
+  {
+    cuda::stream stream{cuda::devices[current_device]};
+    cuda::stream_ref stream_ref{stream};
+    test_mdspan_copy(stream_ref);
+  }
+
+  SECTION("DeviceCopy::Copy works with cuda::std::execution::env")
+  {
+    cuda::std::execution::env env{};
+    test_mdspan_copy(env);
+  }
+
+  SECTION("DeviceCopy::Copy works with cuda::execution::gpu")
+  {
+    const auto policy = cuda::execution::gpu;
+    test_mdspan_copy(policy);
+  }
+
+  SECTION("DeviceCopy::Copy works with cuda::execution::gpu with stream")
+  {
+    cuda::stream stream{cuda::devices[current_device]};
+    const auto policy = cuda::execution::gpu.with(cuda::get_stream, stream);
+    test_mdspan_copy(policy);
+  }
+}
+
 C2H_TEST("DeviceCopy::Copy: 2D, 4D mdspan with compatible layouts", "[copy][mdspan]")
 {
   constexpr size_t num_items = 10000;