Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 36 additions & 32 deletions cub/cub/device/device_copy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,9 @@ struct DeviceCopy
//! **[inferred]** Device-accessible random-access input iterator type providing the number of items to be
//! copied for each pair of ranges
//!
//! @tparam EnvT
//! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``.
//!
//! @param[in] d_temp_storage
//! @devicestorage
//!
Expand All @@ -140,29 +143,28 @@ struct DeviceCopy
//! @param[in] num_ranges
//! The total number of range pairs
//!
//! @param[in] stream
//! @rst
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
//! @endrst
template <typename InputIt, typename OutputIt, typename SizeIteratorT>
//! @param[in] env
//! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
template <typename InputIt, typename OutputIt, typename SizeIteratorT, typename EnvT = ::cuda::std::execution::env<>>
CUB_RUNTIME_FUNCTION static cudaError_t Batched(
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIt input_it,
OutputIt output_it,
SizeIteratorT sizes,
::cuda::std::int64_t num_ranges,
cudaStream_t stream = nullptr)
const EnvT& env = {})
{
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceCopy::Batched");

// Integer type large enough to hold any offset in [0, num_thread_blocks_launched), where a safe
// upper bound on num_thread_blocks_launched can be assumed to be given by
// IDIV_CEIL(num_ranges, 64)
using BlockOffsetT = uint32_t;
Comment on lines -159 to -162

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Q: Why do we need to remove this comment?

using BlockOffsetT = uint32_t;
using default_policy_selector = detail::batch_memcpy::policy_selector;

return detail::batch_memcpy::dispatch<CopyAlg::Copy, BlockOffsetT>(
d_temp_storage, temp_storage_bytes, input_it, output_it, sizes, num_ranges, stream);
return detail::dispatch_with_env_and_tuning<default_policy_selector>(
d_temp_storage, temp_storage_bytes, env, [&](auto policy_selector, void* storage, size_t& bytes, auto stream) {
return detail::batch_memcpy::dispatch<CopyAlg::Copy, BlockOffsetT>(
storage, bytes, input_it, output_it, sizes, num_ranges, stream, policy_selector);
});
}

//! @rst
Expand Down Expand Up @@ -224,8 +226,8 @@ struct DeviceCopy
//! @param[in] env
//! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
template <typename InputIt, typename OutputIt, typename SizeIteratorT, typename EnvT = ::cuda::std::execution::env<>>
[[nodiscard]] CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
Batched(InputIt input_it, OutputIt output_it, SizeIteratorT sizes, ::cuda::std::int64_t num_ranges, EnvT env = {})
[[nodiscard]] CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Batched(
InputIt input_it, OutputIt output_it, SizeIteratorT sizes, ::cuda::std::int64_t num_ranges, const EnvT& env = {})
{
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceCopy::Batched");

Expand Down Expand Up @@ -294,6 +296,9 @@ struct DeviceCopy
//! @tparam Accessor_Out
//! **[inferred]** The accessor type of the destination mdspan
//!
//! @tparam EnvT
//! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``.
//!
//! @param[in] d_temp_storage
//! @devicestorage
//!
Expand All @@ -306,34 +311,35 @@ struct DeviceCopy
//! @param[in] mdspan_out
//! Destination mdspan where the data will be copied
//!
//! @param[in] stream
//! @rst
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
//! @endrst
//!
//! @returns
//! @rst
//! **cudaSuccess** on success, **cudaErrorInvalidValue** if mdspan extents don't match, or error code on failure
//! @endrst
//! @param[in] env
//! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
template <typename T_In,
typename Extents_In,
typename Layout_In,
typename Accessor_In,
typename T_Out,
typename Extents_Out,
typename Layout_Out,
typename Accessor_Out>
typename Accessor_Out,
typename EnvT = ::cuda::std::execution::env<>>
[[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
Copy(void* d_temp_storage,
size_t& temp_storage_bytes,
::cuda::std::mdspan<T_In, Extents_In, Layout_In, Accessor_In> mdspan_in,
::cuda::std::mdspan<T_Out, Extents_Out, Layout_Out, Accessor_Out> mdspan_out,
::cudaStream_t stream = nullptr)
const EnvT& env = {})
{
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceCopy::Copy");
_CCCL_ASSERT(mdspan_in.extents() == mdspan_out.extents(), "mdspan extents must be equal");
_CCCL_ASSERT((mdspan_in.data_handle() != nullptr && mdspan_out.data_handle() != nullptr) || mdspan_in.size() == 0,
"mdspan data handle must not be nullptr if the size is not 0");

if (d_temp_storage == nullptr)
{
temp_storage_bytes = 1;
return ::cudaSuccess;
}

// Check for memory overlap between input and output mdspans
if (mdspan_in.size() != 0)
{
Expand All @@ -344,12 +350,8 @@ struct DeviceCopy
// TODO(fbusato): replace with __are_ptrs_overlapping
_CCCL_ASSERT(!(in_end >= out_start && out_end >= in_start), "mdspan memory ranges must not overlap");
}
if (d_temp_storage == nullptr)
{
temp_storage_bytes = 1;
return ::cudaSuccess;
}
return detail::copy_mdspan::copy(mdspan_in, mdspan_out, stream);

return detail::copy_mdspan::copy(mdspan_in, mdspan_out, env);
}

//! @rst
Expand Down Expand Up @@ -435,12 +437,14 @@ struct DeviceCopy
[[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
Copy(::cuda::std::mdspan<T_In, Extents_In, Layout_In, Accessor_In> mdspan_in,
::cuda::std::mdspan<T_Out, Extents_Out, Layout_Out, Accessor_Out> mdspan_out,
EnvT env = {})
const EnvT& env = {})
{
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceCopy::Copy");
_CCCL_ASSERT(mdspan_in.extents() == mdspan_out.extents(), "mdspan extents must be equal");
_CCCL_ASSERT((mdspan_in.data_handle() != nullptr && mdspan_out.data_handle() != nullptr) || mdspan_in.size() == 0,
"mdspan data handle must not be nullptr if the size is not 0");

// Check for memory overlap between input and output mdspans
if (mdspan_in.size() != 0)
{
auto in_start = mdspan_in.data_handle();
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/device/dispatch/dispatch_copy_mdspan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ template <typename T_In,
[[nodiscard]] CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
copy(::cuda::std::mdspan<T_In, E_In, L_In, A_In> mdspan_in,
::cuda::std::mdspan<T_Out, E_Out, L_Out, A_Out> mdspan_out,
EnvT env = {})
const EnvT& env = {})
{
if (mdspan_in.is_exhaustive() && mdspan_out.is_exhaustive()
&& detail::have_same_strides(mdspan_in.mapping(), mdspan_out.mapping()))
Expand Down
93 changes: 84 additions & 9 deletions cub/test/catch2_test_device_copy_batched.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

#include <cuda/devices>
#include <cuda/iterator>
#include <cuda/std/execution>

#include <cstdint>

Expand Down Expand Up @@ -162,18 +163,92 @@ try
static_cast<std::uint8_t*>(thrust::raw_pointer_cast(d_out.data()))};
auto d_range_dsts = cuda::transform_iterator(d_range_dst_offsets.begin(), dst_transform_op);

// Invoke device-side algorithm
copy_batched(d_range_srcs, d_range_dsts, d_range_sizes.begin(), num_ranges);

// Prepare CPU-side result for verification
for (range_offset_t i = 0; i < num_ranges; i++)
SECTION("With environment")
{
auto out_begin = h_out.begin() + h_dst_offsets[i];
auto out_end = out_begin + h_range_sizes[i];
std::fill(out_begin, out_end, static_cast<std::uint8_t>(i));
// Invoke device-side algorithm
copy_batched(d_range_srcs, d_range_dsts, d_range_sizes.begin(), num_ranges);

// Prepare CPU-side result for verification
for (range_offset_t i = 0; i < num_ranges; i++)
{
auto out_begin = h_out.begin() + h_dst_offsets[i];
auto out_end = out_begin + h_range_sizes[i];
std::fill(out_begin, out_end, static_cast<std::uint8_t>(i));
}
REQUIRE(d_out == h_out);
}

REQUIRE(d_out == h_out);
SECTION("With user provided memory and environment")
{
auto test_copy_batched = [&](const auto& env) {
size_t num_bytes = 0;
auto error = cub::DeviceCopy::Batched(
static_cast<void*>(nullptr), num_bytes, d_range_srcs, d_range_dsts, d_range_sizes.begin(), num_ranges, env);
REQUIRE(error == cudaSuccess);
REQUIRE(cudaSuccess == cudaPeekAtLastError());
REQUIRE(cudaSuccess == cudaDeviceSynchronize());

auto d_temp = c2h::device_vector<uint8_t>(num_bytes, thrust::no_init);
void* temp_storage = thrust::raw_pointer_cast(d_temp.data());

error = cub::DeviceCopy::Batched(
temp_storage, num_bytes, d_range_srcs, d_range_dsts, d_range_sizes.begin(), num_ranges, env);
REQUIRE(error == cudaSuccess);
REQUIRE(cudaSuccess == cudaPeekAtLastError());
REQUIRE(cudaSuccess == cudaDeviceSynchronize());

// Prepare CPU-side result for verification
for (range_offset_t i = 0; i < num_ranges; i++)
{
auto out_begin = h_out.begin() + h_dst_offsets[i];
auto out_end = out_begin + h_range_sizes[i];
std::fill(out_begin, out_end, static_cast<std::uint8_t>(i));
}
REQUIRE(d_out == h_out);
Comment thread
coderabbitai[bot] marked this conversation as resolved.
};

int current_device;
error = cudaGetDevice(&current_device);
REQUIRE(error == cudaSuccess);

SECTION("DeviceCopy::Batched works with cudaStream_t")
{
cuda::stream stream{cuda::devices[current_device]};
test_copy_batched(stream.get());
}

SECTION("DeviceCopy::Batched works with cuda::stream")
{
cuda::stream stream{cuda::devices[current_device]};
test_copy_batched(stream);
}

SECTION("DeviceCopy::Batched works with cuda::stream_ref")
{
cuda::stream stream{cuda::devices[current_device]};
cuda::stream_ref stream_ref{stream};
test_copy_batched(stream_ref);
}

SECTION("DeviceCopy::Batched works with cuda::std::execution::env")
{
cuda::std::execution::env env{};
test_copy_batched(env);
}

SECTION("DeviceCopy::Batched works with cuda::execution::gpu")
{
const auto policy = cuda::execution::gpu;
test_copy_batched(policy);
}

SECTION("DeviceCopy::Batched works with cuda::execution::gpu with stream")
{
cuda::stream stream{cuda::devices[current_device]};
const auto policy = cuda::execution::gpu.with(cuda::get_stream, stream);
test_copy_batched(policy);
}
}
}
catch (std::bad_alloc& e)
{
Expand Down
87 changes: 87 additions & 0 deletions cub/test/catch2_test_device_copy_mdspan.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
#include <thrust/host_vector.h>
#include <thrust/sequence.h>

#include <cuda/devices>
#include <cuda/std/array>
#include <cuda/std/execution>
#include <cuda/std/mdspan>

#include <c2h/catch2_test_helper.h>
Expand Down Expand Up @@ -58,6 +60,91 @@ C2H_TEST("DeviceCopy::Copy: 1D, 2D, 4D mdspan with matching layouts", "[copy][md
REQUIRE(d_input == d_output);
}

C2H_TEST("DeviceCopy::Copy: 1D, 2D, 4D mdspan with matching layouts and user provided memory", "[copy][mdspan]")
{
constexpr size_t num_items = 10000;
c2h::device_vector<int> d_input(num_items, thrust::no_init);
c2h::device_vector<int> d_output(num_items, thrust::no_init);
thrust::sequence(d_input.begin(), d_input.end(), 0);
thrust::fill(d_output.begin(), d_output.end(), 42);

// We do not really need any device storage
auto d_temp = c2h::device_vector<uint8_t>(1, thrust::no_init);
void* temp_storage = thrust::raw_pointer_cast(d_temp.data());
size_t num_bytes = 1;

auto test_mdspan_copy = [&](const auto& env) {
auto mdspan_in1 = cuda::std::mdspan{thrust::raw_pointer_cast(d_input.data()), dims_1d_t{num_items}};
auto mdspan_out1 = cuda::std::mdspan{thrust::raw_pointer_cast(d_output.data()), dims_1d_t{num_items}};
auto error = cub::DeviceCopy::Copy(temp_storage, num_bytes, mdspan_in1, mdspan_out1, env);
REQUIRE(error == cudaSuccess);
REQUIRE(cudaSuccess == cudaPeekAtLastError());
REQUIRE(cudaSuccess == cudaDeviceSynchronize());
REQUIRE(d_input == d_output);
thrust::fill(d_output.begin(), d_output.end(), 42);

using mdspan_2d_left_t = cuda::std::mdspan<int, dims_2d_t, cuda::std::layout_left>;
auto d_mdspan_in2 = mdspan_2d_left_t(thrust::raw_pointer_cast(d_input.data()), dims_2d_t{100, 100});
auto d_mdspan_out2 = mdspan_2d_left_t(thrust::raw_pointer_cast(d_output.data()), dims_2d_t{100, 100});
error = cub::DeviceCopy::Copy(temp_storage, num_bytes, d_mdspan_in2, d_mdspan_out2, env);
REQUIRE(error == cudaSuccess);
REQUIRE(cudaSuccess == cudaPeekAtLastError());
REQUIRE(cudaSuccess == cudaDeviceSynchronize());
REQUIRE(d_input == d_output);
thrust::fill(d_output.begin(), d_output.end(), 42);

auto mdspan_in3 = cuda::std::mdspan(thrust::raw_pointer_cast(d_input.data()), dims_4d_t{10, 10, 10, 10});
auto mdspan_out3 = cuda::std::mdspan(thrust::raw_pointer_cast(d_output.data()), dims_4d_t{10, 10, 10, 10});
error = cub::DeviceCopy::Copy(temp_storage, num_bytes, mdspan_in3, mdspan_out3, env);
REQUIRE(error == cudaSuccess);
REQUIRE(cudaSuccess == cudaPeekAtLastError());
REQUIRE(cudaSuccess == cudaDeviceSynchronize());
REQUIRE(d_input == d_output);
};
Comment thread
miscco marked this conversation as resolved.

int current_device;
error = cudaGetDevice(&current_device);
REQUIRE(error == cudaSuccess);

SECTION("DeviceCopy::Copy works with cudaStream_t")
{
cuda::stream stream{cuda::devices[current_device]};
test_mdspan_copy(stream.get());
}

SECTION("DeviceCopy::Copy works with cuda::stream")
{
cuda::stream stream{cuda::devices[current_device]};
test_mdspan_copy(stream);
}

SECTION("DeviceCopy::Copy works with cuda::stream_ref")
{
cuda::stream stream{cuda::devices[current_device]};
cuda::stream_ref stream_ref{stream};
test_mdspan_copy(stream_ref);
}

SECTION("DeviceCopy::Copy works with cuda::std::execution::env")
{
cuda::std::execution::env env{};
test_mdspan_copy(env);
}

SECTION("DeviceCopy::Copy works with cuda::execution::gpu")
{
const auto policy = cuda::execution::gpu;
test_mdspan_copy(policy);
}

SECTION("DeviceCopy::Copy works with cuda::execution::gpu with stream")
{
cuda::stream stream{cuda::devices[current_device]};
const auto policy = cuda::execution::gpu.with(cuda::get_stream, stream);
test_mdspan_copy(policy);
}
}

C2H_TEST("DeviceCopy::Copy: 2D, 4D mdspan with compatible layouts", "[copy][mdspan]")
{
constexpr size_t num_items = 10000;
Expand Down
Loading