Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
222 changes: 222 additions & 0 deletions cudax/include/cuda/experimental/__multi_gpu/concepts.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDA Experimental in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA_EXPERIMENTAL___MULTI_GPU_CONCEPTS_H
#define _CUDA_EXPERIMENTAL___MULTI_GPU_CONCEPTS_H

#include <cuda/std/detail/__config> // IWYU pragma: export

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cuda/__stream/stream_ref.h>
#include <cuda/std/__concepts/concept_macros.h>
#include <cuda/std/__concepts/convertible_to.h>
#include <cuda/std/__concepts/same_as.h>
#include <cuda/std/__functional/operations.h>
#include <cuda/std/__utility/declval.h>
#include <cuda/std/cstdint>

#include <cuda/std/__cccl/prologue.h>

// NOLINTBEGIN(bugprone-reserved-identifier)
namespace cuda::experimental
{
// Needed because the C++17 concept emulation can't handle the implicit first template
// parameter of real concepts.
Comment thread
Jacobfaib marked this conversation as resolved.
template <class _Tp>
_CCCL_CONCEPT __convertible_to_int32 = ::cuda::std::convertible_to<_Tp, ::cuda::std::int32_t>;

// Requires communicator<_Comm> to be checked first, but that invokes a circular dependency
template <class _Comm, class _Ptr = void*>
_CCCL_CONCEPT __has_send = _CCCL_REQUIRES_EXPR(
(_Comm, _Ptr),
_Comm& __comm,
_Ptr __buf,
::cuda::std::size_t __count,
::cuda::std::int32_t __peer,
::cuda::stream_ref __stream)(_Same_as(void) __comm.send(
::cuda::std::declval<typename _Comm::group_token_type&>(), __buf, __count, __peer, __stream));

// Requires communicator<_Comm> to be checked first, but that invokes a circular dependency
template <class _Comm, class _Ptr = void*>
_CCCL_CONCEPT __has_recv = _CCCL_REQUIRES_EXPR(
(_Comm, _Ptr),
_Comm& __comm,
_Ptr __buf,
::cuda::std::size_t __count,
::cuda::std::int32_t __peer,
::cuda::stream_ref __stream)(_Same_as(void) __comm.recv(
::cuda::std::declval<typename _Comm::group_token_type&>(), __buf, __count, __peer, __stream));

template <class _Comm>
_CCCL_CONCEPT communicator = _CCCL_REQUIRES_EXPR((_Comm), _Comm& __comm)(
typename(typename _Comm::native_handle_type),
_Same_as(typename _Comm::native_handle_type) __comm.native_handle(),
noexcept(__comm.native_handle()),
_Satisfies(__convertible_to_int32) __comm.rank(),
_Satisfies(__convertible_to_int32) __comm.size(),
typename(typename _Comm::group_token_type),
_Same_as(typename _Comm::group_token_type) __comm.group_token(),
requires(__has_send<_Comm>),
requires(__has_recv<_Comm>) //
);

// ==========================================================================================

// Use a typed pointer as default here, since the op may need to instantiated with a
// dereferenceable pointer type for reductions
template <class _Comm, class _Ptr = int*>
_CCCL_CONCEPT __has_reduce = _CCCL_REQUIRES_EXPR(
(_Comm, _Ptr),
_Comm& __comm,
_Ptr __sendbuff,
_Ptr __recvbuff,
::cuda::std::size_t __count,
::cuda::std::plus<> __op,
::cuda::std::int32_t __root,
::cuda::stream_ref __stream)(
requires(communicator<_Comm>),
_Same_as(void) __comm.reduce(
::cuda::std::declval<typename _Comm::group_token_type&>(), __sendbuff, __recvbuff, __count, __op, __root, __stream));

// ==========================================================================================

// Use a typed pointer as default here, since the op may need to instantiated with a
// dereferenceable pointer type for reductions
template <class _Comm, class _Ptr = int*>
_CCCL_CONCEPT __has_all_reduce = _CCCL_REQUIRES_EXPR(
(_Comm, _Ptr),
_Comm& __comm,
_Ptr __sendbuff,
_Ptr __recvbuff,
::cuda::std::size_t __count,
::cuda::std::plus<> __op,
::cuda::stream_ref __stream)(
requires(communicator<_Comm>),
_Same_as(void) __comm.all_reduce(
::cuda::std::declval<typename _Comm::group_token_type&>(), __sendbuff, __recvbuff, __count, __op, __stream));

// ==========================================================================================

template <class _Comm, class _Ptr = void*>
_CCCL_CONCEPT __has_gather = _CCCL_REQUIRES_EXPR(
(_Comm, _Ptr),
_Comm& __comm,
_Ptr __sendbuff,
_Ptr __recvbuff,
::cuda::std::size_t __count,
::cuda::std::int32_t __root,
::cuda::stream_ref __stream)(
requires(communicator<_Comm>),
_Same_as(void) __comm.gather(
::cuda::std::declval<typename _Comm::group_token_type&>(), __sendbuff, __recvbuff, __count, __root, __stream));

// ==========================================================================================

template <class _Comm, class _Ptr = void*>
_CCCL_CONCEPT __has_gather_v = _CCCL_REQUIRES_EXPR(
(_Comm, _Ptr),
_Comm& __comm,
_Ptr __sendbuff,
::cuda::std::size_t __send_count,
_Ptr __recvbuff,
const ::cuda::std::size_t* __recv_counts,
const ::cuda::std::size_t* __displs,
::cuda::std::int32_t __root,
::cuda::stream_ref __stream)(
requires(communicator<_Comm>),
_Same_as(void) __comm.gather_v(
::cuda::std::declval<typename _Comm::group_token_type&>(),
__sendbuff,
__send_count,
__recvbuff,
__recv_counts,
__displs,
__root,
__stream));

// ==========================================================================================

template <class _Comm, class _Ptr = void*>
_CCCL_CONCEPT __has_all_gather = _CCCL_REQUIRES_EXPR(
(_Comm, _Ptr),
_Comm& __comm,
_Ptr __sendbuff,
_Ptr __recvbuff,
::cuda::std::size_t __count,
::cuda::stream_ref __stream)(
requires(communicator<_Comm>),
_Same_as(void) __comm.all_gather(
::cuda::std::declval<typename _Comm::group_token_type&>(), __sendbuff, __recvbuff, __count, __stream));

// ==========================================================================================

template <class _Comm, class _Ptr = void*>
_CCCL_CONCEPT __has_broadcast = _CCCL_REQUIRES_EXPR(
(_Comm, _Ptr),
_Comm& __comm,
_Ptr __sendbuff,
_Ptr __recvbuff,
::cuda::std::size_t __count,
::cuda::std::int32_t __root,
::cuda::stream_ref __stream)(
requires(communicator<_Comm>),
_Same_as(void) __comm.broadcast(
::cuda::std::declval<typename _Comm::group_token_type&>(), __sendbuff, __recvbuff, __count, __root, __stream));

// ==========================================================================================

template <class _Comm, class _Ptr = void*>
_CCCL_CONCEPT __has_all_to_all = _CCCL_REQUIRES_EXPR(
(_Comm, _Ptr),
_Comm& __comm,
_Ptr __sendbuff,
_Ptr __recvbuff,
::cuda::std::size_t __count,
::cuda::stream_ref __stream)(
requires(communicator<_Comm>),
_Same_as(void) __comm.all_to_all(
::cuda::std::declval<typename _Comm::group_token_type&>(), __sendbuff, __recvbuff, __count, __stream));

// ==========================================================================================

template <class _Comm, class _Ptr = void*>
_CCCL_CONCEPT __has_all_to_all_v = _CCCL_REQUIRES_EXPR(
(_Comm, _Ptr),
_Comm& __comm,
_Ptr __sendbuff,
const ::cuda::std::size_t* __send_counts,
const ::cuda::std::size_t* __send_displs,
_Ptr __recvbuff,
const ::cuda::std::size_t* __recv_counts,
const ::cuda::std::size_t* __recv_displs,
::cuda::stream_ref __stream)(
requires(communicator<_Comm>),
_Same_as(void) __comm.all_to_all_v(
::cuda::std::declval<typename _Comm::group_token_type&>(),
__sendbuff,
__send_counts,
__send_displs,
__recvbuff,
__recv_counts,
__recv_displs,
__stream));
} // namespace cuda::experimental
// NOLINTEND(bugprone-reserved-identifier)

#include <cuda/std/__cccl/epilogue.h>

#endif // _CUDA_EXPERIMENTAL___MULTI_GPU_CONCEPTS_H
2 changes: 2 additions & 0 deletions cudax/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,8 @@ cudax_add_catch2_test(test_target algorithm
algorithm/copy.cu
)

add_subdirectory(multi_gpu)

cudax_add_catch2_test(test_target group.mapping.binary_partition
group/mapping/binary_partition.cu
)
Expand Down
11 changes: 11 additions & 0 deletions cudax/test/multi_gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#===----------------------------------------------------------------------===##
#
# Part of CUDA Experimental in CUDA C++ Core Libraries,
# under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
#
#===----------------------------------------------------------------------===##

add_subdirectory(concepts)
38 changes: 38 additions & 0 deletions cudax/test/multi_gpu/concepts/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#===----------------------------------------------------------------------===##
#
# Part of CUDA Experimental in CUDA C++ Core Libraries,
# under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
#
#===----------------------------------------------------------------------===##

set(
multi_gpu_concepts_test_sources
communicator.cu
has_all_gather.cu
has_all_reduce.cu
has_all_to_all.cu
has_all_to_all_v.cu
has_broadcast.cu
has_gather.cu
has_gather_v.cu
has_recv.cu
has_reduce.cu
has_send.cu
)

function(cudax_add_multi_gpu_concepts_test target_name_var source)
cmake_path(GET source STEM filename)
set(test_target "cudax.test.multi_gpu.concepts.${filename}")

cccl_add_executable(${test_target} SOURCES "${source}" ADD_CTEST)
target_link_libraries(${test_target} PRIVATE cudax.compiler_interface)

set(${target_name_var} ${test_target} PARENT_SCOPE)
endfunction()

foreach (source IN LISTS multi_gpu_concepts_test_sources)
cudax_add_multi_gpu_concepts_test(test_target "${source}")
endforeach()
66 changes: 66 additions & 0 deletions cudax/test/multi_gpu/concepts/collective_concepts_common.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDA Experimental in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDAX_TEST_MULTI_GPU_COLLECTIVE_CONCEPTS_COMMON_CUH
#define _CUDAX_TEST_MULTI_GPU_COLLECTIVE_CONCEPTS_COMMON_CUH

#include <cuda/__stream/stream_ref.h>
#include <cuda/std/__cstddef/types.h>
#include <cuda/std/cstdint>

#include "concepts_common.cuh"
Comment thread
Jacobfaib marked this conversation as resolved.

namespace cudax_multi_gpu_concepts
{
struct collective_communicator_model : communicator_model
{
template <class Tp, class Op>
void reduce(group_token_type&, Tp*, Tp*, ::cuda::std::size_t, Op, ::cuda::std::int32_t, ::cuda::stream_ref);

template <class Tp, class Op>
void all_reduce(group_token_type&, Tp*, Tp*, ::cuda::std::size_t, Op, ::cuda::stream_ref);

template <class Tp>
void gather(group_token_type&, Tp*, Tp*, ::cuda::std::size_t, ::cuda::std::int32_t, ::cuda::stream_ref);

template <class Tp>
void gather_v(
group_token_type&,
Tp*,
::cuda::std::size_t,
Tp*,
const ::cuda::std::size_t*,
const ::cuda::std::size_t*,
::cuda::std::int32_t,
::cuda::stream_ref);

template <class Tp>
void all_gather(group_token_type&, Tp*, Tp*, ::cuda::std::size_t, ::cuda::stream_ref);

template <class Tp>
void broadcast(group_token_type&, Tp*, Tp*, ::cuda::std::size_t, ::cuda::std::int32_t, ::cuda::stream_ref);

template <class Tp>
void all_to_all(group_token_type&, Tp*, Tp*, ::cuda::std::size_t, ::cuda::stream_ref);

template <class Tp>
void all_to_all_v(
group_token_type&,
Tp*,
const ::cuda::std::size_t*,
const ::cuda::std::size_t*,
Tp*,
const ::cuda::std::size_t*,
const ::cuda::std::size_t*,
::cuda::stream_ref);
};
} // namespace cudax_multi_gpu_concepts

#endif // _CUDAX_TEST_MULTI_GPU_COLLECTIVE_CONCEPTS_COMMON_CUH
Loading
Loading