Skip to content

Add Cooperative Groups API integration #87

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -14,7 +14,7 @@ members = [
]

exclude = [
"crates/optix/examples/common"
"crates/optix/examples/common",
]

[profile.dev.package.rustc_codegen_nvvm]
10 changes: 9 additions & 1 deletion crates/cuda_builder/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "cuda_builder"
version = "0.3.0"
version = "0.4.0"
edition = "2021"
authors = ["Riccardo D'Ambrosio <[email protected]>", "The rust-gpu Authors"]
license = "MIT OR Apache-2.0"
@@ -9,8 +9,16 @@ repository = "https://github.com/Rust-GPU/Rust-CUDA"
readme = "../../README.md"

[dependencies]
anyhow = "1"
thiserror = "1"
cc = { version = "1", default-features = false, optional = true }
cust = { path = "../cust", optional = true }
rustc_codegen_nvvm = { version = "0.3", path = "../rustc_codegen_nvvm" }
nvvm = { path = "../nvvm", version = "0.1" }
serde = { version = "1.0.130", features = ["derive"] }
serde_json = "1.0.68"
find_cuda_helper = { version = "0.2", path = "../find_cuda_helper" }

[features]
default = []
cooperative_groups = ["cc", "cust"]
61 changes: 61 additions & 0 deletions crates/cuda_builder/cg/cg_bridge.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#include "cooperative_groups.h"
#include "cg_bridge.cuh"
namespace cg = cooperative_groups;

__device__ GridGroup this_grid()
{
cg::grid_group gg = cg::this_grid();
GridGroupWrapper* ggp = new GridGroupWrapper { gg };
return ggp;
}

__device__ void GridGroup_destroy(GridGroup gg)
{
GridGroupWrapper* g = static_cast<GridGroupWrapper*>(gg);
delete g;
}

__device__ bool GridGroup_is_valid(GridGroup gg)
{
GridGroupWrapper* g = static_cast<GridGroupWrapper*>(gg);
return g->gg.is_valid();
}

__device__ void GridGroup_sync(GridGroup gg)
{
GridGroupWrapper* g = static_cast<GridGroupWrapper*>(gg);
return g->gg.sync();
}

__device__ unsigned long long GridGroup_size(GridGroup gg)
{
GridGroupWrapper* g = static_cast<GridGroupWrapper*>(gg);
return g->gg.size();
}

__device__ unsigned long long GridGroup_thread_rank(GridGroup gg)
{
GridGroupWrapper* g = static_cast<GridGroupWrapper*>(gg);
return g->gg.thread_rank();
}

__device__ unsigned long long GridGroup_num_threads(GridGroup gg)
{
GridGroupWrapper* g = static_cast<GridGroupWrapper*>(gg);
return g->gg.num_threads();
}

__device__ unsigned long long GridGroup_num_blocks(GridGroup gg)
{
GridGroupWrapper* g = static_cast<GridGroupWrapper*>(gg);
return g->gg.num_blocks();
}

__device__ unsigned long long GridGroup_block_rank(GridGroup gg)
{
GridGroupWrapper* g = static_cast<GridGroupWrapper*>(gg);
return g->gg.block_rank();
}

__host__ int main()
{}
21 changes: 21 additions & 0 deletions crates/cuda_builder/cg/cg_bridge.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#pragma once
#include "cooperative_groups.h"
namespace cg = cooperative_groups;

typedef struct GridGroupWrapper {
cg::grid_group gg;
} GridGroupWrapper;

extern "C" typedef void* GridGroup;
extern "C" __device__ GridGroup this_grid();
extern "C" __device__ void GridGroup_destroy(GridGroup gg);
extern "C" __device__ bool GridGroup_is_valid(GridGroup gg);
extern "C" __device__ void GridGroup_sync(GridGroup gg);
extern "C" __device__ unsigned long long GridGroup_size(GridGroup gg);
extern "C" __device__ unsigned long long GridGroup_thread_rank(GridGroup gg);
// extern "C" dim3 GridGroup_group_dim(); // TODO: impl these.
extern "C" __device__ unsigned long long GridGroup_num_threads(GridGroup gg);
// extern "C" dim3 GridGroup_dim_blocks(); // TODO: impl these.
extern "C" __device__ unsigned long long GridGroup_num_blocks(GridGroup gg);
// extern "C" dim3 GridGroup_block_index(); // TODO: impl these.
extern "C" __device__ unsigned long long GridGroup_block_rank(GridGroup gg);
174 changes: 174 additions & 0 deletions crates/cuda_builder/src/cg.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
//! Cooperative Groups compilation and linking.

use std::path::{Path, PathBuf};

use anyhow::Context;

use crate::{CudaBuilderError, CudaBuilderResult};

/// An artifact which may be linked together with the Cooperative Groups API bridge PTX code.
pub enum LinkableArtifact {
/// A PTX artifact.
Ptx(PathBuf),
/// A cubin artifact.
Cubin(PathBuf),
/// A fatbin artifact.
Fatbin(PathBuf),
}

impl LinkableArtifact {
/// Add this artifact to the given linker.
fn link_artifact(&self, linker: &mut cust::link::Linker) -> CudaBuilderResult<()> {
match &self {
LinkableArtifact::Ptx(path) => {
let mut data = std::fs::read_to_string(&path).with_context(|| {
format!("error reading PTX file for linking, file={:?}", path)
})?;
if !data.ends_with('\0') {
// If the PTX is not null-terminated, then linking will fail. Only required for PTX.
data.push('\0');
}
linker
.add_ptx(&data)
.with_context(|| format!("error linking PTX file={:?}", path))?;
}
LinkableArtifact::Cubin(path) => {
let data = std::fs::read(&path).with_context(|| {
format!("error reading cubin file for linking, file={:?}", path)
})?;
linker
.add_cubin(&data)
.with_context(|| format!("error linking cubin file={:?}", path))?;
}
LinkableArtifact::Fatbin(path) => {
let data = std::fs::read(&path).with_context(|| {
format!("error reading fatbin file for linking, file={:?}", path)
})?;
linker
.add_fatbin(&data)
.with_context(|| format!("error linking fatbin file={:?}", path))?;
}
}
Ok(())
}
}

/// A builder which will compile the Cooperative Groups API bridging code, and will then link it
/// together with any other artifacts provided to this builder.
///
/// The result of this process will be a `cubin` file containing the linked Cooperative Groups
/// PTX code along with any other linked artifacts provided to this builder. The output `cubin`
/// may then be loaded via `cust::module::Module::from_cubin(..)` and used as normal.
#[derive(Default)]
pub struct CooperativeGroups {
/// Artifacts to be linked together with the Cooperative Groups bridge code.
artifacts: Vec<LinkableArtifact>,
/// Flags to pass to nvcc for Cooperative Groups API bridge compilation.
nvcc_flags: Vec<String>,
}

impl CooperativeGroups {
/// Construct a new instance.
pub fn new() -> Self {
Self::default()
}

/// Add the artifact at the given path for linking.
///
/// This only applies to linking with the Cooperative Groups API bridge code. Typically,
/// this will be the PTX of your main program which has already been built via `CudaBuilder`.
pub fn link(mut self, artifact: LinkableArtifact) -> Self {
self.artifacts.push(artifact);
self
}

/// Add a flag to be passed along to `nvcc` during compilation of the Cooperative Groups API bridge code.
///
/// This provides maximum flexibility for code generation. If needed, multiple architectures
/// may be generated by adding the appropriate flags to the `nvcc` call.
///
/// By default, `nvcc` will generate code for `sm_52`. Override by specifying any of `--gpu-architecture`,
/// `--gpu-code`, or `--generate-code` flags.
///
/// Regardless of the flags added via this method, this builder will always added the following flags:
/// - `-I<cudaRoot>/include`: ensuring `cooperative_groups.h` can be found.
/// - `-Icg`: ensuring the bridging header can be found.
/// - `--ptx`: forces the compiled output to be in PTX form.
/// - `--device-c`: to compile the bridging code as relocatable device code.
/// - `src/cg_bridge.cu` will be added as the code to be compiled, which generates the
/// Cooperative Groups API bridge.
///
/// Docs: https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#command-option-description
pub fn nvcc_flag(mut self, val: impl AsRef<str>) -> Self {
self.nvcc_flags.push(val.as_ref().to_string());
self
}

/// Compile the Cooperative Groups API bridging code, and then link it together
/// with any other artifacts provided to this builder.
///
/// - `cg_out` specifies the output location for the Cooperative Groups API bridge PTX.
/// - `cubin_out` specifies the output location for the fully linked `cubin`.
///
/// ## Errors
/// - At least one artifact must be provided to this builder for linking.
/// - Any errors which take place from the `nvcc` compilation of the Cooperative Groups briding
/// code, or any errors which take place during module linking.
pub fn compile(
mut self,
cg_out: impl AsRef<Path>,
cubin_out: impl AsRef<Path>,
) -> CudaBuilderResult<()> {
// Perform some initial validation.
if self.artifacts.is_empty() {
return Err(anyhow::anyhow!("must provide at least 1 ptx/cubin/fatbin artifact to be linked with the Cooperative Groups API bridge code").into());
}

// Find the cuda installation directory for compilation of CG API.
let cuda_root =
find_cuda_helper::find_cuda_root().ok_or(CudaBuilderError::CudaRootNotFound)?;
let cuda_include = cuda_root.join("include");
let cg_src = std::path::Path::new(std::file!())
.parent()
.context("error accessing parent dir cuda_builder/src")?
.parent()
.context("error accessing parent dir cuda_builder")?
.join("cg")
.canonicalize()
.context("error taking canonical path to cooperative groups API bridge code")?;
let cg_bridge_cu = cg_src.join("cg_bridge.cu");

// Build up the `nvcc` invocation and then build the bridging code.
let mut nvcc = std::process::Command::new("nvcc");
nvcc.arg(format!("-I{:?}", &cuda_include).as_str())
.arg(format!("-I{:?}", &cg_src).as_str())
.arg("--ptx")
.arg("-o")
.arg(cg_out.as_ref().to_string_lossy().as_ref())
.arg("--device-c")
.arg(cg_bridge_cu.to_string_lossy().as_ref());
for flag in self.nvcc_flags.iter() {
nvcc.arg(flag.as_str());
}
nvcc.status()
.context("error calling nvcc for Cooperative Groups API bridge compilation")?;

// Link together the briding code with any given PTX/cubin/fatbin artifacts.
let _ctx = cust::quick_init().context("error building cuda context")?;
let mut linker = cust::link::Linker::new().context("error building cust linker")?;
self.artifacts
.push(LinkableArtifact::Ptx(cg_out.as_ref().to_path_buf()));
for artifact in self.artifacts.iter() {
artifact.link_artifact(&mut linker)?;
}
let linked_cubin = linker
.complete()
.context("error linking artifacts with Cooperative Groups API bridge PTX")?;

// Write finalized cubin.
std::fs::write(&cubin_out, &linked_cubin)
.with_context(|| format!("error writing linked cubin to {:?}", cubin_out.as_ref()))?;

Ok(())
}
}
35 changes: 18 additions & 17 deletions crates/cuda_builder/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,36 +1,37 @@
//! Utility crate for easily building CUDA crates using rustc_codegen_nvvm. Derived from rust-gpu's spirv_builder.

#[cfg(feature = "cooperative_groups")]
pub mod cg;

pub use nvvm::*;
use serde::Deserialize;
use std::{
borrow::Borrow,
env,
ffi::OsString,
fmt,
path::{Path, PathBuf},
process::{Command, Stdio},
};

#[derive(Debug)]
/// Cuda builder result type.
pub type CudaBuilderResult<T> = Result<T, CudaBuilderError>;

/// Cuda builder error type.
#[derive(thiserror::Error, Debug)]
#[non_exhaustive]
pub enum CudaBuilderError {
#[error("crate path {0} does not exist")]
CratePathDoesntExist(PathBuf),
FailedToCopyPtxFile(std::io::Error),
#[error("build failed")]
BuildFailed,
}

impl fmt::Display for CudaBuilderError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
CudaBuilderError::CratePathDoesntExist(path) => {
write!(f, "Crate path {} does not exist", path.display())
}
CudaBuilderError::BuildFailed => f.write_str("Build failed"),
CudaBuilderError::FailedToCopyPtxFile(err) => {
f.write_str(&format!("Failed to copy PTX file: {:?}", err))
}
}
}
#[error("failed to copy PTX file: {0:?}")]
FailedToCopyPtxFile(#[from] std::io::Error),
#[cfg(feature = "cooperative_groups")]
#[error("could not find cuda root installation dir")]
CudaRootNotFound,
#[cfg(feature = "cooperative_groups")]
#[error("compilation of the Cooperative Groups API bridge code failed: {0}")]
CGError(#[from] anyhow::Error),
}

#[derive(Debug, Clone, Copy, PartialEq)]
7 changes: 7 additions & 0 deletions crates/cuda_std/Cargo.toml
Original file line number Diff line number Diff line change
@@ -13,3 +13,10 @@ cuda_std_macros = { version = "0.2", path = "../cuda_std_macros" }
half = "1.7.1"
bitflags = "1.3.2"
paste = "1.0.5"

[features]
default = []
cooperative_groups = []

[package.metadata.docs.rs]
all-features = true
74 changes: 74 additions & 0 deletions crates/cuda_std/src/cg.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
//! Cuda Cooperative Groups API interface.
use crate::gpu_only;

mod ffi {
use core::ffi::c_void;

pub type GridGroup = *mut c_void;
extern "C" {
pub(super) fn this_grid() -> GridGroup;
pub(super) fn GridGroup_destroy(gg: GridGroup);
pub(super) fn GridGroup_is_valid(gg: GridGroup) -> bool;
pub(super) fn GridGroup_sync(gg: GridGroup);
pub(super) fn GridGroup_size(gg: GridGroup) -> u64;
pub(super) fn GridGroup_thread_rank(gg: GridGroup) -> u64;
pub(super) fn GridGroup_num_threads(gg: GridGroup) -> u64;
pub(super) fn GridGroup_num_blocks(gg: GridGroup) -> u64;
pub(super) fn GridGroup_block_rank(gg: GridGroup) -> u64;
// dim3 GridGroup_group_dim(); // TODO: impl these.
// dim3 GridGroup_dim_blocks(); // TODO: impl these.
// dim3 GridGroup_block_index(); // TODO: impl these.
}
}

pub struct GridGroup(ffi::GridGroup);

impl Drop for GridGroup {
fn drop(&mut self) {
unsafe { ffi::GridGroup_destroy(self.0) }
}
}

impl GridGroup {
#[gpu_only]
pub fn this_grid() -> Self {
let ptr = unsafe { ffi::this_grid() };
GridGroup(ptr)
}

#[gpu_only]
pub fn is_valid(&mut self) -> bool {
unsafe { ffi::GridGroup_is_valid(self.0) }
}

#[gpu_only]
pub fn sync(&mut self) {
unsafe { ffi::GridGroup_sync(self.0) }
}

#[gpu_only]
pub fn size(&mut self) -> u64 {
unsafe { ffi::GridGroup_size(self.0) }
}

#[gpu_only]
pub fn thread_rank(&mut self) -> u64 {
unsafe { ffi::GridGroup_thread_rank(self.0) }
}

#[gpu_only]
pub fn num_threads(&mut self) -> u64 {
unsafe { ffi::GridGroup_num_threads(self.0) }
}

#[gpu_only]
pub fn num_blocks(&mut self) -> u64 {
unsafe { ffi::GridGroup_num_blocks(self.0) }
}

#[gpu_only]
pub fn block_rank(&mut self) -> u64 {
unsafe { ffi::GridGroup_block_rank(self.0) }
}
}
2 changes: 2 additions & 0 deletions crates/cuda_std/src/lib.rs
Original file line number Diff line number Diff line change
@@ -46,6 +46,8 @@ pub mod misc;
// pub mod rt;
pub mod atomic;
pub mod cfg;
#[cfg(feature = "cooperative_groups")]
pub mod cg;
pub mod ptr;
pub mod shared;
pub mod thread;
6 changes: 5 additions & 1 deletion crates/cust/src/error.rs
Original file line number Diff line number Diff line change
@@ -78,6 +78,7 @@ pub enum CudaError {
InvalidAddressSpace = 717,
InvalidProgramCounter = 718,
LaunchFailed = 719,
CooperativeLaunchTooLarge = 720,
NotPermitted = 800,
NotSupported = 801,
UnknownError = 999,
@@ -209,9 +210,12 @@ impl ToResult for cudaError_enum {
cudaError_enum::CUDA_ERROR_INVALID_ADDRESS_SPACE => Err(CudaError::InvalidAddressSpace),
cudaError_enum::CUDA_ERROR_INVALID_PC => Err(CudaError::InvalidProgramCounter),
cudaError_enum::CUDA_ERROR_LAUNCH_FAILED => Err(CudaError::LaunchFailed),
cudaError_enum::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE => {
Err(CudaError::CooperativeLaunchTooLarge)
}
cudaError_enum::CUDA_ERROR_NOT_PERMITTED => Err(CudaError::NotPermitted),
cudaError_enum::CUDA_ERROR_NOT_SUPPORTED => Err(CudaError::NotSupported),
_ => Err(CudaError::UnknownError),
err => Err(CudaError::UnknownError),
}
}
}
35 changes: 35 additions & 0 deletions crates/cust/src/function.rs
Original file line number Diff line number Diff line change
@@ -545,3 +545,38 @@ macro_rules! launch {
}
};
}

/// Launch a cooperative kernel function asynchronously.
///
/// This macro is the same as `launch!`, except that it will launch kernels using the driver API
/// `cuLaunchCooperativeKernel` function.
#[macro_export]
macro_rules! launch_cooperative {
($module:ident . $function:ident <<<$grid:expr, $block:expr, $shared:expr, $stream:ident>>>( $( $arg:expr),* $(,)?)) => {
{
let function = $module.get_function(stringify!($function));
match function {
Ok(f) => launch_cooperative!(f<<<$grid, $block, $shared, $stream>>>( $($arg),* ) ),
Err(e) => Err(e),
}
}
};
($function:ident <<<$grid:expr, $block:expr, $shared:expr, $stream:ident>>>( $( $arg:expr),* $(,)?)) => {
{
fn assert_impl_devicecopy<T: $crate::memory::DeviceCopy>(_val: T) {}
if false {
$(
assert_impl_devicecopy($arg);
)*
};

$stream.launch_cooperative(&$function, $grid, $block, $shared,
&[
$(
&$arg as *const _ as *mut ::std::ffi::c_void,
)*
]
)
}
};
}
32 changes: 32 additions & 0 deletions crates/cust/src/stream.rs
Original file line number Diff line number Diff line change
@@ -278,6 +278,38 @@ impl Stream {
.to_result()
}

// Hidden implementation detail function. Highly unsafe. Use the `launch!` macro instead.
#[doc(hidden)]
pub unsafe fn launch_cooperative<G, B>(
&self,
func: &Function,
grid_size: G,
block_size: B,
shared_mem_bytes: u32,
args: &[*mut c_void],
) -> CudaResult<()>
where
G: Into<GridSize>,
B: Into<BlockSize>,
{
let grid_size: GridSize = grid_size.into();
let block_size: BlockSize = block_size.into();

cuda::cuLaunchCooperativeKernel(
func.to_raw(),
grid_size.x,
grid_size.y,
grid_size.z,
block_size.x,
block_size.y,
block_size.z,
shared_mem_bytes,
self.inner,
args.as_ptr() as *mut _,
)
.to_result()
}

// Get the inner `CUstream` from the `Stream`. If you use this handle elsewhere,
// make sure not to use it after the stream has been dropped. Or ManuallyDrop the struct to be safe.
pub fn as_inner(&self) -> CUstream {
27 changes: 17 additions & 10 deletions crates/cust_raw/bindgen.sh
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,21 +1,28 @@
#!/bin/bash
set -exu

if [ -z ${CUDA_PATH} ]; then
echo 'env var ${CUDA_PATH} must be defined, and must point to the root directory of the target Cuda installation'
exit 1
fi

bindgen \
--whitelist-type="^CU.*" \
--whitelist-type="^cuuint(32|64)_t" \
--whitelist-type="^cudaError_enum" \
--whitelist-type="^cu.*Complex$" \
--whitelist-type="^cuda.*" \
--whitelist-type="^libraryPropertyType.*" \
--whitelist-var="^CU.*" \
--whitelist-function="^cu.*" \
--allowlist-type="^CU.*" \
--allowlist-type="^cuuint(32|64)_t" \
--allowlist-type="^cudaError_enum" \
--allowlist-type="^cu.*Complex$" \
--allowlist-type="^cuda.*" \
--allowlist-type="^libraryPropertyType.*" \
--allowlist-var="^CU.*" \
--allowlist-function="^cu.*" \
--default-enum-style=rust \
--no-doc-comments \
--with-derive-default \
--with-derive-eq \
--with-derive-hash \
--with-derive-ord \
--size_t-is-usize \
wrapper.h -- -I/opt/cuda/include \
> src/cuda.rs
wrapper.h \
-- \
-I${CUDA_PATH}/include \
> src/cuda.rs
8 changes: 8 additions & 0 deletions crates/nvvm/src/lib.rs
Original file line number Diff line number Diff line change
@@ -254,6 +254,8 @@ impl FromStr for NvvmOption {
"72" => NvvmArch::Compute72,
"75" => NvvmArch::Compute75,
"80" => NvvmArch::Compute80,
"86" => NvvmArch::Compute86,
"87" => NvvmArch::Compute87,
_ => return Err("unknown arch"),
};
Self::Arch(arch)
@@ -278,6 +280,8 @@ pub enum NvvmArch {
Compute72,
Compute75,
Compute80,
Compute86,
Compute87,
}

impl Display for NvvmArch {
@@ -432,6 +436,8 @@ mod tests {
"-arch=compute_72",
"-arch=compute_75",
"-arch=compute_80",
"-arch=compute_86",
"-arch=compute_87",
"-ftz=1",
"-prec-sqrt=0",
"-prec-div=0",
@@ -453,6 +459,8 @@ mod tests {
Arch(Compute72),
Arch(Compute75),
Arch(Compute80),
Arch(Compute86),
Arch(Compute87),
Ftz,
FastSqrt,
FastDiv,