Rust-GPU · thedodd · Sep 15, 2022
diff --git a/Cargo.toml b/Cargo.toml
@@ -14,7 +14,7 @@ members = [
 ]
 
 exclude = [
-    "crates/optix/examples/common"
+    "crates/optix/examples/common",
 ]
 
 [profile.dev.package.rustc_codegen_nvvm]

diff --git a/crates/cuda_builder/Cargo.toml b/crates/cuda_builder/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "cuda_builder"
-version = "0.3.0"
+version = "0.4.0"
 edition = "2021"
 authors = ["Riccardo D'Ambrosio <[email protected]>", "The rust-gpu Authors"]
 license = "MIT OR Apache-2.0"
@@ -9,8 +9,16 @@ repository = "https://github.com/Rust-GPU/Rust-CUDA"
 readme = "../../README.md"
 
 [dependencies]
+anyhow = "1"
+thiserror = "1"
+cc = { version = "1", default-features = false, optional = true }
+cust = { path = "../cust", optional = true }
 rustc_codegen_nvvm = { version = "0.3", path = "../rustc_codegen_nvvm" }
 nvvm = { path = "../nvvm", version = "0.1" }
 serde = { version = "1.0.130", features = ["derive"] }
 serde_json = "1.0.68"
 find_cuda_helper = { version = "0.2", path = "../find_cuda_helper" }
+
+[features]
+default = []
+cooperative_groups = ["cc", "cust"]
diff --git a/crates/cuda_builder/cg/cg_bridge.cu b/crates/cuda_builder/cg/cg_bridge.cu
@@ -0,0 +1,61 @@
+#include "cooperative_groups.h"
+#include "cg_bridge.cuh"
+namespace cg = cooperative_groups;
+
+__device__ GridGroup this_grid()
+{
+    cg::grid_group gg = cg::this_grid();
+    GridGroupWrapper* ggp = new GridGroupWrapper { gg };
+    return ggp;
+}
+
+__device__ void GridGroup_destroy(GridGroup gg)
+{
+    GridGroupWrapper* g = static_cast<GridGroupWrapper*>(gg);
+    delete g;
+}
+
+__device__ bool GridGroup_is_valid(GridGroup gg)
+{
+    GridGroupWrapper* g = static_cast<GridGroupWrapper*>(gg);
+    return g->gg.is_valid();
+}
+
+__device__ void GridGroup_sync(GridGroup gg)
+{
+    GridGroupWrapper* g = static_cast<GridGroupWrapper*>(gg);
+    return g->gg.sync();
+}
+
+__device__ unsigned long long GridGroup_size(GridGroup gg)
+{
+    GridGroupWrapper* g = static_cast<GridGroupWrapper*>(gg);
+    return g->gg.size();
+}
+
+__device__ unsigned long long GridGroup_thread_rank(GridGroup gg)
+{
+    GridGroupWrapper* g = static_cast<GridGroupWrapper*>(gg);
+    return g->gg.thread_rank();
+}
+
+__device__ unsigned long long GridGroup_num_threads(GridGroup gg)
+{
+    GridGroupWrapper* g = static_cast<GridGroupWrapper*>(gg);
+    return g->gg.num_threads();
+}
+
+__device__ unsigned long long GridGroup_num_blocks(GridGroup gg)
+{
+    GridGroupWrapper* g = static_cast<GridGroupWrapper*>(gg);
+    return g->gg.num_blocks();
+}
+
+__device__ unsigned long long GridGroup_block_rank(GridGroup gg)
+{
+    GridGroupWrapper* g = static_cast<GridGroupWrapper*>(gg);
+    return g->gg.block_rank();
+}
+
+__host__ int main()
+{}
diff --git a/crates/cuda_builder/cg/cg_bridge.cuh b/crates/cuda_builder/cg/cg_bridge.cuh
@@ -0,0 +1,21 @@
+#pragma once
+#include "cooperative_groups.h"
+namespace cg = cooperative_groups;
+
+typedef struct GridGroupWrapper {
+    cg::grid_group gg;
+} GridGroupWrapper;
+
+extern "C" typedef void* GridGroup;
+extern "C" __device__ GridGroup this_grid();
+extern "C" __device__ void GridGroup_destroy(GridGroup gg);
+extern "C" __device__ bool GridGroup_is_valid(GridGroup gg);
+extern "C" __device__ void GridGroup_sync(GridGroup gg);
+extern "C" __device__ unsigned long long GridGroup_size(GridGroup gg);
+extern "C" __device__ unsigned long long GridGroup_thread_rank(GridGroup gg);
+// extern "C" dim3 GridGroup_group_dim(); // TODO: impl these.
+extern "C" __device__ unsigned long long GridGroup_num_threads(GridGroup gg);
+// extern "C" dim3 GridGroup_dim_blocks(); // TODO: impl these.
+extern "C" __device__ unsigned long long GridGroup_num_blocks(GridGroup gg);
+// extern "C" dim3 GridGroup_block_index(); // TODO: impl these.
+extern "C" __device__ unsigned long long GridGroup_block_rank(GridGroup gg);
diff --git a/crates/cuda_builder/src/cg.rs b/crates/cuda_builder/src/cg.rs
@@ -0,0 +1,174 @@
+//! Cooperative Groups compilation and linking.
+
+use std::path::{Path, PathBuf};
+
+use anyhow::Context;
+
+use crate::{CudaBuilderError, CudaBuilderResult};
+
+/// An artifact which may be linked together with the Cooperative Groups API bridge PTX code.
+pub enum LinkableArtifact {
+    /// A PTX artifact.
+    Ptx(PathBuf),
+    /// A cubin artifact.
+    Cubin(PathBuf),
+    /// A fatbin artifact.
+    Fatbin(PathBuf),
+}
+
+impl LinkableArtifact {
+    /// Add this artifact to the given linker.
+    fn link_artifact(&self, linker: &mut cust::link::Linker) -> CudaBuilderResult<()> {
+        match &self {
+            LinkableArtifact::Ptx(path) => {
+                let mut data = std::fs::read_to_string(&path).with_context(|| {
+                    format!("error reading PTX file for linking, file={:?}", path)
+                })?;
+                if !data.ends_with('\0') {
+                    // If the PTX is not null-terminated, then linking will fail. Only required for PTX.
+                    data.push('\0');
+                }
+                linker
+                    .add_ptx(&data)
+                    .with_context(|| format!("error linking PTX file={:?}", path))?;
+            }
+            LinkableArtifact::Cubin(path) => {
+                let data = std::fs::read(&path).with_context(|| {
+                    format!("error reading cubin file for linking, file={:?}", path)
+                })?;
+                linker
+                    .add_cubin(&data)
+                    .with_context(|| format!("error linking cubin file={:?}", path))?;
+            }
+            LinkableArtifact::Fatbin(path) => {
+                let data = std::fs::read(&path).with_context(|| {
+                    format!("error reading fatbin file for linking, file={:?}", path)
+                })?;
+                linker
+                    .add_fatbin(&data)
+                    .with_context(|| format!("error linking fatbin file={:?}", path))?;
+            }
+        }
+        Ok(())
+    }
+}
+
+/// A builder which will compile the Cooperative Groups API bridging code, and will then link it
+/// together with any other artifacts provided to this builder.
+///
+/// The result of this process will be a `cubin` file containing the linked Cooperative Groups
+/// PTX code along with any other linked artifacts provided to this builder. The output `cubin`
+/// may then be loaded via `cust::module::Module::from_cubin(..)` and used as normal.
+#[derive(Default)]
+pub struct CooperativeGroups {
+    /// Artifacts to be linked together with the Cooperative Groups bridge code.
+    artifacts: Vec<LinkableArtifact>,
+    /// Flags to pass to nvcc for Cooperative Groups API bridge compilation.
+    nvcc_flags: Vec<String>,
+}
+
+impl CooperativeGroups {
+    /// Construct a new instance.
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Add the artifact at the given path for linking.
+    ///
+    /// This only applies to linking with the Cooperative Groups API bridge code. Typically,
+    /// this will be the PTX of your main program which has already been built via `CudaBuilder`.
+    pub fn link(mut self, artifact: LinkableArtifact) -> Self {
+        self.artifacts.push(artifact);
+        self
+    }
+
+    /// Add a flag to be passed along to `nvcc` during compilation of the Cooperative Groups API bridge code.
+    ///
+    /// This provides maximum flexibility for code generation. If needed, multiple architectures
+    /// may be generated by adding the appropriate flags to the `nvcc` call.
+    ///
+    /// By default, `nvcc` will generate code for `sm_52`. Override by specifying any of `--gpu-architecture`,
+    /// `--gpu-code`, or `--generate-code` flags.
+    ///
+    /// Regardless of the flags added via this method, this builder will always added the following flags:
+    /// - `-I<cudaRoot>/include`: ensuring `cooperative_groups.h` can be found.
+    /// - `-Icg`: ensuring the bridging header can be found.
+    /// - `--ptx`: forces the compiled output to be in PTX form.
+    /// - `--device-c`: to compile the bridging code as relocatable device code.
+    /// - `src/cg_bridge.cu` will be added as the code to be compiled, which generates the
+    /// Cooperative Groups API bridge.
+    ///
+    /// Docs: https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#command-option-description
+    pub fn nvcc_flag(mut self, val: impl AsRef<str>) -> Self {
+        self.nvcc_flags.push(val.as_ref().to_string());
+        self
+    }
+
+    /// Compile the Cooperative Groups API bridging code, and then link it together
+    /// with any other artifacts provided to this builder.
+    ///
+    /// - `cg_out` specifies the output location for the Cooperative Groups API bridge PTX.
+    /// - `cubin_out` specifies the output location for the fully linked `cubin`.
+    ///
+    /// ## Errors
+    /// - At least one artifact must be provided to this builder for linking.
+    /// - Any errors which take place from the `nvcc` compilation of the Cooperative Groups briding
+    /// code, or any errors which take place during module linking.
+    pub fn compile(
+        mut self,
+        cg_out: impl AsRef<Path>,
+        cubin_out: impl AsRef<Path>,
+    ) -> CudaBuilderResult<()> {
+        // Perform some initial validation.
+        if self.artifacts.is_empty() {
+            return Err(anyhow::anyhow!("must provide at least 1 ptx/cubin/fatbin artifact to be linked with the Cooperative Groups API bridge code").into());
+        }
+
+        // Find the cuda installation directory for compilation of CG API.
+        let cuda_root =
+            find_cuda_helper::find_cuda_root().ok_or(CudaBuilderError::CudaRootNotFound)?;
+        let cuda_include = cuda_root.join("include");
+        let cg_src = std::path::Path::new(std::file!())
+            .parent()
+            .context("error accessing parent dir cuda_builder/src")?
+            .parent()
+            .context("error accessing parent dir cuda_builder")?
+            .join("cg")
+            .canonicalize()
+            .context("error taking canonical path to cooperative groups API bridge code")?;
+        let cg_bridge_cu = cg_src.join("cg_bridge.cu");
+
+        // Build up the `nvcc` invocation and then build the bridging code.
+        let mut nvcc = std::process::Command::new("nvcc");
+        nvcc.arg(format!("-I{:?}", &cuda_include).as_str())
+            .arg(format!("-I{:?}", &cg_src).as_str())
+            .arg("--ptx")
+            .arg("-o")
+            .arg(cg_out.as_ref().to_string_lossy().as_ref())
+            .arg("--device-c")
+            .arg(cg_bridge_cu.to_string_lossy().as_ref());
+        for flag in self.nvcc_flags.iter() {
+            nvcc.arg(flag.as_str());
+        }
+        nvcc.status()
+            .context("error calling nvcc for Cooperative Groups API bridge compilation")?;
+
+        // Link together the briding code with any given PTX/cubin/fatbin artifacts.
+        let _ctx = cust::quick_init().context("error building cuda context")?;
+        let mut linker = cust::link::Linker::new().context("error building cust linker")?;
+        self.artifacts
+            .push(LinkableArtifact::Ptx(cg_out.as_ref().to_path_buf()));
+        for artifact in self.artifacts.iter() {
+            artifact.link_artifact(&mut linker)?;
+        }
+        let linked_cubin = linker
+            .complete()
+            .context("error linking artifacts with Cooperative Groups API bridge PTX")?;
+
+        // Write finalized cubin.
+        std::fs::write(&cubin_out, &linked_cubin)
+            .with_context(|| format!("error writing linked cubin to {:?}", cubin_out.as_ref()))?;
+
+        Ok(())
+    }
+}
diff --git a/crates/cuda_builder/src/lib.rs b/crates/cuda_builder/src/lib.rs
@@ -1,36 +1,37 @@
 //! Utility crate for easily building CUDA crates using rustc_codegen_nvvm. Derived from rust-gpu's spirv_builder.
 
+#[cfg(feature = "cooperative_groups")]
+pub mod cg;
+
 pub use nvvm::*;
 use serde::Deserialize;
 use std::{
     borrow::Borrow,
     env,
     ffi::OsString,
-    fmt,
     path::{Path, PathBuf},
     process::{Command, Stdio},
 };
 
-#[derive(Debug)]
+/// Cuda builder result type.
+pub type CudaBuilderResult<T> = Result<T, CudaBuilderError>;
+
+/// Cuda builder error type.
+#[derive(thiserror::Error, Debug)]
 #[non_exhaustive]
 pub enum CudaBuilderError {
+    #[error("crate path {0} does not exist")]
     CratePathDoesntExist(PathBuf),
-    FailedToCopyPtxFile(std::io::Error),
+    #[error("build failed")]
     BuildFailed,
-}
-
-impl fmt::Display for CudaBuilderError {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        match self {
-            CudaBuilderError::CratePathDoesntExist(path) => {
-                write!(f, "Crate path {} does not exist", path.display())
-            }
-            CudaBuilderError::BuildFailed => f.write_str("Build failed"),
-            CudaBuilderError::FailedToCopyPtxFile(err) => {
-                f.write_str(&format!("Failed to copy PTX file: {:?}", err))
-            }
-        }
-    }
+    #[error("failed to copy PTX file: {0:?}")]
+    FailedToCopyPtxFile(#[from] std::io::Error),
+    #[cfg(feature = "cooperative_groups")]
+    #[error("could not find cuda root installation dir")]
+    CudaRootNotFound,
+    #[cfg(feature = "cooperative_groups")]
+    #[error("compilation of the Cooperative Groups API bridge code failed: {0}")]
+    CGError(#[from] anyhow::Error),
 }
 
 #[derive(Debug, Clone, Copy, PartialEq)]

diff --git a/crates/cuda_std/Cargo.toml b/crates/cuda_std/Cargo.toml
@@ -13,3 +13,10 @@ cuda_std_macros = { version = "0.2", path = "../cuda_std_macros" }
 half = "1.7.1"
 bitflags = "1.3.2"
 paste = "1.0.5"
+
+[features]
+default = []
+cooperative_groups = []
+
+[package.metadata.docs.rs]
+all-features = true
diff --git a/crates/cuda_std/src/cg.rs b/crates/cuda_std/src/cg.rs
@@ -0,0 +1,74 @@
+//! Cuda Cooperative Groups API interface.
+
+use crate::gpu_only;
+
+mod ffi {
+    use core::ffi::c_void;
+
+    pub type GridGroup = *mut c_void;
+    extern "C" {
+        pub(super) fn this_grid() -> GridGroup;
+        pub(super) fn GridGroup_destroy(gg: GridGroup);
+        pub(super) fn GridGroup_is_valid(gg: GridGroup) -> bool;
+        pub(super) fn GridGroup_sync(gg: GridGroup);
+        pub(super) fn GridGroup_size(gg: GridGroup) -> u64;
+        pub(super) fn GridGroup_thread_rank(gg: GridGroup) -> u64;
+        pub(super) fn GridGroup_num_threads(gg: GridGroup) -> u64;
+        pub(super) fn GridGroup_num_blocks(gg: GridGroup) -> u64;
+        pub(super) fn GridGroup_block_rank(gg: GridGroup) -> u64;
+        // dim3 GridGroup_group_dim(); // TODO: impl these.
+        // dim3 GridGroup_dim_blocks(); // TODO: impl these.
+        // dim3 GridGroup_block_index(); // TODO: impl these.
+    }
+}
+
+pub struct GridGroup(ffi::GridGroup);
+
+impl Drop for GridGroup {
+    fn drop(&mut self) {
+        unsafe { ffi::GridGroup_destroy(self.0) }
+    }
+}
+
+impl GridGroup {
+    #[gpu_only]
+    pub fn this_grid() -> Self {
+        let ptr = unsafe { ffi::this_grid() };
+        GridGroup(ptr)
+    }
+
+    #[gpu_only]
+    pub fn is_valid(&mut self) -> bool {
+        unsafe { ffi::GridGroup_is_valid(self.0) }
+    }
+
+    #[gpu_only]
+    pub fn sync(&mut self) {
+        unsafe { ffi::GridGroup_sync(self.0) }
+    }
+
+    #[gpu_only]
+    pub fn size(&mut self) -> u64 {
+        unsafe { ffi::GridGroup_size(self.0) }
+    }
+
+    #[gpu_only]
+    pub fn thread_rank(&mut self) -> u64 {
+        unsafe { ffi::GridGroup_thread_rank(self.0) }
+    }
+
+    #[gpu_only]
+    pub fn num_threads(&mut self) -> u64 {
+        unsafe { ffi::GridGroup_num_threads(self.0) }
+    }
+
+    #[gpu_only]
+    pub fn num_blocks(&mut self) -> u64 {
+        unsafe { ffi::GridGroup_num_blocks(self.0) }
+    }
+
+    #[gpu_only]
+    pub fn block_rank(&mut self) -> u64 {
+        unsafe { ffi::GridGroup_block_rank(self.0) }
+    }
+}
diff --git a/crates/cuda_std/src/lib.rs b/crates/cuda_std/src/lib.rs
@@ -46,6 +46,8 @@ pub mod misc;
 // pub mod rt;
 pub mod atomic;
 pub mod cfg;
+#[cfg(feature = "cooperative_groups")]
+pub mod cg;
 pub mod ptr;
 pub mod shared;
 pub mod thread;

diff --git a/crates/cust/src/error.rs b/crates/cust/src/error.rs
@@ -78,6 +78,7 @@ pub enum CudaError {
     InvalidAddressSpace = 717,
     InvalidProgramCounter = 718,
     LaunchFailed = 719,
+    CooperativeLaunchTooLarge = 720,
     NotPermitted = 800,
     NotSupported = 801,
     UnknownError = 999,
@@ -209,9 +210,12 @@ impl ToResult for cudaError_enum {
             cudaError_enum::CUDA_ERROR_INVALID_ADDRESS_SPACE => Err(CudaError::InvalidAddressSpace),
             cudaError_enum::CUDA_ERROR_INVALID_PC => Err(CudaError::InvalidProgramCounter),
             cudaError_enum::CUDA_ERROR_LAUNCH_FAILED => Err(CudaError::LaunchFailed),
+            cudaError_enum::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE => {
+                Err(CudaError::CooperativeLaunchTooLarge)
+            }
             cudaError_enum::CUDA_ERROR_NOT_PERMITTED => Err(CudaError::NotPermitted),
             cudaError_enum::CUDA_ERROR_NOT_SUPPORTED => Err(CudaError::NotSupported),
-            _ => Err(CudaError::UnknownError),
+            err => Err(CudaError::UnknownError),
         }
     }
 }
diff --git a/crates/cust/src/function.rs b/crates/cust/src/function.rs
@@ -545,3 +545,38 @@ macro_rules! launch {
         }
     };
 }
+
+/// Launch a cooperative kernel function asynchronously.
+///
+/// This macro is the same as `launch!`, except that it will launch kernels using the driver API
+/// `cuLaunchCooperativeKernel` function.
+#[macro_export]
+macro_rules! launch_cooperative {
+    ($module:ident . $function:ident <<<$grid:expr, $block:expr, $shared:expr, $stream:ident>>>( $( $arg:expr),* $(,)?)) => {
+        {
+            let function = $module.get_function(stringify!($function));
+            match function {
+                Ok(f) => launch_cooperative!(f<<<$grid, $block, $shared, $stream>>>( $($arg),* ) ),
+                Err(e) => Err(e),
+            }
+        }
+    };
+    ($function:ident <<<$grid:expr, $block:expr, $shared:expr, $stream:ident>>>( $( $arg:expr),* $(,)?)) => {
+        {
+            fn assert_impl_devicecopy<T: $crate::memory::DeviceCopy>(_val: T) {}
+            if false {
+                $(
+                    assert_impl_devicecopy($arg);
+                )*
+            };
+
+            $stream.launch_cooperative(&$function, $grid, $block, $shared,
+                &[
+                    $(
+                        &$arg as *const _ as *mut ::std::ffi::c_void,
+                    )*
+                ]
+            )
+        }
+    };
+}
diff --git a/crates/cust/src/stream.rs b/crates/cust/src/stream.rs
@@ -278,6 +278,38 @@ impl Stream {
         .to_result()
     }
 
+    // Hidden implementation detail function. Highly unsafe. Use the `launch!` macro instead.
+    #[doc(hidden)]
+    pub unsafe fn launch_cooperative<G, B>(
+        &self,
+        func: &Function,
+        grid_size: G,
+        block_size: B,
+        shared_mem_bytes: u32,
+        args: &[*mut c_void],
+    ) -> CudaResult<()>
+    where
+        G: Into<GridSize>,
+        B: Into<BlockSize>,
+    {
+        let grid_size: GridSize = grid_size.into();
+        let block_size: BlockSize = block_size.into();
+
+        cuda::cuLaunchCooperativeKernel(
+            func.to_raw(),
+            grid_size.x,
+            grid_size.y,
+            grid_size.z,
+            block_size.x,
+            block_size.y,
+            block_size.z,
+            shared_mem_bytes,
+            self.inner,
+            args.as_ptr() as *mut _,
+        )
+        .to_result()
+    }
+
     // Get the inner `CUstream` from the `Stream`. If you use this handle elsewhere,
     // make sure not to use it after the stream has been dropped. Or ManuallyDrop the struct to be safe.
     pub fn as_inner(&self) -> CUstream {

diff --git a/crates/cust_raw/bindgen.sh b/crates/cust_raw/bindgen.sh
@@ -1,21 +1,28 @@
 #!/bin/bash
 set -exu
 
+if [ -z ${CUDA_PATH} ]; then
+  echo 'env var ${CUDA_PATH} must be defined, and must point to the root directory of the target Cuda installation'
+  exit 1
+fi
+
 bindgen \
-  --whitelist-type="^CU.*" \
-  --whitelist-type="^cuuint(32|64)_t" \
-  --whitelist-type="^cudaError_enum" \
-  --whitelist-type="^cu.*Complex$" \
-  --whitelist-type="^cuda.*" \
-  --whitelist-type="^libraryPropertyType.*" \
-  --whitelist-var="^CU.*" \
-  --whitelist-function="^cu.*" \
+  --allowlist-type="^CU.*" \
+  --allowlist-type="^cuuint(32|64)_t" \
+  --allowlist-type="^cudaError_enum" \
+  --allowlist-type="^cu.*Complex$" \
+  --allowlist-type="^cuda.*" \
+  --allowlist-type="^libraryPropertyType.*" \
+  --allowlist-var="^CU.*" \
+  --allowlist-function="^cu.*" \
   --default-enum-style=rust \
   --no-doc-comments \
   --with-derive-default \
   --with-derive-eq \
   --with-derive-hash \
   --with-derive-ord \
   --size_t-is-usize \
-  wrapper.h -- -I/opt/cuda/include \
-  > src/cuda.rs
+  wrapper.h \
+  -- \
+  -I${CUDA_PATH}/include \
+  > src/cuda.rs
diff --git a/crates/nvvm/src/lib.rs b/crates/nvvm/src/lib.rs
@@ -254,6 +254,8 @@ impl FromStr for NvvmOption {
                     "72" => NvvmArch::Compute72,
                     "75" => NvvmArch::Compute75,
                     "80" => NvvmArch::Compute80,
+                    "86" => NvvmArch::Compute86,
+                    "87" => NvvmArch::Compute87,
                     _ => return Err("unknown arch"),
                 };
                 Self::Arch(arch)
@@ -278,6 +280,8 @@ pub enum NvvmArch {
     Compute72,
     Compute75,
     Compute80,
+    Compute86,
+    Compute87,
 }
 
 impl Display for NvvmArch {
@@ -432,6 +436,8 @@ mod tests {
             "-arch=compute_72",
             "-arch=compute_75",
             "-arch=compute_80",
+            "-arch=compute_86",
+            "-arch=compute_87",
             "-ftz=1",
             "-prec-sqrt=0",
             "-prec-div=0",
@@ -453,6 +459,8 @@ mod tests {
             Arch(Compute72),
             Arch(Compute75),
             Arch(Compute80),
+            Arch(Compute86),
+            Arch(Compute87),
             Ftz,
             FastSqrt,
             FastDiv,