Rust-GPU
diff --git a/‎crates/blastoff/src/context.rs
Lines changed: 11 additions & 4 deletions b/‎crates/blastoff/src/context.rs
Lines changed: 11 additions & 4 deletions
diff --git a/‎crates/blastoff/src/level1.rs
Lines changed: 2 additions & 2 deletions b/‎crates/blastoff/src/level1.rs
Lines changed: 2 additions & 2 deletions
diff --git a/‎crates/blastoff/src/lib.rs
Lines changed: 2 additions & 7 deletions b/‎crates/blastoff/src/lib.rs
Lines changed: 2 additions & 7 deletions
diff --git a/‎crates/cuda_builder/src/lib.rs
Lines changed: 4 additions & 4 deletions b/‎crates/cuda_builder/src/lib.rs
Lines changed: 4 additions & 4 deletions
diff --git a/‎crates/cuda_std/src/float.rs
Lines changed: 12 additions & 10 deletions b/‎crates/cuda_std/src/float.rs
Lines changed: 12 additions & 10 deletions
diff --git a/‎crates/cuda_std/src/shared.rs
Lines changed: 29 additions & 19 deletions b/‎crates/cuda_std/src/shared.rs
Lines changed: 29 additions & 19 deletions
@@ -140,7 +140,13 @@ impl CublasContext {
     ) -> Result<T> {
         unsafe {
             // cudaStream_t is the same as CUstream
-            sys::v2::cublasSetStream_v2(self.raw, mem::transmute(stream.as_inner())).to_result()?;
+            sys::v2::cublasSetStream_v2(
+                self.raw,
+                mem::transmute::<*mut cust::sys::CUstream_st, *mut cublas_sys::v2::CUstream_st>(
+                    stream.as_inner(),
+                ),
+            )
+            .to_result()?;
             let res = func(self)?;
             // reset the stream back to NULL just in case someone calls with_stream, then drops the stream, and tries to
             // execute a raw sys function with the context's handle.
@@ -227,10 +233,11 @@ impl CublasContext {
     /// ```
     pub fn set_math_mode(&self, math_mode: MathMode) -> Result<()> {
         unsafe {
-            Ok(
-                sys::v2::cublasSetMathMode(self.raw, mem::transmute(math_mode.bits()))
-                    .to_result()?,
+            Ok(sys::v2::cublasSetMathMode(
+                self.raw,
+                mem::transmute::<u32, cublas_sys::v2::cublasMath_t>(math_mode.bits()),
             )
+            .to_result()?)
         }
     }
 
 
@@ -24,8 +24,8 @@ fn check_stride<T: BlasDatatype>(x: &impl GpuBuffer<T>, n: usize, stride: Option
     );
 }
 
-/// Scalar and Vector-based operations such as `min`, `max`, `axpy`, `copy`, `dot`, `nrm2`, `rot`, `rotg`, `rotm`, `rotmg`, `scal`, and `swap`.
-
+/// Scalar and Vector-based operations such as `min`, `max`, `axpy`, `copy`, `dot`,
+/// `nrm2`, `rot`, `rotg`, `rotm`, `rotmg`, `scal`, and `swap`.
 impl CublasContext {
     /// Same as [`CublasContext::amin`] but with an explicit stride.
     ///
 
@@ -93,22 +93,17 @@ pub(crate) mod private {
 
 /// An optional operation to apply to a matrix before a matrix operation. This includes
 /// no operation, transpose, or conjugate transpose.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum MatrixOp {
     /// No operation, leave the matrix as is. This is the default.
+    #[default]
     None,
     /// Transpose the matrix in place.
     Transpose,
     /// Conjugate transpose the matrix in place.
     ConjugateTranspose,
 }
 
-impl Default for MatrixOp {
-    fn default() -> Self {
-        MatrixOp::None
-    }
-}
-
 impl MatrixOp {
     /// Returns the corresponding `cublasOperation_t` for this operation.
     pub fn to_raw(self) -> sys::v2::cublasOperation_t {
 
@@ -246,13 +246,13 @@ impl CudaBuilder {
 
     /// Emit LLVM IR, the exact same as rustc's `--emit=llvm-ir`.
     pub fn emit_llvm_ir(mut self, emit_llvm_ir: bool) -> Self {
-        self.emit = emit_llvm_ir.then(|| EmitOption::LlvmIr);
+        self.emit = emit_llvm_ir.then_some(EmitOption::LlvmIr);
         self
     }
 
     /// Emit LLVM Bitcode, the exact same as rustc's `--emit=llvm-bc`.
     pub fn emit_llvm_bitcode(mut self, emit_llvm_bitcode: bool) -> Self {
-        self.emit = emit_llvm_bitcode.then(|| EmitOption::Bitcode);
+        self.emit = emit_llvm_bitcode.then_some(EmitOption::Bitcode);
         self
     }
 
@@ -435,7 +435,7 @@ fn invoke_rustc(builder: &CudaBuilder) -> Result<PathBuf, CudaBuilderError> {
     }
 
     let mut cargo = Command::new("cargo");
-    cargo.args(&[
+    cargo.args([
         "build",
         "--lib",
         "--message-format=json-render-diagnostics",
@@ -525,7 +525,7 @@ fn get_last_artifact(out: &str) -> Option<PathBuf> {
             }
         })
         .filter(|line| line.reason == "compiler-artifact")
-        .last()
+        .next_back()
         .expect("Did not find output file in rustc output");
 
     let mut filenames = last
 
@@ -356,12 +356,13 @@ impl GpuFloat for f32 {
         f32_intrinsic!(self, atan())
     }
 
-    /// Computes the four quadrant arctangent of `self` (`y`) and `other` (`x`) in radians.
+    /// Computes the four quadrant arctangent of `self` (`y`) and `other` (`x`) in
+    /// radians.
     ///
-    /// * `x = 0`, `y = 0`: `0`
-    /// * `x >= 0`: `arctan(y/x)` -> `[-pi/2, pi/2]`
-    /// * `y >= 0`: `arctan(y/x) + pi` -> `(pi/2, pi]`
-    /// * `y < 0`: `arctan(y/x) - pi` -> `(-pi, -pi/2)`intrinsics
+    ///   * `x = 0`, `y = 0`: `0`
+    ///   * `x >= 0`: `arctan(y/x)` -> `[-pi/2, pi/2]`
+    ///   * `y >= 0`: `arctan(y/x) + pi` -> `(pi/2, pi]`
+    ///   * `y < 0`: `arctan(y/x) - pi` -> `(-pi, -pi/2)`intrinsics
     #[inline]
     fn atan2(self, other: f32) -> f32 {
         f32_intrinsic!(self, atan2(other))
@@ -687,12 +688,13 @@ impl GpuFloat for f64 {
         f64_intrinsic!(self, atan())
     }
 
-    /// Computes the four quadrant arctangent of `self` (`y`) and `other` (`x`) in radians.
+    /// Computes the four quadrant arctangent of `self` (`y`) and `other` (`x`) in
+    /// radians.
     ///
-    /// * `x = 0`, `y = 0`: `0`
-    /// * `x >= 0`: `arctan(y/x)` -> `[-pi/2, pi/2]`
-    /// * `y >= 0`: `arctan(y/x) + pi` -> `(pi/2, pi]`
-    /// * `y < 0`: `arctan(y/x) - pi` -> `(-pi, -pi/2)`intrinsics
+    ///   * `x = 0`, `y = 0`: `0`
+    ///   * `x >= 0`: `arctan(y/x)` -> `[-pi/2, pi/2]`
+    ///   * `y >= 0`: `arctan(y/x) + pi` -> `(pi/2, pi]`
+    ///   * `y < 0`: `arctan(y/x) - pi` -> `(-pi, -pi/2)`intrinsics
     #[inline]
     fn atan2(self, other: f64) -> f64 {
         f64_intrinsic!(self, atan2(other))
 
@@ -2,31 +2,41 @@
 
 use crate::gpu_only;
 
-/// Statically allocates a buffer large enough for `len` elements of `array_type`, yielding
-/// a `*mut array_type` that points to uninitialized shared memory. `len` must be a constant expression.
+/// Statically allocates a buffer large enough for `len` elements of `array_type`,
+/// yielding a `*mut array_type` that points to uninitialized shared memory. `len` must
+/// be a constant expression.
 ///
-/// Note that this allocates the memory __statically__, it expands to a static in the `shared` address space.
-/// Therefore, calling this macro multiple times in a loop will always yield the same data. However, separate
-/// invocations of the macro will yield different buffers.
+/// Note that this allocates the memory __statically__, it expands to a static in the
+/// `shared` address space. Therefore, calling this macro multiple times in a loop will
+/// always yield the same data. However, separate invocations of the macro will yield
+/// different buffers.
 ///
-/// The data is uninitialized by default, therefore, you must be careful to not read the data before it is written to.
-/// The semantics of what "uninitialized" actually means on the GPU (i.e. if it yields unknown data or if it is UB to read it whatsoever)
-/// are not well known, so even if the type is valid for any backing memory, make sure to not read uninitialized data.
+/// The data is uninitialized by default, therefore, you must be careful to not read the
+/// data before it is written to. The semantics of what "uninitialized" actually means
+/// on the GPU (i.e. if it yields unknown data or if it is UB to read it whatsoever) are
+/// not well known, so even if the type is valid for any backing memory, make sure to
+/// not read uninitialized data.
 ///
 /// # Safety
 ///
-/// Shared memory usage is fundamentally extremely unsafe and impossible to statically prove, therefore
-/// the burden of correctness is on the user. Some of the things you must ensure in your usage of
-/// shared memory are:
-/// - Shared memory is only shared across __thread blocks__, not the entire device, therefore it is
-///   unsound to try and rely on sharing data across more than one block.
-/// - You must write to the shared buffer before reading from it as the data is uninitialized by default.
-/// - [`thread::sync_threads`](crate::thread::sync_threads) must be called before relying on the results of other
-///   threads, this ensures every thread has reached that point before going on. For example, reading another thread's
-///   data after writing to the buffer.
-/// - No access may be out of bounds, this usually means making sure the amount of threads and their dimensions are correct.
+/// Shared memory usage is fundamentally extremely unsafe and impossible to statically
+/// prove, therefore the burden of correctness is on the user. Some of the things you
+/// must ensure in your usage of shared memory are:
 ///
-/// It is suggested to run your executable in `cuda-memcheck` to make sure usages of shared memory are right.
+///  - Shared memory is only shared across __thread blocks__, not the entire device,
+///    therefore it is unsound to try and rely on sharing data across more than one
+///    block.
+///   - You must write to the shared buffer before reading from it as the data is
+///     uninitialized by default.
+///   - [`thread::sync_threads`](crate::thread::sync_threads) must be called before
+///     relying on the results of other threads, this ensures every thread has reached
+///     that point before going on. For example, reading another thread's data after
+///     writing to the buffer.
+///   - No access may be out of bounds, this usually means making sure the amount of
+///     threads and their dimensions are correct.
+///
+/// It is suggested to run your executable in `cuda-memcheck` to make sure usages of
+/// shared memory are right.
 ///
 /// # Examples
 ///