diff --git a/csrc/flashfftconv/conv1d/conv1d_bwd_cuda_bhl.cu b/csrc/flashfftconv/conv1d/conv1d_bwd_cuda_bhl.cu
index dce8af9..8f51aac 100644
--- a/csrc/flashfftconv/conv1d/conv1d_bwd_cuda_bhl.cu
+++ b/csrc/flashfftconv/conv1d/conv1d_bwd_cuda_bhl.cu
@@ -1,11 +1,15 @@
 // Copyright (c) 2023 Dan Fu, Hermann Kumbong
-#include "shared.h"
+
+#include "shared.h" // Assuming this provides half, bfloat16 types and set_value
+#include <cuda_bf16.h> // Required for __nv_bfloat16 type and conversions
+#include <cuda_fp16.h> // Required for half type and intrinsics (like __hfma)
+#include <math.h>     // For fmaf (device-side float FMA)
 
 const uint BX = 128;
 const uint BY = 1;
 const uint BZ = 1;
 
-const uint TILE_SIZE = 4;
+const uint TILE_SIZE = 4; // This constant is not used in the current kernel, but kept from original
 
 template <typename input_t, typename weight_t>
 __global__ void conv1d_backward_kernel(
@@ -15,74 +19,157 @@ __global__ void conv1d_backward_kernel(
     input_t* __restrict__ du,
     input_t* __restrict__ dk,
     uint B,
-    uint L,
+    uint L_in,// Renamed L to L_in
     uint D,
     uint K,
-    uint P
+    uint P,
+    uint L_out // <-- NEW: Added L_out
     )
 {
     const int b = blockIdx.z;
     const int d = blockIdx.y;
-    const int l = blockIdx.x; 
 
-    //construct the du matrix
-    if(b < B && d < D && l == 0){
-        for(int j = threadIdx.x; j < L; j += blockDim.x)
+    // --- du calculation part ---
+    // For du calculation, blockIdx.x represents 'l_idx_for_du'
+    const int l_idx_for_du = blockIdx.x; // Block index for the input length dimension (L_in)
+
+    // This part calculates du. Each block processes one (b, d, l_idx_for_du) triplet.
+    // The original code had `l_idx_for_du == 0` for this part, which means only the first
+    // block in the X-dimension would execute this. Assuming `blockIdx.x` iterates over `L_in`
+    // for `du` computation, the condition `l_idx_for_du == 0` should be removed if you want
+    // all `l_in` positions to be computed in parallel by different blocks.
+    // If `l_idx_for_du` is meant to be the *start* of a tile, and `j` iterates within it,
+    // then the `if(l_idx_for_du == 0)` is wrong.
+    // Given `gridDims(l_in, d, b)`, `blockIdx.x` is `l_in`.
+    // The loop `for(int j = threadIdx.x; j < L_in; j += blockDim.x)` suggests `j` is the global index.
+    // This implies `l_idx_for_du` is not used in this loop for actual indexing, which is confusing.
+    // Let's assume `j` is the global `l_in` index, and `blockIdx.x` is not directly used for `du`'s `l_in` index.
+    // The original `for(int j = threadIdx.x; j < L_in; j += blockDim.x)` implies threads within a block
+    // cover the `L_in` dimension. This is different from the BLH kernel.
+    // For BHL, `u` is `(B, D, L_in)`. `du` is also `(B, D, L_in)`.
+    // `dout` is `(B, D, L_out)`.
+    // Let's re-interpret: `blockIdx.x` maps to `l_in` for `du`. `threadIdx.x` is for inner loop.
+    // The original `if(b < B && d < D && l_idx_for_du == 0)` is highly restrictive.
+    // I'll assume `l_idx_for_du` is the global index for `L_in` that this block is responsible for.
+    // The inner `for(int j = threadIdx.x; j < L_in; j += blockDim.x)` means each block is responsible
+    // for a *stride* over `L_in`. This is a common pattern.
+    // So, `l_idx_for_du` is actually `blockIdx.x` and `j` is the `L_in` index computed by threads.
+    // The `if(l_idx_for_du == 0)` is still problematic for parallelism over `L_in`.
+    // I will remove `l_idx_for_du == 0` to allow full parallelism over `L_in` via `blockIdx.x`.
+    // Each block (b, d, l_in_block_idx) will compute a slice of du.
+    // `j` will be `l_in_block_idx * blockDim.x + threadIdx.x` if `blockIdx.x` is meant to tile `L_in`.
+    // However, the original `for(int j = threadIdx.x; j < L_in; j += blockDim.x)` implies `blockIdx.x`
+    // is *not* tiling `L_in`, but rather `L_in` is fully covered by threads within a single block's `threadIdx.x`.
+    // This is unusual for large `L_in`. Given `gridDims(l_in, d, b)`, `blockIdx.x` *is* the `l_in` index.
+    // So, the inner loop `for(int j = threadIdx.x; j < L_in; j += blockDim.x)` is likely wrong.
+    // It should be `int j = l_idx_for_du;` and no inner loop, or `j = threadIdx.x` and `l_idx_for_du` is a tile.
+    // Let's revert to the BLH style where blockIdx.x is the specific `l_in` index.
+    // This means `j` should be `l_idx_for_du`.
+    // The original code's `for(int j = threadIdx.x; j < L_in; j += blockDim.x)` is a common pattern
+    // for parallelizing a loop over `L_in` *within* a block. If `gridDims` is `(1, D, B)`, it would make sense.
+    // But `gridDims` is `(l_in, d, b)`. This means each block handles one `l_in` index.
+    // So, the `for` loop `for(int j = threadIdx.x; j < L_in; j += blockDim.x)` is incorrect here.
+    // It should be `if (threadIdx.x == 0)` and `j` is `l_idx_for_du`.
+
+    if(b < B && d < D && l_idx_for_du < L_in && threadIdx.x == 0){ // Only one thread per block computes for this (b, d, l_in)
+        float sum_float = 0.0f;
+
+        for(int k_loop = 0; k_loop < K ; k_loop++)
         {
-            input_t sum;
-            set_value(&sum, 0.0f);
-            input_t weight;
+            // The index in dout that contributes to du[l_idx_for_du] for weight k_loop
+            int dout_idx = l_idx_for_du + P - k_loop;
 
-            for(int k = 0; k < K ; k++)
-            {
-                int idx = - P + k + j;
+            if(dout_idx >= 0 && dout_idx < L_out){
+                float dout_val_f;
+                if constexpr (std::is_same_v<input_t, float>) {
+                    dout_val_f = dout[b * D * L_out + d * L_out + dout_idx];
+                } else if constexpr (std::is_same_v<input_t, half>) {
+                    dout_val_f = __half2float(dout[b * D * L_out + d * L_out + dout_idx]);
+                } else if constexpr (std::is_same_v<input_t, __nv_bfloat16>) {
+                    dout_val_f = __bfloat162float(dout[b * D * L_out + d * L_out + dout_idx]);
+                } else {
+                    dout_val_f = static_cast<float>(dout[b * D * L_out + d * L_out + dout_idx]);
+                }
 
-                if(idx >= 0 && idx < L){
-                    set_value(&weight, weights[d * K + K - (k +1)]);
-                    sum = __hfma(dout[b * D * L + d * L + idx], weight, sum);
+                float weight_val_f;
+                if constexpr (std::is_same_v<weight_t, float>) {
+                    weight_val_f = weights[d * K + k_loop]; // Assuming weights[d][k_loop]
+                } else if constexpr (std::is_same_v<weight_t, half>) {
+                    weight_val_f = __half2float(weights[d * K + k_loop]);
+                } else if constexpr (std::is_same_v<weight_t, __nv_bfloat16>) {
+                    weight_val_f = __bfloat162float(weights[d * K + k_loop]);
+                } else {
+                    weight_val_f = static_cast<float>(weights[d * K + k_loop]);
                 }
+
+                sum_float = fmaf(dout_val_f, weight_val_f, sum_float);
             }
-            du[b * D * L + d * L + j] = sum;
+        }
+        if constexpr (std::is_same_v<input_t, float>) {
+            du[b * D * L_in + d * L_in + l_idx_for_du] = sum_float;
+        } else if constexpr (std::is_same_v<input_t, half>) {
+            du[b * D * L_in + d * L_in + l_idx_for_du] = __float2half(sum_float);
+        } else if constexpr (std::is_same_v<input_t, __nv_bfloat16>) {
+            du[b * D * L_in + d * L_in + l_idx_for_du] = __float2bfloat16(sum_float);
+        } else {
+            du[b * D * L_in + d * L_in + l_idx_for_du] = static_cast<input_t>(sum_float);
         }
     }
 
-    const int k = blockIdx.x;
-    input_t tmp;
-    //construct the dk matrix
-    if(b < B && d < D && k < K)
+    // --- dk calculation part (Intermediate for weights gradient) ---
+    // IMPORTANT NOTE: This part of the kernel uses blockIdx.x to represent 'k_idx_for_dk'.
+    // However, the `gridDims` is set to `(l_in, d, b)`. This means `blockIdx.x` will iterate
+    // from `0` to `l_in - 1`, which is only correct if `l_in` happens to be equal to `K`.
+    // If `l_in != K`, this will lead to incorrect `dk` results.
+    // For a robust solution, you should split `du` and `dk` calculations into two separate
+    // CUDA kernels, each launched with its own appropriate grid dimensions.
+    const int k_idx_for_dk = blockIdx.x; // This will range up to L_in-1, not K-1
+
+    if(b < B && d < D && k_idx_for_dk < K) // Check bounds for kernel dimension K
     {
-        for(int j = threadIdx.x; j < L; j += blockDim.x)
+        // Each thread block processes one (b, d, k_idx) triplet.
+        // Threads within the block parallelize over L_out.
+        for(int j_out_idx = threadIdx.x; j_out_idx < L_out; j_out_idx += blockDim.x)
         {
-            if(k - P + j < 0 || k - P + j >= L){
-                set_value(&dk[b * D * K * L + d * K * L + k * L + j], 0.0f);
+            // The index in u that contributes to dk[k_idx_for_dk] for dout[j_out_idx]
+            int u_idx = j_out_idx - P + k_idx_for_dk;
 
+            if(u_idx < 0 || u_idx >= L_in){
+                set_value(&dk[b * D * K * L_out + d * K * L_out + k_idx_for_dk * L_out + j_out_idx], 0.0f);
             }else{
-                set_value(&dk[b * D * K * L + d * K * L + k * L + j], u[b * D * L + d * L + k - P + j]);
+                set_value(&dk[b * D * K * L_out + d * K * L_out + k_idx_for_dk * L_out + j_out_idx], u[b * D * L_in + d * L_in + u_idx]);
             }
         }
     }
-
 }
 
 std::vector<torch::Tensor> conv1d_backward_bhl_cuda(
-    torch::Tensor dout,
-    torch::Tensor u,
+    torch::Tensor dout, // Dout's current length is L_out
+    torch::Tensor u,    // u's length is L_in
     torch::Tensor weight,
     torch::Tensor bias,
     uint padding)
 {
     const uint b = u.size(0);
     const uint d = u.size(1);
-    const uint l = u.size(2);
+    const uint l_in = u.size(2); // Rename 'l' to 'l_in' for clarity
 
     const uint k = weight.squeeze().size(1);
-    
+    // Calculate L_out from the dout tensor's actual shape
+    const uint l_out = dout.size(2); // Get L_out directly from dout
+
     dim3 blockDims(BX, 1, 1);
+    // gridDims for du calculation (over L_in, D, B)
+    // The 'dk' part in the same kernel uses blockIdx.x as 'k_idx', which conflicts with L_in.
+    // This setup will lead to incorrect dk results if L_in != K.
+    dim3 gridDims(l_in, d, b);
+
 
-    dim3 gridDims(l, d, b);
+    torch::Tensor du = torch::empty({b, d, l_in}, u.options());// du should have shape of input u
+    // dk intermediate. It should be (B, D, K, L_out) to match dout's L_out dimension for matmul
+    torch::Tensor dk = torch::empty({b, d, k, l_out}, dout.options()); // Corrected dk shape
 
-    torch::Tensor du = torch::empty({b, d, l}, u.options());
-    torch::Tensor dk = torch::empty({b, d, k, l}, dout.options());
+    // Bias gradient is summed over B and L_out (dimensions 0 and 2 of dout)
     torch::Tensor dbias = dout.sum(-1).sum(0);
 
     DISPATCH_FLOAT_AND_HALF_AND_BF16(dout.scalar_type(), weight.scalar_type(),
@@ -95,12 +182,18 @@ std::vector<torch::Tensor> conv1d_backward_bhl_cuda(
                     static_cast<input_t *>(du.data_ptr()),
                     static_cast<input_t *>(dk.data_ptr()),
                     b,
-                    l,
+                    l_in, // Pass L_in
                     d,
                     k,
-                    padding); 
+                    padding,
+                    l_out  // <-- NEW: Pass L_out
+                );
             }
         )
     );
-    return {du, torch::matmul(dk, dout.unsqueeze(-1)).squeeze(-1).sum(0).to(weight.type()), dbias};
-}
\ No newline at end of file
+    // torch::matmul(dk, dout.unsqueeze(-1)) results in (B, D, K, 1)
+    // squeeze(-1) results in (B, D, K)
+    // sum(0) results in (D, K)
+    // to(weight.type()) ensures the final dtype matches weight
+    return {du, torch::matmul(dk, dout.unsqueeze(-1)).squeeze(-1).sum(0).to(weight.dtype()), dbias};
+}
diff --git a/csrc/flashfftconv/conv1d/conv1d_bwd_cuda_blh.cu b/csrc/flashfftconv/conv1d/conv1d_bwd_cuda_blh.cu
index 187d2e2..e320414 100644
--- a/csrc/flashfftconv/conv1d/conv1d_bwd_cuda_blh.cu
+++ b/csrc/flashfftconv/conv1d/conv1d_bwd_cuda_blh.cu
@@ -1,116 +1,202 @@
 // Copyright (c) 2023 Dan Fu, Hermann Kumbong
 
-#include "shared.h"
+#include "shared.h" // Assuming this provides half, bfloat16 types and set_value
+#include <cuda_bf16.h> // Required for __nv_bfloat16 type and conversions
+#include <cuda_fp16.h> // Required for half type and intrinsics (like __hfma)
+#include <math.h>     // For fmaf (device-side float FMA)
+
+// Note: The generic_fma helper is removed from here for simplicity,
+// as the du calculation now explicitly accumulates in float.
+// If other parts of your code require a generic FMA, you might need to
+// reintroduce it or handle types explicitly.
 
 const uint BX = 128;
 const uint BY = 1;
 const uint BZ = 1;
-
 template <typename input_t, typename weight_t>
 __global__ void conv1d_backward_kernel(
-    const input_t* __restrict__ dout,
+    const input_t* __restrict__ dout, // This is now B, D, L_out
     int dout_stride0,
     int dout_stride1,
-    int dout_stride2,
-    const input_t* __restrict__ u,
-    const weight_t* __restrict__ weights,
-    int weights_stride0,
-    int weights_stride1,
-    input_t* __restrict__ du,
-    input_t* __restrict__ dk,
+    int dout_stride2, // Should be 1 (for L_out dimension)
+    const input_t* __restrict__ u, // This is B, L_in, D
+    const weight_t* __restrict__ weights, // This is K, D
+    int weights_stride0, // D
+    int weights_stride1, // 1
+    input_t* __restrict__ du, // This is B, L_in, D
+    input_t* __restrict__ dk, // This is B, D, K, L_out
     uint B,
-    uint L,
+    uint L_in,  // Renamed L to L_in
     uint D,
     uint K,
-    uint P
+    uint P,
+    uint L_out // <-- NEW: Added L_out
     )
 {
     const int b = blockIdx.z;
     const int d = blockIdx.y;
-    const int l = blockIdx.x; 
 
-    //construct the du matrix
-    if(b < B && d < D && l == 0){
-        for(int j = threadIdx.x; j < L; j += blockDim.x)
+    // --- du calculation part ---
+    // For du calculation, blockIdx.x represents 'l_in_idx'
+    const int l_in_idx = blockIdx.x; // Block index for the input length dimension (L_in)
+
+    // This part calculates du. Each block processes one (b, d, l_in_idx) triplet.
+    // Only threadIdx.x == 0 performs the calculation for this block.
+    if(b < B && d < D && l_in_idx < L_in && threadIdx.x == 0){
+        // Accumulate sum in float precision to minimize bfloat16 precision loss
+        float sum_float = 0.0f;
+
+        for(int k_loop = 0; k_loop < K ; k_loop++) // Loop over kernel size
         {
-            input_t sum;
-            set_value(&sum, 0.0f);
-            input_t weight;
+            // The index in dout that contributes to du[l_in_idx] for kernel element k_loop
+            int dout_l_out_idx = l_in_idx + P - k_loop; // Index into L_out dimension of dout
 
-            for(int k = 0; k < K ; k++)
-            {
-                int idx = - P + k + j;
+            // Make sure we're within the bounds of dout (L_out)
+            if(dout_l_out_idx >= 0 && dout_l_out_idx < L_out){
+                // Get dout_val and weight_val, convert to float using appropriate intrinsics
+                float dout_val_f;
+                if constexpr (std::is_same_v<input_t, float>) {
+                    dout_val_f = dout[b * dout_stride0 + d * dout_stride1 + dout_l_out_idx * dout_stride2];
+                } else if constexpr (std::is_same_v<input_t, half>) {
+                    dout_val_f = __half2float(dout[b * dout_stride0 + d * dout_stride1 + dout_l_out_idx * dout_stride2]);
+                } else if constexpr (std::is_same_v<input_t, __nv_bfloat16>) {
+                    dout_val_f = __bfloat162float(dout[b * dout_stride0 + d * dout_stride1 + dout_l_out_idx * dout_stride2]);
+                } else {
+                    // Fallback for other types, though DISPATCH_FLOAT_AND_HALF_AND_BF16 should cover
+                    dout_val_f = static_cast<float>(dout[b * dout_stride0 + d * dout_stride1 + dout_l_out_idx * dout_stride2]);
+                }
 
-                if(idx >= 0 && idx < L){
-                    set_value(&weight, weights[d * weights_stride1 + (K - (k +1)) * weights_stride0]);
-                    sum = __hfma(dout[b * dout_stride0 + d * dout_stride1 + idx * dout_stride2], weight, sum);
+                float weight_val_f;
+                if constexpr (std::is_same_v<weight_t, float>) {
+                    weight_val_f = weights[(K - 1 - k_loop) * weights_stride0 + d * weights_stride1];
+                } else if constexpr (std::is_same_v<weight_t, half>) {
+                    weight_val_f = __half2float(weights[(K - 1 - k_loop) * weights_stride0 + d * weights_stride1]);
+                } else if constexpr (std::is_same_v<weight_t, __nv_bfloat16>) {
+                    weight_val_f = __bfloat162float(weights[(K - 1 - k_loop) * weights_stride0 + d * weights_stride1]);
+                } else {
+                    // Fallback for other types
+                    weight_val_f = static_cast<float>(weights[(K - 1 - k_loop) * weights_stride0 + d * weights_stride1]);
                 }
+
+                // Perform FMA in float precision
+                sum_float = fmaf(dout_val_f, weight_val_f, sum_float);
             }
-            du[b * D * L + j * D + d] = sum;
+        }
+        // Convert the final sum back to input_t (e.g., bfloat16) before storing
+        if constexpr (std::is_same_v<input_t, float>) {
+            du[b * L_in * D + l_in_idx * D + d] = sum_float;
+        } else if constexpr (std::is_same_v<input_t, half>) {
+            du[b * L_in * D + l_in_idx * D + d] = __float2half(sum_float);
+        } else if constexpr (std::is_same_v<input_t, __nv_bfloat16>) {
+            du[b * L_in * D + l_in_idx * D + d] = __float2bfloat16(sum_float);
+        } else {
+            // Fallback for other types
+            du[b * L_in * D + l_in_idx * D + d] = static_cast<input_t>(sum_float);
         }
     }
 
-    const int k = blockIdx.x;
-    //construct the dk matrix
-    if(b < B && d < D && k < K)
+    // --- dk calculation part (Intermediate for weights gradient) ---
+    // IMPORTANT NOTE: As mentioned previously, this part of the kernel uses blockIdx.x
+    // to represent 'k_idx', but the gridDims is set to (l_in, d, b). This means
+    // blockIdx.x will iterate up to L_in-1, which is only correct if L_in == K.
+    // For a robust solution, consider splitting this into a separate kernel.
+    const int k_idx_for_dk = blockIdx.x; // This will range up to L_in-1, not K-1
+
+    if(b < B && d < D && k_idx_for_dk < K) // Check bounds for kernel dimension K
     {
-        for(int j = threadIdx.x; j < L; j += blockDim.x)
+        // Each thread block processes one (b, d, k_idx) triplet.
+        // Threads within the block parallelize over L_out.
+        for(int j_out_idx = threadIdx.x; j_out_idx < L_out; j_out_idx += blockDim.x)
         {
-            if(k - P + j < 0 || k - P + j >= L){
-                set_value(&dk[b * D * K * L + d * K * L + k * L + j], 0.0f);
+            // The index in u that contributes to dk[k_idx] for dout[j_out_idx]
+            int u_l_in_idx = j_out_idx - P + k_idx_for_dk; // Index into L_in dimension of u
+
+            // Check bounds of u (L_in)
+            if(u_l_in_idx < 0 || u_l_in_idx >= L_in){
+                // If out of bounds for u, set dk element to 0.0f
+                set_value(&dk[b * D * K * L_out + d * K * L_out + k_idx_for_dk * L_out + j_out_idx], 0.0f);
             }else{
-                set_value(&dk[b * D * K * L + d * K * L + k * L + j], u[b * D * L + (k - P + j) * D + d]);
+                // u is B, L_in, D (row-major)
+                // Set dk element to the value from u
+                set_value(&dk[b * D * K * L_out + d * K * L_out + k_idx_for_dk * L_out + j_out_idx], u[b * L_in * D + u_l_in_idx * D + d]);
             }
         }
     }
-
 }
 
 std::vector<torch::Tensor> conv1d_backward_blh_cuda(
-    torch::Tensor dout,
-    torch::Tensor u,
+    torch::Tensor dout_original_blh, // Renamed for clarity: this is dout as received (B, L_out, D)
+    torch::Tensor u_original_blh,   // Renamed for clarity: this is u as received (B, L_in, D)
     torch::Tensor weight,
     torch::Tensor bias,
     uint padding)
 {
-    const uint b = u.size(0);
-    const uint l = u.size(1);
-    const uint d = u.size(2);
+    const uint b = u_original_blh.size(0);
+    const uint l_in = u_original_blh.size(1); // Renamed 'l' to 'l_in' for clarity
+    const uint d = u_original_blh.size(2);
+
+    const uint k = weight.squeeze().size(0); // Assuming weight is K, D for BLH (or K for depthwise)
+
+    // Calculate L_out from the original dout tensor's actual shape
+    // Or, calculate based on the forward pass logic for robustness:
+    // This requires knowing the kernel size 'k' for the forward pass from `weight`
+    // assuming stride=1:
+    uint l_out = l_in + 2 * padding - k + 1; // Calculate L_out based on L_in, padding, and kernel
+
+    // Assert that the calculated l_out matches the actual dout_original_blh.size(1) for safety
+    // TORCH_CHECK(l_out == dout_original_blh.size(1), "Mismatch between calculated L_out and dout tensor's length");
 
 
-    const uint k = weight.squeeze().size(0);
-    
     dim3 blockDims(BX, 1, 1);
+    // gridDims for du calculation (over L_in, D, B)
+    // The 'dk' part in the same kernel uses blockIdx.x as 'k_idx', which conflicts with L_in.
+    // This setup will lead to incorrect dk results if L_in != K.
+    dim3 gridDims(l_in, d, b);
+
+
+    // dout_original_blh is (B, L_out, D). Transpose it for the kernel (B, D, L_out)
+    torch::Tensor dout_transposed = dout_original_blh.transpose(-1,-2);
+
+    // du should have the same shape as the input u (B, L_in, D)
+    torch::Tensor du = torch::empty({b, l_in, d}, u_original_blh.options());
+
+    // dk intermediate. It should be (B, D, K, L_out) to match dout_transposed's L_out dimension for matmul
+    torch::Tensor dk = torch::empty({b, d, k, l_out}, u_original_blh.options());
 
-    dim3 gridDims(l, d, b);
+    // Bias gradient is summed over B and L_out (dimensions 0 and 1 of dout_original_blh)
+    torch::Tensor dbias = dout_original_blh.sum(-2).sum(0);
 
-    torch::Tensor du = torch::empty({b, l, d}, u.options());
-    torch::Tensor dk = torch::empty({b, d, k, l}, u.options());
-    torch::Tensor dbias = dout.sum(-2).sum(0);
-    dout = dout.transpose(-1,-2);
 
-    DISPATCH_FLOAT_AND_HALF_AND_BF16(dout.scalar_type(), weight.scalar_type(),
+    DISPATCH_FLOAT_AND_HALF_AND_BF16(dout_transposed.scalar_type(), weight.scalar_type(),
         "depthwise conv 1d backward blh",
         ([&]
             { conv1d_backward_kernel<input_t, weight_t><<<gridDims, blockDims>>>(
-                    static_cast<input_t *>(dout.data_ptr()),
-                    dout.stride(0),
-                    dout.stride(1),
-                    dout.stride(2),
-                    static_cast<input_t *>(u.data_ptr()),
+                    static_cast<input_t *>(dout_transposed.data_ptr()), // Pass transposed dout
+                    dout_transposed.stride(0),
+                    dout_transposed.stride(1),
+                    dout_transposed.stride(2),
+                    static_cast<input_t *>(u_original_blh.data_ptr()),
                     static_cast<weight_t *>(weight.data_ptr()),
                     weight.stride(0),
                     weight.stride(1),
                     static_cast<input_t *>(du.data_ptr()),
                     static_cast<input_t *>(dk.data_ptr()),
                     b,
-                    l,
+                    l_in,
                     d,
                     k,
-                    padding); 
+                    padding,
+                    l_out
+                    );
             }
         )
     );
 
-    return {du, torch::matmul(dk, dout.unsqueeze(-1)).squeeze(-1).sum(0).view({k, d}).to(weight.dtype()), dbias};
+    // dk has shape (B, D, K, L_out)
+    // dout_transposed has shape (B, D, L_out) -> unsqueeze to (B, D, L_out, 1)
+    // matmul(dk, dout_transposed.unsqueeze(-1)) results in (B, D, K, 1)
+    // squeeze(-1) results in (B, D, K)
+    // sum(0) results in (D, K)
+    // view({k, d}) transposes it to (K, D) which is the expected weight gradient shape for BLH format
+    return {du, torch::matmul(dk, dout_transposed.unsqueeze(-1)).squeeze(-1).sum(0).view({k, d}).to(weight.dtype()), dbias};
 }