vortex-data
diff --git a/‎vortex-cuda/kernels/src/alp.cu‎
Lines changed: 69 additions & 28 deletions b/‎vortex-cuda/kernels/src/alp.cu‎
Lines changed: 69 additions & 28 deletions
@@ -1,36 +1,77 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
-#include "scalar_kernel.cuh"
-
-// ALP (Adaptive Lossless floating-Point) decode operation.
-// Converts integers to floats by multiplying by precomputed exponent factors.
-// Formula: decoded = (float)encoded * f * e
-// Where f = F10[exponents.f] and e = IF10[exponents.e] are passed directly.
-template <typename EncodedT, typename FloatT>
-struct AlpOp {
-    FloatT f; // F10[exponents.f] - power of 10
-    FloatT e; // IF10[exponents.e] - inverse power of 10
-
-    __device__ inline FloatT operator()(EncodedT value) const {
-        return static_cast<FloatT>(value) * f * e;
+#include "patches.cuh"
+
+// ALP (Adaptive Lossless floating-Point) decode: out[i] = (FloatT)in[i] * f * e.
+//
+// Each block processes one 1024-element chunk cooperatively and applies patches
+// into shared memory before writing to global memory, mirroring the strategy
+// used by bit_unpack. f = F10[exponents.f], e = IF10[exponents.e].
+//
+// The cast from EncT to FloatT must preserve ALP's lossless contract: f32 is
+// only encoded as i32, and f64 is only encoded as i64. The i64 → double cast
+// is lossless for all values ALP can produce.
+template <typename EncT, typename FloatT>
+__device__ void _alp_device(const EncT *__restrict in, FloatT *__restrict out, FloatT f,
+                            FloatT e, uint64_t array_len, int thread_idx, GPUPatches &patches) {
+    constexpr int ThreadCount = 32;
+    __shared__ FloatT shared_out[1024];
+
+    constexpr int per_thread = 1024 / ThreadCount;
+    uint64_t chunk_base = static_cast<uint64_t>(blockIdx.x) * 1024;
+
+    // Step 1: decode the chunk into shared memory. The tail block is bounds-checked;
+    // all interior blocks take the fast path with no per-element branch.
+    if (chunk_base + 1024 <= array_len) {
+        #pragma unroll
+        for (int i = 0; i < per_thread; i++) {
+            int idx = i * ThreadCount + thread_idx;
+            shared_out[idx] = static_cast<FloatT>(in[idx]) * f * e;
+        }
+    } else {
+        #pragma unroll
+        for (int i = 0; i < per_thread; i++) {
+            int idx = i * ThreadCount + thread_idx;
+            uint64_t global_idx = chunk_base + static_cast<uint64_t>(idx);
+            if (global_idx < array_len) {
+                shared_out[idx] = static_cast<FloatT>(in[idx]) * f * e;
+            } else {
+                shared_out[idx] = FloatT{};
+            }
+        }
     }
-};
-
-// Macro to generate ALP kernel for each type combination.
-// Input is integer (encoded), output is float (decoded).
-#define GENERATE_ALP_KERNEL(enc_suffix, float_suffix, EncType, FloatType)                                    \
-    extern "C" __global__ void alp_##enc_suffix##_##float_suffix(const EncType *__restrict encoded,          \
-                                                                 FloatType *__restrict decoded,              \
-                                                                 FloatType f,                                \
-                                                                 FloatType e,                                \
-                                                                 uint64_t array_len) {                       \
-        scalar_kernel(encoded, decoded, array_len, AlpOp<EncType, FloatType> {f, e});                        \
+    __syncwarp();
+
+    // Step 2: apply patches in parallel across the warp.
+    PatchesCursor<FloatT> cursor(patches, blockIdx.x, thread_idx, static_cast<uint32_t>(ThreadCount));
+    auto patch = cursor.next();
+    while (patch.index != 1024) {
+        shared_out[patch.index] = patch.value;
+        patch = cursor.next();
     }
+    __syncwarp();
 
-// f32 variants (ALP for f32 encodes as i32 or i64)
-GENERATE_ALP_KERNEL(i32, f32, int32_t, float)
-GENERATE_ALP_KERNEL(i64, f32, int64_t, float)
+    // Step 3: coalesced write-out. Slop past `array_len` in the tail chunk is
+    // overwritten harmlessly; the caller slices the final buffer to `array_len`.
+    #pragma unroll
+    for (int i = 0; i < per_thread; i++) {
+        int idx = i * ThreadCount + thread_idx;
+        out[idx] = shared_out[idx];
+    }
+}
+
+#define GENERATE_ALP_KERNEL(enc_suffix, float_suffix, EncT, FloatT)                          \
+    extern "C" __global__ void alp_##enc_suffix##_##float_suffix##_32t(                      \
+        const EncT *__restrict full_in, FloatT *__restrict full_out, FloatT f, FloatT e,     \
+        uint64_t array_len, GPUPatches patches) {                                            \
+        int thread_idx = threadIdx.x;                                                        \
+        auto in = full_in + (blockIdx.x * 1024);                                             \
+        auto out = full_out + (blockIdx.x * 1024);                                           \
+        _alp_device<EncT, FloatT>(in, out, f, e, array_len, thread_idx, patches);            \
+    }
 
-// f64 variants (ALP for f64 encodes as i64)
+// The only ALPInt bindings produced by the encoder are (f32, i32) and (f64, i64).
+// i64 → double is lossless; i32 → float is lossless for all values ALP emits.
+GENERATE_ALP_KERNEL(i32, f32, int32_t, float)
 GENERATE_ALP_KERNEL(i64, f64, int64_t, double)