Skip to content

Commit 949acdd

Browse files
committed
data-parallel patched ALP standalone kernel
Signed-off-by: Andrew Duffy <andrew@a10y.dev>
1 parent be2a14b commit 949acdd

4 files changed

Lines changed: 358 additions & 107 deletions

File tree

vortex-cuda/kernels/src/alp.cu

Lines changed: 69 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,77 @@
11
// SPDX-License-Identifier: Apache-2.0
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33

4-
#include "scalar_kernel.cuh"
5-
6-
// ALP (Adaptive Lossless floating-Point) decode operation.
7-
// Converts integers to floats by multiplying by precomputed exponent factors.
8-
// Formula: decoded = (float)encoded * f * e
9-
// Where f = F10[exponents.f] and e = IF10[exponents.e] are passed directly.
10-
template <typename EncodedT, typename FloatT>
11-
struct AlpOp {
12-
FloatT f; // F10[exponents.f] - power of 10
13-
FloatT e; // IF10[exponents.e] - inverse power of 10
14-
15-
__device__ inline FloatT operator()(EncodedT value) const {
16-
return static_cast<FloatT>(value) * f * e;
4+
#include "patches.cuh"
5+
6+
// ALP (Adaptive Lossless floating-Point) decode: out[i] = (FloatT)in[i] * f * e.
7+
//
8+
// Each block processes one 1024-element chunk cooperatively and applies patches
9+
// into shared memory before writing to global memory, mirroring the strategy
10+
// used by bit_unpack. f = F10[exponents.f], e = IF10[exponents.e].
11+
//
12+
// The cast from EncT to FloatT must preserve ALP's lossless contract: f32 is
13+
// only encoded as i32, and f64 is only encoded as i64. The i64 → double cast
14+
// is lossless for all values ALP can produce.
15+
template <typename EncT, typename FloatT>
16+
__device__ void _alp_device(const EncT *__restrict in, FloatT *__restrict out, FloatT f,
17+
FloatT e, uint64_t array_len, int thread_idx, GPUPatches &patches) {
18+
constexpr int ThreadCount = 32;
19+
__shared__ FloatT shared_out[1024];
20+
21+
constexpr int per_thread = 1024 / ThreadCount;
22+
uint64_t chunk_base = static_cast<uint64_t>(blockIdx.x) * 1024;
23+
24+
// Step 1: decode the chunk into shared memory. The tail block is bounds-checked;
25+
// all interior blocks take the fast path with no per-element branch.
26+
if (chunk_base + 1024 <= array_len) {
27+
#pragma unroll
28+
for (int i = 0; i < per_thread; i++) {
29+
int idx = i * ThreadCount + thread_idx;
30+
shared_out[idx] = static_cast<FloatT>(in[idx]) * f * e;
31+
}
32+
} else {
33+
#pragma unroll
34+
for (int i = 0; i < per_thread; i++) {
35+
int idx = i * ThreadCount + thread_idx;
36+
uint64_t global_idx = chunk_base + static_cast<uint64_t>(idx);
37+
if (global_idx < array_len) {
38+
shared_out[idx] = static_cast<FloatT>(in[idx]) * f * e;
39+
} else {
40+
shared_out[idx] = FloatT{};
41+
}
42+
}
1743
}
18-
};
19-
20-
// Macro to generate ALP kernel for each type combination.
21-
// Input is integer (encoded), output is float (decoded).
22-
#define GENERATE_ALP_KERNEL(enc_suffix, float_suffix, EncType, FloatType) \
23-
extern "C" __global__ void alp_##enc_suffix##_##float_suffix(const EncType *__restrict encoded, \
24-
FloatType *__restrict decoded, \
25-
FloatType f, \
26-
FloatType e, \
27-
uint64_t array_len) { \
28-
scalar_kernel(encoded, decoded, array_len, AlpOp<EncType, FloatType> {f, e}); \
44+
__syncwarp();
45+
46+
// Step 2: apply patches in parallel across the warp.
47+
PatchesCursor<FloatT> cursor(patches, blockIdx.x, thread_idx, static_cast<uint32_t>(ThreadCount));
48+
auto patch = cursor.next();
49+
while (patch.index != 1024) {
50+
shared_out[patch.index] = patch.value;
51+
patch = cursor.next();
2952
}
53+
__syncwarp();
3054

31-
// f32 variants (ALP for f32 encodes as i32 or i64)
32-
GENERATE_ALP_KERNEL(i32, f32, int32_t, float)
33-
GENERATE_ALP_KERNEL(i64, f32, int64_t, float)
55+
// Step 3: coalesced write-out. Slop past `array_len` in the tail chunk is
56+
// overwritten harmlessly; the caller slices the final buffer to `array_len`.
57+
#pragma unroll
58+
for (int i = 0; i < per_thread; i++) {
59+
int idx = i * ThreadCount + thread_idx;
60+
out[idx] = shared_out[idx];
61+
}
62+
}
63+
64+
#define GENERATE_ALP_KERNEL(enc_suffix, float_suffix, EncT, FloatT) \
65+
extern "C" __global__ void alp_##enc_suffix##_##float_suffix##_32t( \
66+
const EncT *__restrict full_in, FloatT *__restrict full_out, FloatT f, FloatT e, \
67+
uint64_t array_len, GPUPatches patches) { \
68+
int thread_idx = threadIdx.x; \
69+
auto in = full_in + (blockIdx.x * 1024); \
70+
auto out = full_out + (blockIdx.x * 1024); \
71+
_alp_device<EncT, FloatT>(in, out, f, e, array_len, thread_idx, patches); \
72+
}
3473

35-
// f64 variants (ALP for f64 encodes as i64)
74+
// The only ALPInt bindings produced by the encoder are (f32, i32) and (f64, i64).
75+
// i64 → double is lossless; i32 → float is lossless for all values ALP emits.
76+
GENERATE_ALP_KERNEL(i32, f32, int32_t, float)
3677
GENERATE_ALP_KERNEL(i64, f64, int64_t, double)

0 commit comments

Comments
 (0)