Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 22 additions & 28 deletions csrc/bindings.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,13 @@ extern "C" void day10_conv1d(const float* input, const float* kernel, float* out
extern "C" void day11_softmax(const float* input, float* output, int feature_size);
extern "C" void day12_layernorm(const float* input, float* output, const float* gamma, const float* beta, int feature_size, float eps);
extern "C" void day13_rmsnorm(const float* input, float* output, const float* weight, int feature_size, float eps);
extern "C" void day14_fused_softmax(const float* input, float* output, const float* mask, int batch_size, int seq_len, int feature_size, float scale);
extern "C" void day15_fused_attention(const float* Q, const float* K, const float* V, float* output, const float* mask, int batch_size, int num_heads, int seq_len, int head_dim, float scale);
extern "C" void day14_fused_softmax(const float* input, float* output, const float* mask, int seq_len, int feature_size, float scale);
extern "C" void day15_fused_attention(const float* Q, const float* K, const float* V, float* output, const float* mask, int num_heads, int seq_len, int head_dim, float scale);
extern "C" void day16_group_gemm(const float* A, const float* B, float* C, int num_groups, int M, int N, int K);
extern "C" void day17_persistent_matmul(const float* A, const float* B, float* C, int M, int N, int K);
extern "C" void day18_block_scaled_matmul(const float* A, const float* B, float* C, int M, int N, int K, float scale);
extern "C" void day19_rope(const float* query, const float* key, float* rotated_query, float* rotated_key, const float* cos_cache, const float* sin_cache, int batch_size, int num_heads, int seq_len, int head_dim);
extern "C" void day20_conv2d(const float* input, const float* kernel, float* output, int batch_size, int in_channels, int out_channels, int input_h, int input_w, int kernel_h, int kernel_w, int output_h, int output_w, int pad_h, int pad_w, int stride_h, int stride_w);
extern "C" void day19_rope(const float* query, const float* key, float* rotated_query, float* rotated_key, const float* cos_cache, const float* sin_cache, int num_heads, int seq_len, int head_dim);
extern "C" void day20_conv2d(const float* input, const float* kernel, float* output, int in_channels, int out_channels, int input_h, int input_w, int kernel_h, int kernel_w, int output_h, int output_w, int pad_h, int pad_w, int stride_h, int stride_w);

// Helper macro for CUDA error checking
#define CUDA_CHECK(call) \
Expand Down Expand Up @@ -243,11 +243,10 @@ torch::Tensor day13_rmsnorm_wrapper(torch::Tensor input, torch::Tensor weight, f
torch::Tensor day14_fused_softmax_wrapper(torch::Tensor input, torch::Tensor mask, float scale) {
TORCH_CHECK(input.is_cuda(), "Input must be a CUDA tensor");
TORCH_CHECK(input.dtype() == torch::kFloat32, "Input must be float32");
TORCH_CHECK(input.dim() == 3, "Input must be 3D tensor (batch_size, seq_len, feature_size)");
TORCH_CHECK(input.dim() == 2, "Input must be 2D tensor (seq_len, feature_size)");

int batch_size = input.size(0);
int seq_len = input.size(1);
int feature_size = input.size(2);
int seq_len = input.size(0);
int feature_size = input.size(1);

// Default mask if not provided (empty tensor)
const float* mask_ptr = nullptr;
Expand All @@ -262,7 +261,6 @@ torch::Tensor day14_fused_softmax_wrapper(torch::Tensor input, torch::Tensor mas
input.data_ptr<float>(),
output.data_ptr<float>(),
mask_ptr,
batch_size,
seq_len,
feature_size,
scale
Expand All @@ -275,12 +273,11 @@ torch::Tensor day14_fused_softmax_wrapper(torch::Tensor input, torch::Tensor mas
torch::Tensor day15_fused_attention_wrapper(torch::Tensor Q, torch::Tensor K, torch::Tensor V, torch::Tensor mask, float scale) {
TORCH_CHECK(Q.is_cuda() && K.is_cuda() && V.is_cuda(), "Inputs must be CUDA tensors");
TORCH_CHECK(Q.dtype() == torch::kFloat32 && K.dtype() == torch::kFloat32 && V.dtype() == torch::kFloat32, "Inputs must be float32");
TORCH_CHECK(Q.dim() == 4, "Q, K, V must be 4D tensors (batch_size, num_heads, seq_len, head_dim)");
TORCH_CHECK(Q.dim() == 3, "Q, K, V must be 3D tensors (num_heads, seq_len, head_dim)");

int batch_size = Q.size(0);
int num_heads = Q.size(1);
int seq_len = Q.size(2);
int head_dim = Q.size(3);
int num_heads = Q.size(0);
int seq_len = Q.size(1);
int head_dim = Q.size(2);

// Default scale if not provided
if (scale <= 0.0f) {
Expand All @@ -302,7 +299,6 @@ torch::Tensor day15_fused_attention_wrapper(torch::Tensor Q, torch::Tensor K, to
V.data_ptr<float>(),
output.data_ptr<float>(),
mask_ptr,
batch_size,
num_heads,
seq_len,
head_dim,
Expand Down Expand Up @@ -392,12 +388,11 @@ std::tuple<torch::Tensor, torch::Tensor> day19_rope_wrapper(
) {
TORCH_CHECK(query.is_cuda() && key.is_cuda(), "Inputs must be CUDA tensors");
TORCH_CHECK(query.dtype() == torch::kFloat32 && key.dtype() == torch::kFloat32, "Inputs must be float32");
TORCH_CHECK(query.dim() == 4 && key.dim() == 4, "Query and key must be 4D tensors (batch_size, num_heads, seq_len, head_dim)");
TORCH_CHECK(query.dim() == 3 && key.dim() == 3, "Query and key must be 3D tensors (num_heads, seq_len, head_dim)");

int batch_size = query.size(0);
int num_heads = query.size(1);
int seq_len = query.size(2);
int head_dim = query.size(3);
int num_heads = query.size(0);
int seq_len = query.size(1);
int head_dim = query.size(2);

torch::Tensor rotated_query = torch::zeros_like(query);
torch::Tensor rotated_key = torch::zeros_like(key);
Expand All @@ -409,7 +404,7 @@ std::tuple<torch::Tensor, torch::Tensor> day19_rope_wrapper(
rotated_key.data_ptr<float>(),
cos_cache.data_ptr<float>(),
sin_cache.data_ptr<float>(),
batch_size, num_heads, seq_len, head_dim
num_heads, seq_len, head_dim
);

return std::make_tuple(rotated_query, rotated_key);
Expand All @@ -426,13 +421,12 @@ torch::Tensor day20_conv2d_wrapper(
) {
TORCH_CHECK(input.is_cuda() && kernel.is_cuda(), "Inputs must be CUDA tensors");
TORCH_CHECK(input.dtype() == torch::kFloat32 && kernel.dtype() == torch::kFloat32, "Inputs must be float32");
TORCH_CHECK(input.dim() == 4, "Input must be 4D tensor (batch_size, in_channels, height, width)");
TORCH_CHECK(input.dim() == 3, "Input must be 3D tensor (in_channels, height, width)");
TORCH_CHECK(kernel.dim() == 4, "Kernel must be 4D tensor (out_channels, in_channels, kernel_h, kernel_w)");

int batch_size = input.size(0);
int in_channels = input.size(1);
int input_h = input.size(2);
int input_w = input.size(3);
int in_channels = input.size(0);
int input_h = input.size(1);
int input_w = input.size(2);

int out_channels = kernel.size(0);
int kernel_h = kernel.size(2);
Expand All @@ -443,13 +437,13 @@ torch::Tensor day20_conv2d_wrapper(

TORCH_CHECK(output_h > 0 && output_w > 0, "Output dimensions must be positive");

torch::Tensor output = torch::zeros({batch_size, out_channels, output_h, output_w}, input.options());
torch::Tensor output = torch::zeros({out_channels, output_h, output_w}, input.options());

day20_conv2d(
input.data_ptr<float>(),
kernel.data_ptr<float>(),
output.data_ptr<float>(),
batch_size, in_channels, out_channels,
in_channels, out_channels,
input_h, input_w, kernel_h, kernel_w,
output_h, output_w,
pad_h, pad_w, stride_h, stride_w
Expand Down
18 changes: 8 additions & 10 deletions csrc/day14_fused_softmax.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,28 +11,26 @@
// 2. 추가 연산 (scale, mask 등)을 같은 커널에서 처리
// 3. 메모리 접근 최소화
//
// 입력: input (batch_size, seq_len, feature_size)
// 입력: input (seq_len, feature_size) - batch_size는 항상 1
// scale (optional) - scaling factor
// mask (optional) - attention mask
// 출력: output (batch_size, seq_len, feature_size)
// 출력: output (seq_len, feature_size)

__global__ void fused_softmax_kernel(
const float* input,
float* output,
const float* mask,
int batch_size,
int seq_len,
int feature_size,
float scale
) {
// TODO: 구현하세요
// Fused operations: scale -> mask -> softmax
int batch_idx = blockIdx.z;
int seq_idx = blockIdx.y;
int seq_idx = blockIdx.x;
int feature_idx = threadIdx.x;

if (batch_idx < batch_size && seq_idx < seq_len && feature_idx < feature_size) {
int idx = batch_idx * seq_len * feature_size + seq_idx * feature_size + feature_idx;
if (seq_idx < seq_len && feature_idx < feature_size) {
int idx = seq_idx * feature_size + feature_idx;
// TODO: Fused Softmax 계산
output[idx] = input[idx];
}
Expand All @@ -42,17 +40,17 @@ extern "C" void day14_fused_softmax(
const float* input,
float* output,
const float* mask,
int batch_size,
int seq_len,
int feature_size,
float scale
) {
// TODO: kernel launch configuration 설정
// batch_size는 항상 1이므로 제거
dim3 threadsPerBlock(feature_size);
dim3 blocksPerGrid(seq_len, batch_size);
dim3 blocksPerGrid(seq_len);

fused_softmax_kernel<<<blocksPerGrid, threadsPerBlock>>>(
input, output, mask, batch_size, seq_len, feature_size, scale
input, output, mask, seq_len, feature_size, scale
);
cudaDeviceSynchronize();
}
22 changes: 9 additions & 13 deletions csrc/day15_fused_attention.cu
Original file line number Diff line number Diff line change
Expand Up @@ -13,38 +13,34 @@
//
// 모든 연산을 하나의 커널로 융합하여 성능 최적화
//
// 입력: Q (batch_size, num_heads, seq_len, head_dim)
// K (batch_size, num_heads, seq_len, head_dim)
// V (batch_size, num_heads, seq_len, head_dim)
// 입력: Q (num_heads, seq_len, head_dim) - batch_size는 항상 1
// K (num_heads, seq_len, head_dim)
// V (num_heads, seq_len, head_dim)
// mask (optional) - attention mask
// 출력: output (batch_size, num_heads, seq_len, head_dim)
// 출력: output (num_heads, seq_len, head_dim)

__global__ void fused_attention_kernel(
const float* Q,
const float* K,
const float* V,
float* output,
const float* mask,
int batch_size,
int num_heads,
int seq_len,
int head_dim,
float scale
) {
// TODO: 구현하세요
// Fused Attention: QK^T -> scale -> mask -> softmax -> @V
int batch_idx = blockIdx.z;
int head_idx = blockIdx.y;
int seq_idx = blockIdx.x;

// TODO: Attention 계산
// 각 thread는 하나의 head_dim element를 처리할 수 있습니다
int feature_idx = threadIdx.x;

if (batch_idx < batch_size && head_idx < num_heads &&
seq_idx < seq_len && feature_idx < head_dim) {
int idx = batch_idx * num_heads * seq_len * head_dim +
head_idx * seq_len * head_dim +
if (head_idx < num_heads && seq_idx < seq_len && feature_idx < head_dim) {
int idx = head_idx * seq_len * head_dim +
seq_idx * head_dim +
feature_idx;
// TODO: Fused Attention 계산
Expand All @@ -58,18 +54,18 @@ extern "C" void day15_fused_attention(
const float* V,
float* output,
const float* mask,
int batch_size,
int num_heads,
int seq_len,
int head_dim,
float scale
) {
// TODO: kernel launch configuration 설정
// batch_size는 항상 1이므로 제거
dim3 threadsPerBlock(head_dim);
dim3 blocksPerGrid(seq_len, num_heads, batch_size);
dim3 blocksPerGrid(seq_len, num_heads);

fused_attention_kernel<<<blocksPerGrid, threadsPerBlock>>>(
Q, K, V, output, mask, batch_size, num_heads, seq_len, head_dim, scale
Q, K, V, output, mask, num_heads, seq_len, head_dim, scale
);
cudaDeviceSynchronize();
}
18 changes: 7 additions & 11 deletions csrc/day19_rope.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
// 2. 위치 정보를 이용한 회전 변환
// 3. Query와 Key에 각각 적용
//
// 입력: query (batch_size, num_heads, seq_len, head_dim)
// key (batch_size, num_heads, seq_len, head_dim)
// 입력: query (num_heads, seq_len, head_dim) - batch_size는 항상 1
// key (num_heads, seq_len, head_dim)
// cos_cache (seq_len, head_dim / 2)
// sin_cache (seq_len, head_dim / 2)
// 출력: rotated_query, rotated_key
Expand All @@ -23,22 +23,18 @@ __global__ void rope_kernel(
float* rotated_key,
const float* cos_cache,
const float* sin_cache,
int batch_size,
int num_heads,
int seq_len,
int head_dim
) {
// TODO: 구현하세요
int batch_idx = blockIdx.z;
int head_idx = blockIdx.y;
int seq_idx = blockIdx.x;
int dim_idx = threadIdx.x;

if (batch_idx < batch_size && head_idx < num_heads &&
seq_idx < seq_len && dim_idx < head_dim) {
if (head_idx < num_heads && seq_idx < seq_len && dim_idx < head_dim) {
// TODO: RoPE 계산
int q_idx = batch_idx * num_heads * seq_len * head_dim +
head_idx * seq_len * head_dim +
int q_idx = head_idx * seq_len * head_dim +
seq_idx * head_dim +
dim_idx;
rotated_query[q_idx] = query[q_idx];
Expand All @@ -53,19 +49,19 @@ extern "C" void day19_rope(
float* rotated_key,
const float* cos_cache,
const float* sin_cache,
int batch_size,
int num_heads,
int seq_len,
int head_dim
) {
// TODO: kernel launch configuration 설정
// batch_size는 항상 1이므로 제거
dim3 threadsPerBlock(head_dim);
dim3 blocksPerGrid(seq_len, num_heads, batch_size);
dim3 blocksPerGrid(seq_len, num_heads);

rope_kernel<<<blocksPerGrid, threadsPerBlock>>>(
query, key, rotated_query, rotated_key,
cos_cache, sin_cache,
batch_size, num_heads, seq_len, head_dim
num_heads, seq_len, head_dim
);
cudaDeviceSynchronize();
}
18 changes: 7 additions & 11 deletions csrc/day20_conv2d.cu
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,15 @@
// 2. 메모리 타일링 최적화
// 3. Shared memory 활용
//
// 입력: input (batch_size, in_channels, height, width)
// 입력: input (in_channels, height, width) - batch_size는 항상 1
// kernel (out_channels, in_channels, kernel_h, kernel_w)
// padding, stride
// 출력: output (batch_size, out_channels, out_height, out_width)
// 출력: output (out_channels, out_height, out_width)

__global__ void conv2d_kernel(
const float* input,
const float* kernel,
float* output,
int batch_size,
int in_channels,
int out_channels,
int input_h,
Expand All @@ -34,18 +33,15 @@ __global__ void conv2d_kernel(
) {
// TODO: 구현하세요
// 2D 컨볼루션 계산
int batch_idx = blockIdx.z;
int out_channel_idx = blockIdx.y;
int out_row = blockIdx.x / output_w;
int out_col = blockIdx.x % output_w;

int thread_idx = threadIdx.x;

if (batch_idx < batch_size && out_channel_idx < out_channels &&
out_row < output_h && out_col < output_w) {
if (out_channel_idx < out_channels && out_row < output_h && out_col < output_w) {
// TODO: 2D Convolution 계산
int out_idx = batch_idx * out_channels * output_h * output_w +
out_channel_idx * output_h * output_w +
int out_idx = out_channel_idx * output_h * output_w +
out_row * output_w +
out_col;
output[out_idx] = 0.0f;
Expand All @@ -56,7 +52,6 @@ extern "C" void day20_conv2d(
const float* input,
const float* kernel,
float* output,
int batch_size,
int in_channels,
int out_channels,
int input_h,
Expand All @@ -71,12 +66,13 @@ extern "C" void day20_conv2d(
int stride_w
) {
// TODO: kernel launch configuration 설정
// batch_size는 항상 1이므로 제거
dim3 threadsPerBlock(256);
dim3 blocksPerGrid(output_h * output_w, out_channels, batch_size);
dim3 blocksPerGrid(output_h * output_w, out_channels);

conv2d_kernel<<<blocksPerGrid, threadsPerBlock>>>(
input, kernel, output,
batch_size, in_channels, out_channels,
in_channels, out_channels,
input_h, input_w, kernel_h, kernel_w,
output_h, output_w,
pad_h, pad_w, stride_h, stride_w
Expand Down
Loading
Loading