Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 3 additions & 16 deletions .github/workflows/pre-commit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,33 +11,20 @@ concurrency:
jobs:
pre-commit:
runs-on: ubuntu-latest
defaults:
run:
shell: bash
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
token: ${{ secrets.GH_PAT }}

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.12'

- name: Install uv
uses: astral-sh/setup-uv@v4
with:
version: "latest"

- name: Install dependencies
run: |
uv sync --extra dev

- name: Run mypy
- name: Install pre-commit
run: |
uv run mypy src/gpu_20days --ignore-missing-imports || true
pip install pre-commit

- name: Run pre-commit hooks
run: |
uv run pre-commit run --from-ref=origin/${{ github.base_ref }} --to-ref=HEAD || true
pre-commit run --from-ref=origin/${{ github.base_ref }} --to-ref=HEAD
16 changes: 6 additions & 10 deletions csrc/bindings.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ extern "C" void day08_relu(const float* input, float* output, int N);
extern "C" void day09_silu(const float* input, float* output, int N);
extern "C" void day10_conv1d(const float* input, const float* kernel, float* output, int input_size, int kernel_size);
extern "C" void day11_softmax(const float* input, float* output, int feature_size);
extern "C" void day12_layernorm(const float* input, float* output, const float* gamma, const float* beta, int batch_size, int feature_size, float eps);
extern "C" void day13_rmsnorm(const float* input, float* output, const float* weight, int batch_size, int feature_size, float eps);
extern "C" void day12_layernorm(const float* input, float* output, const float* gamma, const float* beta, int feature_size, float eps);
extern "C" void day13_rmsnorm(const float* input, float* output, const float* weight, int feature_size, float eps);
extern "C" void day14_fused_softmax(const float* input, float* output, const float* mask, int batch_size, int seq_len, int feature_size, float scale);
extern "C" void day15_fused_attention(const float* Q, const float* K, const float* V, float* output, const float* mask, int batch_size, int num_heads, int seq_len, int head_dim, float scale);
extern "C" void day16_group_gemm(const float* A, const float* B, float* C, int num_groups, int M, int N, int K);
Expand Down Expand Up @@ -187,10 +187,9 @@ torch::Tensor day11_softmax_wrapper(torch::Tensor input) {
torch::Tensor day12_layernorm_wrapper(torch::Tensor input, torch::Tensor gamma, torch::Tensor beta, float eps) {
TORCH_CHECK(input.is_cuda(), "Input must be a CUDA tensor");
TORCH_CHECK(input.dtype() == torch::kFloat32, "Input must be float32");
TORCH_CHECK(input.dim() == 2, "Input must be 2D tensor (batch_size, feature_size)");
TORCH_CHECK(input.dim() == 1, "Input must be 1D tensor (feature_size)");

int batch_size = input.size(0);
int feature_size = input.size(1);
int feature_size = input.size(0);

// Default gamma and beta if not provided
if (gamma.numel() == 0) {
Expand All @@ -207,7 +206,6 @@ torch::Tensor day12_layernorm_wrapper(torch::Tensor input, torch::Tensor gamma,
output.data_ptr<float>(),
gamma.data_ptr<float>(),
beta.data_ptr<float>(),
batch_size,
feature_size,
eps
);
Expand All @@ -219,10 +217,9 @@ torch::Tensor day12_layernorm_wrapper(torch::Tensor input, torch::Tensor gamma,
torch::Tensor day13_rmsnorm_wrapper(torch::Tensor input, torch::Tensor weight, float eps) {
TORCH_CHECK(input.is_cuda(), "Input must be a CUDA tensor");
TORCH_CHECK(input.dtype() == torch::kFloat32, "Input must be float32");
TORCH_CHECK(input.dim() == 2, "Input must be 2D tensor (batch_size, feature_size)");
TORCH_CHECK(input.dim() == 1, "Input must be 1D tensor (feature_size)");

int batch_size = input.size(0);
int feature_size = input.size(1);
int feature_size = input.size(0);

// Default weight if not provided
if (weight.numel() == 0) {
Expand All @@ -235,7 +232,6 @@ torch::Tensor day13_rmsnorm_wrapper(torch::Tensor input, torch::Tensor weight, f
input.data_ptr<float>(),
output.data_ptr<float>(),
weight.data_ptr<float>(),
batch_size,
feature_size,
eps
);
Expand Down
43 changes: 17 additions & 26 deletions csrc/day12_layernorm.cu
Original file line number Diff line number Diff line change
@@ -1,55 +1,46 @@
#include <cuda_runtime.h>
#define ceil(x, y) (((x) + (y) - 1) / (y))

// TODO: LayerNorm kernel 구현
// LayerNorm(x) = gamma * (x - mean) / sqrt(variance + eps) + beta
//
// 힌트:
// 1. 각 row의 평균을 계산합니다 (reduction)
// 2. 각 row의 분산을 계산합니다 (reduction)
// 3. 정규화: (x - mean) / sqrt(variance + eps)
// 4. affine transformation: gamma * normalized + beta
//
// 입력: input (batch_size, feature_size)
// gamma (feature_size) - optional, 없으면 1.0 사용
// beta (feature_size) - optional, 없으면 0.0 사용
// eps - 작은 상수 (기본값 1e-5)
// 출력: output (batch_size, feature_size)

__global__ void layernorm_kernel(
const float* input,
float* output,
const float* gamma,
const float* beta,
int batch_size,
int feature_size,
float eps
) {
// TODO: 구현하세요
int batch_idx = blockIdx.x;
int feature_idx = threadIdx.x;
int feature_idx = blockIdx.x * blockDim.x + threadIdx.x;

if (batch_idx < batch_size && feature_idx < feature_size) {
// TODO: LayerNorm 계산
output[batch_idx * feature_size + feature_idx] = input[batch_idx * feature_size + feature_idx];
if (feature_idx < feature_size) {
float mean = 0.0f;
float variance = 0.0f;
for (int i = 0; i < feature_size; i++){
mean += input[i];
}
mean /= feature_size;
for (int i = 0; i < feature_size; i++){
variance += (input[i] - mean) * (input[i] - mean);
}
variance /= feature_size;
output[feature_idx] = gamma[feature_idx] * (input[feature_idx] - mean) / sqrt(variance + eps) + beta[feature_idx];
}
}

extern "C" void day12_layernorm(
const float* input,
float* output,
const float* gamma,
const float* beta,
int batch_size,
int feature_size,
float eps
) {
// TODO: kernel launch configuration 설정
dim3 threadsPerBlock(feature_size);
dim3 blocksPerGrid(batch_size);
const int BLOCKSIZE = 256;
dim3 threadsPerBlock(BLOCKSIZE);
dim3 blocksPerGrid(ceil(feature_size, BLOCKSIZE));

layernorm_kernel<<<blocksPerGrid, threadsPerBlock>>>(
input, output, gamma, beta, batch_size, feature_size, eps
input, output, gamma, beta, feature_size, eps
);
cudaDeviceSynchronize();
}
35 changes: 13 additions & 22 deletions csrc/day13_rmsnorm.cu
Original file line number Diff line number Diff line change
@@ -1,52 +1,43 @@
#include <cuda_runtime.h>
#include <cmath>
#define BLOCKSIZE 256
#define ceil(x, y) (((x) + (y) - 1) / (y))

// TODO: RMS Normalization kernel 구현
// RMSNorm(x) = (x / sqrt(mean(x^2) + eps)) * weight
//
// 힌트:
// 1. 각 row의 x^2의 평균을 계산합니다 (RMS: Root Mean Square)
// 2. x / sqrt(mean(x^2) + eps) 계산
// 3. weight를 곱합니다
//
// 입력: input (batch_size, feature_size)
// weight (feature_size) - optional, 없으면 1.0 사용
// eps - 작은 상수 (기본값 1e-5)
// 출력: output (batch_size, feature_size)

__global__ void rmsnorm_kernel(
const float* input,
float* output,
const float* weight,
int batch_size,
int feature_size,
float eps
) {
// TODO: 구현하세요
int batch_idx = blockIdx.x;
int feature_idx = threadIdx.x;
int feature_idx = blockIdx.x * blockDim.x + threadIdx.x;

if (batch_idx < batch_size && feature_idx < feature_size) {
// TODO: RMS Normalization 계산
output[batch_idx * feature_size + feature_idx] = input[batch_idx * feature_size + feature_idx];
if (feature_idx < feature_size) {
float mean_sq = 0.0f;
for (int i = 0; i < feature_size; i++){
mean_sq += input[i] * input[i];
}
mean_sq /= feature_size;
output[feature_idx] = input[feature_idx] / sqrt(mean_sq + eps) * weight[feature_idx];
}
}

extern "C" void day13_rmsnorm(
const float* input,
float* output,
const float* weight,
int batch_size,
int feature_size,
float eps
) {
// TODO: kernel launch configuration 설정
dim3 threadsPerBlock(feature_size);
dim3 blocksPerGrid(batch_size);
// batch_size는 항상 1이므로 제거
dim3 threadsPerBlock(BLOCKSIZE);
dim3 blocksPerGrid(ceil(feature_size, BLOCKSIZE));

rmsnorm_kernel<<<blocksPerGrid, threadsPerBlock>>>(
input, output, weight, batch_size, feature_size, eps
input, output, weight, feature_size, eps
);
cudaDeviceSynchronize();
}
50 changes: 0 additions & 50 deletions src/gpu_20days/__init__.py

This file was deleted.

2 changes: 1 addition & 1 deletion src/gpu_20days/cuda_kernels.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def day11_softmax(input: torch.Tensor) -> torch.Tensor:
def day12_layernorm(
input: torch.Tensor, gamma: torch.Tensor = None, beta: torch.Tensor = None, eps: float = 1e-5
) -> torch.Tensor:
"""Day 12: Layer normalization"""
"""Day 12: Layer normalization (batch_size is always 1)"""
_check_cuda()
assert cuda_ops is not None # Type guard for mypy
if gamma is None:
Expand Down
56 changes: 38 additions & 18 deletions src/gpu_20days/day12_layernorm.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ def day12_layernorm_kernel(
output_ptr,
gamma_ptr,
beta_ptr,
batch_size,
feature_size,
eps,
BLOCK_SIZE: tl.constexpr,
Expand All @@ -23,26 +22,47 @@ def day12_layernorm_kernel(

LayerNorm(x) = gamma * (x - mean) / sqrt(variance + eps) + beta

힌트:
1. 각 row의 평균을 계산합니다 (reduction)
2. 각 row의 분산을 계산합니다 (reduction)
3. 정규화: (x - mean) / sqrt(variance + eps)
4. affine transformation: gamma * normalized + beta
"""
# TODO: 구현하세요
# batch_idx = tl.program_id(0)
# feature_idx = tl.arange(0, BLOCK_SIZE)
# mask = feature_idx < feature_size
pass

pid = tl.program_id(0)
feature_idx = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
mask = feature_idx < feature_size

# Compute mean
_mean = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
for off in range(0, feature_size, BLOCK_SIZE):
cols = off + tl.arange(0, BLOCK_SIZE)
mask_cols = cols < feature_size
a = tl.load(input_ptr + cols, mask=mask_cols, other=0.0).to(tl.float32)
_mean += tl.where(mask_cols, a, 0.0)
mean = tl.sum(_mean, axis=0) / feature_size

# Compute variance
_variance = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
for off in range(0, feature_size, BLOCK_SIZE):
cols = off + tl.arange(0, BLOCK_SIZE)
mask_cols = cols < feature_size
a = tl.load(input_ptr + cols, mask=mask_cols, other=0.0).to(tl.float32)
_variance += tl.where(mask_cols, (a - mean) * (a - mean), 0.0)
variance = tl.sum(_variance, axis=0) / feature_size

input_vals = tl.load(input_ptr + feature_idx, mask=mask, other=0.0)
gamma_vals = tl.load(gamma_ptr + feature_idx, mask=mask, other=1.0)
beta_vals = tl.load(beta_ptr + feature_idx, mask=mask, other=0.0)
normalized = (input_vals - mean) / tl.sqrt(variance + eps)
output = gamma_vals * normalized + beta_vals
tl.store(output_ptr + feature_idx, output, mask=mask)


def day12_layernorm(
input: torch.Tensor, gamma: torch.Tensor = None, beta: torch.Tensor = None, eps: float = 1e-5
) -> torch.Tensor:
"""Day 12: Layer normalization"""
# TODO: 구현하세요
"""Day 12: Layer normalization (batch_size is always 1)"""
BLOCK_SIZE = 256
batch_size, feature_size = input.shape
if input.dim() != 1:
raise ValueError("day12_layernorm expects 1D tensor (feature_size), batch_size is always 1")

feature_size = input.size(0)

if gamma is None:
gamma = torch.ones(feature_size, device=input.device, dtype=input.dtype)
Expand All @@ -52,9 +72,9 @@ def day12_layernorm(
output = torch.zeros_like(input)

def grid(meta):
return (batch_size,)
return (triton.cdiv(feature_size, BLOCK_SIZE),)

# day12_layernorm_kernel[grid](
# input, output, gamma, beta, batch_size, feature_size, eps, BLOCK_SIZE=BLOCK_SIZE
# )
day12_layernorm_kernel[grid](
input, output, gamma, beta, feature_size, eps, BLOCK_SIZE=BLOCK_SIZE
)
return output
Loading
Loading