opus24 · opus24 · Jan 20, 2026 · Jan 20, 2026 · Jan 20, 2026 · Jan 20, 2026
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
@@ -11,33 +11,20 @@ concurrency:
 jobs:
   pre-commit:
     runs-on: ubuntu-latest
-    defaults:
-      run:
-        shell: bash
     steps:
       - uses: actions/checkout@v4
         with:
           fetch-depth: 0
-          token: ${{ secrets.GH_PAT }}
 
       - name: Set up Python
         uses: actions/setup-python@v5
         with:
           python-version: '3.12'
 
-      - name: Install uv
-        uses: astral-sh/setup-uv@v4
-        with:
-          version: "latest"
-
-      - name: Install dependencies
-        run: |
-          uv sync --extra dev
-
-      - name: Run mypy
+      - name: Install pre-commit
         run: |
-          uv run mypy src/gpu_20days --ignore-missing-imports || true
+          pip install pre-commit
 
       - name: Run pre-commit hooks
         run: |
-          uv run pre-commit run --from-ref=origin/${{ github.base_ref }} --to-ref=HEAD || true
+          pre-commit run --from-ref=origin/${{ github.base_ref }} --to-ref=HEAD
diff --git a/csrc/bindings.cu b/csrc/bindings.cu
@@ -14,8 +14,8 @@ extern "C" void day08_relu(const float* input, float* output, int N);
 extern "C" void day09_silu(const float* input, float* output, int N);
 extern "C" void day10_conv1d(const float* input, const float* kernel, float* output, int input_size, int kernel_size);
 extern "C" void day11_softmax(const float* input, float* output, int feature_size);
-extern "C" void day12_layernorm(const float* input, float* output, const float* gamma, const float* beta, int batch_size, int feature_size, float eps);
-extern "C" void day13_rmsnorm(const float* input, float* output, const float* weight, int batch_size, int feature_size, float eps);
+extern "C" void day12_layernorm(const float* input, float* output, const float* gamma, const float* beta, int feature_size, float eps);
+extern "C" void day13_rmsnorm(const float* input, float* output, const float* weight, int feature_size, float eps);
 extern "C" void day14_fused_softmax(const float* input, float* output, const float* mask, int batch_size, int seq_len, int feature_size, float scale);
 extern "C" void day15_fused_attention(const float* Q, const float* K, const float* V, float* output, const float* mask, int batch_size, int num_heads, int seq_len, int head_dim, float scale);
 extern "C" void day16_group_gemm(const float* A, const float* B, float* C, int num_groups, int M, int N, int K);
@@ -187,10 +187,9 @@ torch::Tensor day11_softmax_wrapper(torch::Tensor input) {
 torch::Tensor day12_layernorm_wrapper(torch::Tensor input, torch::Tensor gamma, torch::Tensor beta, float eps) {
     TORCH_CHECK(input.is_cuda(), "Input must be a CUDA tensor");
     TORCH_CHECK(input.dtype() == torch::kFloat32, "Input must be float32");
-    TORCH_CHECK(input.dim() == 2, "Input must be 2D tensor (batch_size, feature_size)");
+    TORCH_CHECK(input.dim() == 1, "Input must be 1D tensor (feature_size)");
 
-    int batch_size = input.size(0);
-    int feature_size = input.size(1);
+    int feature_size = input.size(0);
 
     // Default gamma and beta if not provided
     if (gamma.numel() == 0) {
@@ -207,7 +206,6 @@ torch::Tensor day12_layernorm_wrapper(torch::Tensor input, torch::Tensor gamma,
         output.data_ptr<float>(),
         gamma.data_ptr<float>(),
         beta.data_ptr<float>(),
-        batch_size,
         feature_size,
         eps
     );
@@ -219,10 +217,9 @@ torch::Tensor day12_layernorm_wrapper(torch::Tensor input, torch::Tensor gamma,
 torch::Tensor day13_rmsnorm_wrapper(torch::Tensor input, torch::Tensor weight, float eps) {
     TORCH_CHECK(input.is_cuda(), "Input must be a CUDA tensor");
     TORCH_CHECK(input.dtype() == torch::kFloat32, "Input must be float32");
-    TORCH_CHECK(input.dim() == 2, "Input must be 2D tensor (batch_size, feature_size)");
+    TORCH_CHECK(input.dim() == 1, "Input must be 1D tensor (feature_size)");
 
-    int batch_size = input.size(0);
-    int feature_size = input.size(1);
+    int feature_size = input.size(0);
 
     // Default weight if not provided
     if (weight.numel() == 0) {
@@ -235,7 +232,6 @@ torch::Tensor day13_rmsnorm_wrapper(torch::Tensor input, torch::Tensor weight, f
         input.data_ptr<float>(),
         output.data_ptr<float>(),
         weight.data_ptr<float>(),
-        batch_size,
         feature_size,
         eps
     );

diff --git a/csrc/day12_layernorm.cu b/csrc/day12_layernorm.cu
@@ -1,55 +1,46 @@
 #include <cuda_runtime.h>
 #define ceil(x, y) (((x) + (y) - 1) / (y))
 
-// TODO: LayerNorm kernel 구현
 // LayerNorm(x) = gamma * (x - mean) / sqrt(variance + eps) + beta
-//
-// 힌트:
-// 1. 각 row의 평균을 계산합니다 (reduction)
-// 2. 각 row의 분산을 계산합니다 (reduction)
-// 3. 정규화: (x - mean) / sqrt(variance + eps)
-// 4. affine transformation: gamma * normalized + beta
-//
-// 입력: input (batch_size, feature_size)
-//      gamma (feature_size) - optional, 없으면 1.0 사용
-//      beta (feature_size) - optional, 없으면 0.0 사용
-//      eps - 작은 상수 (기본값 1e-5)
-// 출력: output (batch_size, feature_size)
 
 __global__ void layernorm_kernel(
     const float* input,
     float* output,
     const float* gamma,
     const float* beta,
-    int batch_size,
     int feature_size,
     float eps
 ) {
-    // TODO: 구현하세요
-    int batch_idx = blockIdx.x;
-    int feature_idx = threadIdx.x;
+    int feature_idx = blockIdx.x * blockDim.x + threadIdx.x;
 
-    if (batch_idx < batch_size && feature_idx < feature_size) {
-        // TODO: LayerNorm 계산
-        output[batch_idx * feature_size + feature_idx] = input[batch_idx * feature_size + feature_idx];
+    if (feature_idx < feature_size) {
+        float mean = 0.0f;
+        float variance = 0.0f;
+        for (int i = 0; i < feature_size; i++){
+            mean += input[i];
+        }
+        mean /= feature_size;
+        for (int i = 0; i < feature_size; i++){
+            variance += (input[i] - mean) * (input[i] - mean);
+        }
+        variance /= feature_size;
+        output[feature_idx] = gamma[feature_idx] * (input[feature_idx] - mean) / sqrt(variance + eps) + beta[feature_idx];
     }
 }
-
 extern "C" void day12_layernorm(
     const float* input,
     float* output,
     const float* gamma,
     const float* beta,
-    int batch_size,
     int feature_size,
     float eps
 ) {
-    // TODO: kernel launch configuration 설정
-    dim3 threadsPerBlock(feature_size);
-    dim3 blocksPerGrid(batch_size);
+    const int BLOCKSIZE = 256;
+    dim3 threadsPerBlock(BLOCKSIZE);
+    dim3 blocksPerGrid(ceil(feature_size, BLOCKSIZE));
 
     layernorm_kernel<<<blocksPerGrid, threadsPerBlock>>>(
-        input, output, gamma, beta, batch_size, feature_size, eps
+        input, output, gamma, beta, feature_size, eps
     );
     cudaDeviceSynchronize();
 }
diff --git a/csrc/day13_rmsnorm.cu b/csrc/day13_rmsnorm.cu
@@ -1,52 +1,43 @@
 #include <cuda_runtime.h>
 #include <cmath>
+#define BLOCKSIZE 256
 #define ceil(x, y) (((x) + (y) - 1) / (y))
 
-// TODO: RMS Normalization kernel 구현
 // RMSNorm(x) = (x / sqrt(mean(x^2) + eps)) * weight
-//
-// 힌트:
-// 1. 각 row의 x^2의 평균을 계산합니다 (RMS: Root Mean Square)
-// 2. x / sqrt(mean(x^2) + eps) 계산
-// 3. weight를 곱합니다
-//
-// 입력: input (batch_size, feature_size)
-//      weight (feature_size) - optional, 없으면 1.0 사용
-//      eps - 작은 상수 (기본값 1e-5)
-// 출력: output (batch_size, feature_size)
 
 __global__ void rmsnorm_kernel(
     const float* input,
     float* output,
     const float* weight,
-    int batch_size,
     int feature_size,
     float eps
 ) {
-    // TODO: 구현하세요
-    int batch_idx = blockIdx.x;
-    int feature_idx = threadIdx.x;
+    int feature_idx = blockIdx.x * blockDim.x + threadIdx.x;
 
-    if (batch_idx < batch_size && feature_idx < feature_size) {
-        // TODO: RMS Normalization 계산
-        output[batch_idx * feature_size + feature_idx] = input[batch_idx * feature_size + feature_idx];
+    if (feature_idx < feature_size) {
+        float mean_sq = 0.0f;
+        for (int i = 0; i < feature_size; i++){
+            mean_sq += input[i] * input[i];
+        }
+        mean_sq /= feature_size;
+        output[feature_idx] = input[feature_idx] / sqrt(mean_sq + eps) * weight[feature_idx];
     }
 }
 
 extern "C" void day13_rmsnorm(
     const float* input,
     float* output,
     const float* weight,
-    int batch_size,
     int feature_size,
     float eps
 ) {
     // TODO: kernel launch configuration 설정
-    dim3 threadsPerBlock(feature_size);
-    dim3 blocksPerGrid(batch_size);
+    // batch_size는 항상 1이므로 제거
+    dim3 threadsPerBlock(BLOCKSIZE);
+    dim3 blocksPerGrid(ceil(feature_size, BLOCKSIZE));
 
     rmsnorm_kernel<<<blocksPerGrid, threadsPerBlock>>>(
-        input, output, weight, batch_size, feature_size, eps
+        input, output, weight, feature_size, eps
     );
     cudaDeviceSynchronize();
 }
diff --git a/src/gpu_20days/__init__.py b/src/gpu_20days/__init__.py
diff --git a/src/gpu_20days/cuda_kernels.py b/src/gpu_20days/cuda_kernels.py
@@ -108,7 +108,7 @@ def day11_softmax(input: torch.Tensor) -> torch.Tensor:
 def day12_layernorm(
     input: torch.Tensor, gamma: torch.Tensor = None, beta: torch.Tensor = None, eps: float = 1e-5
 ) -> torch.Tensor:
-    """Day 12: Layer normalization"""
+    """Day 12: Layer normalization (batch_size is always 1)"""
     _check_cuda()
     assert cuda_ops is not None  # Type guard for mypy
     if gamma is None:

diff --git a/src/gpu_20days/day12_layernorm.py b/src/gpu_20days/day12_layernorm.py
@@ -13,7 +13,6 @@ def day12_layernorm_kernel(
     output_ptr,
     gamma_ptr,
     beta_ptr,
-    batch_size,
     feature_size,
     eps,
     BLOCK_SIZE: tl.constexpr,
@@ -23,26 +22,47 @@ def day12_layernorm_kernel(
 
     LayerNorm(x) = gamma * (x - mean) / sqrt(variance + eps) + beta
 
-    힌트:
-    1. 각 row의 평균을 계산합니다 (reduction)
-    2. 각 row의 분산을 계산합니다 (reduction)
-    3. 정규화: (x - mean) / sqrt(variance + eps)
-    4. affine transformation: gamma * normalized + beta
     """
-    # TODO: 구현하세요
-    # batch_idx = tl.program_id(0)
-    # feature_idx = tl.arange(0, BLOCK_SIZE)
-    # mask = feature_idx < feature_size
-    pass
+
+    pid = tl.program_id(0)
+    feature_idx = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = feature_idx < feature_size
+
+    # Compute mean
+    _mean = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
+    for off in range(0, feature_size, BLOCK_SIZE):
+        cols = off + tl.arange(0, BLOCK_SIZE)
+        mask_cols = cols < feature_size
+        a = tl.load(input_ptr + cols, mask=mask_cols, other=0.0).to(tl.float32)
+        _mean += tl.where(mask_cols, a, 0.0)
+    mean = tl.sum(_mean, axis=0) / feature_size
+
+    # Compute variance
+    _variance = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
+    for off in range(0, feature_size, BLOCK_SIZE):
+        cols = off + tl.arange(0, BLOCK_SIZE)
+        mask_cols = cols < feature_size
+        a = tl.load(input_ptr + cols, mask=mask_cols, other=0.0).to(tl.float32)
+        _variance += tl.where(mask_cols, (a - mean) * (a - mean), 0.0)
+    variance = tl.sum(_variance, axis=0) / feature_size
+
+    input_vals = tl.load(input_ptr + feature_idx, mask=mask, other=0.0)
+    gamma_vals = tl.load(gamma_ptr + feature_idx, mask=mask, other=1.0)
+    beta_vals = tl.load(beta_ptr + feature_idx, mask=mask, other=0.0)
+    normalized = (input_vals - mean) / tl.sqrt(variance + eps)
+    output = gamma_vals * normalized + beta_vals
+    tl.store(output_ptr + feature_idx, output, mask=mask)
 
 
 def day12_layernorm(
     input: torch.Tensor, gamma: torch.Tensor = None, beta: torch.Tensor = None, eps: float = 1e-5
 ) -> torch.Tensor:
-    """Day 12: Layer normalization"""
-    # TODO: 구현하세요
+    """Day 12: Layer normalization (batch_size is always 1)"""
     BLOCK_SIZE = 256
-    batch_size, feature_size = input.shape
+    if input.dim() != 1:
+        raise ValueError("day12_layernorm expects 1D tensor (feature_size), batch_size is always 1")
+
+    feature_size = input.size(0)
 
     if gamma is None:
         gamma = torch.ones(feature_size, device=input.device, dtype=input.dtype)
@@ -52,9 +72,9 @@ def day12_layernorm(
     output = torch.zeros_like(input)
 
     def grid(meta):
-        return (batch_size,)
+        return (triton.cdiv(feature_size, BLOCK_SIZE),)
 
-    # day12_layernorm_kernel[grid](
-    #     input, output, gamma, beta, batch_size, feature_size, eps, BLOCK_SIZE=BLOCK_SIZE
-    # )
+    day12_layernorm_kernel[grid](
+        input, output, gamma, beta, feature_size, eps, BLOCK_SIZE=BLOCK_SIZE
+    )
     return output