diff --git a/csrc/bindings.cu b/csrc/bindings.cu
index 73b9ef1..2b785ed 100644
--- a/csrc/bindings.cu
+++ b/csrc/bindings.cu
@@ -16,13 +16,13 @@ extern "C" void day10_conv1d(const float* input, const float* kernel, float* out
 extern "C" void day11_softmax(const float* input, float* output, int feature_size);
 extern "C" void day12_layernorm(const float* input, float* output, const float* gamma, const float* beta, int feature_size, float eps);
 extern "C" void day13_rmsnorm(const float* input, float* output, const float* weight, int feature_size, float eps);
-extern "C" void day14_fused_softmax(const float* input, float* output, const float* mask, int batch_size, int seq_len, int feature_size, float scale);
-extern "C" void day15_fused_attention(const float* Q, const float* K, const float* V, float* output, const float* mask, int batch_size, int num_heads, int seq_len, int head_dim, float scale);
+extern "C" void day14_fused_softmax(const float* input, float* output, const float* mask, int seq_len, int feature_size, float scale);
+extern "C" void day15_fused_attention(const float* Q, const float* K, const float* V, float* output, const float* mask, int num_heads, int seq_len, int head_dim, float scale);
 extern "C" void day16_group_gemm(const float* A, const float* B, float* C, int num_groups, int M, int N, int K);
 extern "C" void day17_persistent_matmul(const float* A, const float* B, float* C, int M, int N, int K);
 extern "C" void day18_block_scaled_matmul(const float* A, const float* B, float* C, int M, int N, int K, float scale);
-extern "C" void day19_rope(const float* query, const float* key, float* rotated_query, float* rotated_key, const float* cos_cache, const float* sin_cache, int batch_size, int num_heads, int seq_len, int head_dim);
-extern "C" void day20_conv2d(const float* input, const float* kernel, float* output, int batch_size, int in_channels, int out_channels, int input_h, int input_w, int kernel_h, int kernel_w, int output_h, int output_w, int pad_h, int pad_w, int stride_h, int stride_w);
+extern "C" void day19_rope(const float* query, const float* key, float* rotated_query, float* rotated_key, const float* cos_cache, const float* sin_cache, int num_heads, int seq_len, int head_dim);
+extern "C" void day20_conv2d(const float* input, const float* kernel, float* output, int in_channels, int out_channels, int input_h, int input_w, int kernel_h, int kernel_w, int output_h, int output_w, int pad_h, int pad_w, int stride_h, int stride_w);
 
 // Helper macro for CUDA error checking
 #define CUDA_CHECK(call) \
@@ -243,11 +243,10 @@ torch::Tensor day13_rmsnorm_wrapper(torch::Tensor input, torch::Tensor weight, f
 torch::Tensor day14_fused_softmax_wrapper(torch::Tensor input, torch::Tensor mask, float scale) {
     TORCH_CHECK(input.is_cuda(), "Input must be a CUDA tensor");
     TORCH_CHECK(input.dtype() == torch::kFloat32, "Input must be float32");
-    TORCH_CHECK(input.dim() == 3, "Input must be 3D tensor (batch_size, seq_len, feature_size)");
+    TORCH_CHECK(input.dim() == 2, "Input must be 2D tensor (seq_len, feature_size)");
 
-    int batch_size = input.size(0);
-    int seq_len = input.size(1);
-    int feature_size = input.size(2);
+    int seq_len = input.size(0);
+    int feature_size = input.size(1);
 
     // Default mask if not provided (empty tensor)
     const float* mask_ptr = nullptr;
@@ -262,7 +261,6 @@ torch::Tensor day14_fused_softmax_wrapper(torch::Tensor input, torch::Tensor mas
         input.data_ptr<float>(),
         output.data_ptr<float>(),
         mask_ptr,
-        batch_size,
         seq_len,
         feature_size,
         scale
@@ -275,12 +273,11 @@ torch::Tensor day14_fused_softmax_wrapper(torch::Tensor input, torch::Tensor mas
 torch::Tensor day15_fused_attention_wrapper(torch::Tensor Q, torch::Tensor K, torch::Tensor V, torch::Tensor mask, float scale) {
     TORCH_CHECK(Q.is_cuda() && K.is_cuda() && V.is_cuda(), "Inputs must be CUDA tensors");
     TORCH_CHECK(Q.dtype() == torch::kFloat32 && K.dtype() == torch::kFloat32 && V.dtype() == torch::kFloat32, "Inputs must be float32");
-    TORCH_CHECK(Q.dim() == 4, "Q, K, V must be 4D tensors (batch_size, num_heads, seq_len, head_dim)");
+    TORCH_CHECK(Q.dim() == 3, "Q, K, V must be 3D tensors (num_heads, seq_len, head_dim)");
 
-    int batch_size = Q.size(0);
-    int num_heads = Q.size(1);
-    int seq_len = Q.size(2);
-    int head_dim = Q.size(3);
+    int num_heads = Q.size(0);
+    int seq_len = Q.size(1);
+    int head_dim = Q.size(2);
 
     // Default scale if not provided
     if (scale <= 0.0f) {
@@ -302,7 +299,6 @@ torch::Tensor day15_fused_attention_wrapper(torch::Tensor Q, torch::Tensor K, to
         V.data_ptr<float>(),
         output.data_ptr<float>(),
         mask_ptr,
-        batch_size,
         num_heads,
         seq_len,
         head_dim,
@@ -392,12 +388,11 @@ std::tuple<torch::Tensor, torch::Tensor> day19_rope_wrapper(
 ) {
     TORCH_CHECK(query.is_cuda() && key.is_cuda(), "Inputs must be CUDA tensors");
     TORCH_CHECK(query.dtype() == torch::kFloat32 && key.dtype() == torch::kFloat32, "Inputs must be float32");
-    TORCH_CHECK(query.dim() == 4 && key.dim() == 4, "Query and key must be 4D tensors (batch_size, num_heads, seq_len, head_dim)");
+    TORCH_CHECK(query.dim() == 3 && key.dim() == 3, "Query and key must be 3D tensors (num_heads, seq_len, head_dim)");
 
-    int batch_size = query.size(0);
-    int num_heads = query.size(1);
-    int seq_len = query.size(2);
-    int head_dim = query.size(3);
+    int num_heads = query.size(0);
+    int seq_len = query.size(1);
+    int head_dim = query.size(2);
 
     torch::Tensor rotated_query = torch::zeros_like(query);
     torch::Tensor rotated_key = torch::zeros_like(key);
@@ -409,7 +404,7 @@ std::tuple<torch::Tensor, torch::Tensor> day19_rope_wrapper(
         rotated_key.data_ptr<float>(),
         cos_cache.data_ptr<float>(),
         sin_cache.data_ptr<float>(),
-        batch_size, num_heads, seq_len, head_dim
+        num_heads, seq_len, head_dim
     );
 
     return std::make_tuple(rotated_query, rotated_key);
@@ -426,13 +421,12 @@ torch::Tensor day20_conv2d_wrapper(
 ) {
     TORCH_CHECK(input.is_cuda() && kernel.is_cuda(), "Inputs must be CUDA tensors");
     TORCH_CHECK(input.dtype() == torch::kFloat32 && kernel.dtype() == torch::kFloat32, "Inputs must be float32");
-    TORCH_CHECK(input.dim() == 4, "Input must be 4D tensor (batch_size, in_channels, height, width)");
+    TORCH_CHECK(input.dim() == 3, "Input must be 3D tensor (in_channels, height, width)");
     TORCH_CHECK(kernel.dim() == 4, "Kernel must be 4D tensor (out_channels, in_channels, kernel_h, kernel_w)");
 
-    int batch_size = input.size(0);
-    int in_channels = input.size(1);
-    int input_h = input.size(2);
-    int input_w = input.size(3);
+    int in_channels = input.size(0);
+    int input_h = input.size(1);
+    int input_w = input.size(2);
 
     int out_channels = kernel.size(0);
     int kernel_h = kernel.size(2);
@@ -443,13 +437,13 @@ torch::Tensor day20_conv2d_wrapper(
 
     TORCH_CHECK(output_h > 0 && output_w > 0, "Output dimensions must be positive");
 
-    torch::Tensor output = torch::zeros({batch_size, out_channels, output_h, output_w}, input.options());
+    torch::Tensor output = torch::zeros({out_channels, output_h, output_w}, input.options());
 
     day20_conv2d(
         input.data_ptr<float>(),
         kernel.data_ptr<float>(),
         output.data_ptr<float>(),
-        batch_size, in_channels, out_channels,
+        in_channels, out_channels,
         input_h, input_w, kernel_h, kernel_w,
         output_h, output_w,
         pad_h, pad_w, stride_h, stride_w
diff --git a/csrc/day14_fused_softmax.cu b/csrc/day14_fused_softmax.cu
index ca027a2..69af8b4 100644
--- a/csrc/day14_fused_softmax.cu
+++ b/csrc/day14_fused_softmax.cu
@@ -11,28 +11,26 @@
 // 2. 추가 연산 (scale, mask 등)을 같은 커널에서 처리
 // 3. 메모리 접근 최소화
 //
-// 입력: input (batch_size, seq_len, feature_size)
+// 입력: input (seq_len, feature_size) - batch_size는 항상 1
 //      scale (optional) - scaling factor
 //      mask (optional) - attention mask
-// 출력: output (batch_size, seq_len, feature_size)
+// 출력: output (seq_len, feature_size)
 
 __global__ void fused_softmax_kernel(
     const float* input,
     float* output,
     const float* mask,
-    int batch_size,
     int seq_len,
     int feature_size,
     float scale
 ) {
     // TODO: 구현하세요
     // Fused operations: scale -> mask -> softmax
-    int batch_idx = blockIdx.z;
-    int seq_idx = blockIdx.y;
+    int seq_idx = blockIdx.x;
     int feature_idx = threadIdx.x;
 
-    if (batch_idx < batch_size && seq_idx < seq_len && feature_idx < feature_size) {
-        int idx = batch_idx * seq_len * feature_size + seq_idx * feature_size + feature_idx;
+    if (seq_idx < seq_len && feature_idx < feature_size) {
+        int idx = seq_idx * feature_size + feature_idx;
         // TODO: Fused Softmax 계산
         output[idx] = input[idx];
     }
@@ -42,17 +40,17 @@ extern "C" void day14_fused_softmax(
     const float* input,
     float* output,
     const float* mask,
-    int batch_size,
     int seq_len,
     int feature_size,
     float scale
 ) {
     // TODO: kernel launch configuration 설정
+    // batch_size는 항상 1이므로 제거
     dim3 threadsPerBlock(feature_size);
-    dim3 blocksPerGrid(seq_len, batch_size);
+    dim3 blocksPerGrid(seq_len);
 
     fused_softmax_kernel<<<blocksPerGrid, threadsPerBlock>>>(
-        input, output, mask, batch_size, seq_len, feature_size, scale
+        input, output, mask, seq_len, feature_size, scale
     );
     cudaDeviceSynchronize();
 }
diff --git a/csrc/day15_fused_attention.cu b/csrc/day15_fused_attention.cu
index 82a924b..06f3ff7 100644
--- a/csrc/day15_fused_attention.cu
+++ b/csrc/day15_fused_attention.cu
@@ -13,11 +13,11 @@
 //
 // 모든 연산을 하나의 커널로 융합하여 성능 최적화
 //
-// 입력: Q (batch_size, num_heads, seq_len, head_dim)
-//      K (batch_size, num_heads, seq_len, head_dim)
-//      V (batch_size, num_heads, seq_len, head_dim)
+// 입력: Q (num_heads, seq_len, head_dim) - batch_size는 항상 1
+//      K (num_heads, seq_len, head_dim)
+//      V (num_heads, seq_len, head_dim)
 //      mask (optional) - attention mask
-// 출력: output (batch_size, num_heads, seq_len, head_dim)
+// 출력: output (num_heads, seq_len, head_dim)
 
 __global__ void fused_attention_kernel(
     const float* Q,
@@ -25,7 +25,6 @@ __global__ void fused_attention_kernel(
     const float* V,
     float* output,
     const float* mask,
-    int batch_size,
     int num_heads,
     int seq_len,
     int head_dim,
@@ -33,7 +32,6 @@ __global__ void fused_attention_kernel(
 ) {
     // TODO: 구현하세요
     // Fused Attention: QK^T -> scale -> mask -> softmax -> @V
-    int batch_idx = blockIdx.z;
     int head_idx = blockIdx.y;
     int seq_idx = blockIdx.x;
 
@@ -41,10 +39,8 @@ __global__ void fused_attention_kernel(
     // 각 thread는 하나의 head_dim element를 처리할 수 있습니다
     int feature_idx = threadIdx.x;
 
-    if (batch_idx < batch_size && head_idx < num_heads &&
-        seq_idx < seq_len && feature_idx < head_dim) {
-        int idx = batch_idx * num_heads * seq_len * head_dim +
-                  head_idx * seq_len * head_dim +
+    if (head_idx < num_heads && seq_idx < seq_len && feature_idx < head_dim) {
+        int idx = head_idx * seq_len * head_dim +
                   seq_idx * head_dim +
                   feature_idx;
         // TODO: Fused Attention 계산
@@ -58,18 +54,18 @@ extern "C" void day15_fused_attention(
     const float* V,
     float* output,
     const float* mask,
-    int batch_size,
     int num_heads,
     int seq_len,
     int head_dim,
     float scale
 ) {
     // TODO: kernel launch configuration 설정
+    // batch_size는 항상 1이므로 제거
     dim3 threadsPerBlock(head_dim);
-    dim3 blocksPerGrid(seq_len, num_heads, batch_size);
+    dim3 blocksPerGrid(seq_len, num_heads);
 
     fused_attention_kernel<<<blocksPerGrid, threadsPerBlock>>>(
-        Q, K, V, output, mask, batch_size, num_heads, seq_len, head_dim, scale
+        Q, K, V, output, mask, num_heads, seq_len, head_dim, scale
     );
     cudaDeviceSynchronize();
 }
diff --git a/csrc/day19_rope.cu b/csrc/day19_rope.cu
index 7d0d450..0f0d708 100644
--- a/csrc/day19_rope.cu
+++ b/csrc/day19_rope.cu
@@ -10,8 +10,8 @@
 // 2. 위치 정보를 이용한 회전 변환
 // 3. Query와 Key에 각각 적용
 //
-// 입력: query (batch_size, num_heads, seq_len, head_dim)
-//      key (batch_size, num_heads, seq_len, head_dim)
+// 입력: query (num_heads, seq_len, head_dim) - batch_size는 항상 1
+//      key (num_heads, seq_len, head_dim)
 //      cos_cache (seq_len, head_dim / 2)
 //      sin_cache (seq_len, head_dim / 2)
 // 출력: rotated_query, rotated_key
@@ -23,22 +23,18 @@ __global__ void rope_kernel(
     float* rotated_key,
     const float* cos_cache,
     const float* sin_cache,
-    int batch_size,
     int num_heads,
     int seq_len,
     int head_dim
 ) {
     // TODO: 구현하세요
-    int batch_idx = blockIdx.z;
     int head_idx = blockIdx.y;
     int seq_idx = blockIdx.x;
     int dim_idx = threadIdx.x;
 
-    if (batch_idx < batch_size && head_idx < num_heads &&
-        seq_idx < seq_len && dim_idx < head_dim) {
+    if (head_idx < num_heads && seq_idx < seq_len && dim_idx < head_dim) {
         // TODO: RoPE 계산
-        int q_idx = batch_idx * num_heads * seq_len * head_dim +
-                    head_idx * seq_len * head_dim +
+        int q_idx = head_idx * seq_len * head_dim +
                     seq_idx * head_dim +
                     dim_idx;
         rotated_query[q_idx] = query[q_idx];
@@ -53,19 +49,19 @@ extern "C" void day19_rope(
     float* rotated_key,
     const float* cos_cache,
     const float* sin_cache,
-    int batch_size,
     int num_heads,
     int seq_len,
     int head_dim
 ) {
     // TODO: kernel launch configuration 설정
+    // batch_size는 항상 1이므로 제거
     dim3 threadsPerBlock(head_dim);
-    dim3 blocksPerGrid(seq_len, num_heads, batch_size);
+    dim3 blocksPerGrid(seq_len, num_heads);
 
     rope_kernel<<<blocksPerGrid, threadsPerBlock>>>(
         query, key, rotated_query, rotated_key,
         cos_cache, sin_cache,
-        batch_size, num_heads, seq_len, head_dim
+        num_heads, seq_len, head_dim
     );
     cudaDeviceSynchronize();
 }
diff --git a/csrc/day20_conv2d.cu b/csrc/day20_conv2d.cu
index e0a95b7..5c66e6f 100644
--- a/csrc/day20_conv2d.cu
+++ b/csrc/day20_conv2d.cu
@@ -9,16 +9,15 @@
 // 2. 메모리 타일링 최적화
 // 3. Shared memory 활용
 //
-// 입력: input (batch_size, in_channels, height, width)
+// 입력: input (in_channels, height, width) - batch_size는 항상 1
 //      kernel (out_channels, in_channels, kernel_h, kernel_w)
 //      padding, stride
-// 출력: output (batch_size, out_channels, out_height, out_width)
+// 출력: output (out_channels, out_height, out_width)
 
 __global__ void conv2d_kernel(
     const float* input,
     const float* kernel,
     float* output,
-    int batch_size,
     int in_channels,
     int out_channels,
     int input_h,
@@ -34,18 +33,15 @@ __global__ void conv2d_kernel(
 ) {
     // TODO: 구현하세요
     // 2D 컨볼루션 계산
-    int batch_idx = blockIdx.z;
     int out_channel_idx = blockIdx.y;
     int out_row = blockIdx.x / output_w;
     int out_col = blockIdx.x % output_w;
 
     int thread_idx = threadIdx.x;
 
-    if (batch_idx < batch_size && out_channel_idx < out_channels &&
-        out_row < output_h && out_col < output_w) {
+    if (out_channel_idx < out_channels && out_row < output_h && out_col < output_w) {
         // TODO: 2D Convolution 계산
-        int out_idx = batch_idx * out_channels * output_h * output_w +
-                      out_channel_idx * output_h * output_w +
+        int out_idx = out_channel_idx * output_h * output_w +
                       out_row * output_w +
                       out_col;
         output[out_idx] = 0.0f;
@@ -56,7 +52,6 @@ extern "C" void day20_conv2d(
     const float* input,
     const float* kernel,
     float* output,
-    int batch_size,
     int in_channels,
     int out_channels,
     int input_h,
@@ -71,12 +66,13 @@ extern "C" void day20_conv2d(
     int stride_w
 ) {
     // TODO: kernel launch configuration 설정
+    // batch_size는 항상 1이므로 제거
     dim3 threadsPerBlock(256);
-    dim3 blocksPerGrid(output_h * output_w, out_channels, batch_size);
+    dim3 blocksPerGrid(output_h * output_w, out_channels);
 
     conv2d_kernel<<<blocksPerGrid, threadsPerBlock>>>(
         input, kernel, output,
-        batch_size, in_channels, out_channels,
+        in_channels, out_channels,
         input_h, input_w, kernel_h, kernel_w,
         output_h, output_w,
         pad_h, pad_w, stride_h, stride_w
diff --git a/src/gpu_20days/day14_fused_softmax.py b/src/gpu_20days/day14_fused_softmax.py
index 4017539..ae19d28 100644
--- a/src/gpu_20days/day14_fused_softmax.py
+++ b/src/gpu_20days/day14_fused_softmax.py
@@ -12,7 +12,6 @@ def day14_fused_softmax_kernel(
     input_ptr,
     output_ptr,
     mask_ptr,
-    batch_size,
     seq_len,
     feature_size,
     scale,
@@ -29,29 +28,33 @@ def day14_fused_softmax_kernel(
     1. 기본 Softmax 구현
     2. 추가 연산 (scale, mask 등)을 같은 커널에서 처리
     3. 메모리 접근 최소화
+
+    batch_size는 항상 1입니다.
     """
     # TODO: 구현하세요
-    # batch_idx = tl.program_id(0)
-    # seq_idx = tl.program_id(1)
-    # feature_idx = tl.arange(0, BLOCK_SIZE)
-    # mask = feature_idx < feature_size
     pass
 
 
 def day14_fused_softmax(
     input: torch.Tensor, mask: torch.Tensor = None, scale: float = 1.0
 ) -> torch.Tensor:
-    """Day 14: Fused softmax operation"""
+    """Day 14: Fused softmax operation (batch_size is always 1)"""
     # TODO: 구현하세요
     BLOCK_SIZE = 256
-    batch_size, seq_len, feature_size = input.shape
+    # batch_size는 항상 1이므로 입력은 2D
+    if input.dim() != 2:
+        raise ValueError(
+            "day14_fused_softmax expects 2D tensor (seq_len, feature_size), batch_size is always 1"
+        )
+
+    seq_len, feature_size = input.shape
 
     output = torch.zeros_like(input)
 
     def grid(meta):
-        return (batch_size, seq_len)
+        return (triton.cdiv(seq_len, BLOCK_SIZE),)
 
     # day14_fused_softmax_kernel[grid](
-    #     input, output, mask, batch_size, seq_len, feature_size, scale, BLOCK_SIZE=BLOCK_SIZE
+    #     input, output, mask, seq_len, feature_size, scale, BLOCK_SIZE=BLOCK_SIZE
     # )
     return output
diff --git a/src/gpu_20days/day15_fused_attention.py b/src/gpu_20days/day15_fused_attention.py
index 00f527e..4d8f586 100644
--- a/src/gpu_20days/day15_fused_attention.py
+++ b/src/gpu_20days/day15_fused_attention.py
@@ -16,7 +16,6 @@ def day15_fused_attention_kernel(
     V_ptr,
     output_ptr,
     mask_ptr,
-    batch_size,
     num_heads,
     seq_len,
     head_dim,
@@ -36,13 +35,10 @@ def day15_fused_attention_kernel(
     5. @ V (matrix multiplication)
 
     모든 연산을 하나의 커널로 융합하여 성능 최적화
+
+    batch_size는 항상 1입니다.
     """
     # TODO: 구현하세요
-    # batch_idx = tl.program_id(0)
-    # head_idx = tl.program_id(1)
-    # seq_idx = tl.program_id(2)
-    # feature_idx = tl.arange(0, BLOCK_SIZE)
-    # mask = feature_idx < head_dim
     pass
 
 
@@ -53,10 +49,16 @@ def day15_fused_attention(
     mask: Optional[torch.Tensor] = None,
     scale: Optional[float] = None,
 ) -> torch.Tensor:
-    """Day 15: Fused attention mechanism"""
+    """Day 15: Fused attention mechanism (batch_size is always 1)"""
     # TODO: 구현하세요
     BLOCK_SIZE = 256
-    batch_size, num_heads, seq_len, head_dim = Q.shape
+    # batch_size는 항상 1이므로 입력은 3D
+    if Q.dim() != 3:
+        raise ValueError(
+            "day15_fused_attention expects 3D tensor (num_heads, seq_len, head_dim), batch_size is always 1"
+        )
+
+    num_heads, seq_len, head_dim = Q.shape
 
     if scale is None:
         scale = 1.0 / (head_dim**0.5)
@@ -64,9 +66,9 @@ def day15_fused_attention(
     output = torch.zeros_like(Q)
 
     def grid(meta):
-        return (batch_size, num_heads, seq_len)
+        return (num_heads, triton.cdiv(seq_len, BLOCK_SIZE))
 
     # day15_fused_attention_kernel[grid](
-    #     Q, K, V, output, mask, batch_size, num_heads, seq_len, head_dim, scale, BLOCK_SIZE=BLOCK_SIZE
+    #     Q, K, V, output, mask, num_heads, seq_len, head_dim, scale, BLOCK_SIZE=BLOCK_SIZE
     # )
     return output
diff --git a/src/gpu_20days/day19_rope.py b/src/gpu_20days/day19_rope.py
index afb900f..b6c9be4 100644
--- a/src/gpu_20days/day19_rope.py
+++ b/src/gpu_20days/day19_rope.py
@@ -15,7 +15,6 @@ def day19_rope_kernel(
     rotated_key_ptr,
     cos_cache_ptr,
     sin_cache_ptr,
-    batch_size,
     num_heads,
     seq_len,
     head_dim,
@@ -30,34 +29,37 @@ def day19_rope_kernel(
     1. 삼각함수 연산 (sin, cos) 사용
     2. 위치 정보를 이용한 회전 변환
     3. Query와 Key에 각각 적용
+
+    batch_size는 항상 1입니다.
     """
     # TODO: 구현하세요
-    # batch_idx = tl.program_id(0)
-    # head_idx = tl.program_id(1)
-    # seq_idx = tl.program_id(2)
-    # dim_idx = tl.arange(0, BLOCK_SIZE)
-    # mask = dim_idx < head_dim
     pass
 
 
 def day19_rope(
     query: torch.Tensor, key: torch.Tensor, cos_cache: torch.Tensor, sin_cache: torch.Tensor
 ) -> tuple[torch.Tensor, torch.Tensor]:
-    """Day 19: Rotary position embedding"""
+    """Day 19: Rotary position embedding (batch_size is always 1)"""
     # TODO: 구현하세요
     BLOCK_SIZE = 256
-    batch_size, num_heads, seq_len, head_dim = query.shape
+    # batch_size는 항상 1이므로 입력은 3D
+    if query.dim() != 3:
+        raise ValueError(
+            "day19_rope expects 3D tensor (num_heads, seq_len, head_dim), batch_size is always 1"
+        )
+
+    num_heads, seq_len, head_dim = query.shape
 
     rotated_query = torch.zeros_like(query)
     rotated_key = torch.zeros_like(key)
 
     def grid(meta):
-        return (batch_size, num_heads, seq_len)
+        return (num_heads, triton.cdiv(seq_len, BLOCK_SIZE))
 
     # day19_rope_kernel[grid](
     #     query, key, rotated_query, rotated_key,
     #     cos_cache, sin_cache,
-    #     batch_size, num_heads, seq_len, head_dim,
+    #     num_heads, seq_len, head_dim,
     #     BLOCK_SIZE=BLOCK_SIZE
     # )
     return rotated_query, rotated_key
diff --git a/src/gpu_20days/day20_conv2d.py b/src/gpu_20days/day20_conv2d.py
index 4436e8b..ff3fed5 100644
--- a/src/gpu_20days/day20_conv2d.py
+++ b/src/gpu_20days/day20_conv2d.py
@@ -12,7 +12,6 @@ def day20_conv2d_kernel(
     input_ptr,
     kernel_ptr,
     output_ptr,
-    batch_size,
     in_channels,
     out_channels,
     input_h,
@@ -36,12 +35,10 @@ def day20_conv2d_kernel(
     1. 2D 슬라이딩 윈도우 패턴
     2. 메모리 타일링 최적화
     3. Shared memory 활용
+
+    batch_size는 항상 1입니다.
     """
     # TODO: 구현하세요
-    # batch_idx = tl.program_id(0)
-    # out_channel_idx = tl.program_id(1)
-    # out_row = tl.program_id(2) // output_w
-    # out_col = tl.program_id(2) % output_w
     pass
 
 
@@ -51,10 +48,16 @@ def day20_conv2d(
     padding: tuple[int, int] = (0, 0),
     stride: tuple[int, int] = (1, 1),
 ) -> torch.Tensor:
-    """Day 20: Two-dimensional convolution"""
+    """Day 20: Two-dimensional convolution (batch_size is always 1)"""
     # TODO: 구현하세요
     BLOCK_SIZE = 256
-    batch_size, in_channels, input_h, input_w = input.shape
+    # batch_size는 항상 1이므로 입력은 3D
+    if input.dim() != 3:
+        raise ValueError(
+            "day20_conv2d expects 3D tensor (in_channels, height, width), batch_size is always 1"
+        )
+
+    in_channels, input_h, input_w = input.shape
     out_channels, _, kernel_h, kernel_w = kernel.shape
 
     pad_h, pad_w = padding
@@ -63,16 +66,14 @@ def day20_conv2d(
     output_h = (input_h + 2 * pad_h - kernel_h) // stride_h + 1
     output_w = (input_w + 2 * pad_w - kernel_w) // stride_w + 1
 
-    output = torch.zeros(
-        batch_size, out_channels, output_h, output_w, device=input.device, dtype=input.dtype
-    )
+    output = torch.zeros(out_channels, output_h, output_w, device=input.device, dtype=input.dtype)
 
     def grid(meta):
-        return (batch_size, out_channels, output_h * output_w)
+        return (out_channels, triton.cdiv(output_h * output_w, BLOCK_SIZE))
 
     # day20_conv2d_kernel[grid](
     #     input, kernel, output,
-    #     batch_size, in_channels, out_channels,
+    #     in_channels, out_channels,
     #     input_h, input_w, kernel_h, kernel_w,
     #     output_h, output_w,
     #     pad_h, pad_w, stride_h, stride_w,
diff --git a/tests/test_day14.py b/tests/test_day14.py
index 3157e26..9efcafc 100644
--- a/tests/test_day14.py
+++ b/tests/test_day14.py
@@ -15,16 +15,16 @@
 
 from conftest import benchmark_kernel_vs_pytorch, compare_kernel_with_pytorch, ensure_cuda_device
 
-# Test cases: (batch_size, seq_len, feature_size, description)
+# Test cases: (seq_len, feature_size, description) - batch_size is always 1
 FUSED_SOFTMAX_TEST_CASES = [
-    (1, 10, 32, "small_1x10x32"),
-    (4, 32, 64, "medium_4x32x64"),
-    (8, 128, 128, "medium_8x128x128"),
+    (10, 32, "small_10x32"),
+    (32, 64, "medium_32x64"),
+    (128, 128, "medium_128x128"),
 ]
 
 
-@pytest.mark.parametrize("batch_size,seq_len,feature_size,description", FUSED_SOFTMAX_TEST_CASES)
-def test_fused_softmax_triton(batch_size, seq_len, feature_size, description):
+@pytest.mark.parametrize("seq_len,feature_size,description", FUSED_SOFTMAX_TEST_CASES)
+def test_fused_softmax_triton(seq_len, feature_size, description):
     """Test Triton Fused Softmax"""
     try:
         from gpu_20days.day14_fused_softmax import day14_fused_softmax
@@ -33,12 +33,9 @@ def test_fused_softmax_triton(batch_size, seq_len, feature_size, description):
 
     device = ensure_cuda_device()
 
-    print(
-        f"Testing Triton Fused Softmax with shape ({batch_size}, {seq_len}, {feature_size}) ({description})..."
-    )
-    input_tensor = torch.randn(
-        batch_size, seq_len, feature_size, device=device, dtype=torch.float32
-    )
+    print(f"Testing Triton Fused Softmax with shape ({seq_len}, {feature_size}) ({description})...")
+    # batch_size is always 1, so input is 2D
+    input_tensor = torch.randn(seq_len, feature_size, device=device, dtype=torch.float32)
 
     output = day14_fused_softmax(input_tensor)
     expected = torch.softmax(input_tensor, dim=-1)
@@ -46,8 +43,8 @@ def test_fused_softmax_triton(batch_size, seq_len, feature_size, description):
     torch.testing.assert_close(output, expected, rtol=1e-4, atol=1e-5)
 
 
-@pytest.mark.parametrize("batch_size,seq_len,feature_size,description", FUSED_SOFTMAX_TEST_CASES)
-def test_fused_softmax_cuda(batch_size, seq_len, feature_size, description):
+@pytest.mark.parametrize("seq_len,feature_size,description", FUSED_SOFTMAX_TEST_CASES)
+def test_fused_softmax_cuda(seq_len, feature_size, description):
     """Test CUDA Fused Softmax"""
     try:
         from gpu_20days.cuda_kernels import day14_fused_softmax
@@ -56,12 +53,9 @@ def test_fused_softmax_cuda(batch_size, seq_len, feature_size, description):
 
     device = ensure_cuda_device()
 
-    print(
-        f"Testing CUDA Fused Softmax with shape ({batch_size}, {seq_len}, {feature_size}) ({description})..."
-    )
-    input_tensor = torch.randn(
-        batch_size, seq_len, feature_size, device=device, dtype=torch.float32
-    )
+    print(f"Testing CUDA Fused Softmax with shape ({seq_len}, {feature_size}) ({description})...")
+    # batch_size is always 1, so input is 2D
+    input_tensor = torch.randn(seq_len, feature_size, device=device, dtype=torch.float32)
 
     output = day14_fused_softmax(input_tensor)
     expected = torch.softmax(input_tensor, dim=-1)
diff --git a/tests/test_day15.py b/tests/test_day15.py
index afed1e8..a677a9a 100644
--- a/tests/test_day15.py
+++ b/tests/test_day15.py
@@ -15,16 +15,16 @@
 
 from conftest import benchmark_kernel_vs_pytorch, compare_kernel_with_pytorch, ensure_cuda_device
 
-# Test cases: (batch_size, num_heads, seq_len, head_dim, description)
+# Test cases: (num_heads, seq_len, head_dim, description) - batch_size is always 1
 ATTENTION_TEST_CASES = [
-    (1, 2, 8, 32, "small_1x2x8x32"),
-    (2, 4, 16, 64, "medium_2x4x16x64"),
-    (4, 8, 32, 128, "medium_4x8x32x128"),
+    (2, 8, 32, "small_2x8x32"),
+    (4, 16, 64, "medium_4x16x64"),
+    (8, 32, 128, "medium_8x32x128"),
 ]
 
 
-@pytest.mark.parametrize("batch_size,num_heads,seq_len,head_dim,description", ATTENTION_TEST_CASES)
-def test_fused_attention_triton(batch_size, num_heads, seq_len, head_dim, description):
+@pytest.mark.parametrize("num_heads,seq_len,head_dim,description", ATTENTION_TEST_CASES)
+def test_fused_attention_triton(num_heads, seq_len, head_dim, description):
     """Test Triton Fused Attention"""
     try:
         from gpu_20days.day15_fused_attention import day15_fused_attention
@@ -34,11 +34,12 @@ def test_fused_attention_triton(batch_size, num_heads, seq_len, head_dim, descri
     device = ensure_cuda_device()
 
     print(
-        f"Testing Triton Fused Attention with shape ({batch_size}, {num_heads}, {seq_len}, {head_dim}) ({description})..."
+        f"Testing Triton Fused Attention with shape ({num_heads}, {seq_len}, {head_dim}) ({description})..."
     )
-    Q = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device, dtype=torch.float32)
-    K = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device, dtype=torch.float32)
-    V = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device, dtype=torch.float32)
+    # batch_size is always 1, so input is 3D
+    Q = torch.randn(num_heads, seq_len, head_dim, device=device, dtype=torch.float32)
+    K = torch.randn(num_heads, seq_len, head_dim, device=device, dtype=torch.float32)
+    V = torch.randn(num_heads, seq_len, head_dim, device=device, dtype=torch.float32)
 
     output = day15_fused_attention(Q, K, V)
     # Reference: scaled dot-product attention
@@ -50,8 +51,8 @@ def test_fused_attention_triton(batch_size, num_heads, seq_len, head_dim, descri
     torch.testing.assert_close(output, expected, rtol=1e-4, atol=1e-5)
 
 
-@pytest.mark.parametrize("batch_size,num_heads,seq_len,head_dim,description", ATTENTION_TEST_CASES)
-def test_fused_attention_cuda(batch_size, num_heads, seq_len, head_dim, description):
+@pytest.mark.parametrize("num_heads,seq_len,head_dim,description", ATTENTION_TEST_CASES)
+def test_fused_attention_cuda(num_heads, seq_len, head_dim, description):
     """Test CUDA Fused Attention"""
     try:
         from gpu_20days.cuda_kernels import day15_fused_attention
@@ -61,11 +62,12 @@ def test_fused_attention_cuda(batch_size, num_heads, seq_len, head_dim, descript
     device = ensure_cuda_device()
 
     print(
-        f"Testing CUDA Fused Attention with shape ({batch_size}, {num_heads}, {seq_len}, {head_dim}) ({description})..."
+        f"Testing CUDA Fused Attention with shape ({num_heads}, {seq_len}, {head_dim}) ({description})..."
     )
-    Q = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device, dtype=torch.float32)
-    K = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device, dtype=torch.float32)
-    V = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device, dtype=torch.float32)
+    # batch_size is always 1, so input is 3D
+    Q = torch.randn(num_heads, seq_len, head_dim, device=device, dtype=torch.float32)
+    K = torch.randn(num_heads, seq_len, head_dim, device=device, dtype=torch.float32)
+    V = torch.randn(num_heads, seq_len, head_dim, device=device, dtype=torch.float32)
 
     output = day15_fused_attention(Q, K, V)
     scale = 1.0 / (head_dim**0.5)
diff --git a/tests/test_day19.py b/tests/test_day19.py
index fb7d392..d455e92 100644
--- a/tests/test_day19.py
+++ b/tests/test_day19.py
@@ -16,10 +16,10 @@
 
 from conftest import benchmark_kernel_vs_pytorch, compare_kernel_with_pytorch, ensure_cuda_device
 
-# Test cases: (batch_size, num_heads, seq_len, head_dim, description)
+# Test cases: (num_heads, seq_len, head_dim, description) - batch_size is always 1
 ROPE_TEST_CASES = [
-    (1, 2, 8, 32, "small_1x2x8x32"),
-    (2, 4, 16, 64, "medium_2x4x16x64"),
+    (2, 8, 32, "small_2x8x32"),
+    (4, 16, 64, "medium_4x16x64"),
 ]
 
 
@@ -33,8 +33,8 @@ def create_rope_cache(seq_len, head_dim, device):
     return cos_cache, sin_cache
 
 
-@pytest.mark.parametrize("batch_size,num_heads,seq_len,head_dim,description", ROPE_TEST_CASES)
-def test_rope_triton(batch_size, num_heads, seq_len, head_dim, description):
+@pytest.mark.parametrize("num_heads,seq_len,head_dim,description", ROPE_TEST_CASES)
+def test_rope_triton(num_heads, seq_len, head_dim, description):
     """Test Triton RoPE"""
     try:
         from gpu_20days.day19_rope import day19_rope
@@ -43,13 +43,10 @@ def test_rope_triton(batch_size, num_heads, seq_len, head_dim, description):
 
     device = ensure_cuda_device()
 
-    print(
-        f"Testing Triton RoPE with shape ({batch_size}, {num_heads}, {seq_len}, {head_dim}) ({description})..."
-    )
-    query = torch.randn(
-        batch_size, num_heads, seq_len, head_dim, device=device, dtype=torch.float32
-    )
-    key = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device, dtype=torch.float32)
+    print(f"Testing Triton RoPE with shape ({num_heads}, {seq_len}, {head_dim}) ({description})...")
+    # batch_size is always 1, so input is 3D
+    query = torch.randn(num_heads, seq_len, head_dim, device=device, dtype=torch.float32)
+    key = torch.randn(num_heads, seq_len, head_dim, device=device, dtype=torch.float32)
     cos_cache, sin_cache = create_rope_cache(seq_len, head_dim // 2, device)
 
     rotated_query, rotated_key = day19_rope(query, key, cos_cache, sin_cache)
@@ -59,8 +56,8 @@ def test_rope_triton(batch_size, num_heads, seq_len, head_dim, description):
     assert rotated_key.shape == key.shape
 
 
-@pytest.mark.parametrize("batch_size,num_heads,seq_len,head_dim,description", ROPE_TEST_CASES)
-def test_rope_cuda(batch_size, num_heads, seq_len, head_dim, description):
+@pytest.mark.parametrize("num_heads,seq_len,head_dim,description", ROPE_TEST_CASES)
+def test_rope_cuda(num_heads, seq_len, head_dim, description):
     """Test CUDA RoPE"""
     try:
         from gpu_20days.cuda_kernels import day19_rope
@@ -69,13 +66,10 @@ def test_rope_cuda(batch_size, num_heads, seq_len, head_dim, description):
 
     device = ensure_cuda_device()
 
-    print(
-        f"Testing CUDA RoPE with shape ({batch_size}, {num_heads}, {seq_len}, {head_dim}) ({description})..."
-    )
-    query = torch.randn(
-        batch_size, num_heads, seq_len, head_dim, device=device, dtype=torch.float32
-    )
-    key = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device, dtype=torch.float32)
+    print(f"Testing CUDA RoPE with shape ({num_heads}, {seq_len}, {head_dim}) ({description})...")
+    # batch_size is always 1, so input is 3D
+    query = torch.randn(num_heads, seq_len, head_dim, device=device, dtype=torch.float32)
+    key = torch.randn(num_heads, seq_len, head_dim, device=device, dtype=torch.float32)
     cos_cache, sin_cache = create_rope_cache(seq_len, head_dim // 2, device)
 
     rotated_query, rotated_key = day19_rope(query, key, cos_cache, sin_cache)
diff --git a/tests/test_day20.py b/tests/test_day20.py
index 4844eb0..aa17463 100644
--- a/tests/test_day20.py
+++ b/tests/test_day20.py
@@ -15,50 +15,48 @@
 
 from conftest import benchmark_kernel_vs_pytorch, compare_kernel_with_pytorch, ensure_cuda_device
 
-# Test cases: (batch_size, in_channels, out_channels, height, width, kernel_size, description)
+# Test cases: (in_channels, out_channels, height, width, kernel_size, description) - batch_size is always 1
 CONV2D_TEST_CASES = [
-    (1, 1, 1, 8, 8, 3, "small_1x1x1_8x8_k3"),
-    (2, 3, 4, 16, 16, 3, "medium_2x3x4_16x16_k3"),
-    (4, 8, 16, 32, 32, 5, "medium_4x8x16_32x32_k5"),
+    (1, 1, 8, 8, 3, "small_1x1_8x8_k3"),
+    (3, 4, 16, 16, 3, "medium_3x4_16x16_k3"),
+    (8, 16, 32, 32, 5, "medium_8x16_32x32_k5"),
 ]
 
 
 @pytest.mark.parametrize(
-    "batch_size,in_channels,out_channels,height,width,kernel_size,description", CONV2D_TEST_CASES
+    "in_channels,out_channels,height,width,kernel_size,description", CONV2D_TEST_CASES
 )
-def test_conv2d_triton(
-    batch_size, in_channels, out_channels, height, width, kernel_size, description
-):
+def test_conv2d_triton(in_channels, out_channels, height, width, kernel_size, description):
     """Test Triton 2D Convolution"""
     try:
-        from gpu_20days.day20_conv import day20_conv2d
+        from gpu_20days.day20_conv2d import day20_conv2d
     except ImportError:
         pytest.skip("gpu_20days package not available")
 
     device = ensure_cuda_device()
 
     print(
-        f"Testing Triton 2D Conv with shape ({batch_size}, {in_channels}, {height}, {width}), kernel={kernel_size} ({description})..."
-    )
-    input_tensor = torch.randn(
-        batch_size, in_channels, height, width, device=device, dtype=torch.float32
+        f"Testing Triton 2D Conv with shape ({in_channels}, {height}, {width}), kernel={kernel_size} ({description})..."
     )
+    # batch_size is always 1, so input is 3D
+    input_tensor = torch.randn(in_channels, height, width, device=device, dtype=torch.float32)
     kernel = torch.randn(
         out_channels, in_channels, kernel_size, kernel_size, device=device, dtype=torch.float32
     )
 
     output = day20_conv2d(input_tensor, kernel, padding=(1, 1), stride=(1, 1))
-    expected = torch.nn.functional.conv2d(input_tensor, kernel, padding=(1, 1), stride=(1, 1))
+    # Add batch dimension for PyTorch reference
+    input_with_batch = input_tensor.unsqueeze(0)
+    expected = torch.nn.functional.conv2d(input_with_batch, kernel, padding=(1, 1), stride=(1, 1))
+    expected = expected.squeeze(0)  # Remove batch dimension
 
     torch.testing.assert_close(output, expected, rtol=1e-4, atol=1e-5)
 
 
 @pytest.mark.parametrize(
-    "batch_size,in_channels,out_channels,height,width,kernel_size,description", CONV2D_TEST_CASES
+    "in_channels,out_channels,height,width,kernel_size,description", CONV2D_TEST_CASES
 )
-def test_conv2d_cuda(
-    batch_size, in_channels, out_channels, height, width, kernel_size, description
-):
+def test_conv2d_cuda(in_channels, out_channels, height, width, kernel_size, description):
     """Test CUDA 2D Convolution"""
     try:
         from gpu_20days.cuda_kernels import day20_conv2d
@@ -68,16 +66,18 @@ def test_conv2d_cuda(
     device = ensure_cuda_device()
 
     print(
-        f"Testing CUDA 2D Conv with shape ({batch_size}, {in_channels}, {height}, {width}), kernel={kernel_size} ({description})..."
-    )
-    input_tensor = torch.randn(
-        batch_size, in_channels, height, width, device=device, dtype=torch.float32
+        f"Testing CUDA 2D Conv with shape ({in_channels}, {height}, {width}), kernel={kernel_size} ({description})..."
     )
+    # batch_size is always 1, so input is 3D
+    input_tensor = torch.randn(in_channels, height, width, device=device, dtype=torch.float32)
     kernel = torch.randn(
         out_channels, in_channels, kernel_size, kernel_size, device=device, dtype=torch.float32
     )
 
     output = day20_conv2d(input_tensor, kernel, padding=(1, 1), stride=(1, 1))
-    expected = torch.nn.functional.conv2d(input_tensor, kernel, padding=(1, 1), stride=(1, 1))
+    # Add batch dimension for PyTorch reference
+    input_with_batch = input_tensor.unsqueeze(0)
+    expected = torch.nn.functional.conv2d(input_with_batch, kernel, padding=(1, 1), stride=(1, 1))
+    expected = expected.squeeze(0)  # Remove batch dimension
 
     torch.testing.assert_close(output, expected, rtol=1e-4, atol=1e-5)