opus24 · opus24 · Jan 20, 2026 · Jan 20, 2026 · Jan 20, 2026
diff --git a/csrc/bindings.cu b/csrc/bindings.cu
@@ -16,13 +16,13 @@ extern "C" void day10_conv1d(const float* input, const float* kernel, float* out
 extern "C" void day11_softmax(const float* input, float* output, int feature_size);
 extern "C" void day12_layernorm(const float* input, float* output, const float* gamma, const float* beta, int feature_size, float eps);
 extern "C" void day13_rmsnorm(const float* input, float* output, const float* weight, int feature_size, float eps);
-extern "C" void day14_fused_softmax(const float* input, float* output, const float* mask, int batch_size, int seq_len, int feature_size, float scale);
-extern "C" void day15_fused_attention(const float* Q, const float* K, const float* V, float* output, const float* mask, int batch_size, int num_heads, int seq_len, int head_dim, float scale);
+extern "C" void day14_fused_softmax(const float* input, float* output, const float* mask, int seq_len, int feature_size, float scale);
+extern "C" void day15_fused_attention(const float* Q, const float* K, const float* V, float* output, const float* mask, int num_heads, int seq_len, int head_dim, float scale);
 extern "C" void day16_group_gemm(const float* A, const float* B, float* C, int num_groups, int M, int N, int K);
 extern "C" void day17_persistent_matmul(const float* A, const float* B, float* C, int M, int N, int K);
 extern "C" void day18_block_scaled_matmul(const float* A, const float* B, float* C, int M, int N, int K, float scale);
-extern "C" void day19_rope(const float* query, const float* key, float* rotated_query, float* rotated_key, const float* cos_cache, const float* sin_cache, int batch_size, int num_heads, int seq_len, int head_dim);
-extern "C" void day20_conv2d(const float* input, const float* kernel, float* output, int batch_size, int in_channels, int out_channels, int input_h, int input_w, int kernel_h, int kernel_w, int output_h, int output_w, int pad_h, int pad_w, int stride_h, int stride_w);
+extern "C" void day19_rope(const float* query, const float* key, float* rotated_query, float* rotated_key, const float* cos_cache, const float* sin_cache, int num_heads, int seq_len, int head_dim);
+extern "C" void day20_conv2d(const float* input, const float* kernel, float* output, int in_channels, int out_channels, int input_h, int input_w, int kernel_h, int kernel_w, int output_h, int output_w, int pad_h, int pad_w, int stride_h, int stride_w);
 
 // Helper macro for CUDA error checking
 #define CUDA_CHECK(call) \
@@ -243,11 +243,10 @@ torch::Tensor day13_rmsnorm_wrapper(torch::Tensor input, torch::Tensor weight, f
 torch::Tensor day14_fused_softmax_wrapper(torch::Tensor input, torch::Tensor mask, float scale) {
     TORCH_CHECK(input.is_cuda(), "Input must be a CUDA tensor");
     TORCH_CHECK(input.dtype() == torch::kFloat32, "Input must be float32");
-    TORCH_CHECK(input.dim() == 3, "Input must be 3D tensor (batch_size, seq_len, feature_size)");
+    TORCH_CHECK(input.dim() == 2, "Input must be 2D tensor (seq_len, feature_size)");
 
-    int batch_size = input.size(0);
-    int seq_len = input.size(1);
-    int feature_size = input.size(2);
+    int seq_len = input.size(0);
+    int feature_size = input.size(1);
 
     // Default mask if not provided (empty tensor)
     const float* mask_ptr = nullptr;
@@ -262,7 +261,6 @@ torch::Tensor day14_fused_softmax_wrapper(torch::Tensor input, torch::Tensor mas
         input.data_ptr<float>(),
         output.data_ptr<float>(),
         mask_ptr,
-        batch_size,
         seq_len,
         feature_size,
         scale
@@ -275,12 +273,11 @@ torch::Tensor day14_fused_softmax_wrapper(torch::Tensor input, torch::Tensor mas
 torch::Tensor day15_fused_attention_wrapper(torch::Tensor Q, torch::Tensor K, torch::Tensor V, torch::Tensor mask, float scale) {
     TORCH_CHECK(Q.is_cuda() && K.is_cuda() && V.is_cuda(), "Inputs must be CUDA tensors");
     TORCH_CHECK(Q.dtype() == torch::kFloat32 && K.dtype() == torch::kFloat32 && V.dtype() == torch::kFloat32, "Inputs must be float32");
-    TORCH_CHECK(Q.dim() == 4, "Q, K, V must be 4D tensors (batch_size, num_heads, seq_len, head_dim)");
+    TORCH_CHECK(Q.dim() == 3, "Q, K, V must be 3D tensors (num_heads, seq_len, head_dim)");
 
-    int batch_size = Q.size(0);
-    int num_heads = Q.size(1);
-    int seq_len = Q.size(2);
-    int head_dim = Q.size(3);
+    int num_heads = Q.size(0);
+    int seq_len = Q.size(1);
+    int head_dim = Q.size(2);
 
     // Default scale if not provided
     if (scale <= 0.0f) {
@@ -302,7 +299,6 @@ torch::Tensor day15_fused_attention_wrapper(torch::Tensor Q, torch::Tensor K, to
         V.data_ptr<float>(),
         output.data_ptr<float>(),
         mask_ptr,
-        batch_size,
         num_heads,
         seq_len,
         head_dim,
@@ -392,12 +388,11 @@ std::tuple<torch::Tensor, torch::Tensor> day19_rope_wrapper(
 ) {
     TORCH_CHECK(query.is_cuda() && key.is_cuda(), "Inputs must be CUDA tensors");
     TORCH_CHECK(query.dtype() == torch::kFloat32 && key.dtype() == torch::kFloat32, "Inputs must be float32");
-    TORCH_CHECK(query.dim() == 4 && key.dim() == 4, "Query and key must be 4D tensors (batch_size, num_heads, seq_len, head_dim)");
+    TORCH_CHECK(query.dim() == 3 && key.dim() == 3, "Query and key must be 3D tensors (num_heads, seq_len, head_dim)");
 
-    int batch_size = query.size(0);
-    int num_heads = query.size(1);
-    int seq_len = query.size(2);
-    int head_dim = query.size(3);
+    int num_heads = query.size(0);
+    int seq_len = query.size(1);
+    int head_dim = query.size(2);
 
     torch::Tensor rotated_query = torch::zeros_like(query);
     torch::Tensor rotated_key = torch::zeros_like(key);
@@ -409,7 +404,7 @@ std::tuple<torch::Tensor, torch::Tensor> day19_rope_wrapper(
         rotated_key.data_ptr<float>(),
         cos_cache.data_ptr<float>(),
         sin_cache.data_ptr<float>(),
-        batch_size, num_heads, seq_len, head_dim
+        num_heads, seq_len, head_dim
     );
 
     return std::make_tuple(rotated_query, rotated_key);
@@ -426,13 +421,12 @@ torch::Tensor day20_conv2d_wrapper(
 ) {
     TORCH_CHECK(input.is_cuda() && kernel.is_cuda(), "Inputs must be CUDA tensors");
     TORCH_CHECK(input.dtype() == torch::kFloat32 && kernel.dtype() == torch::kFloat32, "Inputs must be float32");
-    TORCH_CHECK(input.dim() == 4, "Input must be 4D tensor (batch_size, in_channels, height, width)");
+    TORCH_CHECK(input.dim() == 3, "Input must be 3D tensor (in_channels, height, width)");
     TORCH_CHECK(kernel.dim() == 4, "Kernel must be 4D tensor (out_channels, in_channels, kernel_h, kernel_w)");
 
-    int batch_size = input.size(0);
-    int in_channels = input.size(1);
-    int input_h = input.size(2);
-    int input_w = input.size(3);
+    int in_channels = input.size(0);
+    int input_h = input.size(1);
+    int input_w = input.size(2);
 
     int out_channels = kernel.size(0);
     int kernel_h = kernel.size(2);
@@ -443,13 +437,13 @@ torch::Tensor day20_conv2d_wrapper(
 
     TORCH_CHECK(output_h > 0 && output_w > 0, "Output dimensions must be positive");
 
-    torch::Tensor output = torch::zeros({batch_size, out_channels, output_h, output_w}, input.options());
+    torch::Tensor output = torch::zeros({out_channels, output_h, output_w}, input.options());
 
     day20_conv2d(
         input.data_ptr<float>(),
         kernel.data_ptr<float>(),
         output.data_ptr<float>(),
-        batch_size, in_channels, out_channels,
+        in_channels, out_channels,
         input_h, input_w, kernel_h, kernel_w,
         output_h, output_w,
         pad_h, pad_w, stride_h, stride_w

diff --git a/csrc/day14_fused_softmax.cu b/csrc/day14_fused_softmax.cu
@@ -11,28 +11,26 @@
 // 2. 추가 연산 (scale, mask 등)을 같은 커널에서 처리
 // 3. 메모리 접근 최소화
 //
-// 입력: input (batch_size, seq_len, feature_size)
+// 입력: input (seq_len, feature_size) - batch_size는 항상 1
 //      scale (optional) - scaling factor
 //      mask (optional) - attention mask
-// 출력: output (batch_size, seq_len, feature_size)
+// 출력: output (seq_len, feature_size)
 
 __global__ void fused_softmax_kernel(
     const float* input,
     float* output,
     const float* mask,
-    int batch_size,
     int seq_len,
     int feature_size,
     float scale
 ) {
     // TODO: 구현하세요
     // Fused operations: scale -> mask -> softmax
-    int batch_idx = blockIdx.z;
-    int seq_idx = blockIdx.y;
+    int seq_idx = blockIdx.x;
     int feature_idx = threadIdx.x;
 
-    if (batch_idx < batch_size && seq_idx < seq_len && feature_idx < feature_size) {
-        int idx = batch_idx * seq_len * feature_size + seq_idx * feature_size + feature_idx;
+    if (seq_idx < seq_len && feature_idx < feature_size) {
+        int idx = seq_idx * feature_size + feature_idx;
         // TODO: Fused Softmax 계산
         output[idx] = input[idx];
     }
@@ -42,17 +40,17 @@ extern "C" void day14_fused_softmax(
     const float* input,
     float* output,
     const float* mask,
-    int batch_size,
     int seq_len,
     int feature_size,
     float scale
 ) {
     // TODO: kernel launch configuration 설정
+    // batch_size는 항상 1이므로 제거
     dim3 threadsPerBlock(feature_size);
-    dim3 blocksPerGrid(seq_len, batch_size);
+    dim3 blocksPerGrid(seq_len);
 
     fused_softmax_kernel<<<blocksPerGrid, threadsPerBlock>>>(
-        input, output, mask, batch_size, seq_len, feature_size, scale
+        input, output, mask, seq_len, feature_size, scale
     );
     cudaDeviceSynchronize();
 }
diff --git a/csrc/day15_fused_attention.cu b/csrc/day15_fused_attention.cu
@@ -13,38 +13,34 @@
 //
 // 모든 연산을 하나의 커널로 융합하여 성능 최적화
 //
-// 입력: Q (batch_size, num_heads, seq_len, head_dim)
-//      K (batch_size, num_heads, seq_len, head_dim)
-//      V (batch_size, num_heads, seq_len, head_dim)
+// 입력: Q (num_heads, seq_len, head_dim) - batch_size는 항상 1
+//      K (num_heads, seq_len, head_dim)
+//      V (num_heads, seq_len, head_dim)
 //      mask (optional) - attention mask
-// 출력: output (batch_size, num_heads, seq_len, head_dim)
+// 출력: output (num_heads, seq_len, head_dim)
 
 __global__ void fused_attention_kernel(
     const float* Q,
     const float* K,
     const float* V,
     float* output,
     const float* mask,
-    int batch_size,
     int num_heads,
     int seq_len,
     int head_dim,
     float scale
 ) {
     // TODO: 구현하세요
     // Fused Attention: QK^T -> scale -> mask -> softmax -> @V
-    int batch_idx = blockIdx.z;
     int head_idx = blockIdx.y;
     int seq_idx = blockIdx.x;
 
     // TODO: Attention 계산
     // 각 thread는 하나의 head_dim element를 처리할 수 있습니다
     int feature_idx = threadIdx.x;
 
-    if (batch_idx < batch_size && head_idx < num_heads &&
-        seq_idx < seq_len && feature_idx < head_dim) {
-        int idx = batch_idx * num_heads * seq_len * head_dim +
-                  head_idx * seq_len * head_dim +
+    if (head_idx < num_heads && seq_idx < seq_len && feature_idx < head_dim) {
+        int idx = head_idx * seq_len * head_dim +
                   seq_idx * head_dim +
                   feature_idx;
         // TODO: Fused Attention 계산
@@ -58,18 +54,18 @@ extern "C" void day15_fused_attention(
     const float* V,
     float* output,
     const float* mask,
-    int batch_size,
     int num_heads,
     int seq_len,
     int head_dim,
     float scale
 ) {
     // TODO: kernel launch configuration 설정
+    // batch_size는 항상 1이므로 제거
     dim3 threadsPerBlock(head_dim);
-    dim3 blocksPerGrid(seq_len, num_heads, batch_size);
+    dim3 blocksPerGrid(seq_len, num_heads);
 
     fused_attention_kernel<<<blocksPerGrid, threadsPerBlock>>>(
-        Q, K, V, output, mask, batch_size, num_heads, seq_len, head_dim, scale
+        Q, K, V, output, mask, num_heads, seq_len, head_dim, scale
     );
     cudaDeviceSynchronize();
 }
diff --git a/csrc/day19_rope.cu b/csrc/day19_rope.cu
@@ -10,8 +10,8 @@
 // 2. 위치 정보를 이용한 회전 변환
 // 3. Query와 Key에 각각 적용
 //
-// 입력: query (batch_size, num_heads, seq_len, head_dim)
-//      key (batch_size, num_heads, seq_len, head_dim)
+// 입력: query (num_heads, seq_len, head_dim) - batch_size는 항상 1
+//      key (num_heads, seq_len, head_dim)
 //      cos_cache (seq_len, head_dim / 2)
 //      sin_cache (seq_len, head_dim / 2)
 // 출력: rotated_query, rotated_key
@@ -23,22 +23,18 @@ __global__ void rope_kernel(
     float* rotated_key,
     const float* cos_cache,
     const float* sin_cache,
-    int batch_size,
     int num_heads,
     int seq_len,
     int head_dim
 ) {
     // TODO: 구현하세요
-    int batch_idx = blockIdx.z;
     int head_idx = blockIdx.y;
     int seq_idx = blockIdx.x;
     int dim_idx = threadIdx.x;
 
-    if (batch_idx < batch_size && head_idx < num_heads &&
-        seq_idx < seq_len && dim_idx < head_dim) {
+    if (head_idx < num_heads && seq_idx < seq_len && dim_idx < head_dim) {
         // TODO: RoPE 계산
-        int q_idx = batch_idx * num_heads * seq_len * head_dim +
-                    head_idx * seq_len * head_dim +
+        int q_idx = head_idx * seq_len * head_dim +
                     seq_idx * head_dim +
                     dim_idx;
         rotated_query[q_idx] = query[q_idx];
@@ -53,19 +49,19 @@ extern "C" void day19_rope(
     float* rotated_key,
     const float* cos_cache,
     const float* sin_cache,
-    int batch_size,
     int num_heads,
     int seq_len,
     int head_dim
 ) {
     // TODO: kernel launch configuration 설정
+    // batch_size는 항상 1이므로 제거
     dim3 threadsPerBlock(head_dim);
-    dim3 blocksPerGrid(seq_len, num_heads, batch_size);
+    dim3 blocksPerGrid(seq_len, num_heads);
 
     rope_kernel<<<blocksPerGrid, threadsPerBlock>>>(
         query, key, rotated_query, rotated_key,
         cos_cache, sin_cache,
-        batch_size, num_heads, seq_len, head_dim
+        num_heads, seq_len, head_dim
     );
     cudaDeviceSynchronize();
 }
diff --git a/csrc/day20_conv2d.cu b/csrc/day20_conv2d.cu
@@ -9,16 +9,15 @@
 // 2. 메모리 타일링 최적화
 // 3. Shared memory 활용
 //
-// 입력: input (batch_size, in_channels, height, width)
+// 입력: input (in_channels, height, width) - batch_size는 항상 1
 //      kernel (out_channels, in_channels, kernel_h, kernel_w)
 //      padding, stride
-// 출력: output (batch_size, out_channels, out_height, out_width)
+// 출력: output (out_channels, out_height, out_width)
 
 __global__ void conv2d_kernel(
     const float* input,
     const float* kernel,
     float* output,
-    int batch_size,
     int in_channels,
     int out_channels,
     int input_h,
@@ -34,18 +33,15 @@ __global__ void conv2d_kernel(
 ) {
     // TODO: 구현하세요
     // 2D 컨볼루션 계산
-    int batch_idx = blockIdx.z;
     int out_channel_idx = blockIdx.y;
     int out_row = blockIdx.x / output_w;
     int out_col = blockIdx.x % output_w;
 
     int thread_idx = threadIdx.x;
 
-    if (batch_idx < batch_size && out_channel_idx < out_channels &&
-        out_row < output_h && out_col < output_w) {
+    if (out_channel_idx < out_channels && out_row < output_h && out_col < output_w) {
         // TODO: 2D Convolution 계산
-        int out_idx = batch_idx * out_channels * output_h * output_w +
-                      out_channel_idx * output_h * output_w +
+        int out_idx = out_channel_idx * output_h * output_w +
                       out_row * output_w +
                       out_col;
         output[out_idx] = 0.0f;
@@ -56,7 +52,6 @@ extern "C" void day20_conv2d(
     const float* input,
     const float* kernel,
     float* output,
-    int batch_size,
     int in_channels,
     int out_channels,
     int input_h,
@@ -71,12 +66,13 @@ extern "C" void day20_conv2d(
     int stride_w
 ) {
     // TODO: kernel launch configuration 설정
+    // batch_size는 항상 1이므로 제거
     dim3 threadsPerBlock(256);
-    dim3 blocksPerGrid(output_h * output_w, out_channels, batch_size);
+    dim3 blocksPerGrid(output_h * output_w, out_channels);
 
     conv2d_kernel<<<blocksPerGrid, threadsPerBlock>>>(
         input, kernel, output,
-        batch_size, in_channels, out_channels,
+        in_channels, out_channels,
         input_h, input_w, kernel_h, kernel_w,
         output_h, output_w,
         pad_h, pad_w, stride_h, stride_w