diff --git a/csrc/bindings.cu b/csrc/bindings.cu index 73b9ef1..2b785ed 100644 --- a/csrc/bindings.cu +++ b/csrc/bindings.cu @@ -16,13 +16,13 @@ extern "C" void day10_conv1d(const float* input, const float* kernel, float* out extern "C" void day11_softmax(const float* input, float* output, int feature_size); extern "C" void day12_layernorm(const float* input, float* output, const float* gamma, const float* beta, int feature_size, float eps); extern "C" void day13_rmsnorm(const float* input, float* output, const float* weight, int feature_size, float eps); -extern "C" void day14_fused_softmax(const float* input, float* output, const float* mask, int batch_size, int seq_len, int feature_size, float scale); -extern "C" void day15_fused_attention(const float* Q, const float* K, const float* V, float* output, const float* mask, int batch_size, int num_heads, int seq_len, int head_dim, float scale); +extern "C" void day14_fused_softmax(const float* input, float* output, const float* mask, int seq_len, int feature_size, float scale); +extern "C" void day15_fused_attention(const float* Q, const float* K, const float* V, float* output, const float* mask, int num_heads, int seq_len, int head_dim, float scale); extern "C" void day16_group_gemm(const float* A, const float* B, float* C, int num_groups, int M, int N, int K); extern "C" void day17_persistent_matmul(const float* A, const float* B, float* C, int M, int N, int K); extern "C" void day18_block_scaled_matmul(const float* A, const float* B, float* C, int M, int N, int K, float scale); -extern "C" void day19_rope(const float* query, const float* key, float* rotated_query, float* rotated_key, const float* cos_cache, const float* sin_cache, int batch_size, int num_heads, int seq_len, int head_dim); -extern "C" void day20_conv2d(const float* input, const float* kernel, float* output, int batch_size, int in_channels, int out_channels, int input_h, int input_w, int kernel_h, int kernel_w, int output_h, int output_w, int pad_h, int pad_w, int stride_h, int stride_w); +extern "C" void day19_rope(const float* query, const float* key, float* rotated_query, float* rotated_key, const float* cos_cache, const float* sin_cache, int num_heads, int seq_len, int head_dim); +extern "C" void day20_conv2d(const float* input, const float* kernel, float* output, int in_channels, int out_channels, int input_h, int input_w, int kernel_h, int kernel_w, int output_h, int output_w, int pad_h, int pad_w, int stride_h, int stride_w); // Helper macro for CUDA error checking #define CUDA_CHECK(call) \ @@ -243,11 +243,10 @@ torch::Tensor day13_rmsnorm_wrapper(torch::Tensor input, torch::Tensor weight, f torch::Tensor day14_fused_softmax_wrapper(torch::Tensor input, torch::Tensor mask, float scale) { TORCH_CHECK(input.is_cuda(), "Input must be a CUDA tensor"); TORCH_CHECK(input.dtype() == torch::kFloat32, "Input must be float32"); - TORCH_CHECK(input.dim() == 3, "Input must be 3D tensor (batch_size, seq_len, feature_size)"); + TORCH_CHECK(input.dim() == 2, "Input must be 2D tensor (seq_len, feature_size)"); - int batch_size = input.size(0); - int seq_len = input.size(1); - int feature_size = input.size(2); + int seq_len = input.size(0); + int feature_size = input.size(1); // Default mask if not provided (empty tensor) const float* mask_ptr = nullptr; @@ -262,7 +261,6 @@ torch::Tensor day14_fused_softmax_wrapper(torch::Tensor input, torch::Tensor mas input.data_ptr(), output.data_ptr(), mask_ptr, - batch_size, seq_len, feature_size, scale @@ -275,12 +273,11 @@ torch::Tensor day14_fused_softmax_wrapper(torch::Tensor input, torch::Tensor mas torch::Tensor day15_fused_attention_wrapper(torch::Tensor Q, torch::Tensor K, torch::Tensor V, torch::Tensor mask, float scale) { TORCH_CHECK(Q.is_cuda() && K.is_cuda() && V.is_cuda(), "Inputs must be CUDA tensors"); TORCH_CHECK(Q.dtype() == torch::kFloat32 && K.dtype() == torch::kFloat32 && V.dtype() == torch::kFloat32, "Inputs must be float32"); - TORCH_CHECK(Q.dim() == 4, "Q, K, V must be 4D tensors (batch_size, num_heads, seq_len, head_dim)"); + TORCH_CHECK(Q.dim() == 3, "Q, K, V must be 3D tensors (num_heads, seq_len, head_dim)"); - int batch_size = Q.size(0); - int num_heads = Q.size(1); - int seq_len = Q.size(2); - int head_dim = Q.size(3); + int num_heads = Q.size(0); + int seq_len = Q.size(1); + int head_dim = Q.size(2); // Default scale if not provided if (scale <= 0.0f) { @@ -302,7 +299,6 @@ torch::Tensor day15_fused_attention_wrapper(torch::Tensor Q, torch::Tensor K, to V.data_ptr(), output.data_ptr(), mask_ptr, - batch_size, num_heads, seq_len, head_dim, @@ -392,12 +388,11 @@ std::tuple day19_rope_wrapper( ) { TORCH_CHECK(query.is_cuda() && key.is_cuda(), "Inputs must be CUDA tensors"); TORCH_CHECK(query.dtype() == torch::kFloat32 && key.dtype() == torch::kFloat32, "Inputs must be float32"); - TORCH_CHECK(query.dim() == 4 && key.dim() == 4, "Query and key must be 4D tensors (batch_size, num_heads, seq_len, head_dim)"); + TORCH_CHECK(query.dim() == 3 && key.dim() == 3, "Query and key must be 3D tensors (num_heads, seq_len, head_dim)"); - int batch_size = query.size(0); - int num_heads = query.size(1); - int seq_len = query.size(2); - int head_dim = query.size(3); + int num_heads = query.size(0); + int seq_len = query.size(1); + int head_dim = query.size(2); torch::Tensor rotated_query = torch::zeros_like(query); torch::Tensor rotated_key = torch::zeros_like(key); @@ -409,7 +404,7 @@ std::tuple day19_rope_wrapper( rotated_key.data_ptr(), cos_cache.data_ptr(), sin_cache.data_ptr(), - batch_size, num_heads, seq_len, head_dim + num_heads, seq_len, head_dim ); return std::make_tuple(rotated_query, rotated_key); @@ -426,13 +421,12 @@ torch::Tensor day20_conv2d_wrapper( ) { TORCH_CHECK(input.is_cuda() && kernel.is_cuda(), "Inputs must be CUDA tensors"); TORCH_CHECK(input.dtype() == torch::kFloat32 && kernel.dtype() == torch::kFloat32, "Inputs must be float32"); - TORCH_CHECK(input.dim() == 4, "Input must be 4D tensor (batch_size, in_channels, height, width)"); + TORCH_CHECK(input.dim() == 3, "Input must be 3D tensor (in_channels, height, width)"); TORCH_CHECK(kernel.dim() == 4, "Kernel must be 4D tensor (out_channels, in_channels, kernel_h, kernel_w)"); - int batch_size = input.size(0); - int in_channels = input.size(1); - int input_h = input.size(2); - int input_w = input.size(3); + int in_channels = input.size(0); + int input_h = input.size(1); + int input_w = input.size(2); int out_channels = kernel.size(0); int kernel_h = kernel.size(2); @@ -443,13 +437,13 @@ torch::Tensor day20_conv2d_wrapper( TORCH_CHECK(output_h > 0 && output_w > 0, "Output dimensions must be positive"); - torch::Tensor output = torch::zeros({batch_size, out_channels, output_h, output_w}, input.options()); + torch::Tensor output = torch::zeros({out_channels, output_h, output_w}, input.options()); day20_conv2d( input.data_ptr(), kernel.data_ptr(), output.data_ptr(), - batch_size, in_channels, out_channels, + in_channels, out_channels, input_h, input_w, kernel_h, kernel_w, output_h, output_w, pad_h, pad_w, stride_h, stride_w diff --git a/csrc/day14_fused_softmax.cu b/csrc/day14_fused_softmax.cu index ca027a2..69af8b4 100644 --- a/csrc/day14_fused_softmax.cu +++ b/csrc/day14_fused_softmax.cu @@ -11,28 +11,26 @@ // 2. 추가 연산 (scale, mask 등)을 같은 커널에서 처리 // 3. 메모리 접근 최소화 // -// 입력: input (batch_size, seq_len, feature_size) +// 입력: input (seq_len, feature_size) - batch_size는 항상 1 // scale (optional) - scaling factor // mask (optional) - attention mask -// 출력: output (batch_size, seq_len, feature_size) +// 출력: output (seq_len, feature_size) __global__ void fused_softmax_kernel( const float* input, float* output, const float* mask, - int batch_size, int seq_len, int feature_size, float scale ) { // TODO: 구현하세요 // Fused operations: scale -> mask -> softmax - int batch_idx = blockIdx.z; - int seq_idx = blockIdx.y; + int seq_idx = blockIdx.x; int feature_idx = threadIdx.x; - if (batch_idx < batch_size && seq_idx < seq_len && feature_idx < feature_size) { - int idx = batch_idx * seq_len * feature_size + seq_idx * feature_size + feature_idx; + if (seq_idx < seq_len && feature_idx < feature_size) { + int idx = seq_idx * feature_size + feature_idx; // TODO: Fused Softmax 계산 output[idx] = input[idx]; } @@ -42,17 +40,17 @@ extern "C" void day14_fused_softmax( const float* input, float* output, const float* mask, - int batch_size, int seq_len, int feature_size, float scale ) { // TODO: kernel launch configuration 설정 + // batch_size는 항상 1이므로 제거 dim3 threadsPerBlock(feature_size); - dim3 blocksPerGrid(seq_len, batch_size); + dim3 blocksPerGrid(seq_len); fused_softmax_kernel<<>>( - input, output, mask, batch_size, seq_len, feature_size, scale + input, output, mask, seq_len, feature_size, scale ); cudaDeviceSynchronize(); } diff --git a/csrc/day15_fused_attention.cu b/csrc/day15_fused_attention.cu index 82a924b..06f3ff7 100644 --- a/csrc/day15_fused_attention.cu +++ b/csrc/day15_fused_attention.cu @@ -13,11 +13,11 @@ // // 모든 연산을 하나의 커널로 융합하여 성능 최적화 // -// 입력: Q (batch_size, num_heads, seq_len, head_dim) -// K (batch_size, num_heads, seq_len, head_dim) -// V (batch_size, num_heads, seq_len, head_dim) +// 입력: Q (num_heads, seq_len, head_dim) - batch_size는 항상 1 +// K (num_heads, seq_len, head_dim) +// V (num_heads, seq_len, head_dim) // mask (optional) - attention mask -// 출력: output (batch_size, num_heads, seq_len, head_dim) +// 출력: output (num_heads, seq_len, head_dim) __global__ void fused_attention_kernel( const float* Q, @@ -25,7 +25,6 @@ __global__ void fused_attention_kernel( const float* V, float* output, const float* mask, - int batch_size, int num_heads, int seq_len, int head_dim, @@ -33,7 +32,6 @@ __global__ void fused_attention_kernel( ) { // TODO: 구현하세요 // Fused Attention: QK^T -> scale -> mask -> softmax -> @V - int batch_idx = blockIdx.z; int head_idx = blockIdx.y; int seq_idx = blockIdx.x; @@ -41,10 +39,8 @@ __global__ void fused_attention_kernel( // 각 thread는 하나의 head_dim element를 처리할 수 있습니다 int feature_idx = threadIdx.x; - if (batch_idx < batch_size && head_idx < num_heads && - seq_idx < seq_len && feature_idx < head_dim) { - int idx = batch_idx * num_heads * seq_len * head_dim + - head_idx * seq_len * head_dim + + if (head_idx < num_heads && seq_idx < seq_len && feature_idx < head_dim) { + int idx = head_idx * seq_len * head_dim + seq_idx * head_dim + feature_idx; // TODO: Fused Attention 계산 @@ -58,18 +54,18 @@ extern "C" void day15_fused_attention( const float* V, float* output, const float* mask, - int batch_size, int num_heads, int seq_len, int head_dim, float scale ) { // TODO: kernel launch configuration 설정 + // batch_size는 항상 1이므로 제거 dim3 threadsPerBlock(head_dim); - dim3 blocksPerGrid(seq_len, num_heads, batch_size); + dim3 blocksPerGrid(seq_len, num_heads); fused_attention_kernel<<>>( - Q, K, V, output, mask, batch_size, num_heads, seq_len, head_dim, scale + Q, K, V, output, mask, num_heads, seq_len, head_dim, scale ); cudaDeviceSynchronize(); } diff --git a/csrc/day19_rope.cu b/csrc/day19_rope.cu index 7d0d450..0f0d708 100644 --- a/csrc/day19_rope.cu +++ b/csrc/day19_rope.cu @@ -10,8 +10,8 @@ // 2. 위치 정보를 이용한 회전 변환 // 3. Query와 Key에 각각 적용 // -// 입력: query (batch_size, num_heads, seq_len, head_dim) -// key (batch_size, num_heads, seq_len, head_dim) +// 입력: query (num_heads, seq_len, head_dim) - batch_size는 항상 1 +// key (num_heads, seq_len, head_dim) // cos_cache (seq_len, head_dim / 2) // sin_cache (seq_len, head_dim / 2) // 출력: rotated_query, rotated_key @@ -23,22 +23,18 @@ __global__ void rope_kernel( float* rotated_key, const float* cos_cache, const float* sin_cache, - int batch_size, int num_heads, int seq_len, int head_dim ) { // TODO: 구현하세요 - int batch_idx = blockIdx.z; int head_idx = blockIdx.y; int seq_idx = blockIdx.x; int dim_idx = threadIdx.x; - if (batch_idx < batch_size && head_idx < num_heads && - seq_idx < seq_len && dim_idx < head_dim) { + if (head_idx < num_heads && seq_idx < seq_len && dim_idx < head_dim) { // TODO: RoPE 계산 - int q_idx = batch_idx * num_heads * seq_len * head_dim + - head_idx * seq_len * head_dim + + int q_idx = head_idx * seq_len * head_dim + seq_idx * head_dim + dim_idx; rotated_query[q_idx] = query[q_idx]; @@ -53,19 +49,19 @@ extern "C" void day19_rope( float* rotated_key, const float* cos_cache, const float* sin_cache, - int batch_size, int num_heads, int seq_len, int head_dim ) { // TODO: kernel launch configuration 설정 + // batch_size는 항상 1이므로 제거 dim3 threadsPerBlock(head_dim); - dim3 blocksPerGrid(seq_len, num_heads, batch_size); + dim3 blocksPerGrid(seq_len, num_heads); rope_kernel<<>>( query, key, rotated_query, rotated_key, cos_cache, sin_cache, - batch_size, num_heads, seq_len, head_dim + num_heads, seq_len, head_dim ); cudaDeviceSynchronize(); } diff --git a/csrc/day20_conv2d.cu b/csrc/day20_conv2d.cu index e0a95b7..5c66e6f 100644 --- a/csrc/day20_conv2d.cu +++ b/csrc/day20_conv2d.cu @@ -9,16 +9,15 @@ // 2. 메모리 타일링 최적화 // 3. Shared memory 활용 // -// 입력: input (batch_size, in_channels, height, width) +// 입력: input (in_channels, height, width) - batch_size는 항상 1 // kernel (out_channels, in_channels, kernel_h, kernel_w) // padding, stride -// 출력: output (batch_size, out_channels, out_height, out_width) +// 출력: output (out_channels, out_height, out_width) __global__ void conv2d_kernel( const float* input, const float* kernel, float* output, - int batch_size, int in_channels, int out_channels, int input_h, @@ -34,18 +33,15 @@ __global__ void conv2d_kernel( ) { // TODO: 구현하세요 // 2D 컨볼루션 계산 - int batch_idx = blockIdx.z; int out_channel_idx = blockIdx.y; int out_row = blockIdx.x / output_w; int out_col = blockIdx.x % output_w; int thread_idx = threadIdx.x; - if (batch_idx < batch_size && out_channel_idx < out_channels && - out_row < output_h && out_col < output_w) { + if (out_channel_idx < out_channels && out_row < output_h && out_col < output_w) { // TODO: 2D Convolution 계산 - int out_idx = batch_idx * out_channels * output_h * output_w + - out_channel_idx * output_h * output_w + + int out_idx = out_channel_idx * output_h * output_w + out_row * output_w + out_col; output[out_idx] = 0.0f; @@ -56,7 +52,6 @@ extern "C" void day20_conv2d( const float* input, const float* kernel, float* output, - int batch_size, int in_channels, int out_channels, int input_h, @@ -71,12 +66,13 @@ extern "C" void day20_conv2d( int stride_w ) { // TODO: kernel launch configuration 설정 + // batch_size는 항상 1이므로 제거 dim3 threadsPerBlock(256); - dim3 blocksPerGrid(output_h * output_w, out_channels, batch_size); + dim3 blocksPerGrid(output_h * output_w, out_channels); conv2d_kernel<<>>( input, kernel, output, - batch_size, in_channels, out_channels, + in_channels, out_channels, input_h, input_w, kernel_h, kernel_w, output_h, output_w, pad_h, pad_w, stride_h, stride_w diff --git a/src/gpu_20days/day14_fused_softmax.py b/src/gpu_20days/day14_fused_softmax.py index 4017539..ae19d28 100644 --- a/src/gpu_20days/day14_fused_softmax.py +++ b/src/gpu_20days/day14_fused_softmax.py @@ -12,7 +12,6 @@ def day14_fused_softmax_kernel( input_ptr, output_ptr, mask_ptr, - batch_size, seq_len, feature_size, scale, @@ -29,29 +28,33 @@ def day14_fused_softmax_kernel( 1. 기본 Softmax 구현 2. 추가 연산 (scale, mask 등)을 같은 커널에서 처리 3. 메모리 접근 최소화 + + batch_size는 항상 1입니다. """ # TODO: 구현하세요 - # batch_idx = tl.program_id(0) - # seq_idx = tl.program_id(1) - # feature_idx = tl.arange(0, BLOCK_SIZE) - # mask = feature_idx < feature_size pass def day14_fused_softmax( input: torch.Tensor, mask: torch.Tensor = None, scale: float = 1.0 ) -> torch.Tensor: - """Day 14: Fused softmax operation""" + """Day 14: Fused softmax operation (batch_size is always 1)""" # TODO: 구현하세요 BLOCK_SIZE = 256 - batch_size, seq_len, feature_size = input.shape + # batch_size는 항상 1이므로 입력은 2D + if input.dim() != 2: + raise ValueError( + "day14_fused_softmax expects 2D tensor (seq_len, feature_size), batch_size is always 1" + ) + + seq_len, feature_size = input.shape output = torch.zeros_like(input) def grid(meta): - return (batch_size, seq_len) + return (triton.cdiv(seq_len, BLOCK_SIZE),) # day14_fused_softmax_kernel[grid]( - # input, output, mask, batch_size, seq_len, feature_size, scale, BLOCK_SIZE=BLOCK_SIZE + # input, output, mask, seq_len, feature_size, scale, BLOCK_SIZE=BLOCK_SIZE # ) return output diff --git a/src/gpu_20days/day15_fused_attention.py b/src/gpu_20days/day15_fused_attention.py index 00f527e..4d8f586 100644 --- a/src/gpu_20days/day15_fused_attention.py +++ b/src/gpu_20days/day15_fused_attention.py @@ -16,7 +16,6 @@ def day15_fused_attention_kernel( V_ptr, output_ptr, mask_ptr, - batch_size, num_heads, seq_len, head_dim, @@ -36,13 +35,10 @@ def day15_fused_attention_kernel( 5. @ V (matrix multiplication) 모든 연산을 하나의 커널로 융합하여 성능 최적화 + + batch_size는 항상 1입니다. """ # TODO: 구현하세요 - # batch_idx = tl.program_id(0) - # head_idx = tl.program_id(1) - # seq_idx = tl.program_id(2) - # feature_idx = tl.arange(0, BLOCK_SIZE) - # mask = feature_idx < head_dim pass @@ -53,10 +49,16 @@ def day15_fused_attention( mask: Optional[torch.Tensor] = None, scale: Optional[float] = None, ) -> torch.Tensor: - """Day 15: Fused attention mechanism""" + """Day 15: Fused attention mechanism (batch_size is always 1)""" # TODO: 구현하세요 BLOCK_SIZE = 256 - batch_size, num_heads, seq_len, head_dim = Q.shape + # batch_size는 항상 1이므로 입력은 3D + if Q.dim() != 3: + raise ValueError( + "day15_fused_attention expects 3D tensor (num_heads, seq_len, head_dim), batch_size is always 1" + ) + + num_heads, seq_len, head_dim = Q.shape if scale is None: scale = 1.0 / (head_dim**0.5) @@ -64,9 +66,9 @@ def day15_fused_attention( output = torch.zeros_like(Q) def grid(meta): - return (batch_size, num_heads, seq_len) + return (num_heads, triton.cdiv(seq_len, BLOCK_SIZE)) # day15_fused_attention_kernel[grid]( - # Q, K, V, output, mask, batch_size, num_heads, seq_len, head_dim, scale, BLOCK_SIZE=BLOCK_SIZE + # Q, K, V, output, mask, num_heads, seq_len, head_dim, scale, BLOCK_SIZE=BLOCK_SIZE # ) return output diff --git a/src/gpu_20days/day19_rope.py b/src/gpu_20days/day19_rope.py index afb900f..b6c9be4 100644 --- a/src/gpu_20days/day19_rope.py +++ b/src/gpu_20days/day19_rope.py @@ -15,7 +15,6 @@ def day19_rope_kernel( rotated_key_ptr, cos_cache_ptr, sin_cache_ptr, - batch_size, num_heads, seq_len, head_dim, @@ -30,34 +29,37 @@ def day19_rope_kernel( 1. 삼각함수 연산 (sin, cos) 사용 2. 위치 정보를 이용한 회전 변환 3. Query와 Key에 각각 적용 + + batch_size는 항상 1입니다. """ # TODO: 구현하세요 - # batch_idx = tl.program_id(0) - # head_idx = tl.program_id(1) - # seq_idx = tl.program_id(2) - # dim_idx = tl.arange(0, BLOCK_SIZE) - # mask = dim_idx < head_dim pass def day19_rope( query: torch.Tensor, key: torch.Tensor, cos_cache: torch.Tensor, sin_cache: torch.Tensor ) -> tuple[torch.Tensor, torch.Tensor]: - """Day 19: Rotary position embedding""" + """Day 19: Rotary position embedding (batch_size is always 1)""" # TODO: 구현하세요 BLOCK_SIZE = 256 - batch_size, num_heads, seq_len, head_dim = query.shape + # batch_size는 항상 1이므로 입력은 3D + if query.dim() != 3: + raise ValueError( + "day19_rope expects 3D tensor (num_heads, seq_len, head_dim), batch_size is always 1" + ) + + num_heads, seq_len, head_dim = query.shape rotated_query = torch.zeros_like(query) rotated_key = torch.zeros_like(key) def grid(meta): - return (batch_size, num_heads, seq_len) + return (num_heads, triton.cdiv(seq_len, BLOCK_SIZE)) # day19_rope_kernel[grid]( # query, key, rotated_query, rotated_key, # cos_cache, sin_cache, - # batch_size, num_heads, seq_len, head_dim, + # num_heads, seq_len, head_dim, # BLOCK_SIZE=BLOCK_SIZE # ) return rotated_query, rotated_key diff --git a/src/gpu_20days/day20_conv2d.py b/src/gpu_20days/day20_conv2d.py index 4436e8b..ff3fed5 100644 --- a/src/gpu_20days/day20_conv2d.py +++ b/src/gpu_20days/day20_conv2d.py @@ -12,7 +12,6 @@ def day20_conv2d_kernel( input_ptr, kernel_ptr, output_ptr, - batch_size, in_channels, out_channels, input_h, @@ -36,12 +35,10 @@ def day20_conv2d_kernel( 1. 2D 슬라이딩 윈도우 패턴 2. 메모리 타일링 최적화 3. Shared memory 활용 + + batch_size는 항상 1입니다. """ # TODO: 구현하세요 - # batch_idx = tl.program_id(0) - # out_channel_idx = tl.program_id(1) - # out_row = tl.program_id(2) // output_w - # out_col = tl.program_id(2) % output_w pass @@ -51,10 +48,16 @@ def day20_conv2d( padding: tuple[int, int] = (0, 0), stride: tuple[int, int] = (1, 1), ) -> torch.Tensor: - """Day 20: Two-dimensional convolution""" + """Day 20: Two-dimensional convolution (batch_size is always 1)""" # TODO: 구현하세요 BLOCK_SIZE = 256 - batch_size, in_channels, input_h, input_w = input.shape + # batch_size는 항상 1이므로 입력은 3D + if input.dim() != 3: + raise ValueError( + "day20_conv2d expects 3D tensor (in_channels, height, width), batch_size is always 1" + ) + + in_channels, input_h, input_w = input.shape out_channels, _, kernel_h, kernel_w = kernel.shape pad_h, pad_w = padding @@ -63,16 +66,14 @@ def day20_conv2d( output_h = (input_h + 2 * pad_h - kernel_h) // stride_h + 1 output_w = (input_w + 2 * pad_w - kernel_w) // stride_w + 1 - output = torch.zeros( - batch_size, out_channels, output_h, output_w, device=input.device, dtype=input.dtype - ) + output = torch.zeros(out_channels, output_h, output_w, device=input.device, dtype=input.dtype) def grid(meta): - return (batch_size, out_channels, output_h * output_w) + return (out_channels, triton.cdiv(output_h * output_w, BLOCK_SIZE)) # day20_conv2d_kernel[grid]( # input, kernel, output, - # batch_size, in_channels, out_channels, + # in_channels, out_channels, # input_h, input_w, kernel_h, kernel_w, # output_h, output_w, # pad_h, pad_w, stride_h, stride_w, diff --git a/tests/test_day14.py b/tests/test_day14.py index 3157e26..9efcafc 100644 --- a/tests/test_day14.py +++ b/tests/test_day14.py @@ -15,16 +15,16 @@ from conftest import benchmark_kernel_vs_pytorch, compare_kernel_with_pytorch, ensure_cuda_device -# Test cases: (batch_size, seq_len, feature_size, description) +# Test cases: (seq_len, feature_size, description) - batch_size is always 1 FUSED_SOFTMAX_TEST_CASES = [ - (1, 10, 32, "small_1x10x32"), - (4, 32, 64, "medium_4x32x64"), - (8, 128, 128, "medium_8x128x128"), + (10, 32, "small_10x32"), + (32, 64, "medium_32x64"), + (128, 128, "medium_128x128"), ] -@pytest.mark.parametrize("batch_size,seq_len,feature_size,description", FUSED_SOFTMAX_TEST_CASES) -def test_fused_softmax_triton(batch_size, seq_len, feature_size, description): +@pytest.mark.parametrize("seq_len,feature_size,description", FUSED_SOFTMAX_TEST_CASES) +def test_fused_softmax_triton(seq_len, feature_size, description): """Test Triton Fused Softmax""" try: from gpu_20days.day14_fused_softmax import day14_fused_softmax @@ -33,12 +33,9 @@ def test_fused_softmax_triton(batch_size, seq_len, feature_size, description): device = ensure_cuda_device() - print( - f"Testing Triton Fused Softmax with shape ({batch_size}, {seq_len}, {feature_size}) ({description})..." - ) - input_tensor = torch.randn( - batch_size, seq_len, feature_size, device=device, dtype=torch.float32 - ) + print(f"Testing Triton Fused Softmax with shape ({seq_len}, {feature_size}) ({description})...") + # batch_size is always 1, so input is 2D + input_tensor = torch.randn(seq_len, feature_size, device=device, dtype=torch.float32) output = day14_fused_softmax(input_tensor) expected = torch.softmax(input_tensor, dim=-1) @@ -46,8 +43,8 @@ def test_fused_softmax_triton(batch_size, seq_len, feature_size, description): torch.testing.assert_close(output, expected, rtol=1e-4, atol=1e-5) -@pytest.mark.parametrize("batch_size,seq_len,feature_size,description", FUSED_SOFTMAX_TEST_CASES) -def test_fused_softmax_cuda(batch_size, seq_len, feature_size, description): +@pytest.mark.parametrize("seq_len,feature_size,description", FUSED_SOFTMAX_TEST_CASES) +def test_fused_softmax_cuda(seq_len, feature_size, description): """Test CUDA Fused Softmax""" try: from gpu_20days.cuda_kernels import day14_fused_softmax @@ -56,12 +53,9 @@ def test_fused_softmax_cuda(batch_size, seq_len, feature_size, description): device = ensure_cuda_device() - print( - f"Testing CUDA Fused Softmax with shape ({batch_size}, {seq_len}, {feature_size}) ({description})..." - ) - input_tensor = torch.randn( - batch_size, seq_len, feature_size, device=device, dtype=torch.float32 - ) + print(f"Testing CUDA Fused Softmax with shape ({seq_len}, {feature_size}) ({description})...") + # batch_size is always 1, so input is 2D + input_tensor = torch.randn(seq_len, feature_size, device=device, dtype=torch.float32) output = day14_fused_softmax(input_tensor) expected = torch.softmax(input_tensor, dim=-1) diff --git a/tests/test_day15.py b/tests/test_day15.py index afed1e8..a677a9a 100644 --- a/tests/test_day15.py +++ b/tests/test_day15.py @@ -15,16 +15,16 @@ from conftest import benchmark_kernel_vs_pytorch, compare_kernel_with_pytorch, ensure_cuda_device -# Test cases: (batch_size, num_heads, seq_len, head_dim, description) +# Test cases: (num_heads, seq_len, head_dim, description) - batch_size is always 1 ATTENTION_TEST_CASES = [ - (1, 2, 8, 32, "small_1x2x8x32"), - (2, 4, 16, 64, "medium_2x4x16x64"), - (4, 8, 32, 128, "medium_4x8x32x128"), + (2, 8, 32, "small_2x8x32"), + (4, 16, 64, "medium_4x16x64"), + (8, 32, 128, "medium_8x32x128"), ] -@pytest.mark.parametrize("batch_size,num_heads,seq_len,head_dim,description", ATTENTION_TEST_CASES) -def test_fused_attention_triton(batch_size, num_heads, seq_len, head_dim, description): +@pytest.mark.parametrize("num_heads,seq_len,head_dim,description", ATTENTION_TEST_CASES) +def test_fused_attention_triton(num_heads, seq_len, head_dim, description): """Test Triton Fused Attention""" try: from gpu_20days.day15_fused_attention import day15_fused_attention @@ -34,11 +34,12 @@ def test_fused_attention_triton(batch_size, num_heads, seq_len, head_dim, descri device = ensure_cuda_device() print( - f"Testing Triton Fused Attention with shape ({batch_size}, {num_heads}, {seq_len}, {head_dim}) ({description})..." + f"Testing Triton Fused Attention with shape ({num_heads}, {seq_len}, {head_dim}) ({description})..." ) - Q = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device, dtype=torch.float32) - K = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device, dtype=torch.float32) - V = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device, dtype=torch.float32) + # batch_size is always 1, so input is 3D + Q = torch.randn(num_heads, seq_len, head_dim, device=device, dtype=torch.float32) + K = torch.randn(num_heads, seq_len, head_dim, device=device, dtype=torch.float32) + V = torch.randn(num_heads, seq_len, head_dim, device=device, dtype=torch.float32) output = day15_fused_attention(Q, K, V) # Reference: scaled dot-product attention @@ -50,8 +51,8 @@ def test_fused_attention_triton(batch_size, num_heads, seq_len, head_dim, descri torch.testing.assert_close(output, expected, rtol=1e-4, atol=1e-5) -@pytest.mark.parametrize("batch_size,num_heads,seq_len,head_dim,description", ATTENTION_TEST_CASES) -def test_fused_attention_cuda(batch_size, num_heads, seq_len, head_dim, description): +@pytest.mark.parametrize("num_heads,seq_len,head_dim,description", ATTENTION_TEST_CASES) +def test_fused_attention_cuda(num_heads, seq_len, head_dim, description): """Test CUDA Fused Attention""" try: from gpu_20days.cuda_kernels import day15_fused_attention @@ -61,11 +62,12 @@ def test_fused_attention_cuda(batch_size, num_heads, seq_len, head_dim, descript device = ensure_cuda_device() print( - f"Testing CUDA Fused Attention with shape ({batch_size}, {num_heads}, {seq_len}, {head_dim}) ({description})..." + f"Testing CUDA Fused Attention with shape ({num_heads}, {seq_len}, {head_dim}) ({description})..." ) - Q = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device, dtype=torch.float32) - K = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device, dtype=torch.float32) - V = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device, dtype=torch.float32) + # batch_size is always 1, so input is 3D + Q = torch.randn(num_heads, seq_len, head_dim, device=device, dtype=torch.float32) + K = torch.randn(num_heads, seq_len, head_dim, device=device, dtype=torch.float32) + V = torch.randn(num_heads, seq_len, head_dim, device=device, dtype=torch.float32) output = day15_fused_attention(Q, K, V) scale = 1.0 / (head_dim**0.5) diff --git a/tests/test_day19.py b/tests/test_day19.py index fb7d392..d455e92 100644 --- a/tests/test_day19.py +++ b/tests/test_day19.py @@ -16,10 +16,10 @@ from conftest import benchmark_kernel_vs_pytorch, compare_kernel_with_pytorch, ensure_cuda_device -# Test cases: (batch_size, num_heads, seq_len, head_dim, description) +# Test cases: (num_heads, seq_len, head_dim, description) - batch_size is always 1 ROPE_TEST_CASES = [ - (1, 2, 8, 32, "small_1x2x8x32"), - (2, 4, 16, 64, "medium_2x4x16x64"), + (2, 8, 32, "small_2x8x32"), + (4, 16, 64, "medium_4x16x64"), ] @@ -33,8 +33,8 @@ def create_rope_cache(seq_len, head_dim, device): return cos_cache, sin_cache -@pytest.mark.parametrize("batch_size,num_heads,seq_len,head_dim,description", ROPE_TEST_CASES) -def test_rope_triton(batch_size, num_heads, seq_len, head_dim, description): +@pytest.mark.parametrize("num_heads,seq_len,head_dim,description", ROPE_TEST_CASES) +def test_rope_triton(num_heads, seq_len, head_dim, description): """Test Triton RoPE""" try: from gpu_20days.day19_rope import day19_rope @@ -43,13 +43,10 @@ def test_rope_triton(batch_size, num_heads, seq_len, head_dim, description): device = ensure_cuda_device() - print( - f"Testing Triton RoPE with shape ({batch_size}, {num_heads}, {seq_len}, {head_dim}) ({description})..." - ) - query = torch.randn( - batch_size, num_heads, seq_len, head_dim, device=device, dtype=torch.float32 - ) - key = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device, dtype=torch.float32) + print(f"Testing Triton RoPE with shape ({num_heads}, {seq_len}, {head_dim}) ({description})...") + # batch_size is always 1, so input is 3D + query = torch.randn(num_heads, seq_len, head_dim, device=device, dtype=torch.float32) + key = torch.randn(num_heads, seq_len, head_dim, device=device, dtype=torch.float32) cos_cache, sin_cache = create_rope_cache(seq_len, head_dim // 2, device) rotated_query, rotated_key = day19_rope(query, key, cos_cache, sin_cache) @@ -59,8 +56,8 @@ def test_rope_triton(batch_size, num_heads, seq_len, head_dim, description): assert rotated_key.shape == key.shape -@pytest.mark.parametrize("batch_size,num_heads,seq_len,head_dim,description", ROPE_TEST_CASES) -def test_rope_cuda(batch_size, num_heads, seq_len, head_dim, description): +@pytest.mark.parametrize("num_heads,seq_len,head_dim,description", ROPE_TEST_CASES) +def test_rope_cuda(num_heads, seq_len, head_dim, description): """Test CUDA RoPE""" try: from gpu_20days.cuda_kernels import day19_rope @@ -69,13 +66,10 @@ def test_rope_cuda(batch_size, num_heads, seq_len, head_dim, description): device = ensure_cuda_device() - print( - f"Testing CUDA RoPE with shape ({batch_size}, {num_heads}, {seq_len}, {head_dim}) ({description})..." - ) - query = torch.randn( - batch_size, num_heads, seq_len, head_dim, device=device, dtype=torch.float32 - ) - key = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device, dtype=torch.float32) + print(f"Testing CUDA RoPE with shape ({num_heads}, {seq_len}, {head_dim}) ({description})...") + # batch_size is always 1, so input is 3D + query = torch.randn(num_heads, seq_len, head_dim, device=device, dtype=torch.float32) + key = torch.randn(num_heads, seq_len, head_dim, device=device, dtype=torch.float32) cos_cache, sin_cache = create_rope_cache(seq_len, head_dim // 2, device) rotated_query, rotated_key = day19_rope(query, key, cos_cache, sin_cache) diff --git a/tests/test_day20.py b/tests/test_day20.py index 4844eb0..aa17463 100644 --- a/tests/test_day20.py +++ b/tests/test_day20.py @@ -15,50 +15,48 @@ from conftest import benchmark_kernel_vs_pytorch, compare_kernel_with_pytorch, ensure_cuda_device -# Test cases: (batch_size, in_channels, out_channels, height, width, kernel_size, description) +# Test cases: (in_channels, out_channels, height, width, kernel_size, description) - batch_size is always 1 CONV2D_TEST_CASES = [ - (1, 1, 1, 8, 8, 3, "small_1x1x1_8x8_k3"), - (2, 3, 4, 16, 16, 3, "medium_2x3x4_16x16_k3"), - (4, 8, 16, 32, 32, 5, "medium_4x8x16_32x32_k5"), + (1, 1, 8, 8, 3, "small_1x1_8x8_k3"), + (3, 4, 16, 16, 3, "medium_3x4_16x16_k3"), + (8, 16, 32, 32, 5, "medium_8x16_32x32_k5"), ] @pytest.mark.parametrize( - "batch_size,in_channels,out_channels,height,width,kernel_size,description", CONV2D_TEST_CASES + "in_channels,out_channels,height,width,kernel_size,description", CONV2D_TEST_CASES ) -def test_conv2d_triton( - batch_size, in_channels, out_channels, height, width, kernel_size, description -): +def test_conv2d_triton(in_channels, out_channels, height, width, kernel_size, description): """Test Triton 2D Convolution""" try: - from gpu_20days.day20_conv import day20_conv2d + from gpu_20days.day20_conv2d import day20_conv2d except ImportError: pytest.skip("gpu_20days package not available") device = ensure_cuda_device() print( - f"Testing Triton 2D Conv with shape ({batch_size}, {in_channels}, {height}, {width}), kernel={kernel_size} ({description})..." - ) - input_tensor = torch.randn( - batch_size, in_channels, height, width, device=device, dtype=torch.float32 + f"Testing Triton 2D Conv with shape ({in_channels}, {height}, {width}), kernel={kernel_size} ({description})..." ) + # batch_size is always 1, so input is 3D + input_tensor = torch.randn(in_channels, height, width, device=device, dtype=torch.float32) kernel = torch.randn( out_channels, in_channels, kernel_size, kernel_size, device=device, dtype=torch.float32 ) output = day20_conv2d(input_tensor, kernel, padding=(1, 1), stride=(1, 1)) - expected = torch.nn.functional.conv2d(input_tensor, kernel, padding=(1, 1), stride=(1, 1)) + # Add batch dimension for PyTorch reference + input_with_batch = input_tensor.unsqueeze(0) + expected = torch.nn.functional.conv2d(input_with_batch, kernel, padding=(1, 1), stride=(1, 1)) + expected = expected.squeeze(0) # Remove batch dimension torch.testing.assert_close(output, expected, rtol=1e-4, atol=1e-5) @pytest.mark.parametrize( - "batch_size,in_channels,out_channels,height,width,kernel_size,description", CONV2D_TEST_CASES + "in_channels,out_channels,height,width,kernel_size,description", CONV2D_TEST_CASES ) -def test_conv2d_cuda( - batch_size, in_channels, out_channels, height, width, kernel_size, description -): +def test_conv2d_cuda(in_channels, out_channels, height, width, kernel_size, description): """Test CUDA 2D Convolution""" try: from gpu_20days.cuda_kernels import day20_conv2d @@ -68,16 +66,18 @@ def test_conv2d_cuda( device = ensure_cuda_device() print( - f"Testing CUDA 2D Conv with shape ({batch_size}, {in_channels}, {height}, {width}), kernel={kernel_size} ({description})..." - ) - input_tensor = torch.randn( - batch_size, in_channels, height, width, device=device, dtype=torch.float32 + f"Testing CUDA 2D Conv with shape ({in_channels}, {height}, {width}), kernel={kernel_size} ({description})..." ) + # batch_size is always 1, so input is 3D + input_tensor = torch.randn(in_channels, height, width, device=device, dtype=torch.float32) kernel = torch.randn( out_channels, in_channels, kernel_size, kernel_size, device=device, dtype=torch.float32 ) output = day20_conv2d(input_tensor, kernel, padding=(1, 1), stride=(1, 1)) - expected = torch.nn.functional.conv2d(input_tensor, kernel, padding=(1, 1), stride=(1, 1)) + # Add batch dimension for PyTorch reference + input_with_batch = input_tensor.unsqueeze(0) + expected = torch.nn.functional.conv2d(input_with_batch, kernel, padding=(1, 1), stride=(1, 1)) + expected = expected.squeeze(0) # Remove batch dimension torch.testing.assert_close(output, expected, rtol=1e-4, atol=1e-5)