From 3ac295aead72a566e467d29610ce30c226e6ebec Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Sun, 21 Sep 2025 10:18:41 -0700 Subject: [PATCH 01/10] Fix indenting --- ...-matrix-transpose-local-array_solution.cpp | 54 +++++++++++-------- 1 file changed, 31 insertions(+), 23 deletions(-) diff --git a/exercises/launch-matrix-transpose-local-array_solution.cpp b/exercises/launch-matrix-transpose-local-array_solution.cpp index d57dc5df53..ac6829f3f1 100644 --- a/exercises/launch-matrix-transpose-local-array_solution.cpp +++ b/exercises/launch-matrix-transpose-local-array_solution.cpp @@ -182,9 +182,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), [&] (RAJA::TypedRangeSegment const &row_tile) { + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), + [&] (RAJA::TypedRangeSegment const &row_tile) { - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), [&] (RAJA::TypedRangeSegment const &col_tile) { + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), + [&] (RAJA::TypedRangeSegment const &col_tile) { RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; @@ -232,9 +234,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), [&] (RAJA::TypedRangeSegment const &row_tile) { + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), + [&] (RAJA::TypedRangeSegment const &row_tile) { - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), [&] (RAJA::TypedRangeSegment const &col_tile) { + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), + [&] (RAJA::TypedRangeSegment const &col_tile) { RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; @@ -285,12 +289,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::launch( RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), - RAJA::Threads(c_block_sz, r_block_sz)), + RAJA::Threads(c_block_sz, r_block_sz)), [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), [&] (RAJA::TypedRangeSegment const &row_tile) { + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), + [&] (RAJA::TypedRangeSegment const &row_tile) { - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), [&] (RAJA::TypedRangeSegment const &col_tile) { + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), + [&] (RAJA::TypedRangeSegment const &col_tile) { RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; @@ -302,16 +308,16 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { + RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { - Atview(col, row) = Tile_Array[ty][tx]; + Atview(col, row) = Tile_Array[ty][tx]; - }); - }); + }); + }); - }); - }); + }); + }); }); @@ -357,12 +363,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::launch (RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), - RAJA::Threads(c_block_sz, r_block_sz)), + RAJA::Threads(c_block_sz, r_block_sz)), [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), [&] (RAJA::TypedRangeSegment const &row_tile) { + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), + [&] (RAJA::TypedRangeSegment const &row_tile) { - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), [&] (RAJA::TypedRangeSegment const &col_tile) { + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), + [&] (RAJA::TypedRangeSegment const &col_tile) { RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; @@ -375,15 +383,15 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { - d_Atview(col, row) = Tile_Array[ty][tx]; + d_Atview(col, row) = Tile_Array[ty][tx]; - }); - }); + }); + }); - }); - }); + }); + }); }); From 8060a324b641f2548782d7382afac311a9a352f5 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Sun, 21 Sep 2025 10:19:11 -0700 Subject: [PATCH 02/10] Use unchecked policies --- .../kernel-matrix-transpose-local-array_solution.cpp | 8 ++++---- .../launch-matrix-transpose-local-array_solution.cpp | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/exercises/kernel-matrix-transpose-local-array_solution.cpp b/exercises/kernel-matrix-transpose-local-array_solution.cpp index bf61bca57b..e252808cf7 100644 --- a/exercises/kernel-matrix-transpose-local-array_solution.cpp +++ b/exercises/kernel-matrix-transpose-local-array_solution.cpp @@ -399,8 +399,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // These loops iterate over the number of // tiles needed to carry out the transpose // - RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::cuda_block_y_loop, - RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::cuda_block_x_loop, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::cuda_block_y_direct_unchecked, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::cuda_block_x_direct_unchecked, // This statement will initalize local array memory inside a // kernel. The cpu_tile_mem policy specifies that memory should be // allocated on the stack. The entries in the RAJA::ParamList @@ -494,8 +494,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // These loops iterate over the number of // tiles needed to carry out the transpose // - RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::hip_block_y_loop, - RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::hip_block_x_loop, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::hip_block_y_direct_unchecked, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::hip_block_x_direct_unchecked, // This statement will initalize local array memory inside a // kernel. The cpu_tile_mem policy specifies that memory should be // allocated on the stack. The entries in the RAJA::ParamList diff --git a/exercises/launch-matrix-transpose-local-array_solution.cpp b/exercises/launch-matrix-transpose-local-array_solution.cpp index ac6829f3f1..1a08cc1b91 100644 --- a/exercises/launch-matrix-transpose-local-array_solution.cpp +++ b/exercises/launch-matrix-transpose-local-array_solution.cpp @@ -278,8 +278,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz); const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz); - using cuda_teams_y = RAJA::LoopPolicy; - using cuda_teams_x = RAJA::LoopPolicy; + using cuda_teams_y = RAJA::LoopPolicy; + using cuda_teams_x = RAJA::LoopPolicy; using cuda_threads_y = RAJA::LoopPolicy; using cuda_threads_x = RAJA::LoopPolicy; @@ -352,8 +352,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz); const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz); - using hip_teams_y = RAJA::LoopPolicy; - using hip_teams_x = RAJA::LoopPolicy; + using hip_teams_y = RAJA::LoopPolicy; + using hip_teams_x = RAJA::LoopPolicy; using hip_threads_y = RAJA::LoopPolicy; using hip_threads_x = RAJA::LoopPolicy; From d0c36cec613d6554cdbd7a80a294f8ea10268826 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Sun, 21 Sep 2025 10:19:37 -0700 Subject: [PATCH 03/10] Fix team synchronization --- .../kernel-matrix-transpose-local-array_solution.cpp | 10 ++-------- .../launch-matrix-transpose-local-array_solution.cpp | 4 ++++ 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/exercises/kernel-matrix-transpose-local-array_solution.cpp b/exercises/kernel-matrix-transpose-local-array_solution.cpp index e252808cf7..f5bfe5f038 100644 --- a/exercises/kernel-matrix-transpose-local-array_solution.cpp +++ b/exercises/kernel-matrix-transpose-local-array_solution.cpp @@ -431,10 +431,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::cuda_thread_x_direct, RAJA::statement::Lambda<1> > - >, - // Synchronize threads to ensure all reads - // from the local array are complete - RAJA::statement::CudaSyncThreads + > > > > @@ -526,10 +523,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::hip_thread_x_direct, RAJA::statement::Lambda<1> > - >, - // Synchronize threads to ensure all reads - // from the local array are complete - RAJA::statement::HipSyncThreads + > > > > diff --git a/exercises/launch-matrix-transpose-local-array_solution.cpp b/exercises/launch-matrix-transpose-local-array_solution.cpp index 1a08cc1b91..4da5c5a47d 100644 --- a/exercises/launch-matrix-transpose-local-array_solution.cpp +++ b/exercises/launch-matrix-transpose-local-array_solution.cpp @@ -308,6 +308,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); + ctx.teamSync(); + RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { @@ -382,6 +384,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); + ctx.teamSync(); + RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { From 1e1d347c1965d285f2aef597638d2d245f487386 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Sun, 21 Sep 2025 10:19:47 -0700 Subject: [PATCH 04/10] Fix device synchronization --- exercises/kernel-matrix-transpose-local-array_solution.cpp | 1 + exercises/launch-matrix-transpose-local-array_solution.cpp | 1 + 2 files changed, 2 insertions(+) diff --git a/exercises/kernel-matrix-transpose-local-array_solution.cpp b/exercises/kernel-matrix-transpose-local-array_solution.cpp index f5bfe5f038..5ae76aeb87 100644 --- a/exercises/kernel-matrix-transpose-local-array_solution.cpp +++ b/exercises/kernel-matrix-transpose-local-array_solution.cpp @@ -550,6 +550,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) ); CAMP_HIP_API_INVOKE_AND_CHECK(hipMemcpy, At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost); + CAMP_HIP_API_INVOKE_AND_CHECK(hipDeviceSynchronize); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif diff --git a/exercises/launch-matrix-transpose-local-array_solution.cpp b/exercises/launch-matrix-transpose-local-array_solution.cpp index 4da5c5a47d..787f26a2be 100644 --- a/exercises/launch-matrix-transpose-local-array_solution.cpp +++ b/exercises/launch-matrix-transpose-local-array_solution.cpp @@ -400,6 +400,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); CAMP_HIP_API_INVOKE_AND_CHECK(hipMemcpy, At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost); + CAMP_HIP_API_INVOKE_AND_CHECK(hipDeviceSynchronize); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif From 410f2bb1687680727844c0072bc506646717c729 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Sun, 21 Sep 2025 10:20:28 -0700 Subject: [PATCH 05/10] Remap launch thread mapping --- .../launch-matrix-transpose-local-array_solution.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/exercises/launch-matrix-transpose-local-array_solution.cpp b/exercises/launch-matrix-transpose-local-array_solution.cpp index 787f26a2be..b2025b1774 100644 --- a/exercises/launch-matrix-transpose-local-array_solution.cpp +++ b/exercises/launch-matrix-transpose-local-array_solution.cpp @@ -310,8 +310,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) ctx.teamSync(); - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { + RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { Atview(col, row) = Tile_Array[ty][tx]; @@ -386,8 +386,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) ctx.teamSync(); - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { + RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { d_Atview(col, row) = Tile_Array[ty][tx]; From e5d6004807afa94119c3e9b17d2b5a6337c20512 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Sun, 21 Sep 2025 14:31:55 -0700 Subject: [PATCH 06/10] clean up pass for launch --- .../launch-matrix-transpose-local-array.cpp | 45 ++++++++++--------- ...-matrix-transpose-local-array_solution.cpp | 42 ++++++++--------- 2 files changed, 45 insertions(+), 42 deletions(-) diff --git a/exercises/launch-matrix-transpose-local-array.cpp b/exercises/launch-matrix-transpose-local-array.cpp index dda01643e0..1ce8c3aa72 100644 --- a/exercises/launch-matrix-transpose-local-array.cpp +++ b/exercises/launch-matrix-transpose-local-array.cpp @@ -145,18 +145,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // (2) Inner loops to write array data into output array tile // - // Note: loop order is swapped from above so that output matrix - // data access is stride-1. // for (int tx = 0; tx < TILE_DIM; ++tx) { for (int ty = 0; ty < TILE_DIM; ++ty) { - int col = bx * TILE_DIM + tx; // Matrix column index - int row = by * TILE_DIM + ty; // Matrix row index + // Tranpose tile offset + int col = by * TILE_DIM + tx; // Matrix column index + int row = bx * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { - Atview(col, row) = Tile[ty][tx]; + if (row < N_c && col < N_r) { + Atview(row, col) = Tile[tx][ty]; } } } @@ -195,10 +194,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// input matrix into the RAJA_TEAM_SHARED memory array /// - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, col_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, row_tile, [&] (int col, int tx) { - Atview(col, row) = Tile_Array[ty][tx]; + Atview(row, col) = Tile_Array[tx][ty]; }); }); @@ -244,18 +243,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { + RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - Tile_Array[ty][tx] = Aview(row, col); + Tile_Array[ty][tx] = Aview(row, col); }); }); - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, col_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, row_tile, [&] (int col, int tx) { - Atview(col, row) = Tile_Array[ty][tx]; + Atview(row, col) = Tile_Array[tx][ty]; }); }); @@ -307,10 +306,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + ctx.teamSync(); - Atview(col, row) = Tile_Array[ty][tx]; + RAJA::loop_icount(ctx, col_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, row_tile, [&] (int col, int tx) { + + Atview(row, col) = Tile_Array[tx][ty]; }); }); @@ -379,10 +380,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + ctx.teamSync(); + + RAJA::loop_icount(ctx, col_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, row_tile, [&] (int col, int tx) { - d_Atview(col, row) = Tile_Array[ty][tx]; + d_Atview(row, col) = Tile_Array[tx][ty]; }); }); diff --git a/exercises/launch-matrix-transpose-local-array_solution.cpp b/exercises/launch-matrix-transpose-local-array_solution.cpp index b2025b1774..4b02702431 100644 --- a/exercises/launch-matrix-transpose-local-array_solution.cpp +++ b/exercises/launch-matrix-transpose-local-array_solution.cpp @@ -145,18 +145,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // (2) Inner loops to write array data into output array tile // - // Note: loop order is swapped from above so that output matrix - // data access is stride-1. // - for (int tx = 0; tx < TILE_DIM; ++tx) { - for (int ty = 0; ty < TILE_DIM; ++ty) { + for (int ty = 0; ty < TILE_DIM; ++ty) { + for (int tx = 0; tx < TILE_DIM; ++tx) { - int col = bx * TILE_DIM + tx; // Matrix column index - int row = by * TILE_DIM + ty; // Matrix row index + // Tranpose tile offset + int col = by * TILE_DIM + tx; // Matrix column index + int row = bx * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { - Atview(col, row) = Tile[ty][tx]; + if (row < N_c && col < N_r) { + Atview(row, col) = Tile[tx][ty]; } } } @@ -182,8 +181,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), - [&] (RAJA::TypedRangeSegment const &row_tile) { + [&] (RAJA::TypedRangeSegment const &row_tile) { RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), [&] (RAJA::TypedRangeSegment const &col_tile) { @@ -198,10 +198,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, col_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, row_tile, [&] (int col, int tx) { - Atview(col, row) = Tile_Array[ty][tx]; + Atview(row, col) = Tile_Array[tx][ty]; }); }); @@ -250,10 +250,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, col_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, row_tile, [&] (int col, int tx) { - Atview(col, row) = Tile_Array[ty][tx]; + Atview(row, col) = Tile_Array[tx][ty]; }); }); @@ -310,10 +310,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) ctx.teamSync(); - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, col_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, row_tile, [&] (int col, int tx) { - Atview(col, row) = Tile_Array[ty][tx]; + Atview(row, col) = Tile_Array[tx][ty]; }); }); @@ -386,10 +386,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) ctx.teamSync(); - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, col_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, row_tile, [&] (int col, int tx) { - d_Atview(col, row) = Tile_Array[ty][tx]; + d_Atview(row, col) = Tile_Array[tx][ty]; }); }); From 93b5a2a9b89fa211418f6455104ca23bb474ecd7 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Mon, 22 Sep 2025 12:59:01 -0700 Subject: [PATCH 07/10] use row/col_t --- .../launch-matrix-transpose-local-array.cpp | 32 +++++++++---------- ...-matrix-transpose-local-array_solution.cpp | 30 ++++++++--------- 2 files changed, 31 insertions(+), 31 deletions(-) diff --git a/exercises/launch-matrix-transpose-local-array.cpp b/exercises/launch-matrix-transpose-local-array.cpp index 1ce8c3aa72..2663f4356a 100644 --- a/exercises/launch-matrix-transpose-local-array.cpp +++ b/exercises/launch-matrix-transpose-local-array.cpp @@ -150,12 +150,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) for (int ty = 0; ty < TILE_DIM; ++ty) { // Tranpose tile offset - int col = by * TILE_DIM + tx; // Matrix column index - int row = bx * TILE_DIM + ty; // Matrix row index + int col_t = by * TILE_DIM + tx; // Matrix column index + int row_t = bx * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_c && col < N_r) { - Atview(row, col) = Tile[tx][ty]; + if (row_t < N_c && col_t < N_r) { + Atview(row_t, col_t) = Tile[tx][ty]; } } } @@ -194,10 +194,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// input matrix into the RAJA_TEAM_SHARED memory array /// - RAJA::loop_icount(ctx, col_tile, [&] (int row, int ty) { - RAJA::loop_icount(ctx, row_tile, [&] (int col, int tx) { + RAJA::loop_icount(ctx, col_tile, [&] (int row_t, int ty) { + RAJA::loop_icount(ctx, row_tile, [&] (int col_t, int tx) { - Atview(row, col) = Tile_Array[tx][ty]; + Atview(row_t, col_t) = Tile_Array[tx][ty]; }); }); @@ -251,10 +251,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); - RAJA::loop_icount(ctx, col_tile, [&] (int row, int ty) { - RAJA::loop_icount(ctx, row_tile, [&] (int col, int tx) { + RAJA::loop_icount(ctx, col_tile, [&] (int row_t, int ty) { + RAJA::loop_icount(ctx, row_tile, [&] (int col_t, int tx) { - Atview(row, col) = Tile_Array[tx][ty]; + Atview(row_t, col_t) = Tile_Array[tx][ty]; }); }); @@ -308,10 +308,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) ctx.teamSync(); - RAJA::loop_icount(ctx, col_tile, [&] (int row, int ty) { - RAJA::loop_icount(ctx, row_tile, [&] (int col, int tx) { + RAJA::loop_icount(ctx, col_tile, [&] (int row_t, int ty) { + RAJA::loop_icount(ctx, row_tile, [&] (int col_t, int tx) { - Atview(row, col) = Tile_Array[tx][ty]; + Atview(row_t, col_t) = Tile_Array[tx][ty]; }); }); @@ -382,10 +382,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) ctx.teamSync(); - RAJA::loop_icount(ctx, col_tile, [&] (int row, int ty) { - RAJA::loop_icount(ctx, row_tile, [&] (int col, int tx) { + RAJA::loop_icount(ctx, col_tile, [&] (int row_t, int ty) { + RAJA::loop_icount(ctx, row_tile, [&] (int col_t, int tx) { - d_Atview(row, col) = Tile_Array[tx][ty]; + d_Atview(row_t, col_t) = Tile_Array[tx][ty]; }); }); diff --git a/exercises/launch-matrix-transpose-local-array_solution.cpp b/exercises/launch-matrix-transpose-local-array_solution.cpp index 4b02702431..474bab015c 100644 --- a/exercises/launch-matrix-transpose-local-array_solution.cpp +++ b/exercises/launch-matrix-transpose-local-array_solution.cpp @@ -150,12 +150,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) for (int tx = 0; tx < TILE_DIM; ++tx) { // Tranpose tile offset - int col = by * TILE_DIM + tx; // Matrix column index - int row = bx * TILE_DIM + ty; // Matrix row index + int col_t = by * TILE_DIM + tx; // Matrix column index + int row_t = bx * TILE_DIM + ty; // Matrix row index // Bounds check if (row < N_c && col < N_r) { - Atview(row, col) = Tile[tx][ty]; + Atview(row_t, col_t) = Tile[tx][ty]; } } } @@ -198,10 +198,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); - RAJA::loop_icount(ctx, col_tile, [&] (int row, int ty) { - RAJA::loop_icount(ctx, row_tile, [&] (int col, int tx) { + RAJA::loop_icount(ctx, col_tile, [&] (int row_t, int ty) { + RAJA::loop_icount(ctx, row_tile, [&] (int col_t, int tx) { - Atview(row, col) = Tile_Array[tx][ty]; + Atview(row_t, col_t) = Tile_Array[tx][ty]; }); }); @@ -250,10 +250,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); - RAJA::loop_icount(ctx, col_tile, [&] (int row, int ty) { - RAJA::loop_icount(ctx, row_tile, [&] (int col, int tx) { + RAJA::loop_icount(ctx, col_tile, [&] (int row_t, int ty) { + RAJA::loop_icount(ctx, row_tile, [&] (int col_t, int tx) { - Atview(row, col) = Tile_Array[tx][ty]; + Atview(row_t, col_t) = Tile_Array[tx][ty]; }); }); @@ -310,10 +310,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) ctx.teamSync(); - RAJA::loop_icount(ctx, col_tile, [&] (int row, int ty) { - RAJA::loop_icount(ctx, row_tile, [&] (int col, int tx) { + RAJA::loop_icount(ctx, col_tile, [&] (int row_t, int ty) { + RAJA::loop_icount(ctx, row_tile, [&] (int col_t, int tx) { - Atview(row, col) = Tile_Array[tx][ty]; + Atview(row_t, col_t) = Tile_Array[tx][ty]; }); }); @@ -386,10 +386,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) ctx.teamSync(); - RAJA::loop_icount(ctx, col_tile, [&] (int row, int ty) { - RAJA::loop_icount(ctx, row_tile, [&] (int col, int tx) { + RAJA::loop_icount(ctx, col_tile, [&] (int row_t, int ty) { + RAJA::loop_icount(ctx, row_tile, [&] (int col_t, int tx) { - d_Atview(row, col) = Tile_Array[tx][ty]; + d_Atview(row_t, col_t) = Tile_Array[tx][ty]; }); }); From ec57de9e8f728d2887a84bd07bbd62d46ba90fdc Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Mon, 22 Sep 2025 13:40:27 -0700 Subject: [PATCH 08/10] missed updating variable names --- exercises/launch-matrix-transpose-local-array_solution.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/exercises/launch-matrix-transpose-local-array_solution.cpp b/exercises/launch-matrix-transpose-local-array_solution.cpp index 474bab015c..2fcf8d5770 100644 --- a/exercises/launch-matrix-transpose-local-array_solution.cpp +++ b/exercises/launch-matrix-transpose-local-array_solution.cpp @@ -154,7 +154,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) int row_t = bx * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_c && col < N_r) { + if (row_t < N_c && col_t < N_r) { Atview(row_t, col_t) = Tile[tx][ty]; } } From 4ed8196f25d9b41914d2c6d10f1c3194bdfab364 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Thu, 2 Oct 2025 11:27:11 -0700 Subject: [PATCH 09/10] fix tranpose c example --- exercises/kernel-matrix-transpose-local-array.cpp | 7 ++++--- .../kernel-matrix-transpose-local-array_solution.cpp | 9 +++++---- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/exercises/kernel-matrix-transpose-local-array.cpp b/exercises/kernel-matrix-transpose-local-array.cpp index 793fa300a8..8ebc04de51 100644 --- a/exercises/kernel-matrix-transpose-local-array.cpp +++ b/exercises/kernel-matrix-transpose-local-array.cpp @@ -154,11 +154,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) for (int tx = 0; tx < TILE_DIM; ++tx) { for (int ty = 0; ty < TILE_DIM; ++ty) { - int col = bx * TILE_DIM + tx; // Matrix column index - int row = by * TILE_DIM + ty; // Matrix row index + // Tranpose tile offset + int col_t = by * TILE_DIM + tx; // Matrix column index + int row_t = bx * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row_t < N_c && col_t < N_r) { Atview(col, row) = Tile[ty][tx]; } } diff --git a/exercises/kernel-matrix-transpose-local-array_solution.cpp b/exercises/kernel-matrix-transpose-local-array_solution.cpp index 5ae76aeb87..bcc2a9d02c 100644 --- a/exercises/kernel-matrix-transpose-local-array_solution.cpp +++ b/exercises/kernel-matrix-transpose-local-array_solution.cpp @@ -154,12 +154,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) for (int tx = 0; tx < TILE_DIM; ++tx) { for (int ty = 0; ty < TILE_DIM; ++ty) { - int col = bx * TILE_DIM + tx; // Matrix column index - int row = by * TILE_DIM + ty; // Matrix row index + // Tranpose tile offset + int col_t = by * TILE_DIM + tx; // Matrix column index + int row_t = bx * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { - Atview(col, row) = Tile[ty][tx]; + if (row_t < N_c && col_t < N_r) { + Atview(row_t, col_t) = Tile[tx][ty]; } } } From 627f418e7a16c9941b80bd85688d48a287797e45 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 10 Dec 2025 13:47:42 -0800 Subject: [PATCH 10/10] Fix compilation errors --- exercises/kernel-matrix-transpose-local-array.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/exercises/kernel-matrix-transpose-local-array.cpp b/exercises/kernel-matrix-transpose-local-array.cpp index 8ebc04de51..22e25c1207 100644 --- a/exercises/kernel-matrix-transpose-local-array.cpp +++ b/exercises/kernel-matrix-transpose-local-array.cpp @@ -155,11 +155,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) for (int ty = 0; ty < TILE_DIM; ++ty) { // Tranpose tile offset - int col_t = by * TILE_DIM + tx; // Matrix column index - int row_t = bx * TILE_DIM + ty; // Matrix row index + int col = by * TILE_DIM + tx; // Matrix column index + int row = bx * TILE_DIM + ty; // Matrix row index // Bounds check - if (row_t < N_c && col_t < N_r) { + if (row < N_c && col < N_r) { Atview(col, row) = Tile[ty][tx]; } }