From b8bde0f2393a2d80dd0c9eb4a28453db3155155c Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Wed, 12 Mar 2025 09:39:48 -0400 Subject: [PATCH 1/5] The Gemm NN GPU is missing declaration of datatype and that causes issue #138 --- src/zgemm_wrapper.c | 7 ++++++- tests/testing_zgemm.c | 10 +++++----- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/zgemm_wrapper.c b/src/zgemm_wrapper.c index 7052e7ca..098b8f69 100644 --- a/src/zgemm_wrapper.c +++ b/src/zgemm_wrapper.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010-2024 The University of Tennessee and The University + * Copyright (c) 2010-2025 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2013 Inria. All rights reserved. @@ -390,6 +390,10 @@ dplasma_zgemm_gpu_new( dplasma_enum_t transA, dplasma_enum_t transB, #else tp->_g_hip_handles_infokey = PARSEC_INFO_ID_UNDEFINED; #endif + dplasma_add2arena_tile( &tp->arenas_datatypes[PARSEC_zgemm_NN_gpu_DEFAULT_ADT_IDX], + A->mb*A->nb*sizeof(dplasma_complex64_t), + PARSEC_ARENA_ALIGNMENT_SSE, + parsec_datatype_double_complex_t, A->mb ); zgemm_tp = (parsec_taskpool_t *) tp; return zgemm_tp; @@ -594,6 +598,7 @@ dplasma_zgemm_Destruct( parsec_taskpool_t *tp ) #if defined(DPLASMA_HAVE_CUDA) } else if( zgemm_tp->_g_gemm_type == DPLASMA_ZGEMM_NN_GPU ) { parsec_zgemm_NN_gpu_taskpool_t *zgemm_gpu_tp = (parsec_zgemm_NN_gpu_taskpool_t *)tp; + dplasma_matrix_del2arena( &zgemm_gpu_tp->arenas_datatypes[PARSEC_zgemm_NN_gpu_DEFAULT_ADT_IDX] ); free(zgemm_gpu_tp->_g_cuda_device_index); #endif /* DPLASMA_HAVE_CUDA */ } diff --git a/tests/testing_zgemm.c b/tests/testing_zgemm.c index 75881b07..e14745f0 100644 --- a/tests/testing_zgemm.c +++ b/tests/testing_zgemm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009-2024 The University of Tennessee and The University + * Copyright (c) 2009-2025 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -77,13 +77,13 @@ int main(int argc, char ** argv) dplasma_zplrnt( parsec, 0, (parsec_tiled_matrix_t *)&dcC, Cseed); if(loud > 2) printf("Done\n"); - /* Advice data on device */ + /* Advice data on device */ #if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP) - dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcA, + dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcA, (parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL); - dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcB, + dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcB, (parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL); - dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcC, + dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcC, (parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL); #endif From a840839e43960a5b751587ed0e747af6321737e3 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Wed, 12 Mar 2025 20:42:07 -0400 Subject: [PATCH 2/5] Use ADTT_READ/DC in zgemm_NN_gpu; enable HIP advise Co-authored-by: George Bosilca Signed-off-by: Aurelien Bouteiller --- src/zgemm_NN_gpu.jdf | 76 ++++++++++++++++++++-------- src/zgemm_wrapper.c | 116 +++++++++++++++++++++++-------------------- 2 files changed, 117 insertions(+), 75 deletions(-) diff --git a/src/zgemm_NN_gpu.jdf b/src/zgemm_NN_gpu.jdf index 94e60312..1577c2c3 100644 --- a/src/zgemm_NN_gpu.jdf +++ b/src/zgemm_NN_gpu.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2017-2024 The University of Tennessee and The University + * Copyright (c) 2017-2025 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -113,6 +113,23 @@ static int pred_z(int x, int y, int z, int xMax, int yMax, int zMax, int l) return z; } +/* Define the different shapes this JDF is using */ +#define A_SHAPE 0 +#define B_SHAPE 1 +#define C_SHAPE 2 + +/* Assume the functions on type & type_remote will return parsec_arena_datatype_t */ +#define JDF2C_TYPE_ADT_NOT_INDEX + +/* Include the functions to obtain the parsec_arena_datatype_t */ +#include "dplasmajdf_lapack_dtt.h" +//#define FULL_CONVERSION +#ifdef FULL_CONVERSION +#define ADTT_READ(dM, loc, shape, layout) ADTT_DC(dM, loc, shape, layout) +#else +#define ADTT_READ(dM, loc, shape, layout) ADTT_DC(dM, loc, shape, LAPACK) +#endif + %} /* Keep this first, as in all jdf in this directory, to @@ -126,9 +143,12 @@ transB [ type = int ] alpha [ type = dplasma_complex64_t ] beta [ type = dplasma_complex64_t ] -descA [ type = "const parsec_tiled_matrix_t*" ] -descB [ type = "const parsec_tiled_matrix_t*" ] -descC [ type = "parsec_tiled_matrix_t*" ] +ddescA [type = "dplasma_data_collection_t*"] +descA [type = "parsec_tiled_matrix_t*" hidden = on default = "((dplasma_data_collection_t*)ddescA)->dc_original" aligned=ddescA] +ddescB [type = "dplasma_data_collection_t*"] +descB [type = "parsec_tiled_matrix_t*" hidden = on default = "((dplasma_data_collection_t*)ddescB)->dc_original" aligned=ddescB] +ddescC [type = "dplasma_data_collection_t*"] +descC [type = "parsec_tiled_matrix_t*" hidden = on default = "((dplasma_data_collection_t*)ddescC)->dc_original" aligned=ddescC] /* * The process grid is tP x tQ @@ -142,8 +162,8 @@ tQ [ type = int ] LOOK_AHEAD [ type = int ] -nb_cuda_devices [ type = "int" ] -cuda_device_index [ type = "int *" ] +nb_gpu_devices [ type = "int" ] +gpu_device_index [ type = "int *" ] xMax [ type = int default = "-1" hidden=on ] yMax [ type = int default = "-1" hidden=on ] @@ -163,12 +183,14 @@ READ_A(m, k, x, y, z) z = k / tD .. k / tD nmax = %{ int n1 = (y+1)*tC*tQ-1; int n2 = descC->nt - 1; - return n1 A GEMM(m, y*tC*tQ .. nmax, k) CTL Y <- Y GLOBAL_BARRIER(x, y, z) @@ -191,12 +213,14 @@ READ_B(k, n, x, y, z) z = k / tD .. k / tD mmax = %{ int m1 = (x+1)*tB*tP-1; int m2 = descC->mt - 1; - return m1 B GEMM(x*tB*tP .. mmax, n, k) CTL Y <- Y GLOBAL_BARRIER(x, y, z) @@ -216,19 +240,22 @@ READ_C(m, n) u = r / tQ v = r % tQ + loc_C = %{ return LOC(descC, m, n); %} + : descC(m, n) -READ C <- descC(m, n) +READ C <- descC(m, n) [ type = %{ return ADTT_READ(ddescC, loc_C, C_SHAPE, TILED); %} + type_data = %{ return ADTT_READ(ddescC, loc_C, C_SHAPE, LAPACK); %} ] -> C GEMM(m, n, 0) CTL Z <- Z LOCAL_BARRIER( m/(tB*tP), n/(tC*tQ), 0, u, v ) BODY - if( nb_cuda_devices > 0 ) { - int g = (n / tQ) % nb_cuda_devices; + if( nb_gpu_devices > 0 ) { + int g = (n / tQ) % nb_gpu_devices; if( _f_C->original->preferred_device <= 0 ) { parsec_advise_data_on_device( _f_C->original, - cuda_device_index[g], + gpu_device_index[g], PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); } } @@ -352,14 +379,19 @@ GEMM(m, n, k) yn = %{ return succ_y(x, y, z, xMax, yMax, zMax, 1); %} zn = %{ return succ_z(x, y, z, xMax, yMax, zMax, 1); %} + loc_A = %{ return LOC(descA, m, k); %} + loc_B = %{ return LOC(descB, k, n); %} + loc_C = %{ return LOC(descC, m, n); %} + : descC(m, n) -READ A <- A READ_A(m, k, x, y, z) -READ B <- B READ_B(k, n, x, y, z) -RW C <- k == 0 ? C READ_C(m, n) - : C GEMM(m, n, k-1 ) - -> k + 1 == descB->mt ? descC(m, n) - : C GEMM(m, n, k+1) +READ A <- A READ_A(m, k, x, y, z) [ type_remote = %{ return ADTT_DC(ddescA, loc_A, A_SHAPE, TILED); %} ] +READ B <- B READ_B(k, n, x, y, z) [ type_remote = %{ return ADTT_DC(ddescB, loc_B, B_SHAPE, TILED); %} ] +RW C <- k == 0 ? C READ_C(m, n) [ type_remote = %{ return ADTT_DC(ddescC, loc_C, C_SHAPE, TILED); %} ] + <- k != 0 ? C GEMM(m, n, k-1 ) [ type_remote = %{ return ADTT_DC(ddescC, loc_C, C_SHAPE, TILED); %} ] + -> k + 1 == descB->mt ? descC(m, n) [ type = %{ return ADTT_CP(_f_C, ddescC, loc_C, C_SHAPE); %} + type_data = %{ return ADTT_DC(ddescC, loc_C, C_SHAPE, LAPACK); %} ] + -> k + 1 != descB->mt ? C GEMM(m, n, k+1) /* dep OUT: rely on datacopy dtt for sending */ CTL Z <- ( k > 0 ) & ((k % tD) == 0) ? Z LOCAL_BARRIER(x, y, z, u, v) -> ((k == descB->mt-1) | (k == (z+1)*tD-1)) ? Z LOCAL_BARRIER(xn, yn, zn, u, v) diff --git a/src/zgemm_wrapper.c b/src/zgemm_wrapper.c index 098b8f69..6d3b4dfe 100644 --- a/src/zgemm_wrapper.c +++ b/src/zgemm_wrapper.c @@ -135,17 +135,14 @@ dplasma_zgemm_summa_new(dplasma_enum_t transA, dplasma_enum_t transB, parsec_datatype_double_complex_t, PARSEC_MATRIX_FULL/*uplo*/, 1/*diag:for PARSEC_MATRIX_UPPER or PARSEC_MATRIX_LOWER types*/, &shape); - dplasma_setup_adtt_all_loc( ddc_B, parsec_datatype_double_complex_t, PARSEC_MATRIX_FULL/*uplo*/, 1/*diag:for PARSEC_MATRIX_UPPER or PARSEC_MATRIX_LOWER types*/, &shape); - dplasma_setup_adtt_all_loc( ddc_C, parsec_datatype_double_complex_t, PARSEC_MATRIX_FULL/*uplo*/, 1/*diag:for PARSEC_MATRIX_UPPER or PARSEC_MATRIX_LOWER types*/, &shape); - assert(shape == MAX_SHAPES); (void)opt; //No user-defined options for this algorithm @@ -196,17 +193,14 @@ dplasma_zgemm_default_new(dplasma_enum_t transA, dplasma_enum_t transB, parsec_datatype_double_complex_t, PARSEC_MATRIX_FULL/*uplo*/, 1/*diag:for PARSEC_MATRIX_UPPER or PARSEC_MATRIX_LOWER types*/, &shape); - dplasma_setup_adtt_all_loc( ddc_B, parsec_datatype_double_complex_t, PARSEC_MATRIX_FULL/*uplo*/, 1/*diag:for PARSEC_MATRIX_UPPER or PARSEC_MATRIX_LOWER types*/, &shape); - dplasma_setup_adtt_all_loc( ddc_C, parsec_datatype_double_complex_t, PARSEC_MATRIX_FULL/*uplo*/, 1/*diag:for PARSEC_MATRIX_UPPER or PARSEC_MATRIX_LOWER types*/, &shape); - assert(shape == MAX_SHAPES); (void)opt; //No user-defined options for this algorithm @@ -362,9 +356,13 @@ dplasma_zgemm_gpu_new( dplasma_enum_t transA, dplasma_enum_t transB, assert( c*q <= C->nt ); { + dplasma_data_collection_t * ddc_A = dplasma_wrap_data_collection((parsec_tiled_matrix_t*)A); + dplasma_data_collection_t * ddc_B = dplasma_wrap_data_collection((parsec_tiled_matrix_t*)B); + dplasma_data_collection_t * ddc_C = dplasma_wrap_data_collection(C); + parsec_zgemm_NN_gpu_taskpool_t *tp; tp = parsec_zgemm_NN_gpu_new(transA, transB, alpha, beta, - A, B, C, b, c, d, p, q, look_ahead, + ddc_A, ddc_B, ddc_C, b, c, d, p, q, look_ahead, nbgpu, dev_index); u = C->super.myrank / q; @@ -390,10 +388,21 @@ dplasma_zgemm_gpu_new( dplasma_enum_t transA, dplasma_enum_t transB, #else tp->_g_hip_handles_infokey = PARSEC_INFO_ID_UNDEFINED; #endif - dplasma_add2arena_tile( &tp->arenas_datatypes[PARSEC_zgemm_NN_gpu_DEFAULT_ADT_IDX], - A->mb*A->nb*sizeof(dplasma_complex64_t), - PARSEC_ARENA_ALIGNMENT_SSE, - parsec_datatype_double_complex_t, A->mb ); + + int shape = 0; + dplasma_setup_adtt_all_loc( ddc_A, + parsec_datatype_double_complex_t, + PARSEC_MATRIX_FULL/*uplo*/, 1/*diag:for PARSEC_MATRIX_UPPER or PARSEC_MATRIX_LOWER types*/, + &shape); + dplasma_setup_adtt_all_loc( ddc_B, + parsec_datatype_double_complex_t, + PARSEC_MATRIX_FULL/*uplo*/, 1/*diag:for PARSEC_MATRIX_UPPER or PARSEC_MATRIX_LOWER types*/, + &shape); + dplasma_setup_adtt_all_loc( ddc_C, + parsec_datatype_double_complex_t, + PARSEC_MATRIX_FULL/*uplo*/, 1/*diag:for PARSEC_MATRIX_UPPER or PARSEC_MATRIX_LOWER types*/, + &shape); + assert(shape == MAX_SHAPES); zgemm_tp = (parsec_taskpool_t *) tp; return zgemm_tp; @@ -491,19 +500,19 @@ dplasma_zgemm_New_ex( dplasma_enum_t transA, dplasma_enum_t transB, if ( C->dtype & parsec_matrix_block_cyclic_type ) { #if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP) int nb_gpu_devices = 0, devid; - int p = ((parsec_matrix_block_cyclic_t*)C)->grid.rows; - int q = ((parsec_matrix_block_cyclic_t*)C)->grid.cols; - int64_t gpu_mem_block_size = 0; - int64_t gpu_mem_nb_blocks = -1; + int p = ((parsec_matrix_block_cyclic_t*)C)->grid.rows; + int q = ((parsec_matrix_block_cyclic_t*)C)->grid.cols; + int64_t gpu_mem_block_size = 0; + int64_t gpu_mem_nb_blocks = -1; for(devid = 0; devid < (int)parsec_nb_devices; devid++) { parsec_device_module_t *device = parsec_mca_device_get(devid); if( PARSEC_DEV_CUDA == device->type || PARSEC_DEV_HIP == device->type ) { - parsec_device_gpu_module_t *gpu_device = (parsec_device_gpu_module_t*)device; + parsec_device_gpu_module_t *gpu_device = (parsec_device_gpu_module_t*)device; nb_gpu_devices++; - if( 0 == gpu_mem_block_size ) - gpu_mem_block_size = gpu_device->mem_block_size; - if( -1 == gpu_mem_nb_blocks || gpu_device->mem_nb_blocks < gpu_mem_nb_blocks ) - gpu_mem_nb_blocks = gpu_device->mem_nb_blocks; + if( 0 == gpu_mem_block_size ) + gpu_mem_block_size = gpu_device->mem_block_size; + if( -1 == gpu_mem_nb_blocks || gpu_device->mem_nb_blocks < gpu_mem_nb_blocks ) + gpu_mem_nb_blocks = gpu_device->mem_nb_blocks; } } if(0 < nb_gpu_devices) { @@ -567,51 +576,52 @@ dplasma_zgemm_Destruct( parsec_taskpool_t *tp ) parsec_zgemm_NN_taskpool_t *zgemm_tp = (parsec_zgemm_NN_taskpool_t *)tp; dplasma_data_collection_t *ddc_A = NULL, *ddc_B = NULL, *ddc_C = NULL; - if( zgemm_tp->_g_gemm_type == DPLASMA_ZGEMM_NN_SUMMA || - zgemm_tp->_g_gemm_type == DPLASMA_ZGEMM_NT_SUMMA || - zgemm_tp->_g_gemm_type == DPLASMA_ZGEMM_TN_SUMMA || - zgemm_tp->_g_gemm_type == DPLASMA_ZGEMM_TT_SUMMA) { + switch( zgemm_tp->_g_gemm_type ) { + case DPLASMA_ZGEMM_NN: + case DPLASMA_ZGEMM_NT: + case DPLASMA_ZGEMM_TN: + case DPLASMA_ZGEMM_TT: + ddc_A = zgemm_tp->_g_ddescA; + ddc_B = zgemm_tp->_g_ddescB; + ddc_C = zgemm_tp->_g_ddescC; + break; + case DPLASMA_ZGEMM_NN_SUMMA: + case DPLASMA_ZGEMM_NT_SUMMA: + case DPLASMA_ZGEMM_TN_SUMMA: + case DPLASMA_ZGEMM_TT_SUMMA: { parsec_zgemm_NN_summa_taskpool_t *zgemm_summa_tp = (parsec_zgemm_NN_summa_taskpool_t *)tp; + ddc_A = zgemm_summa_tp->_g_ddescA; + ddc_B = zgemm_summa_tp->_g_ddescB; + ddc_C = zgemm_summa_tp->_g_ddescC; parsec_tiled_matrix_t* Cdist = (parsec_tiled_matrix_t*)zgemm_summa_tp->_g_Cdist; if ( NULL != Cdist ) { parsec_tiled_matrix_destroy( Cdist ); free( Cdist ); } - dplasma_clean_adtt_all_loc(zgemm_summa_tp->_g_ddescA, MAX_SHAPES); - dplasma_clean_adtt_all_loc(zgemm_summa_tp->_g_ddescB, MAX_SHAPES); - dplasma_clean_adtt_all_loc(zgemm_summa_tp->_g_ddescC, MAX_SHAPES); - - ddc_A = zgemm_summa_tp->_g_ddescA; - ddc_B = zgemm_summa_tp->_g_ddescB; - ddc_C = zgemm_summa_tp->_g_ddescC; - } else if( zgemm_tp->_g_gemm_type == DPLASMA_ZGEMM_NN || - zgemm_tp->_g_gemm_type == DPLASMA_ZGEMM_NT || - zgemm_tp->_g_gemm_type == DPLASMA_ZGEMM_TN || - zgemm_tp->_g_gemm_type == DPLASMA_ZGEMM_TT) { - dplasma_clean_adtt_all_loc(zgemm_tp->_g_ddescA, MAX_SHAPES); - dplasma_clean_adtt_all_loc(zgemm_tp->_g_ddescB, MAX_SHAPES); - dplasma_clean_adtt_all_loc(zgemm_tp->_g_ddescC, MAX_SHAPES); - - ddc_A = zgemm_tp->_g_ddescA; - ddc_B = zgemm_tp->_g_ddescB; - ddc_C = zgemm_tp->_g_ddescC; -#if defined(DPLASMA_HAVE_CUDA) - } else if( zgemm_tp->_g_gemm_type == DPLASMA_ZGEMM_NN_GPU ) { + break; } +#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP) + case DPLASMA_ZGEMM_NN_GPU: { parsec_zgemm_NN_gpu_taskpool_t *zgemm_gpu_tp = (parsec_zgemm_NN_gpu_taskpool_t *)tp; - dplasma_matrix_del2arena( &zgemm_gpu_tp->arenas_datatypes[PARSEC_zgemm_NN_gpu_DEFAULT_ADT_IDX] ); - free(zgemm_gpu_tp->_g_cuda_device_index); -#endif /* DPLASMA_HAVE_CUDA */ + ddc_A = zgemm_gpu_tp->_g_ddescA; + ddc_B = zgemm_gpu_tp->_g_ddescB; + ddc_C = zgemm_gpu_tp->_g_ddescC; + free(zgemm_gpu_tp->_g_gpu_device_index); + break; } +#endif /* DPLASMA_HAVE_CUDA || defined(DPLASMA_HAVE_HIP) */ + default: + parsec_warning("Invalid GEMM taskpool type during destruct!"); } + dplasma_clean_adtt_all_loc(ddc_A, MAX_SHAPES); + dplasma_clean_adtt_all_loc(ddc_B, MAX_SHAPES); + dplasma_clean_adtt_all_loc(ddc_C, MAX_SHAPES); + parsec_taskpool_free(tp); /* free the dplasma_data_collection_t, after the tp stops referring to them */ - if(NULL != ddc_A) - dplasma_unwrap_data_collection(ddc_A); - if(NULL != ddc_B) - dplasma_unwrap_data_collection(ddc_B); - if(NULL != ddc_C) - dplasma_unwrap_data_collection(ddc_C); + dplasma_unwrap_data_collection(ddc_A); + dplasma_unwrap_data_collection(ddc_B); + dplasma_unwrap_data_collection(ddc_C); } /** From 8449fccd10cd6d184c41e99e9da0d05609bc3f51 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Wed, 12 Mar 2025 21:23:57 -0400 Subject: [PATCH 3/5] debug-verbose 3 tells what gemm variant is used Signed-off-by: Aurelien Bouteiller --- src/zgemm_wrapper.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/zgemm_wrapper.c b/src/zgemm_wrapper.c index 6d3b4dfe..dd85d8f1 100644 --- a/src/zgemm_wrapper.c +++ b/src/zgemm_wrapper.c @@ -73,7 +73,7 @@ dplasma_zgemm_summa_new(dplasma_enum_t transA, dplasma_enum_t transB, if( dplasmaNoTrans == transA ) { if( dplasmaNoTrans == transB ) { - PARSEC_DEBUG_VERBOSE(3, parsec_debug_output, "zgemm_NN_summa\n"); + PARSEC_DEBUG_VERBOSE(3, parsec_debug_output, "zgemm_NN_summa"); parsec_zgemm_NN_summa_taskpool_t* tp; tp = parsec_zgemm_NN_summa_new(transA, transB, alpha, beta, ddc_A, ddc_B, ddc_C, (parsec_data_collection_t*)Cdist); @@ -86,7 +86,7 @@ dplasma_zgemm_summa_new(dplasma_enum_t transA, dplasma_enum_t transB, #endif zgemm_tp = (parsec_taskpool_t*)tp; } else { - PARSEC_DEBUG_VERBOSE(3, parsec_debug_output, "zgemm_NT_summa\n"); + PARSEC_DEBUG_VERBOSE(3, parsec_debug_output, "zgemm_NT_summa"); parsec_zgemm_NT_summa_taskpool_t* tp; tp = parsec_zgemm_NT_summa_new(transA, transB, alpha, beta, ddc_A, ddc_B, ddc_C, (parsec_data_collection_t*)Cdist); @@ -101,7 +101,7 @@ dplasma_zgemm_summa_new(dplasma_enum_t transA, dplasma_enum_t transB, } } else { if( dplasmaNoTrans == transB ) { - PARSEC_DEBUG_VERBOSE(3, parsec_debug_output, "zgemm_TN_summa\n"); + PARSEC_DEBUG_VERBOSE(3, parsec_debug_output, "zgemm_TN_summa"); parsec_zgemm_TN_summa_taskpool_t* tp; tp = parsec_zgemm_TN_summa_new(transA, transB, alpha, beta, ddc_A, ddc_B, ddc_C, (parsec_data_collection_t*)Cdist); @@ -114,7 +114,7 @@ dplasma_zgemm_summa_new(dplasma_enum_t transA, dplasma_enum_t transB, #endif zgemm_tp = (parsec_taskpool_t*)tp; } else { - PARSEC_DEBUG_VERBOSE(3, parsec_debug_output, "zgemm_TT_summa\n"); + PARSEC_DEBUG_VERBOSE(3, parsec_debug_output, "zgemm_TT_summa"); parsec_zgemm_TT_summa_taskpool_t* tp; tp = parsec_zgemm_TT_summa_new(transA, transB, alpha, beta, ddc_A, ddc_B, ddc_C, @@ -163,11 +163,13 @@ dplasma_zgemm_default_new(dplasma_enum_t transA, dplasma_enum_t transB, if( dplasmaNoTrans == transA ) { if( dplasmaNoTrans == transB ) { + PARSEC_DEBUG_VERBOSE(3, parsec_debug_output, "zgemm_NN"); parsec_zgemm_NN_taskpool_t* tp; tp = parsec_zgemm_NN_new(transA, transB, alpha, beta, ddc_A, ddc_B, ddc_C); zgemm_tp = (parsec_taskpool_t*)tp; } else { + PARSEC_DEBUG_VERBOSE(3, parsec_debug_output, "zgemm_NT"); parsec_zgemm_NT_taskpool_t* tp; tp = parsec_zgemm_NT_new(transA, transB, alpha, beta, ddc_A, ddc_B, ddc_C); @@ -176,11 +178,13 @@ dplasma_zgemm_default_new(dplasma_enum_t transA, dplasma_enum_t transB, } else { if( dplasmaNoTrans == transB ) { parsec_zgemm_TN_taskpool_t* tp; + PARSEC_DEBUG_VERBOSE(3, parsec_debug_output, "zgemm_TN"); tp = parsec_zgemm_TN_new(transA, transB, alpha, beta, ddc_A, ddc_B, ddc_C); zgemm_tp = (parsec_taskpool_t*)tp; } else { + PARSEC_DEBUG_VERBOSE(3, parsec_debug_output, "zgemm_TT"); parsec_zgemm_TT_taskpool_t* tp; tp = parsec_zgemm_TT_new(transA, transB, alpha, beta, ddc_A, ddc_B, ddc_C); @@ -360,6 +364,7 @@ dplasma_zgemm_gpu_new( dplasma_enum_t transA, dplasma_enum_t transB, dplasma_data_collection_t * ddc_B = dplasma_wrap_data_collection((parsec_tiled_matrix_t*)B); dplasma_data_collection_t * ddc_C = dplasma_wrap_data_collection(C); + PARSEC_DEBUG_VERBOSE(3, parsec_debug_output, "zgemm_NN_gpu"); parsec_zgemm_NN_gpu_taskpool_t *tp; tp = parsec_zgemm_NN_gpu_new(transA, transB, alpha, beta, ddc_A, ddc_B, ddc_C, b, c, d, p, q, look_ahead, From 272114ec5ab0691566dba9262df3a8d52c7796e1 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Wed, 12 Mar 2025 21:56:49 -0400 Subject: [PATCH 4/5] Proper error checking when zgemm_NN_gpu cannot create a taskpool Signed-off-by: Aurelien Bouteiller --- src/zgemm_wrapper.c | 5 +++++ tests/common.h | 2 ++ 2 files changed, 7 insertions(+) diff --git a/src/zgemm_wrapper.c b/src/zgemm_wrapper.c index dd85d8f1..9534375c 100644 --- a/src/zgemm_wrapper.c +++ b/src/zgemm_wrapper.c @@ -284,6 +284,11 @@ dplasma_zgemm_gpu_new( dplasma_enum_t transA, dplasma_enum_t transB, nb_block_per_tile = (tile_size + gpu_mem_block_size -1 ) / gpu_mem_block_size; gpu_mem_nb_blocks = vd * gpu_mem_nb_blocks; nb_tile_per_gpu = gpu_mem_nb_blocks / nb_block_per_tile; + if(0 == nb_tile_per_gpu) { + dplasma_error("dplasma_zgemm_gpu_new", + "Not enough memory on the GPU to store a single tile!"); + goto cleanup; + } mt = A->mt; nt = B->nt; diff --git a/tests/common.h b/tests/common.h index 86c25849..5fc4befb 100644 --- a/tests/common.h +++ b/tests/common.h @@ -195,6 +195,7 @@ static inline int min(int a, int b) { return a < b ? a : b; } #define PASTE_CODE_ENQUEUE_KERNEL(PARSEC, KERNEL, PARAMS) \ SYNC_TIME_START(); \ parsec_taskpool_t* PARSEC_##KERNEL = dplasma_##KERNEL##_New PARAMS; \ + PARSEC_CHECK_ERROR(NULL == PARSEC_##KERNEL? PARSEC_ERROR: PARSEC_SUCCESS, "dplasma_"#KERNEL "_New"); \ PARSEC_CHECK_ERROR(parsec_context_add_taskpool(PARSEC, PARSEC_##KERNEL), "parsec_context_add_taskpool"); \ if( loud > 2 ) SYNC_TIME_PRINT(rank, ( #KERNEL "\tDAG created\n")); @@ -259,6 +260,7 @@ static inline int min(int a, int b) { return a < b ? a : b; } #define PASTE_CODE_ENQUEUE_PROGRESS_DESTRUCT_KERNEL(PARSEC, KERNEL, PARAMS, DESTRUCT)\ SYNC_TIME_START(); \ parsec_taskpool_t* PARSEC_##KERNEL = dplasma_##KERNEL##_New PARAMS; \ + PARSEC_CHECK_ERROR(NULL == PARSEC_##KERNEL? PARSEC_ERROR: PARSEC_SUCCESS, "dplasma_"#KERNEL "_New"); \ PARSEC_CHECK_ERROR(parsec_context_add_taskpool(PARSEC, PARSEC_##KERNEL), "parsec_context_add_taskpool");\ SYNC_TIME_STOP(); \ parsec_devices_save_statistics(&dev_stats); \ From aee1c458d1a5af6c1062dd282f930bd7533db8fe Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Mon, 24 Mar 2025 18:08:07 -0400 Subject: [PATCH 5/5] Use the ADTT_CP between READ_C and GEMM since they are :descC --- src/zgemm_NN_gpu.jdf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/zgemm_NN_gpu.jdf b/src/zgemm_NN_gpu.jdf index 1577c2c3..a0713a25 100644 --- a/src/zgemm_NN_gpu.jdf +++ b/src/zgemm_NN_gpu.jdf @@ -387,8 +387,8 @@ GEMM(m, n, k) READ A <- A READ_A(m, k, x, y, z) [ type_remote = %{ return ADTT_DC(ddescA, loc_A, A_SHAPE, TILED); %} ] READ B <- B READ_B(k, n, x, y, z) [ type_remote = %{ return ADTT_DC(ddescB, loc_B, B_SHAPE, TILED); %} ] -RW C <- k == 0 ? C READ_C(m, n) [ type_remote = %{ return ADTT_DC(ddescC, loc_C, C_SHAPE, TILED); %} ] - <- k != 0 ? C GEMM(m, n, k-1 ) [ type_remote = %{ return ADTT_DC(ddescC, loc_C, C_SHAPE, TILED); %} ] +RW C <- k == 0 ? C READ_C(m, n) + <- k != 0 ? C GEMM(m, n, k-1 ) -> k + 1 == descB->mt ? descC(m, n) [ type = %{ return ADTT_CP(_f_C, ddescC, loc_C, C_SHAPE); %} type_data = %{ return ADTT_DC(ddescC, loc_C, C_SHAPE, LAPACK); %} ] -> k + 1 != descB->mt ? C GEMM(m, n, k+1) /* dep OUT: rely on datacopy dtt for sending */