From eca1c11bed81e22de5cc6f99934dfdf70a78caf5 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Thu, 10 Apr 2025 11:42:35 +0800 Subject: [PATCH 1/8] add cusolver support for kpar --- source/module_hsolver/diago_cusolver.cpp | 18 +++++++++ source/module_hsolver/diago_cusolver.h | 7 ++++ source/module_hsolver/hsolver_lcao.cpp | 45 +++++++++++++++++---- source/module_hsolver/parallel_k2d.cpp | 51 +++++++++++++++++++++--- source/module_hsolver/parallel_k2d.h | 34 ++++++++++------ 5 files changed, 131 insertions(+), 24 deletions(-) diff --git a/source/module_hsolver/diago_cusolver.cpp b/source/module_hsolver/diago_cusolver.cpp index efaf20a28a..69b8a0bb07 100644 --- a/source/module_hsolver/diago_cusolver.cpp +++ b/source/module_hsolver/diago_cusolver.cpp @@ -111,6 +111,24 @@ static void distributePsi(const int* desc_psi, T* psi, T* psi_g) Cpxgemr2d(nrows, ncols, psi_g, 1, 1, descg, psi, 1, 1, descl, ctxt); } +template +void DiagoCusolver::diag_pool(hamilt::MatrixBlock& h_mat, + hamilt::MatrixBlock& s_mat, + psi::Psi& psi, + Real* eigenvalue_in, + MPI_Comm& comm) +{ + ModuleBase::TITLE("DiagoCusolver", "diag_pool"); + ModuleBase::timer::tick("DiagoCusolver", "diag_pool"); + std::vector eigen(PARAM.globalv.nlocal, 0.0); + std::vector eigenvectors(h_mat.row * h_mat.col); + this->dc.Dngvd(h_mat.row, h_mat.col, h_mat.p, s_mat.p, eigen.data(), eigenvectors.data()); + const int size = psi.get_nbands() * psi.get_nbasis(); + BlasConnector::copy(size, eigenvectors.data(), 1, psi.get_pointer(), 1); + BlasConnector::copy(PARAM.inp.nbands, eigen.data(), 1, eigenvalue_in, 1); + ModuleBase::timer::tick("DiagoCusolver", "diag_pool"); +} + // Diagonalization function template void DiagoCusolver::diag(hamilt::Hamilt* phm_in, psi::Psi& psi, Real* eigenvalue_in) diff --git a/source/module_hsolver/diago_cusolver.h b/source/module_hsolver/diago_cusolver.h index ed67357f3f..61f37999be 100644 --- a/source/module_hsolver/diago_cusolver.h +++ b/source/module_hsolver/diago_cusolver.h @@ -24,6 +24,13 @@ class DiagoCusolver DiagoCusolver(const Parallel_Orbitals* ParaV = nullptr); ~DiagoCusolver(); + void diag_pool( + hamilt::MatrixBlock& h_mat, + hamilt::MatrixBlock& s_mat, + psi::Psi& psi, + Real* eigenvalue_in, + MPI_Comm& comm); + // Override the diag function for CUSOLVER diagonalization void diag(hamilt::Hamilt* phm_in, psi::Psi& psi, Real* eigenvalue_in); diff --git a/source/module_hsolver/hsolver_lcao.cpp b/source/module_hsolver/hsolver_lcao.cpp index 6a361e08db..0859e886b2 100644 --- a/source/module_hsolver/hsolver_lcao.cpp +++ b/source/module_hsolver/hsolver_lcao.cpp @@ -49,7 +49,8 @@ void HSolverLCAO::solve(hamilt::Hamilt* pHamilt, if (this->method != "pexsi") { if (PARAM.globalv.kpar_lcao > 1 - && (this->method == "genelpa" || this->method == "elpa" || this->method == "scalapack_gvx")) + && (this->method == "genelpa" || this->method == "elpa" + || this->method == "scalapack_gvx" || this->method == "cusolver")) { #ifdef __MPI this->parakSolve(pHamilt, psi, pes, PARAM.globalv.kpar_lcao); @@ -191,11 +192,20 @@ void HSolverLCAO::parakSolve(hamilt::Hamilt* pHamilt, int nks = psi.get_nk(); int nrow = this->ParaV->get_global_row_size(); int nb2d = this->ParaV->get_block_size(); - k2d.set_para_env(psi.get_nk(), nrow, nb2d, GlobalV::NPROC, GlobalV::MY_RANK, PARAM.inp.nspin); + if(this->method == "cusolver") + { + k2d.set_para_env_cusolver(psi.get_nk(), nrow, nb2d, GlobalV::NPROC, GlobalV::MY_RANK, PARAM.inp.nspin); + } else + { + k2d.set_para_env(psi.get_nk(), nrow, nb2d, GlobalV::NPROC, GlobalV::MY_RANK, PARAM.inp.nspin); + } /// set psi_pool const int zero = 0; - int ncol_bands_pool - = numroc_(&(nbands), &(nb2d), &(k2d.get_p2D_pool()->coord[1]), &zero, &(k2d.get_p2D_pool()->dim1)); + int ncol_bands_pool = 0; + if(k2d.is_in_pool()) + { + ncol_bands_pool = numroc_(&(nbands), &(nb2d), &(k2d.get_p2D_pool()->coord[1]), &zero, &(k2d.get_p2D_pool()->dim1)); + } /// Loop over k points for solve Hamiltonian to charge density for (int ik = 0; ik < k2d.get_pKpoints()->get_max_nks_pool(); ++ik) { @@ -222,9 +232,12 @@ void HSolverLCAO::parakSolve(hamilt::Hamilt* pHamilt, } } k2d.distribute_hsk(pHamilt, ik_kpar, nrow); + auto psi_pool = psi::Psi(); + if(k2d.is_in_pool()) + { /// global index of k point int ik_global = ik + k2d.get_pKpoints()->startk_pool[k2d.get_my_pool()]; - auto psi_pool = psi::Psi(1, ncol_bands_pool, k2d.get_p2D_pool()->nrow, k2d.get_p2D_pool()->nrow, true); + psi_pool = psi::Psi(1, ncol_bands_pool, k2d.get_p2D_pool()->nrow, k2d.get_p2D_pool()->nrow, true); ModuleBase::Memory::record("HSolverLCAO::psi_pool", nrow * ncol_bands_pool * sizeof(T)); if (ik_global < psi.get_nk() && ik < k2d.get_pKpoints()->nks_pool[k2d.get_my_pool()]) { @@ -255,6 +268,13 @@ void HSolverLCAO::parakSolve(hamilt::Hamilt* pHamilt, DiagoElpaNative el; el.diag_pool(hk_pool, sk_pool, psi_pool, &(pes->ekb(ik_global, 0)), k2d.POOL_WORLD_K2D); } +#endif +#ifdef __CUDA + else if (this->method == "cusolver") + { + DiagoCusolver cs(nullptr); + cs.diag_pool(hk_pool, sk_pool, psi_pool, &(pes->ekb(ik_global, 0)), k2d.POOL_WORLD_K2D); + } #endif else { @@ -262,14 +282,25 @@ void HSolverLCAO::parakSolve(hamilt::Hamilt* pHamilt, "This type of eigensolver for k-parallelism diagnolization is not supported!"); } } + } MPI_Barrier(MPI_COMM_WORLD); ModuleBase::timer::tick("HSolverLCAO", "collect_psi"); for (int ipool = 0; ipool < ik_kpar.size(); ++ipool) { - int source = k2d.get_pKpoints()->get_startpro_pool(ipool); + int source = 0; + if(this->method != "cusolver") + { + source = k2d.get_pKpoints()->get_startpro_pool(ipool); + } else + { + source = ipool; + } MPI_Bcast(&(pes->ekb(ik_kpar[ipool], 0)), nbands, MPI_DOUBLE, source, MPI_COMM_WORLD); int desc_pool[9]; - std::copy(k2d.get_p2D_pool()->desc, k2d.get_p2D_pool()->desc + 9, desc_pool); + if(k2d.is_in_pool()) + { + std::copy(k2d.get_p2D_pool()->desc, k2d.get_p2D_pool()->desc + 9, desc_pool); + } if (k2d.get_my_pool() != ipool) { desc_pool[1] = -1; diff --git a/source/module_hsolver/parallel_k2d.cpp b/source/module_hsolver/parallel_k2d.cpp index 91ead02082..8d40bf2bca 100644 --- a/source/module_hsolver/parallel_k2d.cpp +++ b/source/module_hsolver/parallel_k2d.cpp @@ -34,6 +34,42 @@ void Parallel_K2D::set_para_env(int nks, this->P2D_pool->init(nw, nw, nb2d, this->POOL_WORLD_K2D); } +template +void Parallel_K2D::set_para_env_cusolver(int nks, + const int& nw, + const int& nb2d, + const int& nproc, + const int& my_rank, + const int& nspin) { + const int kpar = this->get_kpar(); + const int pool_id = (my_rank < kpar) ? my_rank : MPI_UNDEFINED; + MPI_Comm_split(MPI_COMM_WORLD, + pool_id, + 0, + &this->POOL_WORLD_K2D); + this->P2D_global = new Parallel_2D; + this->P2D_global->init(nw, nw, nb2d, MPI_COMM_WORLD); + if(this->POOL_WORLD_K2D != MPI_COMM_NULL) + { + this->MY_POOL = pool_id; + this->RANK_IN_POOL = 0; + this->NPROC_IN_POOL = 1; + this->in_pool = true; + this->P2D_pool = new Parallel_2D; + this->P2D_pool->init(nw, nw, nb2d, this->POOL_WORLD_K2D); + } + else + { + this->in_pool = false; + this->MY_POOL = -1; + this->RANK_IN_POOL = -1; + this->NPROC_IN_POOL = 0; + } + this->Pkpoints = new Parallel_Kpoints; + this->Pkpoints + ->kinfo(nks, kpar, this->MY_POOL, this->RANK_IN_POOL, nproc, nspin); +} + template void Parallel_K2D::distribute_hsk(hamilt::Hamilt* pHamilt, const std::vector& ik_kpar, @@ -45,13 +81,16 @@ void Parallel_K2D::distribute_hsk(hamilt::Hamilt* pHamilt, pHamilt->updateHk(ik_kpar[ipool]); hamilt::MatrixBlock HK_global, SK_global; pHamilt->matrix(HK_global, SK_global); - if (this->MY_POOL == this->Pkpoints->whichpool[ik_kpar[ipool]]) { + if (this->in_pool && this->MY_POOL == this->Pkpoints->whichpool[ik_kpar[ipool]]) { this->hk_pool.resize(this->P2D_pool->get_local_size(), 0.0); this->sk_pool.resize(this->P2D_pool->get_local_size(), 0.0); } int desc_pool[9]; - std::copy(this->P2D_pool->desc, this->P2D_pool->desc + 9, desc_pool); - if (this->MY_POOL != this->Pkpoints->whichpool[ik_kpar[ipool]]) { + if(this->in_pool) + { + std::copy(this->P2D_pool->desc, this->P2D_pool->desc + 9, desc_pool); + } + if ( !this->in_pool || this->MY_POOL != this->Pkpoints->whichpool[ik_kpar[ipool]]) { desc_pool[1] = -1; } Cpxgemr2d(nw, @@ -77,7 +116,6 @@ void Parallel_K2D::distribute_hsk(hamilt::Hamilt* pHamilt, desc_pool, this->P2D_global->blacs_ctxt); } - ModuleBase::Memory::record("Parallel_K2D::hsk_pool", this->P2D_pool->get_local_size() * 2 * sizeof(TK)); ModuleBase::timer::tick("Parallel_K2D", "distribute_hsk"); MPI_Barrier(MPI_COMM_WORLD); #endif @@ -97,7 +135,10 @@ void Parallel_K2D::unset_para_env() { delete this->P2D_pool; this->P2D_pool = nullptr; } - MPI_Comm_free(&this->POOL_WORLD_K2D); + if(this->POOL_WORLD_K2D != MPI_COMM_NULL) + { + MPI_Comm_free(&this->POOL_WORLD_K2D); + } } template diff --git a/source/module_hsolver/parallel_k2d.h b/source/module_hsolver/parallel_k2d.h index cdd7a951c8..620dea5a3a 100644 --- a/source/module_hsolver/parallel_k2d.h +++ b/source/module_hsolver/parallel_k2d.h @@ -18,20 +18,27 @@ template class Parallel_K2D { public: /// private constructor - Parallel_K2D() {} + Parallel_K2D() {}; /// private destructor - ~Parallel_K2D() {} + ~Parallel_K2D() {}; /** * Public member functions */ /// this function sets the parallel environment for k-points parallelism /// including the glabal and pool 2D parallel distribution void set_para_env(int nks, - const int& nw, - const int& nb2d, - const int& nproc, - const int& my_rank, - const int& nspin); + const int& nw, + const int& nb2d, + const int& nproc, + const int& my_rank, + const int& nspin); + + void set_para_env_cusolver(int nks, + const int& nw, + const int& nb2d, + const int& nproc, + const int& my_rank, + const int& nspin); /// this function distributes the Hk and Sk matrices to hk_pool and sk_pool void distribute_hsk(hamilt::Hamilt* pHamilt, @@ -44,15 +51,17 @@ class Parallel_K2D { /// set the number of k-points void set_kpar(int kpar); /// get the number of k-points - int get_kpar() { return this->kpar_; } + int get_kpar() { return this->kpar_; }; /// get my pool - int get_my_pool() { return this->MY_POOL; } + int get_my_pool() { return this->MY_POOL; }; + /// check if this proc is in any pool + bool is_in_pool() { return this->in_pool; }; /// get pKpoints - Parallel_Kpoints* get_pKpoints() { return this->Pkpoints; } + Parallel_Kpoints* get_pKpoints() { return this->Pkpoints; }; /// get p2D_global - Parallel_2D* get_p2D_global() { return this->P2D_global; } + Parallel_2D* get_p2D_global() { return this->P2D_global; }; /// get p2D_pool - Parallel_2D* get_p2D_pool() { return this->P2D_pool; } + Parallel_2D* get_p2D_pool() { return this->P2D_pool; }; /** * the local Hk, Sk matrices in POOL_WORLD_K2D @@ -69,6 +78,7 @@ class Parallel_K2D { * Private member variables */ int kpar_ = 0; + bool in_pool = true; /** * mpi info From 833dcfd286dad6bf9ae4a0fe9c4e2392857671c2 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Thu, 10 Apr 2025 11:49:23 +0800 Subject: [PATCH 2/8] add stream --- source/module_hsolver/kernels/cuda/diag_cusolver.cu | 3 +++ source/module_hsolver/kernels/cuda/diag_cusolver.cuh | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/source/module_hsolver/kernels/cuda/diag_cusolver.cu b/source/module_hsolver/kernels/cuda/diag_cusolver.cu index 90548f1c9b..a7a1a0dd81 100644 --- a/source/module_hsolver/kernels/cuda/diag_cusolver.cu +++ b/source/module_hsolver/kernels/cuda/diag_cusolver.cu @@ -6,6 +6,8 @@ Diag_Cusolver_gvd::Diag_Cusolver_gvd(){ // step 1: create cusolver/cublas handle cusolverH = NULL; checkCudaErrors( cusolverDnCreate(&cusolverH) ); + checkCudaErrors( cudaStreamCreate(&stream) ); + checkCudaErrors( cusolverDnSetStream(cusolverH, stream) ); itype = CUSOLVER_EIG_TYPE_1; // A*x = (lambda)*B*x jobz = CUSOLVER_EIG_MODE_VECTOR; // compute eigenvalues and eigenvectors. @@ -40,6 +42,7 @@ void Diag_Cusolver_gvd::finalize(){ Diag_Cusolver_gvd::~Diag_Cusolver_gvd(){ finalize(); if (cusolverH) {checkCudaErrors( cusolverDnDestroy(cusolverH) ); cusolverH = NULL;} + if (stream) {checkCudaErrors( cudaStreamDestroy(stream) ); stream = NULL;} //checkCudaErrors( cudaDeviceReset() ); } diff --git a/source/module_hsolver/kernels/cuda/diag_cusolver.cuh b/source/module_hsolver/kernels/cuda/diag_cusolver.cuh index 557cf9c3bd..60d0f3fd12 100644 --- a/source/module_hsolver/kernels/cuda/diag_cusolver.cuh +++ b/source/module_hsolver/kernels/cuda/diag_cusolver.cuh @@ -22,7 +22,8 @@ class Diag_Cusolver_gvd{ //------------------- cusolverDnHandle_t cusolverH = nullptr; - + cudaStream_t stream = nullptr; + cusolverEigType_t itype = CUSOLVER_EIG_TYPE_1; //problem type: A*x = (lambda)*B*x cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; // compute eigenvalues and eigenvectors. cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER; From 8398b0ff2209375b5eeab119690bace2061f1219 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Thu, 10 Apr 2025 11:52:18 +0800 Subject: [PATCH 3/8] add multi-gpu support --- source/module_hsolver/kernels/cuda/diag_cusolver.cu | 5 +++++ source/module_hsolver/kernels/cuda/diag_cusolver.cuh | 3 ++- tests/performance/P101_si32_lcao/INPUT | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/source/module_hsolver/kernels/cuda/diag_cusolver.cu b/source/module_hsolver/kernels/cuda/diag_cusolver.cu index a7a1a0dd81..f50a61f30e 100644 --- a/source/module_hsolver/kernels/cuda/diag_cusolver.cu +++ b/source/module_hsolver/kernels/cuda/diag_cusolver.cu @@ -1,9 +1,14 @@ #include #include "diag_cusolver.cuh" #include "helper_cuda.h" +#include "module_base/module_device/device.h" Diag_Cusolver_gvd::Diag_Cusolver_gvd(){ // step 1: create cusolver/cublas handle +#ifdef __MPI + device_id = base_device::information::set_device_by_rank(); + checkCudaErrors( cudaSetDevice(device_id) ); +#endif cusolverH = NULL; checkCudaErrors( cusolverDnCreate(&cusolverH) ); checkCudaErrors( cudaStreamCreate(&stream) ); diff --git a/source/module_hsolver/kernels/cuda/diag_cusolver.cuh b/source/module_hsolver/kernels/cuda/diag_cusolver.cuh index 60d0f3fd12..6d86b4d04b 100644 --- a/source/module_hsolver/kernels/cuda/diag_cusolver.cuh +++ b/source/module_hsolver/kernels/cuda/diag_cusolver.cuh @@ -23,7 +23,8 @@ class Diag_Cusolver_gvd{ cusolverDnHandle_t cusolverH = nullptr; cudaStream_t stream = nullptr; - + int device_id = 0; + cusolverEigType_t itype = CUSOLVER_EIG_TYPE_1; //problem type: A*x = (lambda)*B*x cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; // compute eigenvalues and eigenvectors. cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER; diff --git a/tests/performance/P101_si32_lcao/INPUT b/tests/performance/P101_si32_lcao/INPUT index 824e290b9a..7555ebfdd4 100644 --- a/tests/performance/P101_si32_lcao/INPUT +++ b/tests/performance/P101_si32_lcao/INPUT @@ -21,4 +21,4 @@ smearing_sigma 0.002 #Parameters (5.Mixing) mixing_type broyden -mixing_beta 0.3 +mixing_beta 0.3 \ No newline at end of file From 2b0296c7bfcc67b6021fb360c189973e29e5fdfa Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Fri, 11 Apr 2025 15:12:33 +0800 Subject: [PATCH 4/8] Revert "add stream" This reverts commit 833dcfd286dad6bf9ae4a0fe9c4e2392857671c2. --- source/module_hsolver/kernels/cuda/diag_cusolver.cu | 3 --- source/module_hsolver/kernels/cuda/diag_cusolver.cuh | 3 +-- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/source/module_hsolver/kernels/cuda/diag_cusolver.cu b/source/module_hsolver/kernels/cuda/diag_cusolver.cu index f50a61f30e..a88867d884 100644 --- a/source/module_hsolver/kernels/cuda/diag_cusolver.cu +++ b/source/module_hsolver/kernels/cuda/diag_cusolver.cu @@ -11,8 +11,6 @@ Diag_Cusolver_gvd::Diag_Cusolver_gvd(){ #endif cusolverH = NULL; checkCudaErrors( cusolverDnCreate(&cusolverH) ); - checkCudaErrors( cudaStreamCreate(&stream) ); - checkCudaErrors( cusolverDnSetStream(cusolverH, stream) ); itype = CUSOLVER_EIG_TYPE_1; // A*x = (lambda)*B*x jobz = CUSOLVER_EIG_MODE_VECTOR; // compute eigenvalues and eigenvectors. @@ -47,7 +45,6 @@ void Diag_Cusolver_gvd::finalize(){ Diag_Cusolver_gvd::~Diag_Cusolver_gvd(){ finalize(); if (cusolverH) {checkCudaErrors( cusolverDnDestroy(cusolverH) ); cusolverH = NULL;} - if (stream) {checkCudaErrors( cudaStreamDestroy(stream) ); stream = NULL;} //checkCudaErrors( cudaDeviceReset() ); } diff --git a/source/module_hsolver/kernels/cuda/diag_cusolver.cuh b/source/module_hsolver/kernels/cuda/diag_cusolver.cuh index 6d86b4d04b..f6926c2341 100644 --- a/source/module_hsolver/kernels/cuda/diag_cusolver.cuh +++ b/source/module_hsolver/kernels/cuda/diag_cusolver.cuh @@ -22,9 +22,8 @@ class Diag_Cusolver_gvd{ //------------------- cusolverDnHandle_t cusolverH = nullptr; - cudaStream_t stream = nullptr; int device_id = 0; - + cusolverEigType_t itype = CUSOLVER_EIG_TYPE_1; //problem type: A*x = (lambda)*B*x cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; // compute eigenvalues and eigenvectors. cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER; From bdde73a0a6b4405d6c894b0376fdae806f821ca8 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Fri, 11 Apr 2025 15:22:23 +0800 Subject: [PATCH 5/8] add MPI_Barrier --- source/module_base/parallel_common.cpp | 13 +++++++++++++ source/module_hsolver/parallel_k2d.cpp | 5 +++++ tests/performance/P101_si32_lcao/INPUT | 2 +- 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/source/module_base/parallel_common.cpp b/source/module_base/parallel_common.cpp index 6f8ce79fbc..7be024c0a8 100644 --- a/source/module_base/parallel_common.cpp +++ b/source/module_base/parallel_common.cpp @@ -21,44 +21,55 @@ void Parallel_Common::bcast_string(std::string& object) // Peize Lin fix bug 201 } MPI_Bcast(&object[0], size, MPI_CHAR, 0, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); return; } void Parallel_Common::bcast_string(std::string* object, const int n) // Peize Lin fix bug 2019-03-18 { for (int i = 0; i < n; i++) + { bcast_string(object[i]); + } + MPI_Barrier(MPI_COMM_WORLD); return; } void Parallel_Common::bcast_complex_double(std::complex& object) { MPI_Bcast(&object, 1, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); + } void Parallel_Common::bcast_complex_double(std::complex* object, const int n) { MPI_Bcast(object, n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); } void Parallel_Common::bcast_double(double& object) { MPI_Bcast(&object, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); } void Parallel_Common::bcast_double(double* object, const int n) { MPI_Bcast(object, n, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); } void Parallel_Common::bcast_int(int& object) { MPI_Bcast(&object, 1, MPI_INT, 0, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); } void Parallel_Common::bcast_int(int* object, const int n) { MPI_Bcast(object, n, MPI_INT, 0, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); } void Parallel_Common::bcast_bool(bool& object) @@ -69,6 +80,7 @@ void Parallel_Common::bcast_bool(bool& object) if (my_rank == 0) swap = object; MPI_Bcast(&swap, 1, MPI_INT, 0, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); if (my_rank != 0) object = static_cast(swap); } @@ -76,6 +88,7 @@ void Parallel_Common::bcast_bool(bool& object) void Parallel_Common::bcast_char(char* object, const int n) { MPI_Bcast(object, n, MPI_CHAR, 0, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); } #endif diff --git a/source/module_hsolver/parallel_k2d.cpp b/source/module_hsolver/parallel_k2d.cpp index 8d40bf2bca..8e003c22d7 100644 --- a/source/module_hsolver/parallel_k2d.cpp +++ b/source/module_hsolver/parallel_k2d.cpp @@ -42,6 +42,11 @@ void Parallel_K2D::set_para_env_cusolver(int nks, const int& my_rank, const int& nspin) { const int kpar = this->get_kpar(); + if(kpar <= 0 || kpar > nproc) + { + ModuleBase::WARNING_QUIT("Parallel_K2D::set_para_env_cusolver", + "kpar must be greater than 0 and less than nproc."); + } const int pool_id = (my_rank < kpar) ? my_rank : MPI_UNDEFINED; MPI_Comm_split(MPI_COMM_WORLD, pool_id, diff --git a/tests/performance/P101_si32_lcao/INPUT b/tests/performance/P101_si32_lcao/INPUT index 7555ebfdd4..824e290b9a 100644 --- a/tests/performance/P101_si32_lcao/INPUT +++ b/tests/performance/P101_si32_lcao/INPUT @@ -21,4 +21,4 @@ smearing_sigma 0.002 #Parameters (5.Mixing) mixing_type broyden -mixing_beta 0.3 \ No newline at end of file +mixing_beta 0.3 From 45089fe198443769e44df07bb9932a7cbc03bc21 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Sat, 12 Apr 2025 19:19:06 +0800 Subject: [PATCH 6/8] fix a compilation bug --- source/module_hsolver/kernels/cuda/diag_cusolver.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/module_hsolver/kernels/cuda/diag_cusolver.cu b/source/module_hsolver/kernels/cuda/diag_cusolver.cu index a88867d884..0147584641 100644 --- a/source/module_hsolver/kernels/cuda/diag_cusolver.cu +++ b/source/module_hsolver/kernels/cuda/diag_cusolver.cu @@ -5,7 +5,7 @@ Diag_Cusolver_gvd::Diag_Cusolver_gvd(){ // step 1: create cusolver/cublas handle -#ifdef __MPI +#if defined(__MPI) && defined(__CUDA) device_id = base_device::information::set_device_by_rank(); checkCudaErrors( cudaSetDevice(device_id) ); #endif From b5815c62f2175d0a5be47af8569b812e2c351bfa Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Sun, 13 Apr 2025 18:14:10 +0800 Subject: [PATCH 7/8] simplify set device --- source/module_hsolver/kernels/cuda/diag_cusolver.cu | 3 +-- source/module_hsolver/kernels/cuda/diag_cusolver.cuh | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/source/module_hsolver/kernels/cuda/diag_cusolver.cu b/source/module_hsolver/kernels/cuda/diag_cusolver.cu index 0147584641..95b73bca5d 100644 --- a/source/module_hsolver/kernels/cuda/diag_cusolver.cu +++ b/source/module_hsolver/kernels/cuda/diag_cusolver.cu @@ -6,8 +6,7 @@ Diag_Cusolver_gvd::Diag_Cusolver_gvd(){ // step 1: create cusolver/cublas handle #if defined(__MPI) && defined(__CUDA) - device_id = base_device::information::set_device_by_rank(); - checkCudaErrors( cudaSetDevice(device_id) ); + base_device::information::set_device_by_rank(); #endif cusolverH = NULL; checkCudaErrors( cusolverDnCreate(&cusolverH) ); diff --git a/source/module_hsolver/kernels/cuda/diag_cusolver.cuh b/source/module_hsolver/kernels/cuda/diag_cusolver.cuh index f6926c2341..c71c085735 100644 --- a/source/module_hsolver/kernels/cuda/diag_cusolver.cuh +++ b/source/module_hsolver/kernels/cuda/diag_cusolver.cuh @@ -22,7 +22,6 @@ class Diag_Cusolver_gvd{ //------------------- cusolverDnHandle_t cusolverH = nullptr; - int device_id = 0; cusolverEigType_t itype = CUSOLVER_EIG_TYPE_1; //problem type: A*x = (lambda)*B*x cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; // compute eigenvalues and eigenvectors. From e23f43e6317d27c1c4f9431eae51749972b20495 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Thu, 17 Apr 2025 22:51:45 +0800 Subject: [PATCH 8/8] remove global parameters --- source/module_cell/parallel_kpoints.cpp | 10 +++---- source/module_cell/parallel_kpoints.h | 10 +++---- source/module_hsolver/diago_cusolver.cpp | 30 ++++++++++++------- source/module_hsolver/hsolver_lcao.cpp | 10 +++++-- .../kernels/cuda/diag_cusolver.cu | 4 --- source/module_hsolver/parallel_k2d.cpp | 20 ++++++------- source/module_hsolver/parallel_k2d.h | 21 ++++++------- 7 files changed, 58 insertions(+), 47 deletions(-) diff --git a/source/module_cell/parallel_kpoints.cpp b/source/module_cell/parallel_kpoints.cpp index 69dc13b898..54c25ae5aa 100644 --- a/source/module_cell/parallel_kpoints.cpp +++ b/source/module_cell/parallel_kpoints.cpp @@ -5,11 +5,11 @@ // the kpoints here are reduced after symmetry applied. void Parallel_Kpoints::kinfo(int& nkstot_in, - const int& kpar_in, - const int& my_pool_in, - const int& rank_in_pool_in, - const int& nproc_in, - const int& nspin_in) + const int kpar_in, + const int my_pool_in, + const int rank_in_pool_in, + const int nproc_in, + const int nspin_in) { #ifdef __MPI diff --git a/source/module_cell/parallel_kpoints.h b/source/module_cell/parallel_kpoints.h index 971a2849f2..e2e232509f 100644 --- a/source/module_cell/parallel_kpoints.h +++ b/source/module_cell/parallel_kpoints.h @@ -13,11 +13,11 @@ class Parallel_Kpoints ~Parallel_Kpoints(){}; void kinfo(int& nkstot_in, - const int& kpar_in, - const int& my_pool_in, - const int& rank_in_pool_in, - const int& nproc_in, - const int& nspin_in); + const int kpar_in, + const int my_pool_in, + const int rank_in_pool_in, + const int nproc_in, + const int nspin_in); // collect value from each pool to wk. void pool_collection(double& value, const double* wk, const int& ik); diff --git a/source/module_hsolver/diago_cusolver.cpp b/source/module_hsolver/diago_cusolver.cpp index 69b8a0bb07..ee0858885f 100644 --- a/source/module_hsolver/diago_cusolver.cpp +++ b/source/module_hsolver/diago_cusolver.cpp @@ -120,12 +120,15 @@ void DiagoCusolver::diag_pool(hamilt::MatrixBlock& h_mat, { ModuleBase::TITLE("DiagoCusolver", "diag_pool"); ModuleBase::timer::tick("DiagoCusolver", "diag_pool"); - std::vector eigen(PARAM.globalv.nlocal, 0.0); + const int nbands_local = psi.get_nbands(); + const int nbasis = psi.get_nbasis(); + int nbands_global = nbands_local; + std::vector eigen(nbasis, 0.0); std::vector eigenvectors(h_mat.row * h_mat.col); this->dc.Dngvd(h_mat.row, h_mat.col, h_mat.p, s_mat.p, eigen.data(), eigenvectors.data()); - const int size = psi.get_nbands() * psi.get_nbasis(); + const int size = nbands_local * nbasis; BlasConnector::copy(size, eigenvectors.data(), 1, psi.get_pointer(), 1); - BlasConnector::copy(PARAM.inp.nbands, eigen.data(), 1, eigenvalue_in, 1); + BlasConnector::copy(nbands_global, eigen.data(), 1, eigenvalue_in, 1); ModuleBase::timer::tick("DiagoCusolver", "diag_pool"); } @@ -140,7 +143,14 @@ void DiagoCusolver::diag(hamilt::Hamilt* phm_in, psi::Psi& psi, Real* e hamilt::MatrixBlock h_mat; hamilt::MatrixBlock s_mat; phm_in->matrix(h_mat, s_mat); - + const int nbands_local = psi.get_nbands(); + const int nbasis = psi.get_nbasis(); + int nbands_global; +#ifdef __MPI + MPI_Allreduce(&nbands_local, &nbands_global, 1, MPI_INT, MPI_SUM, this->ParaV->comm()); +#else + nbands_global = nbands_local; +#endif #ifdef __MPI // global matrix Matrix_g h_mat_g; @@ -159,7 +169,7 @@ void DiagoCusolver::diag(hamilt::Hamilt* phm_in, psi::Psi& psi, Real* e #endif // Allocate memory for eigenvalues - std::vector eigen(PARAM.globalv.nlocal, 0.0); + std::vector eigen(nbasis, 0.0); // Start the timer for the cusolver operation ModuleBase::timer::tick("DiagoCusolver", "cusolver"); @@ -189,23 +199,23 @@ void DiagoCusolver::diag(hamilt::Hamilt* phm_in, psi::Psi& psi, Real* e MPI_Barrier(MPI_COMM_WORLD); // broadcast eigenvalues to all processes - MPI_Bcast(eigen.data(), PARAM.inp.nbands, MPI_DOUBLE, root_proc, MPI_COMM_WORLD); + MPI_Bcast(eigen.data(), nbands_global, MPI_DOUBLE, root_proc, MPI_COMM_WORLD); // distribute psi to all processes distributePsi(this->ParaV->desc_wfc, psi.get_pointer(), psi_g.data()); } else { - // Be careful that h_mat.row * h_mat.col != psi.get_nbands() * psi.get_nbasis() under multi-k situation + // Be careful that h_mat.row * h_mat.col != nbands * nbasis under multi-k situation std::vector eigenvectors(h_mat.row * h_mat.col); this->dc.Dngvd(h_mat.row, h_mat.col, h_mat.p, s_mat.p, eigen.data(), eigenvectors.data()); - const int size = psi.get_nbands() * psi.get_nbasis(); + const int size = nbands_local * nbasis; BlasConnector::copy(size, eigenvectors.data(), 1, psi.get_pointer(), 1); } #else std::vector eigenvectors(h_mat.row * h_mat.col); this->dc.Dngvd(h_mat.row, h_mat.col, h_mat.p, s_mat.p, eigen.data(), eigenvectors.data()); - const int size = psi.get_nbands() * psi.get_nbasis(); + const int size = nbands_local * nbasis; BlasConnector::copy(size, eigenvectors.data(), 1, psi.get_pointer(), 1); #endif // Stop the timer for the cusolver operation @@ -213,7 +223,7 @@ void DiagoCusolver::diag(hamilt::Hamilt* phm_in, psi::Psi& psi, Real* e // Copy the eigenvalues to the output arrays const int inc = 1; - BlasConnector::copy(PARAM.inp.nbands, eigen.data(), inc, eigenvalue_in, inc); + BlasConnector::copy(nbands_global, eigen.data(), inc, eigenvalue_in, inc); } // Explicit instantiation of the DiagoCusolver class for real and complex numbers diff --git a/source/module_hsolver/hsolver_lcao.cpp b/source/module_hsolver/hsolver_lcao.cpp index 0859e886b2..585836f6ee 100644 --- a/source/module_hsolver/hsolver_lcao.cpp +++ b/source/module_hsolver/hsolver_lcao.cpp @@ -18,6 +18,7 @@ #ifdef __CUDA #include "diago_cusolver.h" +#include "module_base/module_device/device.h" #endif #ifdef __PEXSI @@ -186,6 +187,9 @@ void HSolverLCAO::parakSolve(hamilt::Hamilt* pHamilt, { #ifdef __MPI ModuleBase::timer::tick("HSolverLCAO", "parakSolve"); +#ifdef __CUDA + base_device::information::set_device_by_rank(); +#endif auto k2d = Parallel_K2D(); k2d.set_kpar(kpar); int nbands = this->ParaV->get_nbands(); @@ -194,10 +198,10 @@ void HSolverLCAO::parakSolve(hamilt::Hamilt* pHamilt, int nb2d = this->ParaV->get_block_size(); if(this->method == "cusolver") { - k2d.set_para_env_cusolver(psi.get_nk(), nrow, nb2d, GlobalV::NPROC, GlobalV::MY_RANK, PARAM.inp.nspin); + k2d.set_para_env_cusolver(psi.get_nk(), nrow, nb2d, GlobalV::NPROC, GlobalV::MY_RANK); } else { - k2d.set_para_env(psi.get_nk(), nrow, nb2d, GlobalV::NPROC, GlobalV::MY_RANK, PARAM.inp.nspin); + k2d.set_para_env(psi.get_nk(), nrow, nb2d, GlobalV::NPROC, GlobalV::MY_RANK); } /// set psi_pool const int zero = 0; @@ -272,7 +276,7 @@ void HSolverLCAO::parakSolve(hamilt::Hamilt* pHamilt, #ifdef __CUDA else if (this->method == "cusolver") { - DiagoCusolver cs(nullptr); + DiagoCusolver cs; cs.diag_pool(hk_pool, sk_pool, psi_pool, &(pes->ekb(ik_global, 0)), k2d.POOL_WORLD_K2D); } #endif diff --git a/source/module_hsolver/kernels/cuda/diag_cusolver.cu b/source/module_hsolver/kernels/cuda/diag_cusolver.cu index 95b73bca5d..90548f1c9b 100644 --- a/source/module_hsolver/kernels/cuda/diag_cusolver.cu +++ b/source/module_hsolver/kernels/cuda/diag_cusolver.cu @@ -1,13 +1,9 @@ #include #include "diag_cusolver.cuh" #include "helper_cuda.h" -#include "module_base/module_device/device.h" Diag_Cusolver_gvd::Diag_Cusolver_gvd(){ // step 1: create cusolver/cublas handle -#if defined(__MPI) && defined(__CUDA) - base_device::information::set_device_by_rank(); -#endif cusolverH = NULL; checkCudaErrors( cusolverDnCreate(&cusolverH) ); diff --git a/source/module_hsolver/parallel_k2d.cpp b/source/module_hsolver/parallel_k2d.cpp index 8e003c22d7..c416ac3733 100644 --- a/source/module_hsolver/parallel_k2d.cpp +++ b/source/module_hsolver/parallel_k2d.cpp @@ -7,11 +7,11 @@ template void Parallel_K2D::set_para_env(int nks, - const int& nw, - const int& nb2d, - const int& nproc, - const int& my_rank, - const int& nspin) { + const int nw, + const int nb2d, + const int nproc, + const int my_rank, + const int nspin) { const int kpar = this->get_kpar(); Parallel_Global::divide_mpi_groups(nproc, kpar, @@ -36,11 +36,11 @@ void Parallel_K2D::set_para_env(int nks, template void Parallel_K2D::set_para_env_cusolver(int nks, - const int& nw, - const int& nb2d, - const int& nproc, - const int& my_rank, - const int& nspin) { + const int nw, + const int nb2d, + const int nproc, + const int my_rank, + const int nspin) { const int kpar = this->get_kpar(); if(kpar <= 0 || kpar > nproc) { diff --git a/source/module_hsolver/parallel_k2d.h b/source/module_hsolver/parallel_k2d.h index 620dea5a3a..b5ee4df54b 100644 --- a/source/module_hsolver/parallel_k2d.h +++ b/source/module_hsolver/parallel_k2d.h @@ -26,19 +26,20 @@ class Parallel_K2D { */ /// this function sets the parallel environment for k-points parallelism /// including the glabal and pool 2D parallel distribution + /// nspin doesn't affect anything here, in fact it can be deleted void set_para_env(int nks, - const int& nw, - const int& nb2d, - const int& nproc, - const int& my_rank, - const int& nspin); + const int nw, + const int nb2d, + const int nproc, + const int my_rank, + const int nspin = 1); void set_para_env_cusolver(int nks, - const int& nw, - const int& nb2d, - const int& nproc, - const int& my_rank, - const int& nspin); + const int nw, + const int nb2d, + const int nproc, + const int my_rank, + const int nspin = 1); /// this function distributes the Hk and Sk matrices to hk_pool and sk_pool void distribute_hsk(hamilt::Hamilt* pHamilt,