diff --git a/source/module_base/parallel_common.cpp b/source/module_base/parallel_common.cpp index 6f8ce79fbc..7be024c0a8 100644 --- a/source/module_base/parallel_common.cpp +++ b/source/module_base/parallel_common.cpp @@ -21,44 +21,55 @@ void Parallel_Common::bcast_string(std::string& object) // Peize Lin fix bug 201 } MPI_Bcast(&object[0], size, MPI_CHAR, 0, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); return; } void Parallel_Common::bcast_string(std::string* object, const int n) // Peize Lin fix bug 2019-03-18 { for (int i = 0; i < n; i++) + { bcast_string(object[i]); + } + MPI_Barrier(MPI_COMM_WORLD); return; } void Parallel_Common::bcast_complex_double(std::complex& object) { MPI_Bcast(&object, 1, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); + } void Parallel_Common::bcast_complex_double(std::complex* object, const int n) { MPI_Bcast(object, n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); } void Parallel_Common::bcast_double(double& object) { MPI_Bcast(&object, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); } void Parallel_Common::bcast_double(double* object, const int n) { MPI_Bcast(object, n, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); } void Parallel_Common::bcast_int(int& object) { MPI_Bcast(&object, 1, MPI_INT, 0, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); } void Parallel_Common::bcast_int(int* object, const int n) { MPI_Bcast(object, n, MPI_INT, 0, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); } void Parallel_Common::bcast_bool(bool& object) @@ -69,6 +80,7 @@ void Parallel_Common::bcast_bool(bool& object) if (my_rank == 0) swap = object; MPI_Bcast(&swap, 1, MPI_INT, 0, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); if (my_rank != 0) object = static_cast(swap); } @@ -76,6 +88,7 @@ void Parallel_Common::bcast_bool(bool& object) void Parallel_Common::bcast_char(char* object, const int n) { MPI_Bcast(object, n, MPI_CHAR, 0, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); } #endif diff --git a/source/module_cell/parallel_kpoints.cpp b/source/module_cell/parallel_kpoints.cpp index 69dc13b898..54c25ae5aa 100644 --- a/source/module_cell/parallel_kpoints.cpp +++ b/source/module_cell/parallel_kpoints.cpp @@ -5,11 +5,11 @@ // the kpoints here are reduced after symmetry applied. void Parallel_Kpoints::kinfo(int& nkstot_in, - const int& kpar_in, - const int& my_pool_in, - const int& rank_in_pool_in, - const int& nproc_in, - const int& nspin_in) + const int kpar_in, + const int my_pool_in, + const int rank_in_pool_in, + const int nproc_in, + const int nspin_in) { #ifdef __MPI diff --git a/source/module_cell/parallel_kpoints.h b/source/module_cell/parallel_kpoints.h index 971a2849f2..e2e232509f 100644 --- a/source/module_cell/parallel_kpoints.h +++ b/source/module_cell/parallel_kpoints.h @@ -13,11 +13,11 @@ class Parallel_Kpoints ~Parallel_Kpoints(){}; void kinfo(int& nkstot_in, - const int& kpar_in, - const int& my_pool_in, - const int& rank_in_pool_in, - const int& nproc_in, - const int& nspin_in); + const int kpar_in, + const int my_pool_in, + const int rank_in_pool_in, + const int nproc_in, + const int nspin_in); // collect value from each pool to wk. void pool_collection(double& value, const double* wk, const int& ik); diff --git a/source/module_hsolver/diago_cusolver.cpp b/source/module_hsolver/diago_cusolver.cpp index efaf20a28a..ee0858885f 100644 --- a/source/module_hsolver/diago_cusolver.cpp +++ b/source/module_hsolver/diago_cusolver.cpp @@ -111,6 +111,27 @@ static void distributePsi(const int* desc_psi, T* psi, T* psi_g) Cpxgemr2d(nrows, ncols, psi_g, 1, 1, descg, psi, 1, 1, descl, ctxt); } +template +void DiagoCusolver::diag_pool(hamilt::MatrixBlock& h_mat, + hamilt::MatrixBlock& s_mat, + psi::Psi& psi, + Real* eigenvalue_in, + MPI_Comm& comm) +{ + ModuleBase::TITLE("DiagoCusolver", "diag_pool"); + ModuleBase::timer::tick("DiagoCusolver", "diag_pool"); + const int nbands_local = psi.get_nbands(); + const int nbasis = psi.get_nbasis(); + int nbands_global = nbands_local; + std::vector eigen(nbasis, 0.0); + std::vector eigenvectors(h_mat.row * h_mat.col); + this->dc.Dngvd(h_mat.row, h_mat.col, h_mat.p, s_mat.p, eigen.data(), eigenvectors.data()); + const int size = nbands_local * nbasis; + BlasConnector::copy(size, eigenvectors.data(), 1, psi.get_pointer(), 1); + BlasConnector::copy(nbands_global, eigen.data(), 1, eigenvalue_in, 1); + ModuleBase::timer::tick("DiagoCusolver", "diag_pool"); +} + // Diagonalization function template void DiagoCusolver::diag(hamilt::Hamilt* phm_in, psi::Psi& psi, Real* eigenvalue_in) @@ -122,7 +143,14 @@ void DiagoCusolver::diag(hamilt::Hamilt* phm_in, psi::Psi& psi, Real* e hamilt::MatrixBlock h_mat; hamilt::MatrixBlock s_mat; phm_in->matrix(h_mat, s_mat); - + const int nbands_local = psi.get_nbands(); + const int nbasis = psi.get_nbasis(); + int nbands_global; +#ifdef __MPI + MPI_Allreduce(&nbands_local, &nbands_global, 1, MPI_INT, MPI_SUM, this->ParaV->comm()); +#else + nbands_global = nbands_local; +#endif #ifdef __MPI // global matrix Matrix_g h_mat_g; @@ -141,7 +169,7 @@ void DiagoCusolver::diag(hamilt::Hamilt* phm_in, psi::Psi& psi, Real* e #endif // Allocate memory for eigenvalues - std::vector eigen(PARAM.globalv.nlocal, 0.0); + std::vector eigen(nbasis, 0.0); // Start the timer for the cusolver operation ModuleBase::timer::tick("DiagoCusolver", "cusolver"); @@ -171,23 +199,23 @@ void DiagoCusolver::diag(hamilt::Hamilt* phm_in, psi::Psi& psi, Real* e MPI_Barrier(MPI_COMM_WORLD); // broadcast eigenvalues to all processes - MPI_Bcast(eigen.data(), PARAM.inp.nbands, MPI_DOUBLE, root_proc, MPI_COMM_WORLD); + MPI_Bcast(eigen.data(), nbands_global, MPI_DOUBLE, root_proc, MPI_COMM_WORLD); // distribute psi to all processes distributePsi(this->ParaV->desc_wfc, psi.get_pointer(), psi_g.data()); } else { - // Be careful that h_mat.row * h_mat.col != psi.get_nbands() * psi.get_nbasis() under multi-k situation + // Be careful that h_mat.row * h_mat.col != nbands * nbasis under multi-k situation std::vector eigenvectors(h_mat.row * h_mat.col); this->dc.Dngvd(h_mat.row, h_mat.col, h_mat.p, s_mat.p, eigen.data(), eigenvectors.data()); - const int size = psi.get_nbands() * psi.get_nbasis(); + const int size = nbands_local * nbasis; BlasConnector::copy(size, eigenvectors.data(), 1, psi.get_pointer(), 1); } #else std::vector eigenvectors(h_mat.row * h_mat.col); this->dc.Dngvd(h_mat.row, h_mat.col, h_mat.p, s_mat.p, eigen.data(), eigenvectors.data()); - const int size = psi.get_nbands() * psi.get_nbasis(); + const int size = nbands_local * nbasis; BlasConnector::copy(size, eigenvectors.data(), 1, psi.get_pointer(), 1); #endif // Stop the timer for the cusolver operation @@ -195,7 +223,7 @@ void DiagoCusolver::diag(hamilt::Hamilt* phm_in, psi::Psi& psi, Real* e // Copy the eigenvalues to the output arrays const int inc = 1; - BlasConnector::copy(PARAM.inp.nbands, eigen.data(), inc, eigenvalue_in, inc); + BlasConnector::copy(nbands_global, eigen.data(), inc, eigenvalue_in, inc); } // Explicit instantiation of the DiagoCusolver class for real and complex numbers diff --git a/source/module_hsolver/diago_cusolver.h b/source/module_hsolver/diago_cusolver.h index ed67357f3f..61f37999be 100644 --- a/source/module_hsolver/diago_cusolver.h +++ b/source/module_hsolver/diago_cusolver.h @@ -24,6 +24,13 @@ class DiagoCusolver DiagoCusolver(const Parallel_Orbitals* ParaV = nullptr); ~DiagoCusolver(); + void diag_pool( + hamilt::MatrixBlock& h_mat, + hamilt::MatrixBlock& s_mat, + psi::Psi& psi, + Real* eigenvalue_in, + MPI_Comm& comm); + // Override the diag function for CUSOLVER diagonalization void diag(hamilt::Hamilt* phm_in, psi::Psi& psi, Real* eigenvalue_in); diff --git a/source/module_hsolver/hsolver_lcao.cpp b/source/module_hsolver/hsolver_lcao.cpp index 6a361e08db..585836f6ee 100644 --- a/source/module_hsolver/hsolver_lcao.cpp +++ b/source/module_hsolver/hsolver_lcao.cpp @@ -18,6 +18,7 @@ #ifdef __CUDA #include "diago_cusolver.h" +#include "module_base/module_device/device.h" #endif #ifdef __PEXSI @@ -49,7 +50,8 @@ void HSolverLCAO::solve(hamilt::Hamilt* pHamilt, if (this->method != "pexsi") { if (PARAM.globalv.kpar_lcao > 1 - && (this->method == "genelpa" || this->method == "elpa" || this->method == "scalapack_gvx")) + && (this->method == "genelpa" || this->method == "elpa" + || this->method == "scalapack_gvx" || this->method == "cusolver")) { #ifdef __MPI this->parakSolve(pHamilt, psi, pes, PARAM.globalv.kpar_lcao); @@ -185,17 +187,29 @@ void HSolverLCAO::parakSolve(hamilt::Hamilt* pHamilt, { #ifdef __MPI ModuleBase::timer::tick("HSolverLCAO", "parakSolve"); +#ifdef __CUDA + base_device::information::set_device_by_rank(); +#endif auto k2d = Parallel_K2D(); k2d.set_kpar(kpar); int nbands = this->ParaV->get_nbands(); int nks = psi.get_nk(); int nrow = this->ParaV->get_global_row_size(); int nb2d = this->ParaV->get_block_size(); - k2d.set_para_env(psi.get_nk(), nrow, nb2d, GlobalV::NPROC, GlobalV::MY_RANK, PARAM.inp.nspin); + if(this->method == "cusolver") + { + k2d.set_para_env_cusolver(psi.get_nk(), nrow, nb2d, GlobalV::NPROC, GlobalV::MY_RANK); + } else + { + k2d.set_para_env(psi.get_nk(), nrow, nb2d, GlobalV::NPROC, GlobalV::MY_RANK); + } /// set psi_pool const int zero = 0; - int ncol_bands_pool - = numroc_(&(nbands), &(nb2d), &(k2d.get_p2D_pool()->coord[1]), &zero, &(k2d.get_p2D_pool()->dim1)); + int ncol_bands_pool = 0; + if(k2d.is_in_pool()) + { + ncol_bands_pool = numroc_(&(nbands), &(nb2d), &(k2d.get_p2D_pool()->coord[1]), &zero, &(k2d.get_p2D_pool()->dim1)); + } /// Loop over k points for solve Hamiltonian to charge density for (int ik = 0; ik < k2d.get_pKpoints()->get_max_nks_pool(); ++ik) { @@ -222,9 +236,12 @@ void HSolverLCAO::parakSolve(hamilt::Hamilt* pHamilt, } } k2d.distribute_hsk(pHamilt, ik_kpar, nrow); + auto psi_pool = psi::Psi(); + if(k2d.is_in_pool()) + { /// global index of k point int ik_global = ik + k2d.get_pKpoints()->startk_pool[k2d.get_my_pool()]; - auto psi_pool = psi::Psi(1, ncol_bands_pool, k2d.get_p2D_pool()->nrow, k2d.get_p2D_pool()->nrow, true); + psi_pool = psi::Psi(1, ncol_bands_pool, k2d.get_p2D_pool()->nrow, k2d.get_p2D_pool()->nrow, true); ModuleBase::Memory::record("HSolverLCAO::psi_pool", nrow * ncol_bands_pool * sizeof(T)); if (ik_global < psi.get_nk() && ik < k2d.get_pKpoints()->nks_pool[k2d.get_my_pool()]) { @@ -255,6 +272,13 @@ void HSolverLCAO::parakSolve(hamilt::Hamilt* pHamilt, DiagoElpaNative el; el.diag_pool(hk_pool, sk_pool, psi_pool, &(pes->ekb(ik_global, 0)), k2d.POOL_WORLD_K2D); } +#endif +#ifdef __CUDA + else if (this->method == "cusolver") + { + DiagoCusolver cs; + cs.diag_pool(hk_pool, sk_pool, psi_pool, &(pes->ekb(ik_global, 0)), k2d.POOL_WORLD_K2D); + } #endif else { @@ -262,14 +286,25 @@ void HSolverLCAO::parakSolve(hamilt::Hamilt* pHamilt, "This type of eigensolver for k-parallelism diagnolization is not supported!"); } } + } MPI_Barrier(MPI_COMM_WORLD); ModuleBase::timer::tick("HSolverLCAO", "collect_psi"); for (int ipool = 0; ipool < ik_kpar.size(); ++ipool) { - int source = k2d.get_pKpoints()->get_startpro_pool(ipool); + int source = 0; + if(this->method != "cusolver") + { + source = k2d.get_pKpoints()->get_startpro_pool(ipool); + } else + { + source = ipool; + } MPI_Bcast(&(pes->ekb(ik_kpar[ipool], 0)), nbands, MPI_DOUBLE, source, MPI_COMM_WORLD); int desc_pool[9]; - std::copy(k2d.get_p2D_pool()->desc, k2d.get_p2D_pool()->desc + 9, desc_pool); + if(k2d.is_in_pool()) + { + std::copy(k2d.get_p2D_pool()->desc, k2d.get_p2D_pool()->desc + 9, desc_pool); + } if (k2d.get_my_pool() != ipool) { desc_pool[1] = -1; diff --git a/source/module_hsolver/kernels/cuda/diag_cusolver.cuh b/source/module_hsolver/kernels/cuda/diag_cusolver.cuh index 557cf9c3bd..c71c085735 100644 --- a/source/module_hsolver/kernels/cuda/diag_cusolver.cuh +++ b/source/module_hsolver/kernels/cuda/diag_cusolver.cuh @@ -22,7 +22,7 @@ class Diag_Cusolver_gvd{ //------------------- cusolverDnHandle_t cusolverH = nullptr; - + cusolverEigType_t itype = CUSOLVER_EIG_TYPE_1; //problem type: A*x = (lambda)*B*x cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; // compute eigenvalues and eigenvectors. cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER; diff --git a/source/module_hsolver/parallel_k2d.cpp b/source/module_hsolver/parallel_k2d.cpp index 91ead02082..c416ac3733 100644 --- a/source/module_hsolver/parallel_k2d.cpp +++ b/source/module_hsolver/parallel_k2d.cpp @@ -7,11 +7,11 @@ template void Parallel_K2D::set_para_env(int nks, - const int& nw, - const int& nb2d, - const int& nproc, - const int& my_rank, - const int& nspin) { + const int nw, + const int nb2d, + const int nproc, + const int my_rank, + const int nspin) { const int kpar = this->get_kpar(); Parallel_Global::divide_mpi_groups(nproc, kpar, @@ -34,6 +34,47 @@ void Parallel_K2D::set_para_env(int nks, this->P2D_pool->init(nw, nw, nb2d, this->POOL_WORLD_K2D); } +template +void Parallel_K2D::set_para_env_cusolver(int nks, + const int nw, + const int nb2d, + const int nproc, + const int my_rank, + const int nspin) { + const int kpar = this->get_kpar(); + if(kpar <= 0 || kpar > nproc) + { + ModuleBase::WARNING_QUIT("Parallel_K2D::set_para_env_cusolver", + "kpar must be greater than 0 and less than nproc."); + } + const int pool_id = (my_rank < kpar) ? my_rank : MPI_UNDEFINED; + MPI_Comm_split(MPI_COMM_WORLD, + pool_id, + 0, + &this->POOL_WORLD_K2D); + this->P2D_global = new Parallel_2D; + this->P2D_global->init(nw, nw, nb2d, MPI_COMM_WORLD); + if(this->POOL_WORLD_K2D != MPI_COMM_NULL) + { + this->MY_POOL = pool_id; + this->RANK_IN_POOL = 0; + this->NPROC_IN_POOL = 1; + this->in_pool = true; + this->P2D_pool = new Parallel_2D; + this->P2D_pool->init(nw, nw, nb2d, this->POOL_WORLD_K2D); + } + else + { + this->in_pool = false; + this->MY_POOL = -1; + this->RANK_IN_POOL = -1; + this->NPROC_IN_POOL = 0; + } + this->Pkpoints = new Parallel_Kpoints; + this->Pkpoints + ->kinfo(nks, kpar, this->MY_POOL, this->RANK_IN_POOL, nproc, nspin); +} + template void Parallel_K2D::distribute_hsk(hamilt::Hamilt* pHamilt, const std::vector& ik_kpar, @@ -45,13 +86,16 @@ void Parallel_K2D::distribute_hsk(hamilt::Hamilt* pHamilt, pHamilt->updateHk(ik_kpar[ipool]); hamilt::MatrixBlock HK_global, SK_global; pHamilt->matrix(HK_global, SK_global); - if (this->MY_POOL == this->Pkpoints->whichpool[ik_kpar[ipool]]) { + if (this->in_pool && this->MY_POOL == this->Pkpoints->whichpool[ik_kpar[ipool]]) { this->hk_pool.resize(this->P2D_pool->get_local_size(), 0.0); this->sk_pool.resize(this->P2D_pool->get_local_size(), 0.0); } int desc_pool[9]; - std::copy(this->P2D_pool->desc, this->P2D_pool->desc + 9, desc_pool); - if (this->MY_POOL != this->Pkpoints->whichpool[ik_kpar[ipool]]) { + if(this->in_pool) + { + std::copy(this->P2D_pool->desc, this->P2D_pool->desc + 9, desc_pool); + } + if ( !this->in_pool || this->MY_POOL != this->Pkpoints->whichpool[ik_kpar[ipool]]) { desc_pool[1] = -1; } Cpxgemr2d(nw, @@ -77,7 +121,6 @@ void Parallel_K2D::distribute_hsk(hamilt::Hamilt* pHamilt, desc_pool, this->P2D_global->blacs_ctxt); } - ModuleBase::Memory::record("Parallel_K2D::hsk_pool", this->P2D_pool->get_local_size() * 2 * sizeof(TK)); ModuleBase::timer::tick("Parallel_K2D", "distribute_hsk"); MPI_Barrier(MPI_COMM_WORLD); #endif @@ -97,7 +140,10 @@ void Parallel_K2D::unset_para_env() { delete this->P2D_pool; this->P2D_pool = nullptr; } - MPI_Comm_free(&this->POOL_WORLD_K2D); + if(this->POOL_WORLD_K2D != MPI_COMM_NULL) + { + MPI_Comm_free(&this->POOL_WORLD_K2D); + } } template diff --git a/source/module_hsolver/parallel_k2d.h b/source/module_hsolver/parallel_k2d.h index cdd7a951c8..b5ee4df54b 100644 --- a/source/module_hsolver/parallel_k2d.h +++ b/source/module_hsolver/parallel_k2d.h @@ -18,20 +18,28 @@ template class Parallel_K2D { public: /// private constructor - Parallel_K2D() {} + Parallel_K2D() {}; /// private destructor - ~Parallel_K2D() {} + ~Parallel_K2D() {}; /** * Public member functions */ /// this function sets the parallel environment for k-points parallelism /// including the glabal and pool 2D parallel distribution + /// nspin doesn't affect anything here, in fact it can be deleted void set_para_env(int nks, - const int& nw, - const int& nb2d, - const int& nproc, - const int& my_rank, - const int& nspin); + const int nw, + const int nb2d, + const int nproc, + const int my_rank, + const int nspin = 1); + + void set_para_env_cusolver(int nks, + const int nw, + const int nb2d, + const int nproc, + const int my_rank, + const int nspin = 1); /// this function distributes the Hk and Sk matrices to hk_pool and sk_pool void distribute_hsk(hamilt::Hamilt* pHamilt, @@ -44,15 +52,17 @@ class Parallel_K2D { /// set the number of k-points void set_kpar(int kpar); /// get the number of k-points - int get_kpar() { return this->kpar_; } + int get_kpar() { return this->kpar_; }; /// get my pool - int get_my_pool() { return this->MY_POOL; } + int get_my_pool() { return this->MY_POOL; }; + /// check if this proc is in any pool + bool is_in_pool() { return this->in_pool; }; /// get pKpoints - Parallel_Kpoints* get_pKpoints() { return this->Pkpoints; } + Parallel_Kpoints* get_pKpoints() { return this->Pkpoints; }; /// get p2D_global - Parallel_2D* get_p2D_global() { return this->P2D_global; } + Parallel_2D* get_p2D_global() { return this->P2D_global; }; /// get p2D_pool - Parallel_2D* get_p2D_pool() { return this->P2D_pool; } + Parallel_2D* get_p2D_pool() { return this->P2D_pool; }; /** * the local Hk, Sk matrices in POOL_WORLD_K2D @@ -69,6 +79,7 @@ class Parallel_K2D { * Private member variables */ int kpar_ = 0; + bool in_pool = true; /** * mpi info