Skip to content

Feature: cusolver support for LCAO kpar #6139

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 12 commits into
base: develop
Choose a base branch
from
13 changes: 13 additions & 0 deletions source/module_base/parallel_common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,44 +21,55 @@ void Parallel_Common::bcast_string(std::string& object) // Peize Lin fix bug 201
}

MPI_Bcast(&object[0], size, MPI_CHAR, 0, MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
return;
}

void Parallel_Common::bcast_string(std::string* object, const int n) // Peize Lin fix bug 2019-03-18
{
for (int i = 0; i < n; i++)
{
bcast_string(object[i]);
}
MPI_Barrier(MPI_COMM_WORLD);
return;
}

void Parallel_Common::bcast_complex_double(std::complex<double>& object)
{
MPI_Bcast(&object, 1, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);

}

void Parallel_Common::bcast_complex_double(std::complex<double>* object, const int n)
{
MPI_Bcast(object, n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
}

void Parallel_Common::bcast_double(double& object)
{
MPI_Bcast(&object, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
}

void Parallel_Common::bcast_double(double* object, const int n)
{
MPI_Bcast(object, n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
}

void Parallel_Common::bcast_int(int& object)
{
MPI_Bcast(&object, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
}

void Parallel_Common::bcast_int(int* object, const int n)
{
MPI_Bcast(object, n, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
}

void Parallel_Common::bcast_bool(bool& object)
Expand All @@ -69,13 +80,15 @@ void Parallel_Common::bcast_bool(bool& object)
if (my_rank == 0)
swap = object;
MPI_Bcast(&swap, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
if (my_rank != 0)
object = static_cast<bool>(swap);
}

void Parallel_Common::bcast_char(char* object, const int n)
{
MPI_Bcast(object, n, MPI_CHAR, 0, MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
}

#endif
10 changes: 5 additions & 5 deletions source/module_cell/parallel_kpoints.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@

// the kpoints here are reduced after symmetry applied.
void Parallel_Kpoints::kinfo(int& nkstot_in,
const int& kpar_in,
const int& my_pool_in,
const int& rank_in_pool_in,
const int& nproc_in,
const int& nspin_in)
const int kpar_in,
const int my_pool_in,
const int rank_in_pool_in,
const int nproc_in,
const int nspin_in)
{
#ifdef __MPI

Expand Down
10 changes: 5 additions & 5 deletions source/module_cell/parallel_kpoints.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@ class Parallel_Kpoints
~Parallel_Kpoints(){};

void kinfo(int& nkstot_in,
const int& kpar_in,
const int& my_pool_in,
const int& rank_in_pool_in,
const int& nproc_in,
const int& nspin_in);
const int kpar_in,
const int my_pool_in,
const int rank_in_pool_in,
const int nproc_in,
const int nspin_in);

// collect value from each pool to wk.
void pool_collection(double& value, const double* wk, const int& ik);
Expand Down
42 changes: 35 additions & 7 deletions source/module_hsolver/diago_cusolver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,27 @@ static void distributePsi(const int* desc_psi, T* psi, T* psi_g)
Cpxgemr2d(nrows, ncols, psi_g, 1, 1, descg, psi, 1, 1, descl, ctxt);
}

template <typename T>
void DiagoCusolver<T>::diag_pool(hamilt::MatrixBlock<T>& h_mat,
hamilt::MatrixBlock<T>& s_mat,
psi::Psi<T>& psi,
Real* eigenvalue_in,
MPI_Comm& comm)
{
ModuleBase::TITLE("DiagoCusolver", "diag_pool");
ModuleBase::timer::tick("DiagoCusolver", "diag_pool");
const int nbands_local = psi.get_nbands();
const int nbasis = psi.get_nbasis();
int nbands_global = nbands_local;
std::vector<double> eigen(nbasis, 0.0);
std::vector<T> eigenvectors(h_mat.row * h_mat.col);
this->dc.Dngvd(h_mat.row, h_mat.col, h_mat.p, s_mat.p, eigen.data(), eigenvectors.data());
const int size = nbands_local * nbasis;
BlasConnector::copy(size, eigenvectors.data(), 1, psi.get_pointer(), 1);
BlasConnector::copy(nbands_global, eigen.data(), 1, eigenvalue_in, 1);
ModuleBase::timer::tick("DiagoCusolver", "diag_pool");
}

// Diagonalization function
template <typename T>
void DiagoCusolver<T>::diag(hamilt::Hamilt<T>* phm_in, psi::Psi<T>& psi, Real* eigenvalue_in)
Expand All @@ -122,7 +143,14 @@ void DiagoCusolver<T>::diag(hamilt::Hamilt<T>* phm_in, psi::Psi<T>& psi, Real* e
hamilt::MatrixBlock<T> h_mat;
hamilt::MatrixBlock<T> s_mat;
phm_in->matrix(h_mat, s_mat);

const int nbands_local = psi.get_nbands();
const int nbasis = psi.get_nbasis();
int nbands_global;
#ifdef __MPI
MPI_Allreduce(&nbands_local, &nbands_global, 1, MPI_INT, MPI_SUM, this->ParaV->comm());
#else
nbands_global = nbands_local;
#endif
#ifdef __MPI
// global matrix
Matrix_g<T> h_mat_g;
Expand All @@ -141,7 +169,7 @@ void DiagoCusolver<T>::diag(hamilt::Hamilt<T>* phm_in, psi::Psi<T>& psi, Real* e
#endif

// Allocate memory for eigenvalues
std::vector<double> eigen(PARAM.globalv.nlocal, 0.0);
std::vector<double> eigen(nbasis, 0.0);

// Start the timer for the cusolver operation
ModuleBase::timer::tick("DiagoCusolver", "cusolver");
Expand Down Expand Up @@ -171,31 +199,31 @@ void DiagoCusolver<T>::diag(hamilt::Hamilt<T>* phm_in, psi::Psi<T>& psi, Real* e
MPI_Barrier(MPI_COMM_WORLD);

// broadcast eigenvalues to all processes
MPI_Bcast(eigen.data(), PARAM.inp.nbands, MPI_DOUBLE, root_proc, MPI_COMM_WORLD);
MPI_Bcast(eigen.data(), nbands_global, MPI_DOUBLE, root_proc, MPI_COMM_WORLD);

// distribute psi to all processes
distributePsi(this->ParaV->desc_wfc, psi.get_pointer(), psi_g.data());
}
else
{
// Be careful that h_mat.row * h_mat.col != psi.get_nbands() * psi.get_nbasis() under multi-k situation
// Be careful that h_mat.row * h_mat.col != nbands * nbasis under multi-k situation
std::vector<T> eigenvectors(h_mat.row * h_mat.col);
this->dc.Dngvd(h_mat.row, h_mat.col, h_mat.p, s_mat.p, eigen.data(), eigenvectors.data());
const int size = psi.get_nbands() * psi.get_nbasis();
const int size = nbands_local * nbasis;
BlasConnector::copy(size, eigenvectors.data(), 1, psi.get_pointer(), 1);
}
#else
std::vector<T> eigenvectors(h_mat.row * h_mat.col);
this->dc.Dngvd(h_mat.row, h_mat.col, h_mat.p, s_mat.p, eigen.data(), eigenvectors.data());
const int size = psi.get_nbands() * psi.get_nbasis();
const int size = nbands_local * nbasis;
BlasConnector::copy(size, eigenvectors.data(), 1, psi.get_pointer(), 1);
#endif
// Stop the timer for the cusolver operation
ModuleBase::timer::tick("DiagoCusolver", "cusolver");

// Copy the eigenvalues to the output arrays
const int inc = 1;
BlasConnector::copy(PARAM.inp.nbands, eigen.data(), inc, eigenvalue_in, inc);
BlasConnector::copy(nbands_global, eigen.data(), inc, eigenvalue_in, inc);
}

// Explicit instantiation of the DiagoCusolver class for real and complex numbers
Expand Down
7 changes: 7 additions & 0 deletions source/module_hsolver/diago_cusolver.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,13 @@ class DiagoCusolver
DiagoCusolver(const Parallel_Orbitals* ParaV = nullptr);
~DiagoCusolver();

void diag_pool(
hamilt::MatrixBlock<T>& h_mat,
hamilt::MatrixBlock<T>& s_mat,
psi::Psi<T>& psi,
Real* eigenvalue_in,
MPI_Comm& comm);

// Override the diag function for CUSOLVER diagonalization
void diag(hamilt::Hamilt<T>* phm_in, psi::Psi<T>& psi, Real* eigenvalue_in);

Expand Down
49 changes: 42 additions & 7 deletions source/module_hsolver/hsolver_lcao.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#ifdef __CUDA
#include "diago_cusolver.h"
#include "module_base/module_device/device.h"
#endif

#ifdef __PEXSI
Expand Down Expand Up @@ -49,7 +50,8 @@ void HSolverLCAO<T, Device>::solve(hamilt::Hamilt<T>* pHamilt,
if (this->method != "pexsi")
{
if (PARAM.globalv.kpar_lcao > 1
&& (this->method == "genelpa" || this->method == "elpa" || this->method == "scalapack_gvx"))
&& (this->method == "genelpa" || this->method == "elpa"
|| this->method == "scalapack_gvx" || this->method == "cusolver"))
{
#ifdef __MPI
this->parakSolve(pHamilt, psi, pes, PARAM.globalv.kpar_lcao);
Expand Down Expand Up @@ -185,17 +187,29 @@ void HSolverLCAO<T, Device>::parakSolve(hamilt::Hamilt<T>* pHamilt,
{
#ifdef __MPI
ModuleBase::timer::tick("HSolverLCAO", "parakSolve");
#ifdef __CUDA
base_device::information::set_device_by_rank();
#endif
auto k2d = Parallel_K2D<T>();
k2d.set_kpar(kpar);
int nbands = this->ParaV->get_nbands();
int nks = psi.get_nk();
int nrow = this->ParaV->get_global_row_size();
int nb2d = this->ParaV->get_block_size();
k2d.set_para_env(psi.get_nk(), nrow, nb2d, GlobalV::NPROC, GlobalV::MY_RANK, PARAM.inp.nspin);
if(this->method == "cusolver")
{
k2d.set_para_env_cusolver(psi.get_nk(), nrow, nb2d, GlobalV::NPROC, GlobalV::MY_RANK);
} else
{
k2d.set_para_env(psi.get_nk(), nrow, nb2d, GlobalV::NPROC, GlobalV::MY_RANK);
}
/// set psi_pool
const int zero = 0;
int ncol_bands_pool
= numroc_(&(nbands), &(nb2d), &(k2d.get_p2D_pool()->coord[1]), &zero, &(k2d.get_p2D_pool()->dim1));
int ncol_bands_pool = 0;
if(k2d.is_in_pool())
{
ncol_bands_pool = numroc_(&(nbands), &(nb2d), &(k2d.get_p2D_pool()->coord[1]), &zero, &(k2d.get_p2D_pool()->dim1));
}
/// Loop over k points for solve Hamiltonian to charge density
for (int ik = 0; ik < k2d.get_pKpoints()->get_max_nks_pool(); ++ik)
{
Expand All @@ -222,9 +236,12 @@ void HSolverLCAO<T, Device>::parakSolve(hamilt::Hamilt<T>* pHamilt,
}
}
k2d.distribute_hsk(pHamilt, ik_kpar, nrow);
auto psi_pool = psi::Psi<T>();
if(k2d.is_in_pool())
{
/// global index of k point
int ik_global = ik + k2d.get_pKpoints()->startk_pool[k2d.get_my_pool()];
auto psi_pool = psi::Psi<T>(1, ncol_bands_pool, k2d.get_p2D_pool()->nrow, k2d.get_p2D_pool()->nrow, true);
psi_pool = psi::Psi<T>(1, ncol_bands_pool, k2d.get_p2D_pool()->nrow, k2d.get_p2D_pool()->nrow, true);
ModuleBase::Memory::record("HSolverLCAO::psi_pool", nrow * ncol_bands_pool * sizeof(T));
if (ik_global < psi.get_nk() && ik < k2d.get_pKpoints()->nks_pool[k2d.get_my_pool()])
{
Expand Down Expand Up @@ -255,21 +272,39 @@ void HSolverLCAO<T, Device>::parakSolve(hamilt::Hamilt<T>* pHamilt,
DiagoElpaNative<T> el;
el.diag_pool(hk_pool, sk_pool, psi_pool, &(pes->ekb(ik_global, 0)), k2d.POOL_WORLD_K2D);
}
#endif
#ifdef __CUDA
else if (this->method == "cusolver")
{
DiagoCusolver<T> cs;
cs.diag_pool(hk_pool, sk_pool, psi_pool, &(pes->ekb(ik_global, 0)), k2d.POOL_WORLD_K2D);
}
#endif
else
{
ModuleBase::WARNING_QUIT("HSolverLCAO::solve",
"This type of eigensolver for k-parallelism diagnolization is not supported!");
}
}
}
MPI_Barrier(MPI_COMM_WORLD);
ModuleBase::timer::tick("HSolverLCAO", "collect_psi");
for (int ipool = 0; ipool < ik_kpar.size(); ++ipool)
{
int source = k2d.get_pKpoints()->get_startpro_pool(ipool);
int source = 0;
if(this->method != "cusolver")
{
source = k2d.get_pKpoints()->get_startpro_pool(ipool);
} else
{
source = ipool;
}
MPI_Bcast(&(pes->ekb(ik_kpar[ipool], 0)), nbands, MPI_DOUBLE, source, MPI_COMM_WORLD);
int desc_pool[9];
std::copy(k2d.get_p2D_pool()->desc, k2d.get_p2D_pool()->desc + 9, desc_pool);
if(k2d.is_in_pool())
{
std::copy(k2d.get_p2D_pool()->desc, k2d.get_p2D_pool()->desc + 9, desc_pool);
}
if (k2d.get_my_pool() != ipool)
{
desc_pool[1] = -1;
Expand Down
2 changes: 1 addition & 1 deletion source/module_hsolver/kernels/cuda/diag_cusolver.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class Diag_Cusolver_gvd{
//-------------------

cusolverDnHandle_t cusolverH = nullptr;

cusolverEigType_t itype = CUSOLVER_EIG_TYPE_1; //problem type: A*x = (lambda)*B*x
cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; // compute eigenvalues and eigenvectors.
cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
Expand Down
Loading
Loading