diff --git a/source/module_base/parallel_common.cpp b/source/module_base/parallel_common.cpp
index 6f8ce79fbc..7be024c0a8 100644
--- a/source/module_base/parallel_common.cpp
+++ b/source/module_base/parallel_common.cpp
@@ -21,44 +21,55 @@ void Parallel_Common::bcast_string(std::string& object) // Peize Lin fix bug 201
     }
 
     MPI_Bcast(&object[0], size, MPI_CHAR, 0, MPI_COMM_WORLD);
+    MPI_Barrier(MPI_COMM_WORLD);
     return;
 }
 
 void Parallel_Common::bcast_string(std::string* object, const int n) // Peize Lin fix bug 2019-03-18
 {
     for (int i = 0; i < n; i++)
+    {
         bcast_string(object[i]);
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
     return;
 }
 
 void Parallel_Common::bcast_complex_double(std::complex<double>& object)
 {
     MPI_Bcast(&object, 1, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);
+    MPI_Barrier(MPI_COMM_WORLD);
+
 }
 
 void Parallel_Common::bcast_complex_double(std::complex<double>* object, const int n)
 {
     MPI_Bcast(object, n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);
+    MPI_Barrier(MPI_COMM_WORLD);
 }
 
 void Parallel_Common::bcast_double(double& object)
 {
     MPI_Bcast(&object, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+    MPI_Barrier(MPI_COMM_WORLD);
 }
 
 void Parallel_Common::bcast_double(double* object, const int n)
 {
     MPI_Bcast(object, n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+    MPI_Barrier(MPI_COMM_WORLD);
 }
 
 void Parallel_Common::bcast_int(int& object)
 {
     MPI_Bcast(&object, 1, MPI_INT, 0, MPI_COMM_WORLD);
+    MPI_Barrier(MPI_COMM_WORLD);
 }
 
 void Parallel_Common::bcast_int(int* object, const int n)
 {
     MPI_Bcast(object, n, MPI_INT, 0, MPI_COMM_WORLD);
+    MPI_Barrier(MPI_COMM_WORLD);
 }
 
 void Parallel_Common::bcast_bool(bool& object)
@@ -69,6 +80,7 @@ void Parallel_Common::bcast_bool(bool& object)
     if (my_rank == 0)
         swap = object;
     MPI_Bcast(&swap, 1, MPI_INT, 0, MPI_COMM_WORLD);
+    MPI_Barrier(MPI_COMM_WORLD);
     if (my_rank != 0)
         object = static_cast<bool>(swap);
 }
@@ -76,6 +88,7 @@ void Parallel_Common::bcast_bool(bool& object)
 void Parallel_Common::bcast_char(char* object, const int n)
 {
     MPI_Bcast(object, n, MPI_CHAR, 0, MPI_COMM_WORLD);
+    MPI_Barrier(MPI_COMM_WORLD);
 }
 
 #endif
diff --git a/source/module_cell/parallel_kpoints.cpp b/source/module_cell/parallel_kpoints.cpp
index 69dc13b898..54c25ae5aa 100644
--- a/source/module_cell/parallel_kpoints.cpp
+++ b/source/module_cell/parallel_kpoints.cpp
@@ -5,11 +5,11 @@
 
 // the kpoints here are reduced after symmetry applied.
 void Parallel_Kpoints::kinfo(int& nkstot_in,
-                             const int& kpar_in,
-                             const int& my_pool_in,
-                             const int& rank_in_pool_in,
-                             const int& nproc_in,
-                             const int& nspin_in)
+                             const int kpar_in,
+                             const int my_pool_in,
+                             const int rank_in_pool_in,
+                             const int nproc_in,
+                             const int nspin_in)
 {
 #ifdef __MPI
 
diff --git a/source/module_cell/parallel_kpoints.h b/source/module_cell/parallel_kpoints.h
index 971a2849f2..e2e232509f 100644
--- a/source/module_cell/parallel_kpoints.h
+++ b/source/module_cell/parallel_kpoints.h
@@ -13,11 +13,11 @@ class Parallel_Kpoints
     ~Parallel_Kpoints(){};
 
     void kinfo(int& nkstot_in,
-               const int& kpar_in,
-               const int& my_pool_in,
-               const int& rank_in_pool_in,
-               const int& nproc_in,
-               const int& nspin_in);
+               const int kpar_in,
+               const int my_pool_in,
+               const int rank_in_pool_in,
+               const int nproc_in,
+               const int nspin_in);
 
     // collect value from each pool to wk.
     void pool_collection(double& value, const double* wk, const int& ik);
diff --git a/source/module_hsolver/diago_cusolver.cpp b/source/module_hsolver/diago_cusolver.cpp
index efaf20a28a..ee0858885f 100644
--- a/source/module_hsolver/diago_cusolver.cpp
+++ b/source/module_hsolver/diago_cusolver.cpp
@@ -111,6 +111,27 @@ static void distributePsi(const int* desc_psi, T* psi, T* psi_g)
     Cpxgemr2d(nrows, ncols, psi_g, 1, 1, descg, psi, 1, 1, descl, ctxt);
 }
 
+template <typename T>
+void DiagoCusolver<T>::diag_pool(hamilt::MatrixBlock<T>& h_mat,
+    hamilt::MatrixBlock<T>& s_mat,
+    psi::Psi<T>& psi,
+    Real* eigenvalue_in,
+    MPI_Comm& comm)
+{
+    ModuleBase::TITLE("DiagoCusolver", "diag_pool");
+    ModuleBase::timer::tick("DiagoCusolver", "diag_pool");
+    const int nbands_local = psi.get_nbands();
+    const int nbasis = psi.get_nbasis();
+    int nbands_global = nbands_local;
+    std::vector<double> eigen(nbasis, 0.0);
+    std::vector<T> eigenvectors(h_mat.row * h_mat.col);
+    this->dc.Dngvd(h_mat.row, h_mat.col, h_mat.p, s_mat.p, eigen.data(), eigenvectors.data());
+    const int size = nbands_local * nbasis;
+    BlasConnector::copy(size, eigenvectors.data(), 1, psi.get_pointer(), 1);
+    BlasConnector::copy(nbands_global, eigen.data(), 1, eigenvalue_in, 1);
+    ModuleBase::timer::tick("DiagoCusolver", "diag_pool");
+}
+
 // Diagonalization function
 template <typename T>
 void DiagoCusolver<T>::diag(hamilt::Hamilt<T>* phm_in, psi::Psi<T>& psi, Real* eigenvalue_in)
@@ -122,7 +143,14 @@ void DiagoCusolver<T>::diag(hamilt::Hamilt<T>* phm_in, psi::Psi<T>& psi, Real* e
     hamilt::MatrixBlock<T> h_mat;
     hamilt::MatrixBlock<T> s_mat;
     phm_in->matrix(h_mat, s_mat);
-
+    const int nbands_local = psi.get_nbands();
+    const int nbasis = psi.get_nbasis();
+    int nbands_global;
+#ifdef __MPI
+    MPI_Allreduce(&nbands_local, &nbands_global, 1, MPI_INT, MPI_SUM, this->ParaV->comm());
+#else
+    nbands_global = nbands_local;
+#endif
 #ifdef __MPI
     // global matrix
     Matrix_g<T> h_mat_g;
@@ -141,7 +169,7 @@ void DiagoCusolver<T>::diag(hamilt::Hamilt<T>* phm_in, psi::Psi<T>& psi, Real* e
 #endif
 
     // Allocate memory for eigenvalues
-    std::vector<double> eigen(PARAM.globalv.nlocal, 0.0);
+    std::vector<double> eigen(nbasis, 0.0);
 
     // Start the timer for the cusolver operation
     ModuleBase::timer::tick("DiagoCusolver", "cusolver");
@@ -171,23 +199,23 @@ void DiagoCusolver<T>::diag(hamilt::Hamilt<T>* phm_in, psi::Psi<T>& psi, Real* e
         MPI_Barrier(MPI_COMM_WORLD);
 
         // broadcast eigenvalues to all processes
-        MPI_Bcast(eigen.data(), PARAM.inp.nbands, MPI_DOUBLE, root_proc, MPI_COMM_WORLD);
+        MPI_Bcast(eigen.data(), nbands_global, MPI_DOUBLE, root_proc, MPI_COMM_WORLD);
 
         // distribute psi to all processes
         distributePsi(this->ParaV->desc_wfc, psi.get_pointer(), psi_g.data());
     }
     else
     {
-        // Be careful that h_mat.row * h_mat.col != psi.get_nbands() * psi.get_nbasis() under multi-k situation
+        // Be careful that h_mat.row * h_mat.col != nbands * nbasis under multi-k situation
         std::vector<T> eigenvectors(h_mat.row * h_mat.col);
         this->dc.Dngvd(h_mat.row, h_mat.col, h_mat.p, s_mat.p, eigen.data(), eigenvectors.data());
-        const int size = psi.get_nbands() * psi.get_nbasis();
+        const int size = nbands_local * nbasis;
         BlasConnector::copy(size, eigenvectors.data(), 1, psi.get_pointer(), 1);
     }
 #else
     std::vector<T> eigenvectors(h_mat.row * h_mat.col);
     this->dc.Dngvd(h_mat.row, h_mat.col, h_mat.p, s_mat.p, eigen.data(), eigenvectors.data());
-    const int size = psi.get_nbands() * psi.get_nbasis();
+    const int size = nbands_local * nbasis;
     BlasConnector::copy(size, eigenvectors.data(), 1, psi.get_pointer(), 1);
 #endif
     // Stop the timer for the cusolver operation
@@ -195,7 +223,7 @@ void DiagoCusolver<T>::diag(hamilt::Hamilt<T>* phm_in, psi::Psi<T>& psi, Real* e
 
     // Copy the eigenvalues to the output arrays
     const int inc = 1;
-    BlasConnector::copy(PARAM.inp.nbands, eigen.data(), inc, eigenvalue_in, inc);
+    BlasConnector::copy(nbands_global, eigen.data(), inc, eigenvalue_in, inc);
 }
 
 // Explicit instantiation of the DiagoCusolver class for real and complex numbers
diff --git a/source/module_hsolver/diago_cusolver.h b/source/module_hsolver/diago_cusolver.h
index ed67357f3f..61f37999be 100644
--- a/source/module_hsolver/diago_cusolver.h
+++ b/source/module_hsolver/diago_cusolver.h
@@ -24,6 +24,13 @@ class DiagoCusolver
     DiagoCusolver(const Parallel_Orbitals* ParaV = nullptr);
     ~DiagoCusolver();
     
+    void diag_pool(
+      hamilt::MatrixBlock<T>& h_mat,
+      hamilt::MatrixBlock<T>& s_mat,
+      psi::Psi<T>& psi,
+      Real* eigenvalue_in,
+      MPI_Comm& comm);
+    
     // Override the diag function for CUSOLVER diagonalization
     void diag(hamilt::Hamilt<T>* phm_in, psi::Psi<T>& psi, Real* eigenvalue_in);
 
diff --git a/source/module_hsolver/hsolver_lcao.cpp b/source/module_hsolver/hsolver_lcao.cpp
index 6a361e08db..585836f6ee 100644
--- a/source/module_hsolver/hsolver_lcao.cpp
+++ b/source/module_hsolver/hsolver_lcao.cpp
@@ -18,6 +18,7 @@
 
 #ifdef __CUDA
 #include "diago_cusolver.h"
+#include "module_base/module_device/device.h"
 #endif
 
 #ifdef __PEXSI
@@ -49,7 +50,8 @@ void HSolverLCAO<T, Device>::solve(hamilt::Hamilt<T>* pHamilt,
     if (this->method != "pexsi")
     {
         if (PARAM.globalv.kpar_lcao > 1
-            && (this->method == "genelpa" || this->method == "elpa" || this->method == "scalapack_gvx"))
+            && (this->method == "genelpa" || this->method == "elpa" 
+                || this->method == "scalapack_gvx" || this->method == "cusolver"))
         {
 #ifdef __MPI
             this->parakSolve(pHamilt, psi, pes, PARAM.globalv.kpar_lcao);
@@ -185,17 +187,29 @@ void HSolverLCAO<T, Device>::parakSolve(hamilt::Hamilt<T>* pHamilt,
 {
 #ifdef __MPI
     ModuleBase::timer::tick("HSolverLCAO", "parakSolve");
+#ifdef __CUDA
+    base_device::information::set_device_by_rank();
+#endif
     auto k2d = Parallel_K2D<T>();
     k2d.set_kpar(kpar);
     int nbands = this->ParaV->get_nbands();
     int nks = psi.get_nk();
     int nrow = this->ParaV->get_global_row_size();
     int nb2d = this->ParaV->get_block_size();
-    k2d.set_para_env(psi.get_nk(), nrow, nb2d, GlobalV::NPROC, GlobalV::MY_RANK, PARAM.inp.nspin);
+    if(this->method == "cusolver")
+    {
+        k2d.set_para_env_cusolver(psi.get_nk(), nrow, nb2d, GlobalV::NPROC, GlobalV::MY_RANK);
+    } else
+    {
+        k2d.set_para_env(psi.get_nk(), nrow, nb2d, GlobalV::NPROC, GlobalV::MY_RANK);
+    }
     /// set psi_pool
     const int zero = 0;
-    int ncol_bands_pool
-        = numroc_(&(nbands), &(nb2d), &(k2d.get_p2D_pool()->coord[1]), &zero, &(k2d.get_p2D_pool()->dim1));
+    int ncol_bands_pool = 0;
+    if(k2d.is_in_pool())
+    {
+        ncol_bands_pool = numroc_(&(nbands), &(nb2d), &(k2d.get_p2D_pool()->coord[1]), &zero, &(k2d.get_p2D_pool()->dim1));
+    }
     /// Loop over k points for solve Hamiltonian to charge density
     for (int ik = 0; ik < k2d.get_pKpoints()->get_max_nks_pool(); ++ik)
     {
@@ -222,9 +236,12 @@ void HSolverLCAO<T, Device>::parakSolve(hamilt::Hamilt<T>* pHamilt,
             }
         }
         k2d.distribute_hsk(pHamilt, ik_kpar, nrow);
+        auto psi_pool = psi::Psi<T>();
+        if(k2d.is_in_pool())
+        {
         /// global index of k point
         int ik_global = ik + k2d.get_pKpoints()->startk_pool[k2d.get_my_pool()];
-        auto psi_pool = psi::Psi<T>(1, ncol_bands_pool, k2d.get_p2D_pool()->nrow, k2d.get_p2D_pool()->nrow, true);
+        psi_pool = psi::Psi<T>(1, ncol_bands_pool, k2d.get_p2D_pool()->nrow, k2d.get_p2D_pool()->nrow, true);
         ModuleBase::Memory::record("HSolverLCAO::psi_pool", nrow * ncol_bands_pool * sizeof(T));
         if (ik_global < psi.get_nk() && ik < k2d.get_pKpoints()->nks_pool[k2d.get_my_pool()])
         {
@@ -255,6 +272,13 @@ void HSolverLCAO<T, Device>::parakSolve(hamilt::Hamilt<T>* pHamilt,
                 DiagoElpaNative<T> el;
                 el.diag_pool(hk_pool, sk_pool, psi_pool, &(pes->ekb(ik_global, 0)), k2d.POOL_WORLD_K2D);
             }
+#endif
+#ifdef __CUDA
+            else if (this->method == "cusolver")
+            {
+                DiagoCusolver<T> cs;
+                cs.diag_pool(hk_pool, sk_pool, psi_pool, &(pes->ekb(ik_global, 0)), k2d.POOL_WORLD_K2D);
+            }
 #endif
             else
             {
@@ -262,14 +286,25 @@ void HSolverLCAO<T, Device>::parakSolve(hamilt::Hamilt<T>* pHamilt,
                                          "This type of eigensolver for k-parallelism diagnolization is not supported!");
             }
         }
+        }
         MPI_Barrier(MPI_COMM_WORLD);
         ModuleBase::timer::tick("HSolverLCAO", "collect_psi");
         for (int ipool = 0; ipool < ik_kpar.size(); ++ipool)
         {
-            int source = k2d.get_pKpoints()->get_startpro_pool(ipool);
+            int source = 0;
+            if(this->method != "cusolver")
+            {
+                source = k2d.get_pKpoints()->get_startpro_pool(ipool);
+            } else
+            {
+                source = ipool;
+            }
             MPI_Bcast(&(pes->ekb(ik_kpar[ipool], 0)), nbands, MPI_DOUBLE, source, MPI_COMM_WORLD);
             int desc_pool[9];
-            std::copy(k2d.get_p2D_pool()->desc, k2d.get_p2D_pool()->desc + 9, desc_pool);
+            if(k2d.is_in_pool())
+            {
+                std::copy(k2d.get_p2D_pool()->desc, k2d.get_p2D_pool()->desc + 9, desc_pool);
+            }
             if (k2d.get_my_pool() != ipool)
             {
                 desc_pool[1] = -1;
diff --git a/source/module_hsolver/kernels/cuda/diag_cusolver.cuh b/source/module_hsolver/kernels/cuda/diag_cusolver.cuh
index 557cf9c3bd..c71c085735 100644
--- a/source/module_hsolver/kernels/cuda/diag_cusolver.cuh
+++ b/source/module_hsolver/kernels/cuda/diag_cusolver.cuh
@@ -22,7 +22,7 @@ class Diag_Cusolver_gvd{
 //-------------------
 
     cusolverDnHandle_t cusolverH = nullptr;
-
+    
     cusolverEigType_t itype = CUSOLVER_EIG_TYPE_1; //problem type: A*x = (lambda)*B*x
     cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; // compute eigenvalues and eigenvectors.
     cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
diff --git a/source/module_hsolver/parallel_k2d.cpp b/source/module_hsolver/parallel_k2d.cpp
index 91ead02082..c416ac3733 100644
--- a/source/module_hsolver/parallel_k2d.cpp
+++ b/source/module_hsolver/parallel_k2d.cpp
@@ -7,11 +7,11 @@
 
 template <typename TK>
 void Parallel_K2D<TK>::set_para_env(int nks,
-                                    const int& nw,
-                                    const int& nb2d,
-                                    const int& nproc,
-                                    const int& my_rank,
-                                    const int& nspin) {
+                                    const int nw,
+                                    const int nb2d,
+                                    const int nproc,
+                                    const int my_rank,
+                                    const int nspin) {
     const int kpar = this->get_kpar();
     Parallel_Global::divide_mpi_groups(nproc,
                                        kpar,
@@ -34,6 +34,47 @@ void Parallel_K2D<TK>::set_para_env(int nks,
     this->P2D_pool->init(nw, nw, nb2d, this->POOL_WORLD_K2D);
 }
 
+template <typename TK>
+void Parallel_K2D<TK>::set_para_env_cusolver(int nks,
+                                    const int nw,
+                                    const int nb2d,
+                                    const int nproc,
+                                    const int my_rank,
+                                    const int nspin) {
+    const int kpar = this->get_kpar();
+    if(kpar <= 0 || kpar > nproc)
+    {
+        ModuleBase::WARNING_QUIT("Parallel_K2D::set_para_env_cusolver",
+                                 "kpar must be greater than 0 and less than nproc.");
+    }
+    const int pool_id = (my_rank < kpar) ? my_rank : MPI_UNDEFINED;
+    MPI_Comm_split(MPI_COMM_WORLD,
+                   pool_id,
+                   0,
+                   &this->POOL_WORLD_K2D);
+    this->P2D_global = new Parallel_2D;
+    this->P2D_global->init(nw, nw, nb2d, MPI_COMM_WORLD);
+    if(this->POOL_WORLD_K2D != MPI_COMM_NULL)
+    {
+        this->MY_POOL = pool_id;
+        this->RANK_IN_POOL = 0;
+        this->NPROC_IN_POOL = 1;
+        this->in_pool = true;
+        this->P2D_pool = new Parallel_2D;
+        this->P2D_pool->init(nw, nw, nb2d, this->POOL_WORLD_K2D);
+    }
+    else
+    {
+        this->in_pool = false;
+        this->MY_POOL = -1;
+        this->RANK_IN_POOL = -1;
+        this->NPROC_IN_POOL = 0;
+    }
+    this->Pkpoints = new Parallel_Kpoints;
+    this->Pkpoints
+    ->kinfo(nks, kpar, this->MY_POOL, this->RANK_IN_POOL, nproc, nspin);
+}
+
 template <typename TK>
 void Parallel_K2D<TK>::distribute_hsk(hamilt::Hamilt<TK>* pHamilt,
                                       const std::vector<int>& ik_kpar,
@@ -45,13 +86,16 @@ void Parallel_K2D<TK>::distribute_hsk(hamilt::Hamilt<TK>* pHamilt,
         pHamilt->updateHk(ik_kpar[ipool]);
         hamilt::MatrixBlock<TK> HK_global, SK_global;
         pHamilt->matrix(HK_global, SK_global);
-        if (this->MY_POOL == this->Pkpoints->whichpool[ik_kpar[ipool]]) {
+        if (this->in_pool && this->MY_POOL == this->Pkpoints->whichpool[ik_kpar[ipool]]) {
             this->hk_pool.resize(this->P2D_pool->get_local_size(), 0.0);
             this->sk_pool.resize(this->P2D_pool->get_local_size(), 0.0);
         }
         int desc_pool[9];
-        std::copy(this->P2D_pool->desc, this->P2D_pool->desc + 9, desc_pool);
-        if (this->MY_POOL != this->Pkpoints->whichpool[ik_kpar[ipool]]) {
+        if(this->in_pool)
+        {
+            std::copy(this->P2D_pool->desc, this->P2D_pool->desc + 9, desc_pool);
+        }
+        if ( !this->in_pool || this->MY_POOL != this->Pkpoints->whichpool[ik_kpar[ipool]]) {
             desc_pool[1] = -1;
         }
         Cpxgemr2d(nw,
@@ -77,7 +121,6 @@ void Parallel_K2D<TK>::distribute_hsk(hamilt::Hamilt<TK>* pHamilt,
                   desc_pool,
                   this->P2D_global->blacs_ctxt);
     }
-    ModuleBase::Memory::record("Parallel_K2D::hsk_pool", this->P2D_pool->get_local_size() * 2 * sizeof(TK));
     ModuleBase::timer::tick("Parallel_K2D", "distribute_hsk");
     MPI_Barrier(MPI_COMM_WORLD);
 #endif
@@ -97,7 +140,10 @@ void Parallel_K2D<TK>::unset_para_env() {
         delete this->P2D_pool;
         this->P2D_pool = nullptr;
     }
-    MPI_Comm_free(&this->POOL_WORLD_K2D);
+    if(this->POOL_WORLD_K2D != MPI_COMM_NULL)
+    {
+        MPI_Comm_free(&this->POOL_WORLD_K2D);
+    }
 }
 
 template <typename TK>
diff --git a/source/module_hsolver/parallel_k2d.h b/source/module_hsolver/parallel_k2d.h
index cdd7a951c8..b5ee4df54b 100644
--- a/source/module_hsolver/parallel_k2d.h
+++ b/source/module_hsolver/parallel_k2d.h
@@ -18,20 +18,28 @@ template <typename TK>
 class Parallel_K2D {
   public:
       /// private constructor
-    Parallel_K2D() {}
+    Parallel_K2D() {};
     /// private destructor
-    ~Parallel_K2D() {}
+    ~Parallel_K2D() {};
     /**
      * Public member functions
      */
     /// this function sets the parallel environment for k-points parallelism
     /// including the glabal and pool 2D parallel distribution
+    /// nspin doesn't affect anything here, in fact it can be deleted
     void set_para_env(int nks,
-                    const int& nw,
-                    const int& nb2d,
-                    const int& nproc,
-                    const int& my_rank,
-                    const int& nspin);
+                      const int nw,
+                      const int nb2d,
+                      const int nproc,
+                      const int my_rank,
+                      const int nspin = 1);
+    
+    void set_para_env_cusolver(int nks,
+                               const int nw,
+                               const int nb2d,
+                               const int nproc,
+                               const int my_rank,
+                               const int nspin = 1);
 
     /// this function distributes the Hk and Sk matrices to hk_pool and sk_pool
     void distribute_hsk(hamilt::Hamilt<TK>* pHamilt,
@@ -44,15 +52,17 @@ class Parallel_K2D {
     /// set the number of k-points
     void set_kpar(int kpar);
     /// get the number of k-points
-    int get_kpar() { return this->kpar_; }
+    int get_kpar() { return this->kpar_; };
     /// get my pool
-    int get_my_pool() { return this->MY_POOL; }
+    int get_my_pool() { return this->MY_POOL; };
+    /// check if this proc is in any pool
+    bool is_in_pool() { return this->in_pool; };
     /// get pKpoints
-    Parallel_Kpoints* get_pKpoints() { return this->Pkpoints; }
+    Parallel_Kpoints* get_pKpoints() { return this->Pkpoints; };
     /// get p2D_global
-    Parallel_2D* get_p2D_global() { return this->P2D_global; }
+    Parallel_2D* get_p2D_global() { return this->P2D_global; };
     /// get p2D_pool
-    Parallel_2D* get_p2D_pool() { return this->P2D_pool; }
+    Parallel_2D* get_p2D_pool() { return this->P2D_pool; };
 
     /**
      * the local Hk, Sk matrices in POOL_WORLD_K2D
@@ -69,6 +79,7 @@ class Parallel_K2D {
      * Private member variables
      */
     int kpar_ = 0;
+    bool in_pool = true;
 
     /**
      * mpi info