diff --git a/src/Drivers/Sparse/CMakeLists.txt b/src/Drivers/Sparse/CMakeLists.txt index 91efb88a7..1c7ed6d8c 100644 --- a/src/Drivers/Sparse/CMakeLists.txt +++ b/src/Drivers/Sparse/CMakeLists.txt @@ -18,11 +18,10 @@ add_executable(NlpSparseEx4.exe NlpSparseEx4.cpp NlpSparseEx4Driver.cpp) target_link_libraries(NlpSparseEx4.exe HiOp::HiOp) if(HIOP_USE_RAJA) - if(HIOP_USE_GPU AND HIOP_USE_CUDA) + if(HIOP_USE_GPU) set_source_files_properties( NlpSparseRajaEx2.cpp NlpSparseRajaEx2Driver.cpp - PROPERTIES LANGUAGE CUDA ) add_executable(NlpSparseRajaEx2.exe NlpSparseRajaEx2Driver.cpp NlpSparseRajaEx2.cpp) diff --git a/src/Drivers/Sparse/NlpSparseEx1Driver.cpp b/src/Drivers/Sparse/NlpSparseEx1Driver.cpp index 3131fc024..3bc259bae 100644 --- a/src/Drivers/Sparse/NlpSparseEx1Driver.cpp +++ b/src/Drivers/Sparse/NlpSparseEx1Driver.cpp @@ -239,8 +239,10 @@ int main(int argc, char **argv) nlp.options->SetStringValue("fact_acceptor", "inertia_free"); nlp.options->SetIntegerValue("ir_outer_maxit", 0); if (use_ginkgo_cuda) { + nlp.options->SetStringValue("compute_mode", "gpu"); nlp.options->SetStringValue("ginkgo_exec", "cuda"); } else if (use_ginkgo_hip) { + nlp.options->SetStringValue("compute_mode", "gpu"); nlp.options->SetStringValue("ginkgo_exec", "hip"); } else { nlp.options->SetStringValue("ginkgo_exec", "reference"); diff --git a/src/Drivers/Sparse/NlpSparseEx2Driver.cpp b/src/Drivers/Sparse/NlpSparseEx2Driver.cpp index e61b866a0..27e3048d0 100644 --- a/src/Drivers/Sparse/NlpSparseEx2Driver.cpp +++ b/src/Drivers/Sparse/NlpSparseEx2Driver.cpp @@ -248,8 +248,10 @@ int main(int argc, char **argv) nlp.options->SetStringValue("linsol_mode", "speculative"); nlp.options->SetStringValue("linear_solver_sparse", "ginkgo"); if (use_ginkgo_cuda) { + nlp.options->SetStringValue("compute_mode", "gpu"); nlp.options->SetStringValue("ginkgo_exec", "cuda"); } else if (use_ginkgo_hip) { + nlp.options->SetStringValue("compute_mode", "gpu"); nlp.options->SetStringValue("ginkgo_exec", "hip"); } else { nlp.options->SetStringValue("ginkgo_exec", "reference"); diff --git a/src/Drivers/Sparse/NlpSparseEx4Driver.cpp b/src/Drivers/Sparse/NlpSparseEx4Driver.cpp index 0200284f6..878a605e9 100644 --- a/src/Drivers/Sparse/NlpSparseEx4Driver.cpp +++ b/src/Drivers/Sparse/NlpSparseEx4Driver.cpp @@ -238,8 +238,10 @@ int main(int argc, char **argv) nlp.options->SetStringValue("fact_acceptor", "inertia_free"); nlp.options->SetIntegerValue("ir_outer_maxit", 0); if (use_ginkgo_cuda) { + nlp.options->SetStringValue("compute_mode", "gpu"); nlp.options->SetStringValue("ginkgo_exec", "cuda"); } else if (use_ginkgo_hip) { + nlp.options->SetStringValue("compute_mode", "gpu"); nlp.options->SetStringValue("ginkgo_exec", "hip"); } else { nlp.options->SetStringValue("ginkgo_exec", "reference"); diff --git a/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp b/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp index a2455ebe1..d079ed46e 100644 --- a/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp +++ b/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp @@ -256,12 +256,24 @@ int main(int argc, char **argv) // only support cusolverLU right now, 2023.02.28 //lsq initialization of the duals fails for this example since the Jacobian is rank deficient //use zero initialization - nlp.options->SetStringValue("linear_solver_sparse", "resolve"); if(use_resolve_cuda_rf) { + nlp.options->SetStringValue("linear_solver_sparse", "resolve"); nlp.options->SetStringValue("resolve_refactorization", "rf"); nlp.options->SetIntegerValue("ir_inner_maxit", 20); nlp.options->SetIntegerValue("ir_outer_maxit", 0); } + if (use_ginkgo) { + nlp.options->SetStringValue("linear_solver_sparse", "ginkgo"); + nlp.options->SetIntegerValue("ir_outer_maxit", 0); + if (use_ginkgo_cuda) { + nlp.options->SetStringValue("ginkgo_exec", "cuda"); + } else if (use_ginkgo_hip) { + nlp.options->SetStringValue("ginkgo_exec", "hip"); + } else { + nlp.options->SetStringValue("ginkgo_exec", "reference"); + nlp.options->SetStringValue("compute_mode", "cpu"); + } + } nlp.options->SetStringValue("duals_init", "zero"); nlp.options->SetStringValue("mem_space", "device"); nlp.options->SetStringValue("fact_acceptor", "inertia_free"); diff --git a/src/LinAlg/hiopLinSolverSparseGinkgo.cpp b/src/LinAlg/hiopLinSolverSparseGinkgo.cpp index 971ef699a..e463500b2 100644 --- a/src/LinAlg/hiopLinSolverSparseGinkgo.cpp +++ b/src/LinAlg/hiopLinSolverSparseGinkgo.cpp @@ -212,13 +212,11 @@ std::shared_ptr create_exec(std::string executor_string) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::ReferenceExecutor::create(), - true); + return gko::CudaExecutor::create(0, gko::ReferenceExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::ReferenceExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::ReferenceExecutor::create()); }}, {"dpcpp", [] { @@ -283,10 +281,19 @@ std::shared_ptr setup_solver_factory(std::shared_ptroptions->GetString("mem_space") == "device") { + M_host_ = LinearAlgebraFactory::create_matrix_sparse("default", n, n, nnz); + } + } hiopLinSolverSymSparseGinkgo::~hiopLinSolverSymSparseGinkgo() { + // If memory space is device, delete allocated host mirrors + if(nlp_->options->GetString("mem_space") == "device") { + delete M_host_; + } + delete [] index_covert_CSR2Triplet_; delete [] index_covert_extra_Diag2CSR_; } @@ -304,11 +311,34 @@ std::shared_ptr setup_solver_factory(std::shared_ptroptions->GetInteger("ir_inner_restart"); iterative_refinement_ = gmres_iter > 0; - host_mtx_ = transferTripletToCSR(exec_->get_master(), n_, M_, &index_covert_CSR2Triplet_, &index_covert_extra_Diag2CSR_); + // If the matrix is on device, copy it to the host mirror + std::string mem_space = nlp_->options->GetString("mem_space"); + auto M = M_; + if(mem_space == "device") { + auto host = exec_->get_master(); + auto nnz = M_->numberOfNonzeros(); + //host->copy_from(exec_.get(), nnz, M_->M(), M_host_->M()); + auto dv = gko::make_const_array_view(exec_, nnz, M_->M()); + auto hv = gko::make_array_view(host, nnz, M_host_->M()); + host->copy_from(exec_.get(), nnz, dv.get_const_data(), hv.get_data()); + auto di = gko::make_const_array_view(exec_, nnz, M_->i_row()); + auto hi = gko::make_array_view(host, nnz, M_host_->i_row()); + host->copy_from(exec_.get(), nnz, di.get_const_data(), hi.get_data()); + auto dj = gko::make_const_array_view(exec_, nnz, M_->j_col()); + auto hj = gko::make_array_view(host, nnz, M_host_->j_col()); + host->copy_from(exec_.get(), nnz, dj.get_const_data(), hj.get_data()); + //host->copy_from(exec_.get(), nnz, M_->i_row(), M_host_->i_row()); + //host->copy_from(exec_.get(), nnz, M_->j_col(), M_host_->j_col()); + M = M_host_; + } + + host_mtx_ = transferTripletToCSR(exec_->get_master(), n_, M, &index_covert_CSR2Triplet_, &index_covert_extra_Diag2CSR_); mtx_ = exec_ == (exec_->get_master()) ? host_mtx_ : gko::clone(exec_, host_mtx_); nnz_ = mtx_->get_num_stored_elements(); reusable_factory_ = setup_solver_factory(exec_, mtx_, alg, gmres_iter, gmres_tol, gmres_restart); + + dense_b_ = gko::matrix::Dense::create(exec_, gko::dim<2>{n_, 1}); } int hiopLinSolverSymSparseGinkgo::matrixChanged() @@ -321,7 +351,15 @@ std::shared_ptr setup_solver_factory(std::shared_ptrfirstCall(); } else { - update_matrix(M_, mtx_, host_mtx_, index_covert_CSR2Triplet_, index_covert_extra_Diag2CSR_); + std::string mem_space = nlp_->options->GetString("mem_space"); + auto M = M_; + if(mem_space == "device") { + auto host = exec_->get_master(); + auto nnz = M_->numberOfNonzeros(); + host->copy_from(exec_.get(), nnz, M_->M(), M_host_->M()); + M = M_host_; + } + update_matrix(M, mtx_, host_mtx_, index_covert_CSR2Triplet_, index_covert_extra_Diag2CSR_); } gko_solver_ = gko::share(reusable_factory_->generate(mtx_)); @@ -350,24 +388,20 @@ std::shared_ptr setup_solver_factory(std::shared_ptrrunStats.linsolv.tmTriuSolves.start(); - hiopVectorPar* x = dynamic_cast(&x_); - assert(x != NULL); - hiopVectorPar* rhs = dynamic_cast(x->new_copy()); - double* dx = x->local_data(); - double* drhs = rhs->local_data(); + std::string mem_space = nlp_->options->GetString("mem_space"); + auto exec = host; + if(mem_space == "device") { + exec = exec_; + } + + double* dx = x_.local_data(); const auto size = gko::dim<2>{(long unsigned int)n_, 1}; - auto dense_x_host = vec::create(host, size, arr::view(host, n_, dx), 1); - auto dense_x = vec::create(exec_, size); - dense_x->copy_from(dense_x_host.get()); - auto dense_b_host = vec::create(host, size, arr::view(host, n_, drhs), 1); - auto dense_b = vec::create(exec_, size); - dense_b->copy_from(dense_b_host.get()); - - gko_solver_->apply(dense_b.get(), dense_x.get()); + auto dense_x = vec::create(exec, size, arr::view(exec, n_, dx), 1); + dense_b_->copy_from(dense_x.get()); + + gko_solver_->apply(dense_b_.get(), dense_x.get()); nlp_->runStats.linsolv.tmTriuSolves.stop(); - dense_x_host->copy_from(dense_x.get()); - delete rhs; rhs=nullptr; return 1; } diff --git a/src/LinAlg/hiopLinSolverSparseGinkgo.hpp b/src/LinAlg/hiopLinSolverSparseGinkgo.hpp index 058c606a5..d1c7a0419 100644 --- a/src/LinAlg/hiopLinSolverSparseGinkgo.hpp +++ b/src/LinAlg/hiopLinSolverSparseGinkgo.hpp @@ -90,12 +90,15 @@ class hiopLinSolverSymSparseGinkgo: public hiopLinSolverSymSparse std::shared_ptr exec_; std::shared_ptr> mtx_; std::shared_ptr> host_mtx_; + std::shared_ptr> dense_b_; std::shared_ptr reusable_factory_; std::shared_ptr gko_solver_; bool iterative_refinement_; static const std::map alg_map_; + hiopMatrixSparse* M_host_{ nullptr }; ///< Host mirror for the KKT matrix + public: /** called the very first time a matrix is factored. Allocates space diff --git a/src/Optimization/hiopKKTLinSysSparse.cpp b/src/Optimization/hiopKKTLinSysSparse.cpp index c41a8579a..3cee4380f 100644 --- a/src/Optimization/hiopKKTLinSysSparse.cpp +++ b/src/Optimization/hiopKKTLinSysSparse.cpp @@ -317,9 +317,7 @@ namespace hiop if( (nullptr == linSys_ && linear_solver == "auto") || linear_solver == "ginkgo") { //ma57, pardiso and strumpack are not available or user requested ginkgo #ifdef HIOP_USE_GINKGO - nlp_->log->printf(hovScalars, - "KKT_SPARSE_XYcYd linsys: alloc GINKGO with matrix size %d (%d cons)\n", - n, neq+nineq); + linsol_actual = "GINKGO"; linSys_ = new hiopLinSolverSymSparseGinkgo(n, nnz, nlp_); #endif // HIOP_USE_GINKGO } @@ -376,6 +374,14 @@ namespace hiop linSys_ = new hiopLinSolverSymSparsePARDISO(n, nnz, nlp_); #endif // HIOP_USE_PARDISO } + + if( (nullptr == linSys_ && linear_solver == "auto") || linear_solver == "ginkgo") { + //ma57, pardiso and strumpack are not available or user requested ginkgo +#ifdef HIOP_USE_GINKGO + linsol_actual = "GINKGO"; + linSys_ = new hiopLinSolverSymSparseGinkgo(n, nnz, nlp_); +#endif // HIOP_USE_GINKGO + } if(linSys_) { nlp_->log->printf(hovScalars, @@ -747,6 +753,14 @@ namespace hiop #endif // HIOP_USE_PARDISO } + if(nullptr == linSys_ && linear_solver == "ginkgo") { + //ma57, pardiso and strumpack are not available or user requested ginkgo +#ifdef HIOP_USE_GINKGO + linSys_ = new hiopLinSolverSymSparseGinkgo(n, nnz, nlp_); + actual_lin_solver = "GINKGO"; +#endif // HIOP_USE_GINKGO + } + if(linSys_) { nlp_->log->printf(hovScalars, "KKT_SPARSE_XDYcYd linsys: alloc [%s] size %d (%d cons) (hybrid)\n", @@ -781,6 +795,14 @@ namespace hiop } #endif } //end resolve + + if(nullptr == linSys_ && linear_solver == "ginkgo") { + //ma57, pardiso and strumpack are not available or user requested ginkgo +#ifdef HIOP_USE_GINKGO + linSys_ = new hiopLinSolverSymSparseGinkgo(n, nnz, nlp_); + actual_lin_solver = "GINKGO"; +#endif // HIOP_USE_GINKGO + } } // end of compute mode gpu } assert(linSys_&& "KKT_SPARSE_XDYcYd linsys: cannot instantiate backend linear solver");