diff --git a/coreneuron/apps/main1.cpp b/coreneuron/apps/main1.cpp index e2bdfce96..cf9409f0f 100644 --- a/coreneuron/apps/main1.cpp +++ b/coreneuron/apps/main1.cpp @@ -12,8 +12,9 @@ * @brief File containing main driver routine for CoreNeuron */ -#include #include +#include +#include #include #include @@ -114,11 +115,24 @@ char* prepare_args(int& argc, char**& argv, int use_mpi, const char* arg) { // return actual data to be freed return first; } -} + +} // extern "C" namespace coreneuron { + void call_prcellstate_for_prcellgid(int prcellgid, int compute_gpu, int is_init); + +static std::string check_restore() { + auto restore_path = corenrn_param.restorepath; + const auto auto_chkpt_path = corenrn_param.outpath + "/_corenrn_ckpt"; + if (restore_path.empty() && fs_isdir(auto_chkpt_path.c_str())) { + restore_path = auto_chkpt_path; + } + return restore_path; +} + + void nrn_init_and_load_data(int argc, char* argv[], bool is_mapping_needed = false, @@ -168,7 +182,7 @@ void nrn_init_and_load_data(int argc, set_globals(corenrn_param.datpath.c_str(), (corenrn_param.seed >= 0), corenrn_param.seed); // set global variables for start time, timestep and temperature - std::string restore_path = corenrn_param.restorepath; + std::string restore_path = check_restore(); t = restore_time(restore_path.c_str()); if (corenrn_param.dt != -1000.) { // command line arg highest precedence @@ -402,6 +416,20 @@ std::unique_ptr create_report_handler(ReportConfiguration& config return report_handler; } +/** + * \brief Installs a SIGTERM handler so that we finish the current simulation without losing data + * \return True if a checkpoint was performed. False otherwise (not enough elapsed time) + */ +static void install_sigterm_handler() { + auto sigh = [](int) { + std::cerr << "SIGTERM caught! Halting sim and dumping checkpoint" << std::endl; + coreneuron::stoprun = true; + }; + if (std::signal(SIGTERM, sigh) == SIG_ERR) { + std::cerr << "Could not install SIGTERM handler" << std::endl; + } +} + } // namespace coreneuron /// The following high-level functions are marked as "extern C" @@ -482,6 +510,9 @@ extern "C" int run_solve_core(int argc, char** argv) { if (nrnmpi_myid == 0) { mkdir_p(output_dir.c_str()); } + + install_sigterm_handler(); + #if NRNMPI nrnmpi_barrier(); #endif diff --git a/coreneuron/io/file_utils.cpp b/coreneuron/io/file_utils.cpp index 96ee47d2f..f81482f0e 100644 --- a/coreneuron/io/file_utils.cpp +++ b/coreneuron/io/file_utils.cpp @@ -51,3 +51,13 @@ int mkdir_p(const char* path) { delete[] dirpath; return 0; } + +bool fs_exists(const char* path) { + struct stat buffer; + return (stat(path, &buffer) == 0); +} + +bool fs_isdir(const char* path) { + struct stat buffer; + return (stat(path, &buffer) == 0 && S_ISDIR(buffer.st_mode)); +} diff --git a/coreneuron/io/file_utils.hpp b/coreneuron/io/file_utils.hpp index 2e3fc2b59..881b6d8a7 100644 --- a/coreneuron/io/file_utils.hpp +++ b/coreneuron/io/file_utils.hpp @@ -21,4 +21,12 @@ */ int mkdir_p(const char* path); +/** @brief Checks an arbitrary path exists + */ +bool fs_exists(const char* path); + +/** @brief Checks an arbitrary path is an existing directory + */ +bool fs_isdir(const char* path); + #endif /* ifndef NRN_FILE_UTILS */ diff --git a/coreneuron/nrnconf.h b/coreneuron/nrnconf.h index 064cc1547..8953c55de 100644 --- a/coreneuron/nrnconf.h +++ b/coreneuron/nrnconf.h @@ -37,7 +37,7 @@ extern double pi; extern double t, dt; extern int rev_dt; extern int secondorder; -extern bool stoprun; +extern bool volatile stoprun; extern const char* bbcore_write_version; #define tstopbit (1 << 15) #define tstopset stoprun |= tstopbit diff --git a/coreneuron/sim/fadvance_core.cpp b/coreneuron/sim/fadvance_core.cpp index 162efc063..b73a820de 100644 --- a/coreneuron/sim/fadvance_core.cpp +++ b/coreneuron/sim/fadvance_core.cpp @@ -6,6 +6,7 @@ # =============================================================================. */ +#include #include #include "coreneuron/coreneuron.hpp" @@ -23,12 +24,19 @@ #include "coreneuron/utils/progressbar/progressbar.h" #include "coreneuron/utils/profile/profiler_interface.h" #include "coreneuron/io/nrn2core_direct.h" +#include "coreneuron/io/nrn_checkpoint.hpp" + +// Do an auto checkpoint only if execution lasted longer than this var (secs) +#define CHECKPOINT_MIN_RUNTIME (4 * 3600) // 4h namespace coreneuron { extern corenrn_parameters corenrn_param; static void* nrn_fixed_step_thread(NrnThread*); static void* nrn_fixed_step_group_thread(NrnThread*, int, int, int&); +static bool nrn_auto_checkpoint(); +static time_t sim_start_time; + void dt2thread(double adt) { /* copied from nrnoc/fadvance.c */ if (adt != nrn_threads[0]._dt) { @@ -109,6 +117,7 @@ void nrn_fixed_single_steps_minimal(int total_sim_steps, double tstop) { #endif nrn_fixed_step_minimal(); if (stoprun) { + nrn_auto_checkpoint(); break; } current_steps++; @@ -141,6 +150,7 @@ void nrn_fixed_step_group_minimal(int total_sim_steps) { nrn_flush_reports(nrn_threads[0]._t); #endif if (stoprun) { + nrn_auto_checkpoint(); break; } current_steps++; @@ -377,4 +387,26 @@ void* nrn_fixed_step_lastpart(NrnThread* nth) { return nullptr; } + +/** + * \brief Does a checkpoint of the simulation in enough time has passed + * \return True if a checkpoint was performed. False otherwise (not enough elapsed time) + */ +static bool nrn_auto_checkpoint() { + time_t cur_time = time(NULL); + int elapsed_secs = difftime(sim_start_time, cur_time); + if (elapsed_secs < CHECKPOINT_MIN_RUNTIME) { + return false; + } + // Write to tmp location first because allocated time may not be enough to complete + const auto ckpt_tmp = corenrn_param.outpath + "/_corenrn_ckpt_dirty", + ckpt_dir = corenrn_param.outpath + "/_corenrn_ckpt"; + Instrumentor::phase p("Checkpointing"); + write_checkpoint(nrn_threads, nrn_nthread, ckpt_tmp.c_str()); + system(("/bin/rm -rf '" + ckpt_dir + "'; " + "/bin/mv '" + ckpt_tmp + "' '" + ckpt_dir + "'") + .c_str()); + return true; +} + + } // namespace coreneuron diff --git a/coreneuron/utils/nrnoc_aux.cpp b/coreneuron/utils/nrnoc_aux.cpp index 1efc7d395..ba7f87816 100644 --- a/coreneuron/utils/nrnoc_aux.cpp +++ b/coreneuron/utils/nrnoc_aux.cpp @@ -15,7 +15,7 @@ #include "coreneuron/utils/nrnoc_aux.hpp" namespace coreneuron { -bool stoprun; +bool volatile stoprun; int v_structure_change; int diam_changed; #define MAXERRCOUNT 5