From 0721b3299f6ad3efb6c9181c2de6189136d53afe Mon Sep 17 00:00:00 2001 From: Ioannis Magkanaris Date: Tue, 14 Dec 2021 18:29:29 +0100 Subject: [PATCH 01/11] Added stream id vector in NrnThread --- coreneuron/io/phase2.cpp | 4 +++- coreneuron/sim/multicore.cpp | 3 +++ coreneuron/sim/multicore.hpp | 7 ++++--- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/coreneuron/io/phase2.cpp b/coreneuron/io/phase2.cpp index e54e1db81..157b69d61 100644 --- a/coreneuron/io/phase2.cpp +++ b/coreneuron/io/phase2.cpp @@ -875,7 +875,9 @@ void Phase2::populate(NrnThread& nt, const UserParams& userParams) { * greater than number of omp threads. */ #if defined(_OPENMP) - nt.stream_id = omp_get_thread_num(); + const auto omp_thread_num = omp_get_thread_num(); + nt.stream_id = omp_thread_num; + nt.streams[nt.stream_id] = omp_thread_num; #endif int shadow_rhs_cnt = 0; diff --git a/coreneuron/sim/multicore.cpp b/coreneuron/sim/multicore.cpp index d5368a29c..d11ab790a 100644 --- a/coreneuron/sim/multicore.cpp +++ b/coreneuron/sim/multicore.cpp @@ -106,6 +106,9 @@ void nrn_threads_create(int n) { for (int i = 0; i < nrn_nthread; ++i) { NrnThread& nt = nrn_threads[i]; nt.id = i; +#if defined(_OPENMP) + nt.streams.reserve(omp_get_num_threads()); +#endif for (int j = 0; j < BEFORE_AFTER_SIZE; ++j) { nt.tbl[j] = nullptr; } diff --git a/coreneuron/sim/multicore.hpp b/coreneuron/sim/multicore.hpp index c108e2431..56f8d3af8 100644 --- a/coreneuron/sim/multicore.hpp +++ b/coreneuron/sim/multicore.hpp @@ -130,9 +130,10 @@ struct NrnThread: public MemoryManaged { NrnThreadBAList* tbl[BEFORE_AFTER_SIZE]; /* wasteful since almost all empty */ - int shadow_rhs_cnt = 0; /* added to facilitate the NrnThread transfer to GPU */ - int compute_gpu = 0; /* define whether to compute with gpus */ - int stream_id = 0; /* define where the kernel will be launched on GPU stream */ + int shadow_rhs_cnt = 0; /* added to facilitate the NrnThread transfer to GPU */ + int compute_gpu = 0; /* define whether to compute with gpus */ + int stream_id = 0; /* define where the kernel will be launched on GPU stream */ + std::vector streams; /* vector of stream ids needed for async execution of OpenMP in multiple streams */ int _net_send_buffer_size = 0; int _net_send_buffer_cnt = 0; int* _net_send_buffer = nullptr; From a36d21c1a1452df913b946981de7dfd7be0f03c6 Mon Sep 17 00:00:00 2001 From: Ioannis Magkanaris Date: Fri, 17 Dec 2021 13:20:23 +0100 Subject: [PATCH 02/11] Fixed openacc async clauses --- coreneuron/gpu/nrn_acc_manager.cpp | 8 ++++---- coreneuron/mechanism/capac.cpp | 4 ++-- coreneuron/mechanism/eion.cpp | 14 ++++++-------- coreneuron/network/netcvode.cpp | 8 ++++---- coreneuron/network/partrans.cpp | 8 ++++---- coreneuron/permute/cellorder.cpp | 8 ++++---- coreneuron/sim/fadvance_core.cpp | 20 ++++++++++---------- coreneuron/sim/fast_imem.cpp | 4 ++-- coreneuron/sim/solve_core.cpp | 8 ++++---- coreneuron/sim/treeset_core.cpp | 12 ++++++------ 10 files changed, 46 insertions(+), 48 deletions(-) diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp index d5e723527..4a1583f8b 100644 --- a/coreneuron/gpu/nrn_acc_manager.cpp +++ b/coreneuron/gpu/nrn_acc_manager.cpp @@ -326,8 +326,8 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { /* Here is the example of using OpenACC data enter/exit * Remember that we are not allowed to use nt->_data but we have to use: * double *dtmp = nt->_data; // now use dtmp! - #pragma acc enter data copyin(dtmp[0:nt->_ndata]) async(nt->stream_id) - #pragma acc wait(nt->stream_id) + #pragma acc enter data copyin(dtmp[0:nt->_ndata]) async(nt->streams[nt->stream_id]) + #pragma acc wait(nt->streams[nt->stream_id]) */ /*update d_nt._data to point to device copy */ @@ -736,7 +736,7 @@ void update_net_receive_buffer(NrnThread* nt) { nrb->_nrb_flag[:nrb->_cnt], nrb->_displ[:nrb->_displ_cnt + 1], nrb->_nrb_index[:nrb->_cnt]) - async(nt->stream_id)) + async(nt->streams[nt->stream_id])) nrn_pragma_omp(target update to(nrb->_cnt, nrb->_displ_cnt, nrb->_pnt_index[:nrb->_cnt], @@ -749,7 +749,7 @@ void update_net_receive_buffer(NrnThread* nt) { } } } - nrn_pragma_acc(wait(nt->stream_id)) + nrn_pragma_acc(wait(nt->streams[nt->stream_id])) } void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb) { diff --git a/coreneuron/mechanism/capac.cpp b/coreneuron/mechanism/capac.cpp index 42c65cb18..0cb59c4b1 100644 --- a/coreneuron/mechanism/capac.cpp +++ b/coreneuron/mechanism/capac.cpp @@ -70,7 +70,7 @@ void nrn_jacob_capacitance(NrnThread* _nt, Memb_list* ml, int /* type */) { nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm], ni [0:_cntml_actual], _vec_d [0:_nt->end]) if (_nt->compute_gpu) - async(_nt->stream_id)) + async(_nt->streams[_nt->stream_id]) nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (_iml = 0; _iml < _cntml_actual; _iml++) { _vec_d[ni[_iml]] += cfac * cm; @@ -116,7 +116,7 @@ void nrn_cur_capacitance(NrnThread* _nt, Memb_list* ml, int /* type */) { nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm], ni [0:_cntml_actual], _vec_rhs [0:_nt->end]) if (_nt->compute_gpu) - async(_nt->stream_id)) + async(_nt->streams[_nt->stream_id]) nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (int _iml = 0; _iml < _cntml_actual; _iml++) { i_cap = cfac * cm * _vec_rhs[ni[_iml]]; diff --git a/coreneuron/mechanism/eion.cpp b/coreneuron/mechanism/eion.cpp index 8b58e858d..2f95e0f9c 100644 --- a/coreneuron/mechanism/eion.cpp +++ b/coreneuron/mechanism/eion.cpp @@ -263,13 +263,11 @@ void nrn_cur_ion(NrnThread* nt, Memb_list* ml, int type) { int _cntml_padded = ml->_nodecount_padded; pd = ml->data; ppd = ml->pdata; - // clang-format off - nrn_pragma_acc(parallel loop present(pd[0:_cntml_padded * 5], - nrn_ion_global_map[0:nrn_ion_global_map_size] - [0:ion_global_map_member_size]) - if (nt->compute_gpu) - async(nt->stream_id)) - // clang-format on + nrn_pragma_acc(parallel loop present( + pd [0:_cntml_padded * 5], + nrn_ion_global_map + [0:nrn_ion_global_map_size] [0:ion_global_map_member_size]) if (nt->compute_gpu) + async(nt->streams[nt->stream_id])) nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)) for (int _iml = 0; _iml < _cntml_actual; ++_iml) { dcurdv = 0.; @@ -342,7 +340,7 @@ void second_order_cur(NrnThread* _nt, int secondorder) { nrn_pragma_acc(parallel loop present(pd [0:_cntml_padded * 5], ni [0:_cntml_actual], _vec_rhs [0:_nt->end]) if (_nt->compute_gpu) - async(_nt->stream_id)) + async(_nt->streams[_nt->stream_id]) nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (int _iml = 0; _iml < _cntml_actual; ++_iml) { cur += dcurdv * (_vec_rhs[ni[_iml]]); diff --git a/coreneuron/network/netcvode.cpp b/coreneuron/network/netcvode.cpp index 4fb1d165f..b209c1e09 100644 --- a/coreneuron/network/netcvode.cpp +++ b/coreneuron/network/netcvode.cpp @@ -533,7 +533,7 @@ void NetCvode::check_thresh(NrnThread* nt) { // for default method nrn_pragma_acc(parallel loop present( nt [0:1], presyns_helper [0:nt->n_presyn], presyns [0:nt->n_presyn], actual_v [0:nt->end]) - copy(net_send_buf_count) if (nt->compute_gpu) async(nt->stream_id)) + copy(net_send_buf_count) if (nt->compute_gpu) async(nt->streams[nt->stream_id])) nrn_pragma_omp(target teams distribute parallel for map(tofrom: net_send_buf_count) if(nt->compute_gpu)) for (int i = 0; i < nt->ncell; ++i) { PreSyn* ps = presyns + i; @@ -561,15 +561,15 @@ void NetCvode::check_thresh(NrnThread* nt) { // for default method nt->_net_send_buffer[idx] = i; } } - nrn_pragma_acc(wait(nt->stream_id)) + nrn_pragma_acc(wait(nt->streams[nt->stream_id])) nt->_net_send_buffer_cnt = net_send_buf_count; if (nt->compute_gpu && nt->_net_send_buffer_cnt) { #ifdef CORENEURON_ENABLE_GPU int* nsbuffer = nt->_net_send_buffer; #endif - nrn_pragma_acc(update host(nsbuffer [0:nt->_net_send_buffer_cnt]) async(nt->stream_id)) - nrn_pragma_acc(wait(nt->stream_id)) + nrn_pragma_acc(update host(nsbuffer [0:nt->_net_send_buffer_cnt]) async(nt->streams[nt->stream_id])) + nrn_pragma_acc(wait(nt->streams[nt->stream_id])) nrn_pragma_omp(target update from(nsbuffer [0:nt->_net_send_buffer_cnt])) } diff --git a/coreneuron/network/partrans.cpp b/coreneuron/network/partrans.cpp index 4c517e999..a2f9af8f0 100644 --- a/coreneuron/network/partrans.cpp +++ b/coreneuron/network/partrans.cpp @@ -56,13 +56,13 @@ void nrnmpi_v_transfer() { nrn_pragma_acc(parallel loop present(src_indices [0:n_src_gather], src_data [0:nt->_ndata], src_gather [0:n_src_gather]) if (nt->compute_gpu) - async(nt->stream_id)) + async(nt->streams[nt->stream_id])) nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)) for (int i = 0; i < n_src_gather; ++i) { src_gather[i] = src_data[src_indices[i]]; } nrn_pragma_acc(update host(src_gather [0:n_src_gather]) if (nt->compute_gpu) - async(nt->stream_id)) + async(nt->streams[nt->stream_id])) nrn_pragma_omp(target update from(src_gather [0:n_src_gather]) if (nt->compute_gpu)) } @@ -71,7 +71,7 @@ void nrnmpi_v_transfer() { for (int tid = 0; tid < nrn_nthread; ++tid) { if (nrn_threads[tid].compute_gpu) { compute_gpu = true; - nrn_pragma_acc(wait(nrn_threads[tid].stream_id)) + nrn_pragma_acc(wait(nrn_threads[tid].streams[nrn_threads[tid].stream_id])) } TransferThreadData& ttd = transfer_thread_data_[tid]; size_t n_outsrc_indices = ttd.outsrc_indices.size(); @@ -122,7 +122,7 @@ void nrnthread_v_transfer(NrnThread* _nt) { nrn_pragma_acc(parallel loop present(insrc_indices [0:ntar], tar_data [0:ndata], insrc_buf_ [0:n_insrc_buf]) if (_nt->compute_gpu) - async(_nt->stream_id)) + async(_nt->streams[_nt->stream_id]) nrn_pragma_omp(target teams distribute parallel for simd map(to: tar_indices[0:ntar]) if(_nt->compute_gpu)) for (size_t i = 0; i < ntar; ++i) { tar_data[tar_indices[i]] = insrc_buf_[insrc_indices[i]]; diff --git a/coreneuron/permute/cellorder.cpp b/coreneuron/permute/cellorder.cpp index 14feb31de..9814c792b 100644 --- a/coreneuron/permute/cellorder.cpp +++ b/coreneuron/permute/cellorder.cpp @@ -606,7 +606,7 @@ void solve_interleaved2(int ith) { ncycles [0:nwarp], stridedispl [0:nwarp + 1], rootbegin [0:nwarp + 1], - nodebegin [0:nwarp + 1]) if (nt->compute_gpu) async(nt->stream_id)) + nodebegin [0:nwarp + 1]) if (nt->compute_gpu) async(nt->streams[nt->stream_id])) nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)) for (int icore = 0; icore < ncore; ++icore) { int iwarp = icore / warpsize; // figure out the >> value @@ -626,7 +626,7 @@ void solve_interleaved2(int ith) { } // serial test mode #endif } - nrn_pragma_acc(wait(nt->stream_id)) + nrn_pragma_acc(wait(nt->streams[nt->stream_id])) #ifdef _OPENACC } #endif @@ -660,14 +660,14 @@ void solve_interleaved1(int ith) { firstnode [0:ncell], lastnode [0:ncell], cellsize [0:ncell]) if (nt->compute_gpu) - async(nt->stream_id)) + async(nt->streams[nt->stream_id])) nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)) for (int icell = 0; icell < ncell; ++icell) { int icellsize = cellsize[icell]; triang_interleaved(nt, icell, icellsize, nstride, stride, lastnode); bksub_interleaved(nt, icell, icellsize, nstride, stride, firstnode); } - nrn_pragma_acc(wait(nt->stream_id)) + nrn_pragma_acc(wait(nt->streams[nt->stream_id])) } void solve_interleaved(int ith) { diff --git a/coreneuron/sim/fadvance_core.cpp b/coreneuron/sim/fadvance_core.cpp index ab6fc4bfb..6dfab8ae4 100644 --- a/coreneuron/sim/fadvance_core.cpp +++ b/coreneuron/sim/fadvance_core.cpp @@ -79,7 +79,7 @@ void dt2thread(double adt) { /* copied from nrnoc/fadvance.c */ nt->cj = 1.0 / dt; } nrn_pragma_acc(update device(nt->_t, nt->_dt, nt->cj) - async(nt->stream_id) if (nt->compute_gpu)) + async(nt->streams[nt->stream_id]) if (nt->compute_gpu)) // clang-format off nrn_pragma_omp(target update to(nt->_t, nt->_dt, nt->cj) if(nt->compute_gpu)) @@ -206,14 +206,14 @@ void update(NrnThread* _nt) { /* do not need to worry about linmod or extracellular*/ if (secondorder) { nrn_pragma_acc(parallel loop present(vec_v [0:i2], vec_rhs [0:i2]) if (_nt->compute_gpu) - async(_nt->stream_id)) + async(_nt->streams[_nt->stream_id]) nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (int i = 0; i < i2; ++i) { vec_v[i] += 2. * vec_rhs[i]; } } else { nrn_pragma_acc(parallel loop present(vec_v [0:i2], vec_rhs [0:i2]) if (_nt->compute_gpu) - async(_nt->stream_id)) + async(_nt->streams[_nt->stream_id]) nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (int i = 0; i < i2; ++i) { vec_v[i] += vec_rhs[i]; @@ -295,7 +295,7 @@ void nrncore2nrn_send_values(NrnThread* nth) { assert(vs < tr->bsize); nrn_pragma_acc(parallel loop present(tr [0:1]) if (nth->compute_gpu) - async(nth->stream_id)) + async(nth->streams[nth->stream_id])) nrn_pragma_omp(target teams distribute parallel for simd if(nth->compute_gpu)) for (int i = 0; i < tr->n_trajec; ++i) { tr->varrays[i][vs] = *tr->gather[i]; @@ -316,10 +316,10 @@ void nrncore2nrn_send_values(NrnThread* nth) { for (int i = 0; i < tr->n_trajec; ++i) { double* gather_i = tr->gather[i]; nrn_pragma_acc(update self(gather_i [0:1]) if (nth->compute_gpu) - async(nth->stream_id)) + async(nth->streams[nth->stream_id])) nrn_pragma_omp(target update from(gather_i [0:1]) if (nth->compute_gpu)) } - nrn_pragma_acc(wait(nth->stream_id)) + nrn_pragma_acc(wait(nth->streams[nth->stream_id))) for (int i = 0; i < tr->n_trajec; ++i) { *(tr->scatter[i]) = *(tr->gather[i]); } @@ -341,8 +341,8 @@ static void* nrn_fixed_step_thread(NrnThread* nth) { if (nth->ncell) { /*@todo: do we need to update nth->_t on GPU: Yes (Michael, but can launch kernel) */ - nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->stream_id)) - nrn_pragma_acc(wait(nth->stream_id)) + nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->streams[nth->stream_id])) + nrn_pragma_acc(wait(nth->streams[nth->stream_id))) nrn_pragma_omp(target update to(nth->_t) if (nth->compute_gpu)) fixed_play_continuous(nth); @@ -377,8 +377,8 @@ void* nrn_fixed_step_lastpart(NrnThread* nth) { if (nth->ncell) { /*@todo: do we need to update nth->_t on GPU */ - nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->stream_id)) - nrn_pragma_acc(wait(nth->stream_id)) + nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->streams[nth->stream_id])) + nrn_pragma_acc(wait(nth->streams[nth->stream_id))) nrn_pragma_omp(target update to(nth->_t) if (nth->compute_gpu)) fixed_play_continuous(nth); nonvint(nth); diff --git a/coreneuron/sim/fast_imem.cpp b/coreneuron/sim/fast_imem.cpp index 1218b7967..d579e7678 100644 --- a/coreneuron/sim/fast_imem.cpp +++ b/coreneuron/sim/fast_imem.cpp @@ -52,7 +52,7 @@ void nrn_calc_fast_imem(NrnThread* nt) { double* fast_imem_rhs = nt->nrn_fast_imem->nrn_sav_rhs; nrn_pragma_acc( parallel loop present(vec_rhs, vec_area, fast_imem_d, fast_imem_rhs) if (nt->compute_gpu) - async(nt->stream_id)) + async(nt->streams[nt->stream_id])) nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)) for (int i = i1; i < i3; ++i) { fast_imem_rhs[i] = (fast_imem_d[i] * vec_rhs[i] + fast_imem_rhs[i]) * vec_area[i] * 0.01; @@ -69,7 +69,7 @@ void nrn_calc_fast_imem_init(NrnThread* nt) { double* fast_imem_rhs = nt->nrn_fast_imem->nrn_sav_rhs; nrn_pragma_acc(parallel loop present(vec_rhs, vec_area, fast_imem_rhs) if (nt->compute_gpu) - async(nt->stream_id)) + async(nt->streams[nt->stream_id])) nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)) for (int i = i1; i < i3; ++i) { fast_imem_rhs[i] = (vec_rhs[i] + fast_imem_rhs[i]) * vec_area[i] * 0.01; diff --git a/coreneuron/sim/solve_core.cpp b/coreneuron/sim/solve_core.cpp index 60ba2b660..69de841e4 100644 --- a/coreneuron/sim/solve_core.cpp +++ b/coreneuron/sim/solve_core.cpp @@ -41,7 +41,7 @@ static void triang(NrnThread* _nt) { nrn_pragma_acc(parallel loop seq present( vec_a [0:i3], vec_b [0:i3], vec_d [0:i3], vec_rhs [0:i3], parent_index [0:i3]) - async(_nt->stream_id) if (_nt->compute_gpu)) + async(_nt->streams[_nt->stream_id]) if (_nt->compute_gpu)) for (int i = i3 - 1; i >= i2; --i) { double p = vec_a[i] / vec_d[i]; vec_d[parent_index[i]] -= p * vec_b[i]; @@ -61,21 +61,21 @@ static void bksub(NrnThread* _nt) { int* parent_index = _nt->_v_parent_index; nrn_pragma_acc(parallel loop seq present(vec_d [0:i2], vec_rhs [0:i2]) - async(_nt->stream_id) if (_nt->compute_gpu)) + async(_nt->streams[_nt->stream_id]) if (_nt->compute_gpu)) for (int i = i1; i < i2; ++i) { vec_rhs[i] /= vec_d[i]; } nrn_pragma_acc( parallel loop seq present(vec_b [0:i3], vec_d [0:i3], vec_rhs [0:i3], parent_index [0:i3]) - async(_nt->stream_id) if (_nt->compute_gpu)) + async(_nt->streams[_nt->stream_id]) if (_nt->compute_gpu)) for (int i = i2; i < i3; ++i) { vec_rhs[i] -= vec_b[i] * vec_rhs[parent_index[i]]; vec_rhs[i] /= vec_d[i]; } if (_nt->compute_gpu) { - nrn_pragma_acc(wait(_nt->stream_id)) + nrn_pragma_acc(wait(_nt->streams[_nth->stream_id])) } } } // namespace coreneuron diff --git a/coreneuron/sim/treeset_core.cpp b/coreneuron/sim/treeset_core.cpp index 208058fe1..a7de3a1be 100644 --- a/coreneuron/sim/treeset_core.cpp +++ b/coreneuron/sim/treeset_core.cpp @@ -33,7 +33,7 @@ static void nrn_rhs(NrnThread* _nt) { int* parent_index = _nt->_v_parent_index; nrn_pragma_acc(parallel loop present(vec_rhs [0:i3], vec_d [0:i3]) if (_nt->compute_gpu) - async(_nt->stream_id)) + async(_nt->streams[_nt->stream_id]) nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu)) for (int i = i1; i < i3; ++i) { vec_rhs[i] = 0.; @@ -45,7 +45,7 @@ static void nrn_rhs(NrnThread* _nt) { double* fast_imem_rhs = _nt->nrn_fast_imem->nrn_sav_rhs; nrn_pragma_acc( parallel loop present(fast_imem_d [i1:i3], fast_imem_rhs [i1:i3]) if (_nt->compute_gpu) - async(_nt->stream_id)) + async(_nt->streams[_nt->stream_id]) nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu)) for (int i = i1; i < i3; ++i) { fast_imem_d[i] = 0.; @@ -75,7 +75,7 @@ static void nrn_rhs(NrnThread* _nt) { */ double* p = _nt->nrn_fast_imem->nrn_sav_rhs; nrn_pragma_acc(parallel loop present(p, vec_rhs) if (_nt->compute_gpu) - async(_nt->stream_id)) + async(_nt->streams[_nt->stream_id]) nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu)) for (int i = i1; i < i3; ++i) { p[i] -= vec_rhs[i]; @@ -92,7 +92,7 @@ static void nrn_rhs(NrnThread* _nt) { vec_b [0:i3], vec_v [0:i3], parent_index [0:i3]) if (_nt->compute_gpu) - async(_nt->stream_id)) + async(_nt->streams[_nt->stream_id]) nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu)) for (int i = i2; i < i3; ++i) { double dv = vec_v[parent_index[i]] - vec_v[i]; @@ -152,7 +152,7 @@ static void nrn_lhs(NrnThread* _nt) { so here we transform so it only has membrane current contribution */ double* p = _nt->nrn_fast_imem->nrn_sav_d; - nrn_pragma_acc(parallel loop present(p, vec_d) if (_nt->compute_gpu) async(_nt->stream_id)) + nrn_pragma_acc(parallel loop present(p, vec_d) if (_nt->compute_gpu) async(_nt->streams[_nt->stream_id]) nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu)) for (int i = i1; i < i3; ++i) { p[i] += vec_d[i]; @@ -162,7 +162,7 @@ static void nrn_lhs(NrnThread* _nt) { /* now add the axial currents */ nrn_pragma_acc(parallel loop present( vec_d [0:i3], vec_a [0:i3], vec_b [0:i3], parent_index [0:i3]) if (_nt->compute_gpu) - async(_nt->stream_id)) + async(_nt->streams[_nt->stream_id]) nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu)) for (int i = i2; i < i3; ++i) { nrn_pragma_acc(atomic update) From 3f3f77302b3bbdff0278be1669da89b40aac1c96 Mon Sep 17 00:00:00 2001 From: Ioannis Magkanaris Date: Fri, 17 Dec 2021 13:23:35 +0100 Subject: [PATCH 03/11] Updated nmodl and mod2c submodules --- external/mod2c | 2 +- external/nmodl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/external/mod2c b/external/mod2c index 9c1ca6ed3..a33e8df74 160000 --- a/external/mod2c +++ b/external/mod2c @@ -1 +1 @@ -Subproject commit 9c1ca6ed3fcefad9230b4cdf2f5e7fde6cce6d08 +Subproject commit a33e8df74f4f6bee44f8de5b397c3ff4338202c3 diff --git a/external/nmodl b/external/nmodl index 46f8baf2b..8355747b3 160000 --- a/external/nmodl +++ b/external/nmodl @@ -1 +1 @@ -Subproject commit 46f8baf2bbeaa0d21559d6306ec37b94c601f1ee +Subproject commit 8355747b38b9b285cdf44ca39a383a8de648144f From 6060f2c6f8e1d4239730bf6a4c5468c16e676ec8 Mon Sep 17 00:00:00 2001 From: Ioannis Magkanaris Date: Mon, 20 Dec 2021 12:34:09 +0100 Subject: [PATCH 04/11] Fixed issues with missing parenthesis --- coreneuron/mechanism/capac.cpp | 4 ++-- coreneuron/mechanism/eion.cpp | 2 +- coreneuron/network/partrans.cpp | 2 +- coreneuron/sim/fadvance_core.cpp | 6 +++--- coreneuron/sim/multicore.cpp | 4 ++++ coreneuron/sim/solve_core.cpp | 2 +- coreneuron/sim/treeset_core.cpp | 12 ++++++------ 7 files changed, 18 insertions(+), 14 deletions(-) diff --git a/coreneuron/mechanism/capac.cpp b/coreneuron/mechanism/capac.cpp index 0cb59c4b1..c132a1bfd 100644 --- a/coreneuron/mechanism/capac.cpp +++ b/coreneuron/mechanism/capac.cpp @@ -70,7 +70,7 @@ void nrn_jacob_capacitance(NrnThread* _nt, Memb_list* ml, int /* type */) { nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm], ni [0:_cntml_actual], _vec_d [0:_nt->end]) if (_nt->compute_gpu) - async(_nt->streams[_nt->stream_id]) + async(_nt->streams[_nt->stream_id])) nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (_iml = 0; _iml < _cntml_actual; _iml++) { _vec_d[ni[_iml]] += cfac * cm; @@ -116,7 +116,7 @@ void nrn_cur_capacitance(NrnThread* _nt, Memb_list* ml, int /* type */) { nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm], ni [0:_cntml_actual], _vec_rhs [0:_nt->end]) if (_nt->compute_gpu) - async(_nt->streams[_nt->stream_id]) + async(_nt->streams[_nt->stream_id])) nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (int _iml = 0; _iml < _cntml_actual; _iml++) { i_cap = cfac * cm * _vec_rhs[ni[_iml]]; diff --git a/coreneuron/mechanism/eion.cpp b/coreneuron/mechanism/eion.cpp index 2f95e0f9c..23ab6dc9b 100644 --- a/coreneuron/mechanism/eion.cpp +++ b/coreneuron/mechanism/eion.cpp @@ -340,7 +340,7 @@ void second_order_cur(NrnThread* _nt, int secondorder) { nrn_pragma_acc(parallel loop present(pd [0:_cntml_padded * 5], ni [0:_cntml_actual], _vec_rhs [0:_nt->end]) if (_nt->compute_gpu) - async(_nt->streams[_nt->stream_id]) + async(_nt->streams[_nt->stream_id])) nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (int _iml = 0; _iml < _cntml_actual; ++_iml) { cur += dcurdv * (_vec_rhs[ni[_iml]]); diff --git a/coreneuron/network/partrans.cpp b/coreneuron/network/partrans.cpp index a2f9af8f0..3f3c589f9 100644 --- a/coreneuron/network/partrans.cpp +++ b/coreneuron/network/partrans.cpp @@ -122,7 +122,7 @@ void nrnthread_v_transfer(NrnThread* _nt) { nrn_pragma_acc(parallel loop present(insrc_indices [0:ntar], tar_data [0:ndata], insrc_buf_ [0:n_insrc_buf]) if (_nt->compute_gpu) - async(_nt->streams[_nt->stream_id]) + async(_nt->streams[_nt->stream_id])) nrn_pragma_omp(target teams distribute parallel for simd map(to: tar_indices[0:ntar]) if(_nt->compute_gpu)) for (size_t i = 0; i < ntar; ++i) { tar_data[tar_indices[i]] = insrc_buf_[insrc_indices[i]]; diff --git a/coreneuron/sim/fadvance_core.cpp b/coreneuron/sim/fadvance_core.cpp index 6dfab8ae4..4f6da406f 100644 --- a/coreneuron/sim/fadvance_core.cpp +++ b/coreneuron/sim/fadvance_core.cpp @@ -206,14 +206,14 @@ void update(NrnThread* _nt) { /* do not need to worry about linmod or extracellular*/ if (secondorder) { nrn_pragma_acc(parallel loop present(vec_v [0:i2], vec_rhs [0:i2]) if (_nt->compute_gpu) - async(_nt->streams[_nt->stream_id]) + async(_nt->streams[_nt->stream_id])) nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (int i = 0; i < i2; ++i) { vec_v[i] += 2. * vec_rhs[i]; } } else { nrn_pragma_acc(parallel loop present(vec_v [0:i2], vec_rhs [0:i2]) if (_nt->compute_gpu) - async(_nt->streams[_nt->stream_id]) + async(_nt->streams[_nt->stream_id])) nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (int i = 0; i < i2; ++i) { vec_v[i] += vec_rhs[i]; @@ -319,7 +319,7 @@ void nrncore2nrn_send_values(NrnThread* nth) { async(nth->streams[nth->stream_id])) nrn_pragma_omp(target update from(gather_i [0:1]) if (nth->compute_gpu)) } - nrn_pragma_acc(wait(nth->streams[nth->stream_id))) + nrn_pragma_acc(wait(nth->streams[nth->stream_id)) for (int i = 0; i < tr->n_trajec; ++i) { *(tr->scatter[i]) = *(tr->gather[i]); } diff --git a/coreneuron/sim/multicore.cpp b/coreneuron/sim/multicore.cpp index d11ab790a..aeee91352 100644 --- a/coreneuron/sim/multicore.cpp +++ b/coreneuron/sim/multicore.cpp @@ -15,6 +15,10 @@ #include "coreneuron/coreneuron.hpp" #include "coreneuron/utils/nrnoc_aux.hpp" +#if defined(_OPENMP) +#include +#endif + /* Now that threads have taken over the actual_v, v_node, etc, it might be a good time to regularize the method of freeing, allocating, and diff --git a/coreneuron/sim/solve_core.cpp b/coreneuron/sim/solve_core.cpp index 69de841e4..9c6631434 100644 --- a/coreneuron/sim/solve_core.cpp +++ b/coreneuron/sim/solve_core.cpp @@ -75,7 +75,7 @@ static void bksub(NrnThread* _nt) { } if (_nt->compute_gpu) { - nrn_pragma_acc(wait(_nt->streams[_nth->stream_id])) + nrn_pragma_acc(wait(_nt->streams[_nt->stream_id])) } } } // namespace coreneuron diff --git a/coreneuron/sim/treeset_core.cpp b/coreneuron/sim/treeset_core.cpp index a7de3a1be..3a62c0c02 100644 --- a/coreneuron/sim/treeset_core.cpp +++ b/coreneuron/sim/treeset_core.cpp @@ -33,7 +33,7 @@ static void nrn_rhs(NrnThread* _nt) { int* parent_index = _nt->_v_parent_index; nrn_pragma_acc(parallel loop present(vec_rhs [0:i3], vec_d [0:i3]) if (_nt->compute_gpu) - async(_nt->streams[_nt->stream_id]) + async(_nt->streams[_nt->stream_id])) nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu)) for (int i = i1; i < i3; ++i) { vec_rhs[i] = 0.; @@ -45,7 +45,7 @@ static void nrn_rhs(NrnThread* _nt) { double* fast_imem_rhs = _nt->nrn_fast_imem->nrn_sav_rhs; nrn_pragma_acc( parallel loop present(fast_imem_d [i1:i3], fast_imem_rhs [i1:i3]) if (_nt->compute_gpu) - async(_nt->streams[_nt->stream_id]) + async(_nt->streams[_nt->stream_id])) nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu)) for (int i = i1; i < i3; ++i) { fast_imem_d[i] = 0.; @@ -75,7 +75,7 @@ static void nrn_rhs(NrnThread* _nt) { */ double* p = _nt->nrn_fast_imem->nrn_sav_rhs; nrn_pragma_acc(parallel loop present(p, vec_rhs) if (_nt->compute_gpu) - async(_nt->streams[_nt->stream_id]) + async(_nt->streams[_nt->stream_id])) nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu)) for (int i = i1; i < i3; ++i) { p[i] -= vec_rhs[i]; @@ -92,7 +92,7 @@ static void nrn_rhs(NrnThread* _nt) { vec_b [0:i3], vec_v [0:i3], parent_index [0:i3]) if (_nt->compute_gpu) - async(_nt->streams[_nt->stream_id]) + async(_nt->streams[_nt->stream_id])) nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu)) for (int i = i2; i < i3; ++i) { double dv = vec_v[parent_index[i]] - vec_v[i]; @@ -152,7 +152,7 @@ static void nrn_lhs(NrnThread* _nt) { so here we transform so it only has membrane current contribution */ double* p = _nt->nrn_fast_imem->nrn_sav_d; - nrn_pragma_acc(parallel loop present(p, vec_d) if (_nt->compute_gpu) async(_nt->streams[_nt->stream_id]) + nrn_pragma_acc(parallel loop present(p, vec_d) if (_nt->compute_gpu) async(_nt->streams[_nt->stream_id])) nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu)) for (int i = i1; i < i3; ++i) { p[i] += vec_d[i]; @@ -162,7 +162,7 @@ static void nrn_lhs(NrnThread* _nt) { /* now add the axial currents */ nrn_pragma_acc(parallel loop present( vec_d [0:i3], vec_a [0:i3], vec_b [0:i3], parent_index [0:i3]) if (_nt->compute_gpu) - async(_nt->streams[_nt->stream_id]) + async(_nt->streams[_nt->stream_id])) nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu)) for (int i = i2; i < i3; ++i) { nrn_pragma_acc(atomic update) From 60e3d3aa1728e8e3a65c8ffed0d01bd807997edc Mon Sep 17 00:00:00 2001 From: Ioannis Magkanaris Date: Mon, 20 Dec 2021 12:44:26 +0100 Subject: [PATCH 05/11] More small fixes --- coreneuron/sim/fadvance_core.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/coreneuron/sim/fadvance_core.cpp b/coreneuron/sim/fadvance_core.cpp index 4f6da406f..271b77191 100644 --- a/coreneuron/sim/fadvance_core.cpp +++ b/coreneuron/sim/fadvance_core.cpp @@ -319,7 +319,7 @@ void nrncore2nrn_send_values(NrnThread* nth) { async(nth->streams[nth->stream_id])) nrn_pragma_omp(target update from(gather_i [0:1]) if (nth->compute_gpu)) } - nrn_pragma_acc(wait(nth->streams[nth->stream_id)) + nrn_pragma_acc(wait(nth->streams[nth->stream_id])) for (int i = 0; i < tr->n_trajec; ++i) { *(tr->scatter[i]) = *(tr->gather[i]); } @@ -342,7 +342,7 @@ static void* nrn_fixed_step_thread(NrnThread* nth) { /*@todo: do we need to update nth->_t on GPU: Yes (Michael, but can launch kernel) */ nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->streams[nth->stream_id])) - nrn_pragma_acc(wait(nth->streams[nth->stream_id))) + nrn_pragma_acc(wait(nth->streams[nth->stream_id])) nrn_pragma_omp(target update to(nth->_t) if (nth->compute_gpu)) fixed_play_continuous(nth); @@ -378,7 +378,7 @@ void* nrn_fixed_step_lastpart(NrnThread* nth) { if (nth->ncell) { /*@todo: do we need to update nth->_t on GPU */ nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->streams[nth->stream_id])) - nrn_pragma_acc(wait(nth->streams[nth->stream_id))) + nrn_pragma_acc(wait(nth->streams[nth->stream_id])) nrn_pragma_omp(target update to(nth->_t) if (nth->compute_gpu)) fixed_play_continuous(nth); nonvint(nth); From d6bf37c9505e4ca10a09d8bd9127376745e15eeb Mon Sep 17 00:00:00 2001 From: Ioannis Magkanaris Date: Mon, 20 Dec 2021 15:35:55 +0100 Subject: [PATCH 06/11] Fixed openacc async --- coreneuron/gpu/nrn_acc_manager.cpp | 2 +- coreneuron/network/netcvode.cpp | 4 ++-- coreneuron/network/partrans.cpp | 2 +- coreneuron/permute/cellorder.cpp | 4 ++-- coreneuron/sim/fadvance_core.cpp | 6 +++--- coreneuron/sim/solve_core.cpp | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp index 4a1583f8b..c4f74b559 100644 --- a/coreneuron/gpu/nrn_acc_manager.cpp +++ b/coreneuron/gpu/nrn_acc_manager.cpp @@ -749,7 +749,7 @@ void update_net_receive_buffer(NrnThread* nt) { } } } - nrn_pragma_acc(wait(nt->streams[nt->stream_id])) + nrn_pragma_acc(wait async(nt->streams[nt->stream_id])) } void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb) { diff --git a/coreneuron/network/netcvode.cpp b/coreneuron/network/netcvode.cpp index b209c1e09..526a6d92e 100644 --- a/coreneuron/network/netcvode.cpp +++ b/coreneuron/network/netcvode.cpp @@ -561,7 +561,7 @@ void NetCvode::check_thresh(NrnThread* nt) { // for default method nt->_net_send_buffer[idx] = i; } } - nrn_pragma_acc(wait(nt->streams[nt->stream_id])) + nrn_pragma_acc(wait async(nt->streams[nt->stream_id])) nt->_net_send_buffer_cnt = net_send_buf_count; if (nt->compute_gpu && nt->_net_send_buffer_cnt) { @@ -569,7 +569,7 @@ void NetCvode::check_thresh(NrnThread* nt) { // for default method int* nsbuffer = nt->_net_send_buffer; #endif nrn_pragma_acc(update host(nsbuffer [0:nt->_net_send_buffer_cnt]) async(nt->streams[nt->stream_id])) - nrn_pragma_acc(wait(nt->streams[nt->stream_id])) + nrn_pragma_acc(wait async(nt->streams[nt->stream_id])) nrn_pragma_omp(target update from(nsbuffer [0:nt->_net_send_buffer_cnt])) } diff --git a/coreneuron/network/partrans.cpp b/coreneuron/network/partrans.cpp index 3f3c589f9..f77f74ed9 100644 --- a/coreneuron/network/partrans.cpp +++ b/coreneuron/network/partrans.cpp @@ -71,7 +71,7 @@ void nrnmpi_v_transfer() { for (int tid = 0; tid < nrn_nthread; ++tid) { if (nrn_threads[tid].compute_gpu) { compute_gpu = true; - nrn_pragma_acc(wait(nrn_threads[tid].streams[nrn_threads[tid].stream_id])) + nrn_pragma_acc(wait async(nrn_threads[tid].streams[nrn_threads[tid].stream_id])) } TransferThreadData& ttd = transfer_thread_data_[tid]; size_t n_outsrc_indices = ttd.outsrc_indices.size(); diff --git a/coreneuron/permute/cellorder.cpp b/coreneuron/permute/cellorder.cpp index 9814c792b..79251c091 100644 --- a/coreneuron/permute/cellorder.cpp +++ b/coreneuron/permute/cellorder.cpp @@ -626,7 +626,7 @@ void solve_interleaved2(int ith) { } // serial test mode #endif } - nrn_pragma_acc(wait(nt->streams[nt->stream_id])) + nrn_pragma_acc(wait async(nt->streams[nt->stream_id])) #ifdef _OPENACC } #endif @@ -667,7 +667,7 @@ void solve_interleaved1(int ith) { triang_interleaved(nt, icell, icellsize, nstride, stride, lastnode); bksub_interleaved(nt, icell, icellsize, nstride, stride, firstnode); } - nrn_pragma_acc(wait(nt->streams[nt->stream_id])) + nrn_pragma_acc(wait async(nt->streams[nt->stream_id])) } void solve_interleaved(int ith) { diff --git a/coreneuron/sim/fadvance_core.cpp b/coreneuron/sim/fadvance_core.cpp index 271b77191..f7ed350a7 100644 --- a/coreneuron/sim/fadvance_core.cpp +++ b/coreneuron/sim/fadvance_core.cpp @@ -319,7 +319,7 @@ void nrncore2nrn_send_values(NrnThread* nth) { async(nth->streams[nth->stream_id])) nrn_pragma_omp(target update from(gather_i [0:1]) if (nth->compute_gpu)) } - nrn_pragma_acc(wait(nth->streams[nth->stream_id])) + nrn_pragma_acc(wait async(nth->streams[nth->stream_id])) for (int i = 0; i < tr->n_trajec; ++i) { *(tr->scatter[i]) = *(tr->gather[i]); } @@ -342,7 +342,7 @@ static void* nrn_fixed_step_thread(NrnThread* nth) { /*@todo: do we need to update nth->_t on GPU: Yes (Michael, but can launch kernel) */ nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->streams[nth->stream_id])) - nrn_pragma_acc(wait(nth->streams[nth->stream_id])) + nrn_pragma_acc(wait async(nth->streams[nth->stream_id])) nrn_pragma_omp(target update to(nth->_t) if (nth->compute_gpu)) fixed_play_continuous(nth); @@ -378,7 +378,7 @@ void* nrn_fixed_step_lastpart(NrnThread* nth) { if (nth->ncell) { /*@todo: do we need to update nth->_t on GPU */ nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->streams[nth->stream_id])) - nrn_pragma_acc(wait(nth->streams[nth->stream_id])) + nrn_pragma_acc(wait async(nth->streams[nth->stream_id])) nrn_pragma_omp(target update to(nth->_t) if (nth->compute_gpu)) fixed_play_continuous(nth); nonvint(nth); diff --git a/coreneuron/sim/solve_core.cpp b/coreneuron/sim/solve_core.cpp index 9c6631434..2ee416242 100644 --- a/coreneuron/sim/solve_core.cpp +++ b/coreneuron/sim/solve_core.cpp @@ -75,7 +75,7 @@ static void bksub(NrnThread* _nt) { } if (_nt->compute_gpu) { - nrn_pragma_acc(wait(_nt->streams[_nt->stream_id])) + nrn_pragma_acc(wait async(_nt->streams[_nt->stream_id])) } } } // namespace coreneuron From aac0915f300d8071fd03b6a73679eacf03cf2be7 Mon Sep 17 00:00:00 2001 From: Ioannis Magkanaris Date: Mon, 20 Dec 2021 16:45:35 +0100 Subject: [PATCH 07/11] First working commit of openmp async execution --- coreneuron/mechanism/capac.cpp | 4 ++-- coreneuron/mechanism/eion.cpp | 4 ++-- coreneuron/network/netcvode.cpp | 3 ++- coreneuron/network/partrans.cpp | 2 +- coreneuron/permute/cellorder.cpp | 6 ++++-- coreneuron/sim/fadvance_core.cpp | 10 ++++++---- coreneuron/sim/fast_imem.cpp | 4 ++-- coreneuron/sim/treeset_core.cpp | 12 ++++++------ 8 files changed, 25 insertions(+), 20 deletions(-) diff --git a/coreneuron/mechanism/capac.cpp b/coreneuron/mechanism/capac.cpp index c132a1bfd..6a03e1f53 100644 --- a/coreneuron/mechanism/capac.cpp +++ b/coreneuron/mechanism/capac.cpp @@ -71,7 +71,7 @@ void nrn_jacob_capacitance(NrnThread* _nt, Memb_list* ml, int /* type */) { ni [0:_cntml_actual], _vec_d [0:_nt->end]) if (_nt->compute_gpu) async(_nt->streams[_nt->stream_id])) - nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu) depend(inout: _nt->streams[_nt->stream_id]) nowait) for (_iml = 0; _iml < _cntml_actual; _iml++) { _vec_d[ni[_iml]] += cfac * cm; } @@ -117,7 +117,7 @@ void nrn_cur_capacitance(NrnThread* _nt, Memb_list* ml, int /* type */) { ni [0:_cntml_actual], _vec_rhs [0:_nt->end]) if (_nt->compute_gpu) async(_nt->streams[_nt->stream_id])) - nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu) depend(inout: _nt->streams[_nt->stream_id]) nowait) for (int _iml = 0; _iml < _cntml_actual; _iml++) { i_cap = cfac * cm * _vec_rhs[ni[_iml]]; } diff --git a/coreneuron/mechanism/eion.cpp b/coreneuron/mechanism/eion.cpp index 23ab6dc9b..de3a92611 100644 --- a/coreneuron/mechanism/eion.cpp +++ b/coreneuron/mechanism/eion.cpp @@ -268,7 +268,7 @@ void nrn_cur_ion(NrnThread* nt, Memb_list* ml, int type) { nrn_ion_global_map [0:nrn_ion_global_map_size] [0:ion_global_map_member_size]) if (nt->compute_gpu) async(nt->streams[nt->stream_id])) - nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu) depend(inout: nt->streams[nt->stream_id]) nowait) for (int _iml = 0; _iml < _cntml_actual; ++_iml) { dcurdv = 0.; cur = 0.; @@ -341,7 +341,7 @@ void second_order_cur(NrnThread* _nt, int secondorder) { ni [0:_cntml_actual], _vec_rhs [0:_nt->end]) if (_nt->compute_gpu) async(_nt->streams[_nt->stream_id])) - nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu) depend(inout: _nt->streams[_nt->stream_id]) nowait) for (int _iml = 0; _iml < _cntml_actual; ++_iml) { cur += dcurdv * (_vec_rhs[ni[_iml]]); } diff --git a/coreneuron/network/netcvode.cpp b/coreneuron/network/netcvode.cpp index 526a6d92e..07b7b7c2a 100644 --- a/coreneuron/network/netcvode.cpp +++ b/coreneuron/network/netcvode.cpp @@ -534,7 +534,7 @@ void NetCvode::check_thresh(NrnThread* nt) { // for default method nrn_pragma_acc(parallel loop present( nt [0:1], presyns_helper [0:nt->n_presyn], presyns [0:nt->n_presyn], actual_v [0:nt->end]) copy(net_send_buf_count) if (nt->compute_gpu) async(nt->streams[nt->stream_id])) - nrn_pragma_omp(target teams distribute parallel for map(tofrom: net_send_buf_count) if(nt->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for map(tofrom: net_send_buf_count) if(nt->compute_gpu) depend(inout: nt->streams[nt->stream_id]) nowait) for (int i = 0; i < nt->ncell; ++i) { PreSyn* ps = presyns + i; PreSynHelper* psh = presyns_helper + i; @@ -562,6 +562,7 @@ void NetCvode::check_thresh(NrnThread* nt) { // for default method } } nrn_pragma_acc(wait async(nt->streams[nt->stream_id])) + nrn_pragma_omp(taskwait) nt->_net_send_buffer_cnt = net_send_buf_count; if (nt->compute_gpu && nt->_net_send_buffer_cnt) { diff --git a/coreneuron/network/partrans.cpp b/coreneuron/network/partrans.cpp index f77f74ed9..c415d3005 100644 --- a/coreneuron/network/partrans.cpp +++ b/coreneuron/network/partrans.cpp @@ -123,7 +123,7 @@ void nrnthread_v_transfer(NrnThread* _nt) { tar_data [0:ndata], insrc_buf_ [0:n_insrc_buf]) if (_nt->compute_gpu) async(_nt->streams[_nt->stream_id])) - nrn_pragma_omp(target teams distribute parallel for simd map(to: tar_indices[0:ntar]) if(_nt->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for simd map(to: tar_indices[0:ntar]) if(_nt->compute_gpu) depend(inout: _nt->streams[_nt->stream_id]) nowait) for (size_t i = 0; i < ntar; ++i) { tar_data[tar_indices[i]] = insrc_buf_[insrc_indices[i]]; } diff --git a/coreneuron/permute/cellorder.cpp b/coreneuron/permute/cellorder.cpp index 79251c091..e1ee3fd39 100644 --- a/coreneuron/permute/cellorder.cpp +++ b/coreneuron/permute/cellorder.cpp @@ -607,7 +607,7 @@ void solve_interleaved2(int ith) { stridedispl [0:nwarp + 1], rootbegin [0:nwarp + 1], nodebegin [0:nwarp + 1]) if (nt->compute_gpu) async(nt->streams[nt->stream_id])) - nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu) depend(inout: nt->streams[nt->stream_id]) nowait) for (int icore = 0; icore < ncore; ++icore) { int iwarp = icore / warpsize; // figure out the >> value int ic = icore & (warpsize - 1); // figure out the & mask @@ -627,6 +627,7 @@ void solve_interleaved2(int ith) { #endif } nrn_pragma_acc(wait async(nt->streams[nt->stream_id])) + nrn_pragma_omp(taskwait) #ifdef _OPENACC } #endif @@ -661,13 +662,14 @@ void solve_interleaved1(int ith) { lastnode [0:ncell], cellsize [0:ncell]) if (nt->compute_gpu) async(nt->streams[nt->stream_id])) - nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu) depend(inout: nt->streams[nt->stream_id]) nowait) for (int icell = 0; icell < ncell; ++icell) { int icellsize = cellsize[icell]; triang_interleaved(nt, icell, icellsize, nstride, stride, lastnode); bksub_interleaved(nt, icell, icellsize, nstride, stride, firstnode); } nrn_pragma_acc(wait async(nt->streams[nt->stream_id])) + nrn_pragma_omp(taskwait) } void solve_interleaved(int ith) { diff --git a/coreneuron/sim/fadvance_core.cpp b/coreneuron/sim/fadvance_core.cpp index f7ed350a7..38f60cd01 100644 --- a/coreneuron/sim/fadvance_core.cpp +++ b/coreneuron/sim/fadvance_core.cpp @@ -82,7 +82,7 @@ void dt2thread(double adt) { /* copied from nrnoc/fadvance.c */ async(nt->streams[nt->stream_id]) if (nt->compute_gpu)) // clang-format off nrn_pragma_omp(target update to(nt->_t, nt->_dt, nt->cj) - if(nt->compute_gpu)) + if(nt->compute_gpu)) // clang-format on } } @@ -207,14 +207,14 @@ void update(NrnThread* _nt) { if (secondorder) { nrn_pragma_acc(parallel loop present(vec_v [0:i2], vec_rhs [0:i2]) if (_nt->compute_gpu) async(_nt->streams[_nt->stream_id])) - nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu) depend(inout: _nt->streams[_nt->stream_id]) nowait) for (int i = 0; i < i2; ++i) { vec_v[i] += 2. * vec_rhs[i]; } } else { nrn_pragma_acc(parallel loop present(vec_v [0:i2], vec_rhs [0:i2]) if (_nt->compute_gpu) async(_nt->streams[_nt->stream_id])) - nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu) depend(inout: _nt->streams[_nt->stream_id]) nowait) for (int i = 0; i < i2; ++i) { vec_v[i] += vec_rhs[i]; } @@ -296,7 +296,7 @@ void nrncore2nrn_send_values(NrnThread* nth) { nrn_pragma_acc(parallel loop present(tr [0:1]) if (nth->compute_gpu) async(nth->streams[nth->stream_id])) - nrn_pragma_omp(target teams distribute parallel for simd if(nth->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for simd if(nth->compute_gpu) depend(inout: nth->streams[nth->stream_id]) nowait) for (int i = 0; i < tr->n_trajec; ++i) { tr->varrays[i][vs] = *tr->gather[i]; } @@ -344,6 +344,7 @@ static void* nrn_fixed_step_thread(NrnThread* nth) { nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->streams[nth->stream_id])) nrn_pragma_acc(wait async(nth->streams[nth->stream_id])) nrn_pragma_omp(target update to(nth->_t) if (nth->compute_gpu)) + nrn_pragma_omp(taskwait) fixed_play_continuous(nth); { @@ -380,6 +381,7 @@ void* nrn_fixed_step_lastpart(NrnThread* nth) { nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->streams[nth->stream_id])) nrn_pragma_acc(wait async(nth->streams[nth->stream_id])) nrn_pragma_omp(target update to(nth->_t) if (nth->compute_gpu)) + nrn_pragma_omp(taskwait) fixed_play_continuous(nth); nonvint(nth); nrncore2nrn_send_values(nth); diff --git a/coreneuron/sim/fast_imem.cpp b/coreneuron/sim/fast_imem.cpp index d579e7678..52b118b53 100644 --- a/coreneuron/sim/fast_imem.cpp +++ b/coreneuron/sim/fast_imem.cpp @@ -53,7 +53,7 @@ void nrn_calc_fast_imem(NrnThread* nt) { nrn_pragma_acc( parallel loop present(vec_rhs, vec_area, fast_imem_d, fast_imem_rhs) if (nt->compute_gpu) async(nt->streams[nt->stream_id])) - nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu) depend(inout: nt->streams[nt->stream_id]) nowait) for (int i = i1; i < i3; ++i) { fast_imem_rhs[i] = (fast_imem_d[i] * vec_rhs[i] + fast_imem_rhs[i]) * vec_area[i] * 0.01; } @@ -70,7 +70,7 @@ void nrn_calc_fast_imem_init(NrnThread* nt) { double* fast_imem_rhs = nt->nrn_fast_imem->nrn_sav_rhs; nrn_pragma_acc(parallel loop present(vec_rhs, vec_area, fast_imem_rhs) if (nt->compute_gpu) async(nt->streams[nt->stream_id])) - nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu) depend(inout: nt->streams[nt->stream_id]) nowait) for (int i = i1; i < i3; ++i) { fast_imem_rhs[i] = (vec_rhs[i] + fast_imem_rhs[i]) * vec_area[i] * 0.01; } diff --git a/coreneuron/sim/treeset_core.cpp b/coreneuron/sim/treeset_core.cpp index 3a62c0c02..7f6f1d3af 100644 --- a/coreneuron/sim/treeset_core.cpp +++ b/coreneuron/sim/treeset_core.cpp @@ -34,7 +34,7 @@ static void nrn_rhs(NrnThread* _nt) { nrn_pragma_acc(parallel loop present(vec_rhs [0:i3], vec_d [0:i3]) if (_nt->compute_gpu) async(_nt->streams[_nt->stream_id])) - nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu) depend(inout: _nt->streams[_nt->stream_id]) nowait) for (int i = i1; i < i3; ++i) { vec_rhs[i] = 0.; vec_d[i] = 0.; @@ -46,7 +46,7 @@ static void nrn_rhs(NrnThread* _nt) { nrn_pragma_acc( parallel loop present(fast_imem_d [i1:i3], fast_imem_rhs [i1:i3]) if (_nt->compute_gpu) async(_nt->streams[_nt->stream_id])) - nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu) depend(inout: _nt->streams[_nt->stream_id]) nowait) for (int i = i1; i < i3; ++i) { fast_imem_d[i] = 0.; fast_imem_rhs[i] = 0.; @@ -76,7 +76,7 @@ static void nrn_rhs(NrnThread* _nt) { double* p = _nt->nrn_fast_imem->nrn_sav_rhs; nrn_pragma_acc(parallel loop present(p, vec_rhs) if (_nt->compute_gpu) async(_nt->streams[_nt->stream_id])) - nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu) depend(inout: _nt->streams[_nt->stream_id]) nowait) for (int i = i1; i < i3; ++i) { p[i] -= vec_rhs[i]; } @@ -93,7 +93,7 @@ static void nrn_rhs(NrnThread* _nt) { vec_v [0:i3], parent_index [0:i3]) if (_nt->compute_gpu) async(_nt->streams[_nt->stream_id])) - nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu) depend(inout: _nt->streams[_nt->stream_id]) nowait) for (int i = i2; i < i3; ++i) { double dv = vec_v[parent_index[i]] - vec_v[i]; /* our connection coefficients are negative so */ @@ -153,7 +153,7 @@ static void nrn_lhs(NrnThread* _nt) { */ double* p = _nt->nrn_fast_imem->nrn_sav_d; nrn_pragma_acc(parallel loop present(p, vec_d) if (_nt->compute_gpu) async(_nt->streams[_nt->stream_id])) - nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu) depend(inout: _nt->streams[_nt->stream_id]) nowait) for (int i = i1; i < i3; ++i) { p[i] += vec_d[i]; } @@ -163,7 +163,7 @@ static void nrn_lhs(NrnThread* _nt) { nrn_pragma_acc(parallel loop present( vec_d [0:i3], vec_a [0:i3], vec_b [0:i3], parent_index [0:i3]) if (_nt->compute_gpu) async(_nt->streams[_nt->stream_id])) - nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu) depend(inout: _nt->streams[_nt->stream_id]) nowait) for (int i = i2; i < i3; ++i) { nrn_pragma_acc(atomic update) nrn_pragma_omp(atomic update) From 7a230ba3d6f80ed82d86f776498a2c35c2c37744 Mon Sep 17 00:00:00 2001 From: Ioannis Magkanaris Date: Tue, 21 Dec 2021 14:03:33 +0100 Subject: [PATCH 08/11] Added depend in update from clauses --- coreneuron/network/netcvode.cpp | 3 ++- coreneuron/network/partrans.cpp | 2 +- coreneuron/sim/fadvance_core.cpp | 4 +--- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/coreneuron/network/netcvode.cpp b/coreneuron/network/netcvode.cpp index 07b7b7c2a..d7e743edd 100644 --- a/coreneuron/network/netcvode.cpp +++ b/coreneuron/network/netcvode.cpp @@ -571,7 +571,8 @@ void NetCvode::check_thresh(NrnThread* nt) { // for default method #endif nrn_pragma_acc(update host(nsbuffer [0:nt->_net_send_buffer_cnt]) async(nt->streams[nt->stream_id])) nrn_pragma_acc(wait async(nt->streams[nt->stream_id])) - nrn_pragma_omp(target update from(nsbuffer [0:nt->_net_send_buffer_cnt])) + nrn_pragma_omp(target update from(nsbuffer [0:nt->_net_send_buffer_cnt]) depend(inout: nt->streams[nt->stream_id]) nowait) + nrn_pragma_omp(taskwait) } // on CPU... diff --git a/coreneuron/network/partrans.cpp b/coreneuron/network/partrans.cpp index c415d3005..066ca15bb 100644 --- a/coreneuron/network/partrans.cpp +++ b/coreneuron/network/partrans.cpp @@ -63,7 +63,7 @@ void nrnmpi_v_transfer() { } nrn_pragma_acc(update host(src_gather [0:n_src_gather]) if (nt->compute_gpu) async(nt->streams[nt->stream_id])) - nrn_pragma_omp(target update from(src_gather [0:n_src_gather]) if (nt->compute_gpu)) + nrn_pragma_omp(target update from(src_gather [0:n_src_gather]) if (nt->compute_gpu) depend(inout: nt->streams[nt->stream_id]) nowait) } // copy gathered source values to outsrc_buf_ diff --git a/coreneuron/sim/fadvance_core.cpp b/coreneuron/sim/fadvance_core.cpp index 38f60cd01..b1dce24e4 100644 --- a/coreneuron/sim/fadvance_core.cpp +++ b/coreneuron/sim/fadvance_core.cpp @@ -317,7 +317,7 @@ void nrncore2nrn_send_values(NrnThread* nth) { double* gather_i = tr->gather[i]; nrn_pragma_acc(update self(gather_i [0:1]) if (nth->compute_gpu) async(nth->streams[nth->stream_id])) - nrn_pragma_omp(target update from(gather_i [0:1]) if (nth->compute_gpu)) + nrn_pragma_omp(target update from(gather_i [0:1]) if (nth->compute_gpu) depend(inout: nth->streams[nth->stream_id]) nowait) } nrn_pragma_acc(wait async(nth->streams[nth->stream_id])) for (int i = 0; i < tr->n_trajec; ++i) { @@ -344,7 +344,6 @@ static void* nrn_fixed_step_thread(NrnThread* nth) { nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->streams[nth->stream_id])) nrn_pragma_acc(wait async(nth->streams[nth->stream_id])) nrn_pragma_omp(target update to(nth->_t) if (nth->compute_gpu)) - nrn_pragma_omp(taskwait) fixed_play_continuous(nth); { @@ -381,7 +380,6 @@ void* nrn_fixed_step_lastpart(NrnThread* nth) { nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->streams[nth->stream_id])) nrn_pragma_acc(wait async(nth->streams[nth->stream_id])) nrn_pragma_omp(target update to(nth->_t) if (nth->compute_gpu)) - nrn_pragma_omp(taskwait) fixed_play_continuous(nth); nonvint(nth); nrncore2nrn_send_values(nth); From 645283775aeb6e8ad6fead9d780e33be7f42b311 Mon Sep 17 00:00:00 2001 From: Ioannis Magkanaris Date: Tue, 21 Dec 2021 15:01:55 +0100 Subject: [PATCH 09/11] Small indentation fix --- coreneuron/sim/fadvance_core.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/coreneuron/sim/fadvance_core.cpp b/coreneuron/sim/fadvance_core.cpp index b1dce24e4..1d6ffdfcb 100644 --- a/coreneuron/sim/fadvance_core.cpp +++ b/coreneuron/sim/fadvance_core.cpp @@ -82,7 +82,7 @@ void dt2thread(double adt) { /* copied from nrnoc/fadvance.c */ async(nt->streams[nt->stream_id]) if (nt->compute_gpu)) // clang-format off nrn_pragma_omp(target update to(nt->_t, nt->_dt, nt->cj) - if(nt->compute_gpu)) + if(nt->compute_gpu)) // clang-format on } } From 79d0cfc7811b81d1f85add2918e6ab005474e8e6 Mon Sep 17 00:00:00 2001 From: Ioannis Date: Tue, 21 Dec 2021 16:16:28 +0200 Subject: [PATCH 10/11] Fixed clang-format --- coreneuron/network/netcvode.cpp | 14 ++++++++++---- coreneuron/network/partrans.cpp | 5 ++++- coreneuron/permute/cellorder.cpp | 20 ++++++++++++-------- coreneuron/sim/fadvance_core.cpp | 11 ++++++++--- coreneuron/sim/multicore.hpp | 9 +++++---- coreneuron/sim/treeset_core.cpp | 6 ++++-- 6 files changed, 43 insertions(+), 22 deletions(-) diff --git a/coreneuron/network/netcvode.cpp b/coreneuron/network/netcvode.cpp index d7e743edd..60192e8af 100644 --- a/coreneuron/network/netcvode.cpp +++ b/coreneuron/network/netcvode.cpp @@ -533,8 +533,10 @@ void NetCvode::check_thresh(NrnThread* nt) { // for default method nrn_pragma_acc(parallel loop present( nt [0:1], presyns_helper [0:nt->n_presyn], presyns [0:nt->n_presyn], actual_v [0:nt->end]) - copy(net_send_buf_count) if (nt->compute_gpu) async(nt->streams[nt->stream_id])) - nrn_pragma_omp(target teams distribute parallel for map(tofrom: net_send_buf_count) if(nt->compute_gpu) depend(inout: nt->streams[nt->stream_id]) nowait) + copy(net_send_buf_count) if (nt->compute_gpu) + async(nt->streams[nt->stream_id])) + nrn_pragma_omp(target teams distribute parallel for map(tofrom: net_send_buf_count) + if(nt->compute_gpu) depend(inout: nt->streams[nt->stream_id]) nowait) for (int i = 0; i < nt->ncell; ++i) { PreSyn* ps = presyns + i; PreSynHelper* psh = presyns_helper + i; @@ -569,9 +571,13 @@ void NetCvode::check_thresh(NrnThread* nt) { // for default method #ifdef CORENEURON_ENABLE_GPU int* nsbuffer = nt->_net_send_buffer; #endif - nrn_pragma_acc(update host(nsbuffer [0:nt->_net_send_buffer_cnt]) async(nt->streams[nt->stream_id])) + nrn_pragma_acc(update host(nsbuffer [0:nt->_net_send_buffer_cnt]) + async(nt->streams[nt->stream_id])) nrn_pragma_acc(wait async(nt->streams[nt->stream_id])) - nrn_pragma_omp(target update from(nsbuffer [0:nt->_net_send_buffer_cnt]) depend(inout: nt->streams[nt->stream_id]) nowait) + // clang-format off + nrn_pragma_omp(target update from(nsbuffer [0:nt->_net_send_buffer_cnt]) + depend(inout: nt->streams[nt->stream_id]) nowait) + // clang-format on nrn_pragma_omp(taskwait) } diff --git a/coreneuron/network/partrans.cpp b/coreneuron/network/partrans.cpp index 066ca15bb..a39458f25 100644 --- a/coreneuron/network/partrans.cpp +++ b/coreneuron/network/partrans.cpp @@ -63,7 +63,10 @@ void nrnmpi_v_transfer() { } nrn_pragma_acc(update host(src_gather [0:n_src_gather]) if (nt->compute_gpu) async(nt->streams[nt->stream_id])) - nrn_pragma_omp(target update from(src_gather [0:n_src_gather]) if (nt->compute_gpu) depend(inout: nt->streams[nt->stream_id]) nowait) + // clang-format off + nrn_pragma_omp(target update from(src_gather [0:n_src_gather]) if (nt->compute_gpu) + depend(inout: nt->streams[nt->stream_id]) nowait) + // clang-format on } // copy gathered source values to outsrc_buf_ diff --git a/coreneuron/permute/cellorder.cpp b/coreneuron/permute/cellorder.cpp index e1ee3fd39..c9a93bcd9 100644 --- a/coreneuron/permute/cellorder.cpp +++ b/coreneuron/permute/cellorder.cpp @@ -600,14 +600,18 @@ void solve_interleaved2(int ith) { defined(_OPENACC) int nstride = stridedispl[nwarp]; #endif - nrn_pragma_acc(parallel loop gang vector vector_length( - warpsize) present(nt [0:1], - strides [0:nstride], - ncycles [0:nwarp], - stridedispl [0:nwarp + 1], - rootbegin [0:nwarp + 1], - nodebegin [0:nwarp + 1]) if (nt->compute_gpu) async(nt->streams[nt->stream_id])) - nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu) depend(inout: nt->streams[nt->stream_id]) nowait) + nrn_pragma_acc(parallel loop gang vector vector_length(warpsize) + present(nt [0:1], + strides [0:nstride], + ncycles [0:nwarp], + stridedispl [0:nwarp + 1], + rootbegin [0:nwarp + 1], + nodebegin [0:nwarp + 1]) if (nt->compute_gpu) + async(nt->streams[nt->stream_id])) + // clang-format off + nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu) + depend(inout: nt->streams[nt->stream_id]) nowait) + // clang-format on for (int icore = 0; icore < ncore; ++icore) { int iwarp = icore / warpsize; // figure out the >> value int ic = icore & (warpsize - 1); // figure out the & mask diff --git a/coreneuron/sim/fadvance_core.cpp b/coreneuron/sim/fadvance_core.cpp index 1d6ffdfcb..ad71f4c7a 100644 --- a/coreneuron/sim/fadvance_core.cpp +++ b/coreneuron/sim/fadvance_core.cpp @@ -317,7 +317,10 @@ void nrncore2nrn_send_values(NrnThread* nth) { double* gather_i = tr->gather[i]; nrn_pragma_acc(update self(gather_i [0:1]) if (nth->compute_gpu) async(nth->streams[nth->stream_id])) - nrn_pragma_omp(target update from(gather_i [0:1]) if (nth->compute_gpu) depend(inout: nth->streams[nth->stream_id]) nowait) + // clang-format off + nrn_pragma_omp(target update from(gather_i [0:1]) if (nth->compute_gpu) + depend(inout: nth->streams[nth->stream_id]) nowait) + // clang-format on } nrn_pragma_acc(wait async(nth->streams[nth->stream_id])) for (int i = 0; i < tr->n_trajec; ++i) { @@ -341,7 +344,8 @@ static void* nrn_fixed_step_thread(NrnThread* nth) { if (nth->ncell) { /*@todo: do we need to update nth->_t on GPU: Yes (Michael, but can launch kernel) */ - nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->streams[nth->stream_id])) + nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) + async(nth->streams[nth->stream_id])) nrn_pragma_acc(wait async(nth->streams[nth->stream_id])) nrn_pragma_omp(target update to(nth->_t) if (nth->compute_gpu)) fixed_play_continuous(nth); @@ -377,7 +381,8 @@ void* nrn_fixed_step_lastpart(NrnThread* nth) { if (nth->ncell) { /*@todo: do we need to update nth->_t on GPU */ - nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->streams[nth->stream_id])) + nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) + async(nth->streams[nth->stream_id])) nrn_pragma_acc(wait async(nth->streams[nth->stream_id])) nrn_pragma_omp(target update to(nth->_t) if (nth->compute_gpu)) fixed_play_continuous(nth); diff --git a/coreneuron/sim/multicore.hpp b/coreneuron/sim/multicore.hpp index 56f8d3af8..44189191d 100644 --- a/coreneuron/sim/multicore.hpp +++ b/coreneuron/sim/multicore.hpp @@ -130,10 +130,11 @@ struct NrnThread: public MemoryManaged { NrnThreadBAList* tbl[BEFORE_AFTER_SIZE]; /* wasteful since almost all empty */ - int shadow_rhs_cnt = 0; /* added to facilitate the NrnThread transfer to GPU */ - int compute_gpu = 0; /* define whether to compute with gpus */ - int stream_id = 0; /* define where the kernel will be launched on GPU stream */ - std::vector streams; /* vector of stream ids needed for async execution of OpenMP in multiple streams */ + int shadow_rhs_cnt = 0; /* added to facilitate the NrnThread transfer to GPU */ + int compute_gpu = 0; /* define whether to compute with gpus */ + int stream_id = 0; /* define where the kernel will be launched on GPU stream */ + std::vector streams; /* vector of stream ids needed for async execution of OpenMP in + multiple streams */ int _net_send_buffer_size = 0; int _net_send_buffer_cnt = 0; int* _net_send_buffer = nullptr; diff --git a/coreneuron/sim/treeset_core.cpp b/coreneuron/sim/treeset_core.cpp index 7f6f1d3af..42de967d0 100644 --- a/coreneuron/sim/treeset_core.cpp +++ b/coreneuron/sim/treeset_core.cpp @@ -152,8 +152,10 @@ static void nrn_lhs(NrnThread* _nt) { so here we transform so it only has membrane current contribution */ double* p = _nt->nrn_fast_imem->nrn_sav_d; - nrn_pragma_acc(parallel loop present(p, vec_d) if (_nt->compute_gpu) async(_nt->streams[_nt->stream_id])) - nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu) depend(inout: _nt->streams[_nt->stream_id]) nowait) + nrn_pragma_acc(parallel loop present(p, vec_d) if (_nt->compute_gpu) + async(_nt->streams[_nt->stream_id])) + nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu) + depend(inout: _nt->streams[_nt->stream_id]) nowait) for (int i = i1; i < i3; ++i) { p[i] += vec_d[i]; } From 6b909139b2926747bf604e7d7b3b8290541d908e Mon Sep 17 00:00:00 2001 From: Olli Lupton Date: Thu, 23 Dec 2021 12:19:59 +0100 Subject: [PATCH 11/11] Update NMODL after rebase. --- external/nmodl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/nmodl b/external/nmodl index 8355747b3..afd70c5bb 160000 --- a/external/nmodl +++ b/external/nmodl @@ -1 +1 @@ -Subproject commit 8355747b38b9b285cdf44ca39a383a8de648144f +Subproject commit afd70c5bb258835661963efa091c8a55718ee2c0