Skip to content
This repository was archived by the owner on Mar 20, 2023. It is now read-only.

Support async execution in OpenMP wherever it's supported #725

Draft
wants to merge 11 commits into
base: master
Choose a base branch
from
8 changes: 4 additions & 4 deletions coreneuron/gpu/nrn_acc_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -326,8 +326,8 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
/* Here is the example of using OpenACC data enter/exit
* Remember that we are not allowed to use nt->_data but we have to use:
* double *dtmp = nt->_data; // now use dtmp!
#pragma acc enter data copyin(dtmp[0:nt->_ndata]) async(nt->stream_id)
#pragma acc wait(nt->stream_id)
#pragma acc enter data copyin(dtmp[0:nt->_ndata]) async(nt->streams[nt->stream_id])
#pragma acc wait(nt->streams[nt->stream_id])
*/

/*update d_nt._data to point to device copy */
Expand Down Expand Up @@ -736,7 +736,7 @@ void update_net_receive_buffer(NrnThread* nt) {
nrb->_nrb_flag[:nrb->_cnt],
nrb->_displ[:nrb->_displ_cnt + 1],
nrb->_nrb_index[:nrb->_cnt])
async(nt->stream_id))
async(nt->streams[nt->stream_id]))
nrn_pragma_omp(target update to(nrb->_cnt,
nrb->_displ_cnt,
nrb->_pnt_index[:nrb->_cnt],
Expand All @@ -749,7 +749,7 @@ void update_net_receive_buffer(NrnThread* nt) {
}
}
}
nrn_pragma_acc(wait(nt->stream_id))
nrn_pragma_acc(wait async(nt->streams[nt->stream_id]))
}

void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb) {
Expand Down
4 changes: 3 additions & 1 deletion coreneuron/io/phase2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -875,7 +875,9 @@ void Phase2::populate(NrnThread& nt, const UserParams& userParams) {
* greater than number of omp threads.
*/
#if defined(_OPENMP)
nt.stream_id = omp_get_thread_num();
const auto omp_thread_num = omp_get_thread_num();
nt.stream_id = omp_thread_num;
nt.streams[nt.stream_id] = omp_thread_num;
#endif

int shadow_rhs_cnt = 0;
Expand Down
8 changes: 4 additions & 4 deletions coreneuron/mechanism/capac.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,8 @@ void nrn_jacob_capacitance(NrnThread* _nt, Memb_list* ml, int /* type */) {
nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm],
ni [0:_cntml_actual],
_vec_d [0:_nt->end]) if (_nt->compute_gpu)
async(_nt->stream_id))
nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
async(_nt->streams[_nt->stream_id]))
nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu) depend(inout: _nt->streams[_nt->stream_id]) nowait)
for (_iml = 0; _iml < _cntml_actual; _iml++) {
_vec_d[ni[_iml]] += cfac * cm;
}
Expand Down Expand Up @@ -116,8 +116,8 @@ void nrn_cur_capacitance(NrnThread* _nt, Memb_list* ml, int /* type */) {
nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm],
ni [0:_cntml_actual],
_vec_rhs [0:_nt->end]) if (_nt->compute_gpu)
async(_nt->stream_id))
nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
async(_nt->streams[_nt->stream_id]))
nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu) depend(inout: _nt->streams[_nt->stream_id]) nowait)
for (int _iml = 0; _iml < _cntml_actual; _iml++) {
i_cap = cfac * cm * _vec_rhs[ni[_iml]];
}
Expand Down
18 changes: 8 additions & 10 deletions coreneuron/mechanism/eion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -263,14 +263,12 @@ void nrn_cur_ion(NrnThread* nt, Memb_list* ml, int type) {
int _cntml_padded = ml->_nodecount_padded;
pd = ml->data;
ppd = ml->pdata;
// clang-format off
nrn_pragma_acc(parallel loop present(pd[0:_cntml_padded * 5],
nrn_ion_global_map[0:nrn_ion_global_map_size]
[0:ion_global_map_member_size])
if (nt->compute_gpu)
async(nt->stream_id))
// clang-format on
nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
nrn_pragma_acc(parallel loop present(
pd [0:_cntml_padded * 5],
nrn_ion_global_map
[0:nrn_ion_global_map_size] [0:ion_global_map_member_size]) if (nt->compute_gpu)
async(nt->streams[nt->stream_id]))
nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu) depend(inout: nt->streams[nt->stream_id]) nowait)
for (int _iml = 0; _iml < _cntml_actual; ++_iml) {
dcurdv = 0.;
cur = 0.;
Expand Down Expand Up @@ -342,8 +340,8 @@ void second_order_cur(NrnThread* _nt, int secondorder) {
nrn_pragma_acc(parallel loop present(pd [0:_cntml_padded * 5],
ni [0:_cntml_actual],
_vec_rhs [0:_nt->end]) if (_nt->compute_gpu)
async(_nt->stream_id))
nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
async(_nt->streams[_nt->stream_id]))
nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu) depend(inout: _nt->streams[_nt->stream_id]) nowait)
for (int _iml = 0; _iml < _cntml_actual; ++_iml) {
cur += dcurdv * (_vec_rhs[ni[_iml]]);
}
Expand Down
20 changes: 14 additions & 6 deletions coreneuron/network/netcvode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -533,8 +533,10 @@ void NetCvode::check_thresh(NrnThread* nt) { // for default method

nrn_pragma_acc(parallel loop present(
nt [0:1], presyns_helper [0:nt->n_presyn], presyns [0:nt->n_presyn], actual_v [0:nt->end])
copy(net_send_buf_count) if (nt->compute_gpu) async(nt->stream_id))
nrn_pragma_omp(target teams distribute parallel for map(tofrom: net_send_buf_count) if(nt->compute_gpu))
copy(net_send_buf_count) if (nt->compute_gpu)
async(nt->streams[nt->stream_id]))
nrn_pragma_omp(target teams distribute parallel for map(tofrom: net_send_buf_count)
if(nt->compute_gpu) depend(inout: nt->streams[nt->stream_id]) nowait)
for (int i = 0; i < nt->ncell; ++i) {
PreSyn* ps = presyns + i;
PreSynHelper* psh = presyns_helper + i;
Expand All @@ -561,16 +563,22 @@ void NetCvode::check_thresh(NrnThread* nt) { // for default method
nt->_net_send_buffer[idx] = i;
}
}
nrn_pragma_acc(wait(nt->stream_id))
nrn_pragma_acc(wait async(nt->streams[nt->stream_id]))
nrn_pragma_omp(taskwait)
nt->_net_send_buffer_cnt = net_send_buf_count;

if (nt->compute_gpu && nt->_net_send_buffer_cnt) {
#ifdef CORENEURON_ENABLE_GPU
int* nsbuffer = nt->_net_send_buffer;
#endif
nrn_pragma_acc(update host(nsbuffer [0:nt->_net_send_buffer_cnt]) async(nt->stream_id))
nrn_pragma_acc(wait(nt->stream_id))
nrn_pragma_omp(target update from(nsbuffer [0:nt->_net_send_buffer_cnt]))
nrn_pragma_acc(update host(nsbuffer [0:nt->_net_send_buffer_cnt])
async(nt->streams[nt->stream_id]))
nrn_pragma_acc(wait async(nt->streams[nt->stream_id]))
// clang-format off
nrn_pragma_omp(target update from(nsbuffer [0:nt->_net_send_buffer_cnt])
depend(inout: nt->streams[nt->stream_id]) nowait)
// clang-format on
nrn_pragma_omp(taskwait)
}

// on CPU...
Expand Down
15 changes: 9 additions & 6 deletions coreneuron/network/partrans.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,22 +56,25 @@ void nrnmpi_v_transfer() {
nrn_pragma_acc(parallel loop present(src_indices [0:n_src_gather],
src_data [0:nt->_ndata],
src_gather [0:n_src_gather]) if (nt->compute_gpu)
async(nt->stream_id))
async(nt->streams[nt->stream_id]))
nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
for (int i = 0; i < n_src_gather; ++i) {
src_gather[i] = src_data[src_indices[i]];
}
nrn_pragma_acc(update host(src_gather [0:n_src_gather]) if (nt->compute_gpu)
async(nt->stream_id))
nrn_pragma_omp(target update from(src_gather [0:n_src_gather]) if (nt->compute_gpu))
async(nt->streams[nt->stream_id]))
// clang-format off
nrn_pragma_omp(target update from(src_gather [0:n_src_gather]) if (nt->compute_gpu)
depend(inout: nt->streams[nt->stream_id]) nowait)
// clang-format on
}

// copy gathered source values to outsrc_buf_
bool compute_gpu = false;
for (int tid = 0; tid < nrn_nthread; ++tid) {
if (nrn_threads[tid].compute_gpu) {
compute_gpu = true;
nrn_pragma_acc(wait(nrn_threads[tid].stream_id))
nrn_pragma_acc(wait async(nrn_threads[tid].streams[nrn_threads[tid].stream_id]))
}
TransferThreadData& ttd = transfer_thread_data_[tid];
size_t n_outsrc_indices = ttd.outsrc_indices.size();
Expand Down Expand Up @@ -122,8 +125,8 @@ void nrnthread_v_transfer(NrnThread* _nt) {
nrn_pragma_acc(parallel loop present(insrc_indices [0:ntar],
tar_data [0:ndata],
insrc_buf_ [0:n_insrc_buf]) if (_nt->compute_gpu)
async(_nt->stream_id))
nrn_pragma_omp(target teams distribute parallel for simd map(to: tar_indices[0:ntar]) if(_nt->compute_gpu))
async(_nt->streams[_nt->stream_id]))
nrn_pragma_omp(target teams distribute parallel for simd map(to: tar_indices[0:ntar]) if(_nt->compute_gpu) depend(inout: _nt->streams[_nt->stream_id]) nowait)
for (size_t i = 0; i < ntar; ++i) {
tar_data[tar_indices[i]] = insrc_buf_[insrc_indices[i]];
}
Expand Down
30 changes: 18 additions & 12 deletions coreneuron/permute/cellorder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -600,14 +600,18 @@ void solve_interleaved2(int ith) {
defined(_OPENACC)
int nstride = stridedispl[nwarp];
#endif
nrn_pragma_acc(parallel loop gang vector vector_length(
warpsize) present(nt [0:1],
strides [0:nstride],
ncycles [0:nwarp],
stridedispl [0:nwarp + 1],
rootbegin [0:nwarp + 1],
nodebegin [0:nwarp + 1]) if (nt->compute_gpu) async(nt->stream_id))
nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
nrn_pragma_acc(parallel loop gang vector vector_length(warpsize)
present(nt [0:1],
strides [0:nstride],
ncycles [0:nwarp],
stridedispl [0:nwarp + 1],
rootbegin [0:nwarp + 1],
nodebegin [0:nwarp + 1]) if (nt->compute_gpu)
async(nt->streams[nt->stream_id]))
// clang-format off
nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)
depend(inout: nt->streams[nt->stream_id]) nowait)
// clang-format on
for (int icore = 0; icore < ncore; ++icore) {
int iwarp = icore / warpsize; // figure out the >> value
int ic = icore & (warpsize - 1); // figure out the & mask
Expand All @@ -626,7 +630,8 @@ void solve_interleaved2(int ith) {
} // serial test mode
#endif
}
nrn_pragma_acc(wait(nt->stream_id))
nrn_pragma_acc(wait async(nt->streams[nt->stream_id]))
nrn_pragma_omp(taskwait)
#ifdef _OPENACC
}
#endif
Expand Down Expand Up @@ -660,14 +665,15 @@ void solve_interleaved1(int ith) {
firstnode [0:ncell],
lastnode [0:ncell],
cellsize [0:ncell]) if (nt->compute_gpu)
async(nt->stream_id))
nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
async(nt->streams[nt->stream_id]))
nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu) depend(inout: nt->streams[nt->stream_id]) nowait)
for (int icell = 0; icell < ncell; ++icell) {
int icellsize = cellsize[icell];
triang_interleaved(nt, icell, icellsize, nstride, stride, lastnode);
bksub_interleaved(nt, icell, icellsize, nstride, stride, firstnode);
}
nrn_pragma_acc(wait(nt->stream_id))
nrn_pragma_acc(wait async(nt->streams[nt->stream_id]))
nrn_pragma_omp(taskwait)
}

void solve_interleaved(int ith) {
Expand Down
33 changes: 19 additions & 14 deletions coreneuron/sim/fadvance_core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ void dt2thread(double adt) { /* copied from nrnoc/fadvance.c */
nt->cj = 1.0 / dt;
}
nrn_pragma_acc(update device(nt->_t, nt->_dt, nt->cj)
async(nt->stream_id) if (nt->compute_gpu))
async(nt->streams[nt->stream_id]) if (nt->compute_gpu))
// clang-format off
nrn_pragma_omp(target update to(nt->_t, nt->_dt, nt->cj)
if(nt->compute_gpu))
Expand Down Expand Up @@ -206,15 +206,15 @@ void update(NrnThread* _nt) {
/* do not need to worry about linmod or extracellular*/
if (secondorder) {
nrn_pragma_acc(parallel loop present(vec_v [0:i2], vec_rhs [0:i2]) if (_nt->compute_gpu)
async(_nt->stream_id))
nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
async(_nt->streams[_nt->stream_id]))
nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu) depend(inout: _nt->streams[_nt->stream_id]) nowait)
for (int i = 0; i < i2; ++i) {
vec_v[i] += 2. * vec_rhs[i];
}
} else {
nrn_pragma_acc(parallel loop present(vec_v [0:i2], vec_rhs [0:i2]) if (_nt->compute_gpu)
async(_nt->stream_id))
nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
async(_nt->streams[_nt->stream_id]))
nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu) depend(inout: _nt->streams[_nt->stream_id]) nowait)
for (int i = 0; i < i2; ++i) {
vec_v[i] += vec_rhs[i];
}
Expand Down Expand Up @@ -295,8 +295,8 @@ void nrncore2nrn_send_values(NrnThread* nth) {
assert(vs < tr->bsize);

nrn_pragma_acc(parallel loop present(tr [0:1]) if (nth->compute_gpu)
async(nth->stream_id))
nrn_pragma_omp(target teams distribute parallel for simd if(nth->compute_gpu))
async(nth->streams[nth->stream_id]))
nrn_pragma_omp(target teams distribute parallel for simd if(nth->compute_gpu) depend(inout: nth->streams[nth->stream_id]) nowait)
for (int i = 0; i < tr->n_trajec; ++i) {
tr->varrays[i][vs] = *tr->gather[i];
}
Expand All @@ -316,10 +316,13 @@ void nrncore2nrn_send_values(NrnThread* nth) {
for (int i = 0; i < tr->n_trajec; ++i) {
double* gather_i = tr->gather[i];
nrn_pragma_acc(update self(gather_i [0:1]) if (nth->compute_gpu)
async(nth->stream_id))
nrn_pragma_omp(target update from(gather_i [0:1]) if (nth->compute_gpu))
async(nth->streams[nth->stream_id]))
// clang-format off
nrn_pragma_omp(target update from(gather_i [0:1]) if (nth->compute_gpu)
depend(inout: nth->streams[nth->stream_id]) nowait)
// clang-format on
}
nrn_pragma_acc(wait(nth->stream_id))
nrn_pragma_acc(wait async(nth->streams[nth->stream_id]))
for (int i = 0; i < tr->n_trajec; ++i) {
*(tr->scatter[i]) = *(tr->gather[i]);
}
Expand All @@ -341,8 +344,9 @@ static void* nrn_fixed_step_thread(NrnThread* nth) {
if (nth->ncell) {
/*@todo: do we need to update nth->_t on GPU: Yes (Michael, but can
launch kernel) */
nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->stream_id))
nrn_pragma_acc(wait(nth->stream_id))
nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu)
async(nth->streams[nth->stream_id]))
nrn_pragma_acc(wait async(nth->streams[nth->stream_id]))
nrn_pragma_omp(target update to(nth->_t) if (nth->compute_gpu))
fixed_play_continuous(nth);

Expand Down Expand Up @@ -377,8 +381,9 @@ void* nrn_fixed_step_lastpart(NrnThread* nth) {

if (nth->ncell) {
/*@todo: do we need to update nth->_t on GPU */
nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->stream_id))
nrn_pragma_acc(wait(nth->stream_id))
nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu)
async(nth->streams[nth->stream_id]))
nrn_pragma_acc(wait async(nth->streams[nth->stream_id]))
nrn_pragma_omp(target update to(nth->_t) if (nth->compute_gpu))
fixed_play_continuous(nth);
nonvint(nth);
Expand Down
8 changes: 4 additions & 4 deletions coreneuron/sim/fast_imem.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ void nrn_calc_fast_imem(NrnThread* nt) {
double* fast_imem_rhs = nt->nrn_fast_imem->nrn_sav_rhs;
nrn_pragma_acc(
parallel loop present(vec_rhs, vec_area, fast_imem_d, fast_imem_rhs) if (nt->compute_gpu)
async(nt->stream_id))
nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
async(nt->streams[nt->stream_id]))
nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu) depend(inout: nt->streams[nt->stream_id]) nowait)
for (int i = i1; i < i3; ++i) {
fast_imem_rhs[i] = (fast_imem_d[i] * vec_rhs[i] + fast_imem_rhs[i]) * vec_area[i] * 0.01;
}
Expand All @@ -69,8 +69,8 @@ void nrn_calc_fast_imem_init(NrnThread* nt) {

double* fast_imem_rhs = nt->nrn_fast_imem->nrn_sav_rhs;
nrn_pragma_acc(parallel loop present(vec_rhs, vec_area, fast_imem_rhs) if (nt->compute_gpu)
async(nt->stream_id))
nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
async(nt->streams[nt->stream_id]))
nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu) depend(inout: nt->streams[nt->stream_id]) nowait)
for (int i = i1; i < i3; ++i) {
fast_imem_rhs[i] = (vec_rhs[i] + fast_imem_rhs[i]) * vec_area[i] * 0.01;
}
Expand Down
7 changes: 7 additions & 0 deletions coreneuron/sim/multicore.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@
#include "coreneuron/coreneuron.hpp"
#include "coreneuron/utils/nrnoc_aux.hpp"

#if defined(_OPENMP)
#include <omp.h>
#endif

/*
Now that threads have taken over the actual_v, v_node, etc, it might
be a good time to regularize the method of freeing, allocating, and
Expand Down Expand Up @@ -106,6 +110,9 @@ void nrn_threads_create(int n) {
for (int i = 0; i < nrn_nthread; ++i) {
NrnThread& nt = nrn_threads[i];
nt.id = i;
#if defined(_OPENMP)
nt.streams.reserve(omp_get_num_threads());
#endif
for (int j = 0; j < BEFORE_AFTER_SIZE; ++j) {
nt.tbl[j] = nullptr;
}
Expand Down
Loading