diff --git a/configure.ac b/configure.ac index 929ec70e8b0..4bc2042a29c 100644 --- a/configure.ac +++ b/configure.ac @@ -286,6 +286,24 @@ AS_IF([test "$enable_oshmem" != "no"], [project_oshmem_amc=true], [project_oshme m4_ifndef([project_oshmem], [project_oshmem_amc=false]) AM_CONDITIONAL([PROJECT_OSHMEM], [test "$project_oshmem_amc" = "true"]) +# Enable/Disable Software-Based Performance Counters Capability +AC_ARG_ENABLE(spc, + AC_HELP_STRING([--enable-spc], + [Enable software-based performance counters capability (default: disabled)])) +if test "$enable_spc" = "yes"; then + AC_MSG_RESULT([yes]) + SOFTWARE_EVENTS_ENABLE=1 +else + AC_MSG_RESULT([no]) + SOFTWARE_EVENTS_ENABLE=0 +fi +AC_DEFINE_UNQUOTED([SOFTWARE_EVENTS_ENABLE], + [$SOFTWARE_EVENTS_ENABLE], + [If the software-based performance counters capability should be enabled.]) +AM_CONDITIONAL(SOFTWARE_EVENTS_ENABLE, test "$SOFTWARE_EVENTS_ENABLE" = "1") + +AS_IF([test "$enable_spc" != "no"], [project_spc_amc=true], [project_spc_amc=false]) + if test "$enable_binaries" = "no" && test "$enable_dist" = "yes"; then AC_MSG_WARN([--disable-binaries is incompatible with --enable dist]) AC_MSG_ERROR([Cannot continue]) diff --git a/ompi/mca/pml/ob1/pml_ob1.c b/ompi/mca/pml/ob1/pml_ob1.c index eaf3ab26e0f..fc2eed6793f 100644 --- a/ompi/mca/pml/ob1/pml_ob1.c +++ b/ompi/mca/pml/ob1/pml_ob1.c @@ -36,6 +36,7 @@ #include "opal_stdint.h" #include "opal/mca/btl/btl.h" #include "opal/mca/btl/base/base.h" +#include "ompi/runtime/ompi_software_events.h" #include "ompi/mca/pml/pml.h" #include "ompi/mca/pml/base/base.h" @@ -195,6 +196,7 @@ int mca_pml_ob1_add_comm(ompi_communicator_t* comm) mca_pml_ob1_recv_frag_t *frag, *next_frag; mca_pml_ob1_comm_proc_t* pml_proc; mca_pml_ob1_match_hdr_t* hdr; + opal_timer_t usecs = 0; if (NULL == pml_comm) { return OMPI_ERR_OUT_OF_RESOURCE; @@ -264,15 +266,17 @@ int mca_pml_ob1_add_comm(ompi_communicator_t* comm) * situation as the cant_match is only checked when a new fragment is received from * the network. */ + SW_EVENT_TIMER_START(OMPI_OOS_MATCH_TIME, &usecs); OPAL_LIST_FOREACH(frag, &pml_proc->frags_cant_match, mca_pml_ob1_recv_frag_t) { - hdr = &frag->hdr.hdr_match; - /* If the message has the next expected seq from that proc... */ - if(hdr->hdr_seq != pml_proc->expected_sequence) - continue; - - opal_list_remove_item(&pml_proc->frags_cant_match, (opal_list_item_t*)frag); - goto add_fragment_to_unexpected; - } + hdr = &frag->hdr.hdr_match; + /* If the message has the next expected seq from that proc... */ + if(hdr->hdr_seq != pml_proc->expected_sequence) + continue; + + opal_list_remove_item(&pml_proc->frags_cant_match, (opal_list_item_t*)frag); + goto add_fragment_to_unexpected; + } + SW_EVENT_TIMER_STOP(OMPI_OOS_MATCH_TIME, &usecs); } else { opal_list_append( &pml_proc->frags_cant_match, (opal_list_item_t*)frag ); } diff --git a/ompi/mca/pml/ob1/pml_ob1_isend.c b/ompi/mca/pml/ob1/pml_ob1_isend.c index 3a5b0c2d7a0..d1f2289869a 100644 --- a/ompi/mca/pml/ob1/pml_ob1_isend.c +++ b/ompi/mca/pml/ob1/pml_ob1_isend.c @@ -28,6 +28,7 @@ #include "pml_ob1_sendreq.h" #include "pml_ob1_recvreq.h" #include "ompi/peruse/peruse-internal.h" +#include "ompi/runtime/ompi_software_events.h" /** * Single usage request. As we allow recursive calls (as an @@ -119,6 +120,16 @@ static inline int mca_pml_ob1_send_inline (const void *buf, size_t count, rc = mca_bml_base_sendi (bml_btl, &convertor, &match, OMPI_PML_OB1_MATCH_HDR_LEN, size, MCA_BTL_NO_ORDER, MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP, MCA_PML_OB1_HDR_TYPE_MATCH, NULL); + + if(OPAL_LIKELY(rc == OPAL_SUCCESS)){ + if(tag >= 0){ + SW_EVENT_RECORD(OMPI_BYTES_SENT_USER, size); + } + else{ + SW_EVENT_RECORD(OMPI_BYTES_SENT_MPI, size); + } + } + if (count > 0) { opal_convertor_cleanup (&convertor); } diff --git a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c index 1b59e3aae16..6d83182abf8 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c @@ -39,6 +39,7 @@ #include "ompi/mca/pml/pml.h" #include "ompi/peruse/peruse-internal.h" #include "ompi/memchecker.h" +#include "ompi/runtime/ompi_software_events.h" #include "pml_ob1.h" #include "pml_ob1_comm.h" @@ -231,6 +232,9 @@ void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl, &iov_count, &bytes_received ); match->req_bytes_received = bytes_received; + + SW_EVENT_USER_OR_MPI(match->req_recv.req_base.req_tag, (long long)bytes_received, OMPI_BYTES_RECEIVED_USER, OMPI_BYTES_RECEIVED_MPI); + /* * Unpacking finished, make the user buffer unaccessable again. */ @@ -536,6 +540,11 @@ match_one(mca_btl_base_module_t *btl, mca_pml_ob1_comm_proc_t *proc, mca_pml_ob1_recv_frag_t* frag) { +#if SOFTWARE_EVENTS_ENABLE == 1 + opal_timer_t usecs = 0; +#endif + SW_EVENT_TIMER_START(OMPI_MATCH_TIME, &usecs); + mca_pml_ob1_recv_request_t *match; mca_pml_ob1_comm_t *comm = (mca_pml_ob1_comm_t *)comm_ptr->c_pml_comm; @@ -573,19 +582,28 @@ match_one(mca_btl_base_module_t *btl, num_segments); /* this frag is already processed, so we want to break out of the loop and not end up back on the unexpected queue. */ + SW_EVENT_TIMER_STOP(OMPI_MATCH_TIME, &usecs); + return NULL; } PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_MSG_MATCH_POSTED_REQ, &(match->req_recv.req_base), PERUSE_RECV); + SW_EVENT_TIMER_STOP(OMPI_MATCH_TIME, &usecs); + return match; } /* if no match found, place on unexpected queue */ append_frag_to_list(&proc->unexpected_frags, btl, hdr, segments, num_segments, frag); + + SW_EVENT_RECORD(OMPI_UNEXPECTED, 1); + PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_INSERT_IN_UNEX_Q, comm_ptr, hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV); + SW_EVENT_TIMER_STOP(OMPI_MATCH_TIME, &usecs); + return NULL; } while(true); } @@ -593,6 +611,11 @@ match_one(mca_btl_base_module_t *btl, static mca_pml_ob1_recv_frag_t* check_cantmatch_for_match(mca_pml_ob1_comm_proc_t *proc) { mca_pml_ob1_recv_frag_t *frag; +#if SOFTWARE_EVENTS_ENABLE == 1 + opal_timer_t usecs = 0; +#endif + + SW_EVENT_TIMER_START(OMPI_OOS_MATCH_TIME, &usecs); /* search the list for a fragment from the send with sequence * number next_msg_seq_expected @@ -609,9 +632,14 @@ static mca_pml_ob1_recv_frag_t* check_cantmatch_for_match(mca_pml_ob1_comm_proc_ continue; opal_list_remove_item(&proc->frags_cant_match, (opal_list_item_t*)frag); + + SW_EVENT_TIMER_STOP(OMPI_OOS_MATCH_TIME, &usecs); + return frag; } + SW_EVENT_TIMER_STOP(OMPI_OOS_MATCH_TIME, &usecs); + return NULL; } @@ -776,6 +804,9 @@ static int mca_pml_ob1_recv_frag_match( mca_btl_base_module_t *btl, */ append_frag_to_list(&proc->frags_cant_match, btl, hdr, segments, num_segments, NULL); + + SW_EVENT_RECORD(OMPI_OUT_OF_SEQUENCE, 1); + OB1_MATCHING_UNLOCK(&comm->matching_lock); return OMPI_SUCCESS; } diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c index bbc90e1e471..d4d2cba8176 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c @@ -29,6 +29,7 @@ #include "opal/mca/mpool/mpool.h" #include "opal/util/arch.h" +#include "ompi/runtime/ompi_software_events.h" #include "ompi/mca/pml/pml.h" #include "ompi/mca/bml/bml.h" #include "pml_ob1_comm.h" @@ -198,7 +199,10 @@ static void mca_pml_ob1_put_completion (mca_pml_ob1_rdma_frag_t *frag, int64_t r assert ((uint64_t) rdma_size == frag->rdma_length); /* check completion status */ + OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, rdma_size); + SW_EVENT_USER_OR_MPI(recvreq->req_recv.req_base.req_tag, (long long)rdma_size, OMPI_BYTES_RECEIVED_USER, OMPI_BYTES_RECEIVED_MPI); + if (recv_request_pml_complete_check(recvreq) == false && recvreq->req_rdma_offset < recvreq->req_send_offset) { /* schedule additional rdma operations */ @@ -242,6 +246,8 @@ int mca_pml_ob1_recv_request_ack_send_btl( des->des_cbfunc = mca_pml_ob1_recv_ctl_completion; rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_ACK); + SW_EVENT_RECORD(OMPI_BYTES_RECEIVED_MPI, (long long)size); + if( OPAL_LIKELY( rc >= 0 ) ) { return OMPI_SUCCESS; } @@ -374,6 +380,7 @@ static void mca_pml_ob1_rget_completion (mca_btl_base_module_t* btl, struct mca_ } else { /* is receive request complete */ OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, frag->rdma_length); + SW_EVENT_USER_OR_MPI(recvreq->req_recv.req_base.req_tag, (long long)frag->rdma_length, OMPI_BYTES_RECEIVED_USER, OMPI_BYTES_RECEIVED_MPI); /* TODO: re-add order */ mca_pml_ob1_send_fin (recvreq->req_recv.req_base.req_proc, bml_btl, frag->rdma_hdr.hdr_rget.hdr_frag, @@ -429,6 +436,10 @@ static int mca_pml_ob1_recv_request_put_frag (mca_pml_ob1_rdma_frag_t *frag) /* send rdma request to peer */ rc = mca_bml_base_send (bml_btl, ctl, MCA_PML_OB1_HDR_TYPE_PUT); + + /* Increment counter for bytes_put even though they probably haven't all been received yet */ + SW_EVENT_RECORD(OMPI_BYTES_PUT, frag->rdma_length); + if (OPAL_UNLIKELY(rc < 0)) { mca_bml_base_free (bml_btl, ctl); return rc; @@ -470,6 +481,10 @@ int mca_pml_ob1_recv_request_get_frag (mca_pml_ob1_rdma_frag_t *frag) rc = mca_bml_base_get (bml_btl, frag->local_address, frag->remote_address, local_handle, (mca_btl_base_registration_handle_t *) frag->remote_handle, frag->rdma_length, 0, MCA_BTL_NO_ORDER, mca_pml_ob1_rget_completion, frag); + + /* Increment counter for bytes_get even though they probably haven't all been received yet */ + SW_EVENT_RECORD(OMPI_BYTES_GET, frag->rdma_length); + if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) { return mca_pml_ob1_recv_request_get_frag_failed (frag, OMPI_ERR_OUT_OF_RESOURCE); } @@ -525,6 +540,8 @@ void mca_pml_ob1_recv_request_progress_frag( mca_pml_ob1_recv_request_t* recvreq ); OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, bytes_received); + SW_EVENT_USER_OR_MPI(recvreq->req_recv.req_base.req_tag, (long long)bytes_received, OMPI_BYTES_RECEIVED_USER, OMPI_BYTES_RECEIVED_MPI); + /* check completion status */ if(recv_request_pml_complete_check(recvreq) == false && recvreq->req_rdma_offset < recvreq->req_send_offset) { @@ -602,6 +619,7 @@ void mca_pml_ob1_recv_request_frag_copy_finished( mca_btl_base_module_t* btl, des->des_cbfunc(NULL, NULL, des, 0); OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, bytes_received); + SW_EVENT_USER_OR_MPI(recvreq->req_recv.req_base.req_tag, (long long)bytes_received, OMPI_BYTES_RECEIVED_USER, OMPI_BYTES_RECEIVED_MPI); /* check completion status */ if(recv_request_pml_complete_check(recvreq) == false && @@ -816,6 +834,7 @@ void mca_pml_ob1_recv_request_progress_rndv( mca_pml_ob1_recv_request_t* recvreq recvreq->req_recv.req_base.req_datatype); ); OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, bytes_received); + SW_EVENT_USER_OR_MPI(recvreq->req_recv.req_base.req_tag, (long long)bytes_received, OMPI_BYTES_RECEIVED_USER, OMPI_BYTES_RECEIVED_MPI); } /* check completion status */ if(recv_request_pml_complete_check(recvreq) == false && @@ -886,6 +905,9 @@ void mca_pml_ob1_recv_request_progress_match( mca_pml_ob1_recv_request_t* recvre * for this request. */ recvreq->req_bytes_received += bytes_received; + + SW_EVENT_USER_OR_MPI(recvreq->req_recv.req_base.req_tag, (long long)bytes_received, OMPI_BYTES_RECEIVED_USER, OMPI_BYTES_RECEIVED_MPI); + recv_request_pml_complete(recvreq); } diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.c b/ompi/mca/pml/ob1/pml_ob1_sendreq.c index f358d733dab..3e2f4e1582f 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c @@ -29,6 +29,7 @@ #include "ompi_config.h" #include "opal/prefetch.h" #include "opal/mca/mpool/mpool.h" +#include "ompi/runtime/ompi_software_events.h" #include "ompi/constants.h" #include "ompi/mca/pml/pml.h" #include "pml_ob1.h" @@ -39,7 +40,6 @@ #include "ompi/mca/bml/base/base.h" #include "ompi/memchecker.h" - OBJ_CLASS_INSTANCE(mca_pml_ob1_send_range_t, opal_free_list_item_t, NULL, NULL); @@ -206,6 +206,8 @@ mca_pml_ob1_rndv_completion_request( mca_bml_base_btl_t* bml_btl, } OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered); + SW_EVENT_USER_OR_MPI(sendreq->req_send.req_base.req_tag, req_bytes_delivered, + OMPI_BYTES_SENT_USER, OMPI_BYTES_SENT_MPI); /* advance the request */ OPAL_THREAD_ADD32(&sendreq->req_state, -1); @@ -262,6 +264,8 @@ mca_pml_ob1_rget_completion (mca_pml_ob1_rdma_frag_t *frag, int64_t rdma_length) /* count bytes of user data actually delivered and check for request completion */ if (OPAL_LIKELY(0 < rdma_length)) { OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, (size_t) rdma_length); + SW_EVENT_USER_OR_MPI(sendreq->req_send.req_base.req_tag, rdma_length, + OMPI_BYTES_SENT_USER, OMPI_BYTES_SENT_MPI); } send_request_pml_complete_check(sendreq); @@ -315,6 +319,8 @@ mca_pml_ob1_frag_completion( mca_btl_base_module_t* btl, OPAL_THREAD_ADD32(&sendreq->req_pipeline_depth, -1); OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered); + SW_EVENT_USER_OR_MPI(sendreq->req_send.req_base.req_tag, req_bytes_delivered, + OMPI_BYTES_SENT_USER, OMPI_BYTES_SENT_MPI); if(send_request_pml_complete_check(sendreq) == false) { mca_pml_ob1_send_request_schedule(sendreq); @@ -350,6 +356,7 @@ mca_pml_ob1_copy_frag_completion( mca_btl_base_module_t* btl, * we just abort. In theory, a new queue could be created to hold this * fragment and then attempt to send it out on another BTL. */ rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_FRAG); + if(OPAL_UNLIKELY(rc < 0)) { opal_output(0, "%s:%d FATAL", __FILE__, __LINE__); ompi_rte_abort(-1, NULL); @@ -449,6 +456,7 @@ int mca_pml_ob1_send_request_start_buffered( /* send */ rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_RNDV); + if( OPAL_LIKELY( rc >= 0 ) ) { if( OPAL_LIKELY( 1 == rc ) ) { mca_pml_ob1_rndv_completion_request( bml_btl, sendreq, req_bytes_delivered); @@ -495,8 +503,10 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq, MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP, MCA_PML_OB1_HDR_TYPE_MATCH, &des); + if( OPAL_LIKELY(OMPI_SUCCESS == rc) ) { /* signal request completion */ + SW_EVENT_USER_OR_MPI(sendreq->req_send.req_base.req_tag, size, OMPI_BYTES_SENT_USER, OMPI_BYTES_SENT_MPI); send_request_pml_complete(sendreq); return OMPI_SUCCESS; } @@ -567,6 +577,10 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq, /* send */ rc = mca_bml_base_send_status(bml_btl, des, MCA_PML_OB1_HDR_TYPE_MATCH); + + SW_EVENT_USER_OR_MPI(sendreq->req_send.req_base.req_tag, size, + OMPI_BYTES_SENT_USER, OMPI_BYTES_SENT_MPI); + if( OPAL_LIKELY( rc >= OPAL_SUCCESS ) ) { if( OPAL_LIKELY( 1 == rc ) ) { mca_pml_ob1_match_completion_free_request( bml_btl, sendreq ); @@ -627,6 +641,10 @@ int mca_pml_ob1_send_request_start_prepare( mca_pml_ob1_send_request_t* sendreq, /* send */ rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_MATCH); + + SW_EVENT_USER_OR_MPI(sendreq->req_send.req_base.req_tag, size, + OMPI_BYTES_SENT_USER, OMPI_BYTES_SENT_MPI); + if( OPAL_LIKELY( rc >= OPAL_SUCCESS ) ) { if( OPAL_LIKELY( 1 == rc ) ) { mca_pml_ob1_match_completion_free_request( bml_btl, sendreq ); @@ -731,6 +749,7 @@ int mca_pml_ob1_send_request_start_rdma( mca_pml_ob1_send_request_t* sendreq, /* send */ rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_RGET); + if (OPAL_UNLIKELY(rc < 0)) { mca_bml_base_free(bml_btl, des); return rc; @@ -811,6 +830,7 @@ int mca_pml_ob1_send_request_start_rndv( mca_pml_ob1_send_request_t* sendreq, /* send */ rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_RNDV); + if( OPAL_LIKELY( rc >= 0 ) ) { if( OPAL_LIKELY( 1 == rc ) ) { mca_pml_ob1_rndv_completion_request( bml_btl, sendreq, size ); @@ -1055,6 +1075,7 @@ mca_pml_ob1_send_request_schedule_once(mca_pml_ob1_send_request_t* sendreq) /* initiate send - note that this may complete before the call returns */ rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_FRAG); + if( OPAL_LIKELY(rc >= 0) ) { /* update state */ range->range_btls[btl_idx].length -= size; @@ -1127,6 +1148,8 @@ static void mca_pml_ob1_put_completion (mca_btl_base_module_t* btl, struct mca_b /* check for request completion */ OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, frag->rdma_length); + SW_EVENT_USER_OR_MPI(sendreq->req_send.req_base.req_tag, frag->rdma_length, + OMPI_BYTES_SENT_USER, OMPI_BYTES_SENT_MPI); send_request_pml_complete_check(sendreq); } else { @@ -1177,6 +1200,10 @@ int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t *frag ) rc = mca_bml_base_put (bml_btl, frag->local_address, frag->remote_address, local_handle, (mca_btl_base_registration_handle_t *) frag->remote_handle, frag->rdma_length, 0, MCA_BTL_NO_ORDER, mca_pml_ob1_put_completion, frag); + + /* Count the bytes put even though they probably haven't been sent yet */ + SW_EVENT_RECORD(OMPI_BYTES_PUT, frag->rdma_length); + if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { mca_pml_ob1_send_request_put_frag_failed (frag, rc); return rc; diff --git a/ompi/mpi/c/allgather.c b/ompi/mpi/c/allgather.c index 41df7adf386..07d62deb282 100644 --- a/ompi/mpi/c/allgather.c +++ b/ompi/mpi/c/allgather.c @@ -32,6 +32,7 @@ #include "ompi/errhandler/errhandler.h" #include "ompi/datatype/ompi_datatype.h" #include "ompi/memchecker.h" +#include "ompi/runtime/ompi_software_events.h" #if OMPI_BUILD_MPI_PROFILING #if OPAL_HAVE_WEAK_SYMBOLS @@ -49,6 +50,8 @@ int MPI_Allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, { int err; + SW_EVENT_RECORD(OMPI_ALLGATHER, 1); + MEMCHECKER( int rank; ptrdiff_t ext; diff --git a/ompi/mpi/c/allreduce.c b/ompi/mpi/c/allreduce.c index edfb7020c00..f485e202965 100644 --- a/ompi/mpi/c/allreduce.c +++ b/ompi/mpi/c/allreduce.c @@ -31,6 +31,7 @@ #include "ompi/datatype/ompi_datatype.h" #include "ompi/op/op.h" #include "ompi/memchecker.h" +#include "ompi/runtime/ompi_software_events.h" #if OMPI_BUILD_MPI_PROFILING #if OPAL_HAVE_WEAK_SYMBOLS @@ -47,6 +48,8 @@ int MPI_Allreduce(const void *sendbuf, void *recvbuf, int count, { int err; + SW_EVENT_RECORD(OMPI_ALLREDUCE, 1); + MEMCHECKER( memchecker_datatype(datatype); memchecker_comm(comm); diff --git a/ompi/mpi/c/alltoall.c b/ompi/mpi/c/alltoall.c index c31bb724205..5001293d117 100644 --- a/ompi/mpi/c/alltoall.c +++ b/ompi/mpi/c/alltoall.c @@ -33,6 +33,7 @@ #include "ompi/errhandler/errhandler.h" #include "ompi/datatype/ompi_datatype.h" #include "ompi/memchecker.h" +#include "ompi/runtime/ompi_software_events.h" #if OMPI_BUILD_MPI_PROFILING #if OPAL_HAVE_WEAK_SYMBOLS @@ -51,6 +52,8 @@ int MPI_Alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype, int err; size_t recvtype_size; + SW_EVENT_RECORD(OMPI_ALLTOALL, 1); + MEMCHECKER( memchecker_comm(comm); if (MPI_IN_PLACE != sendbuf) { diff --git a/ompi/mpi/c/bcast.c b/ompi/mpi/c/bcast.c index 6715aff90de..ea0613e4ad6 100644 --- a/ompi/mpi/c/bcast.c +++ b/ompi/mpi/c/bcast.c @@ -26,6 +26,7 @@ #include "ompi/errhandler/errhandler.h" #include "ompi/datatype/ompi_datatype.h" #include "ompi/memchecker.h" +#include "ompi/runtime/ompi_software_events.h" #if OMPI_BUILD_MPI_PROFILING #if OPAL_HAVE_WEAK_SYMBOLS @@ -36,12 +37,13 @@ static const char FUNC_NAME[] = "MPI_Bcast"; - int MPI_Bcast(void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm) { int err; + SW_EVENT_RECORD(OMPI_BCAST, 1); + MEMCHECKER( memchecker_datatype(datatype); memchecker_comm(comm); @@ -110,5 +112,6 @@ int MPI_Bcast(void *buffer, int count, MPI_Datatype datatype, err = comm->c_coll->coll_bcast(buffer, count, datatype, root, comm, comm->c_coll->coll_bcast_module); + OMPI_ERRHANDLER_RETURN(err, comm, err, FUNC_NAME); } diff --git a/ompi/mpi/c/finalize.c b/ompi/mpi/c/finalize.c index b640c6cec11..a8e503d0050 100644 --- a/ompi/mpi/c/finalize.c +++ b/ompi/mpi/c/finalize.c @@ -23,6 +23,7 @@ #include "ompi/mpi/c/bindings.h" #include "ompi/runtime/params.h" #include "ompi/errhandler/errhandler.h" +#include "ompi/runtime/ompi_software_events.h" #if OMPI_BUILD_MPI_PROFILING #if OPAL_HAVE_WEAK_SYMBOLS @@ -36,6 +37,53 @@ static const char FUNC_NAME[] = "MPI_Finalize"; int MPI_Finalize(void) { + /* If --with-spc was specified, print all of the final SPC values + * aggregated across the whole MPI run. + */ +#if SOFTWARE_EVENTS_ENABLE == 1 + int i, j, rank, world_size, offset; + long long *recv_buffer, *send_buffer; + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + + if(rank == 0){ + send_buffer = (long long*)malloc(OMPI_NUM_COUNTERS * sizeof(long long)); + recv_buffer = (long long*)malloc(world_size * OMPI_NUM_COUNTERS * sizeof(long long)); + for(i = 0; i < OMPI_NUM_COUNTERS; i++){ + send_buffer[i] = events[i].value; + } + MPI_Gather(send_buffer, OMPI_NUM_COUNTERS, MPI_LONG_LONG, recv_buffer, OMPI_NUM_COUNTERS, MPI_LONG_LONG, 0, MPI_COMM_WORLD); + } + else{ + send_buffer = (long long*)malloc(OMPI_NUM_COUNTERS * sizeof(long long)); + for(i = 0; i < OMPI_NUM_COUNTERS; i++){ + send_buffer[i] = events[i].value; + } + MPI_Gather(send_buffer, OMPI_NUM_COUNTERS, MPI_LONG_LONG, recv_buffer, OMPI_NUM_COUNTERS, MPI_LONG_LONG, 0, MPI_COMM_WORLD); + } + + if(rank == 0){ + fprintf(stdout, "OMPI Software Counters:\n"); + offset = 0; + for(j = 0; j < world_size; j++){ + fprintf(stdout, "World Rank %d:\n", j); + for(i = 0; i < OMPI_NUM_COUNTERS; i++){ + fprintf(stdout, "%s -> %lld\n", events[i].name, recv_buffer[offset+i]); + } + fprintf(stdout, "\n"); + offset += OMPI_NUM_COUNTERS; + } + free(recv_buffer); + free(send_buffer); + } + else{ + free(send_buffer); + } + + MPI_Barrier(MPI_COMM_WORLD); +#endif + OPAL_CR_FINALIZE_LIBRARY(); if (MPI_PARAM_CHECK) { diff --git a/ompi/mpi/c/gather.c b/ompi/mpi/c/gather.c index 03a7bf63860..4fd06446609 100644 --- a/ompi/mpi/c/gather.c +++ b/ompi/mpi/c/gather.c @@ -32,6 +32,7 @@ #include "ompi/errhandler/errhandler.h" #include "ompi/datatype/ompi_datatype.h" #include "ompi/memchecker.h" +#include "ompi/runtime/ompi_software_events.h" #if OMPI_BUILD_MPI_PROFILING #if OPAL_HAVE_WEAK_SYMBOLS @@ -49,6 +50,8 @@ int MPI_Gather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, { int err; + SW_EVENT_RECORD(OMPI_GATHER, 1); + MEMCHECKER( int rank; ptrdiff_t ext; diff --git a/ompi/mpi/c/init.c b/ompi/mpi/c/init.c index d316fb743d2..da378dbf048 100644 --- a/ompi/mpi/c/init.c +++ b/ompi/mpi/c/init.c @@ -25,6 +25,7 @@ #include #include "opal/util/show_help.h" +#include "ompi/runtime/ompi_software_events.h" #include "ompi/mpi/c/bindings.h" #include "ompi/communicator/communicator.h" #include "ompi/errhandler/errhandler.h" @@ -83,5 +84,7 @@ int MPI_Init(int *argc, char ***argv) OPAL_CR_INIT_LIBRARY(); + SW_EVENT_INIT(); + return MPI_SUCCESS; } diff --git a/ompi/mpi/c/init_thread.c b/ompi/mpi/c/init_thread.c index 38c6d7b7a81..061d6d99c4b 100644 --- a/ompi/mpi/c/init_thread.c +++ b/ompi/mpi/c/init_thread.c @@ -26,6 +26,7 @@ #include "ompi_config.h" #include "opal/util/show_help.h" +#include "ompi/runtime/ompi_software_events.h" #include "ompi/mpi/c/bindings.h" #include "ompi/runtime/params.h" #include "ompi/communicator/communicator.h" @@ -48,6 +49,8 @@ int MPI_Init_thread(int *argc, char ***argv, int required, { int err; + SW_EVENT_INIT(); + ompi_hook_base_mpi_init_thread_top(argc, argv, required, provided); if ( MPI_PARAM_CHECK ) { diff --git a/ompi/mpi/c/irecv.c b/ompi/mpi/c/irecv.c index 3e66e365eb1..5b40bf0bf64 100644 --- a/ompi/mpi/c/irecv.c +++ b/ompi/mpi/c/irecv.c @@ -27,6 +27,7 @@ #include "ompi/mca/pml/pml.h" #include "ompi/request/request.h" #include "ompi/memchecker.h" +#include "ompi/runtime/ompi_software_events.h" #if OMPI_BUILD_MPI_PROFILING #if OPAL_HAVE_WEAK_SYMBOLS @@ -37,12 +38,13 @@ static const char FUNC_NAME[] = "MPI_Irecv"; - int MPI_Irecv(void *buf, int count, MPI_Datatype type, int source, int tag, MPI_Comm comm, MPI_Request *request) { int rc = MPI_SUCCESS; + SW_EVENT_RECORD(OMPI_IRECV, 1); + MEMCHECKER( memchecker_datatype(type); memchecker_comm(comm); @@ -78,5 +80,6 @@ int MPI_Irecv(void *buf, int count, MPI_Datatype type, int source, memchecker_call(&opal_memchecker_base_mem_noaccess, buf, count, type); ); rc = MCA_PML_CALL(irecv(buf,count,type,source,tag,comm,request)); + OMPI_ERRHANDLER_RETURN(rc, comm, rc, FUNC_NAME); } diff --git a/ompi/mpi/c/isend.c b/ompi/mpi/c/isend.c index 5e56deed67e..fe808f3e4a4 100644 --- a/ompi/mpi/c/isend.c +++ b/ompi/mpi/c/isend.c @@ -31,6 +31,7 @@ #include "ompi/mca/pml/pml.h" #include "ompi/request/request.h" #include "ompi/memchecker.h" +#include "ompi/runtime/ompi_software_events.h" #if OMPI_BUILD_MPI_PROFILING #if OPAL_HAVE_WEAK_SYMBOLS @@ -41,12 +42,13 @@ static const char FUNC_NAME[] = "MPI_Isend"; - int MPI_Isend(const void *buf, int count, MPI_Datatype type, int dest, int tag, MPI_Comm comm, MPI_Request *request) { int rc = MPI_SUCCESS; + SW_EVENT_RECORD(OMPI_ISEND, 1); + MEMCHECKER( memchecker_datatype(type); memchecker_call(&opal_memchecker_base_isdefined, buf, count, type); diff --git a/ompi/mpi/c/recv.c b/ompi/mpi/c/recv.c index 864fdd2cdbb..118ca9e8395 100644 --- a/ompi/mpi/c/recv.c +++ b/ompi/mpi/c/recv.c @@ -27,6 +27,7 @@ #include "ompi/mca/pml/pml.h" #include "ompi/memchecker.h" #include "ompi/request/request.h" +#include "ompi/runtime/ompi_software_events.h" #if OMPI_BUILD_MPI_PROFILING #if OPAL_HAVE_WEAK_SYMBOLS @@ -37,12 +38,13 @@ static const char FUNC_NAME[] = "MPI_Recv"; - int MPI_Recv(void *buf, int count, MPI_Datatype type, int source, int tag, MPI_Comm comm, MPI_Status *status) { int rc = MPI_SUCCESS; + SW_EVENT_RECORD(OMPI_RECV, 1); + MEMCHECKER( memchecker_datatype(type); memchecker_call(&opal_memchecker_base_isaddressable, buf, count, type); diff --git a/ompi/mpi/c/reduce.c b/ompi/mpi/c/reduce.c index 92cb8024d75..76e0354a023 100644 --- a/ompi/mpi/c/reduce.c +++ b/ompi/mpi/c/reduce.c @@ -31,6 +31,7 @@ #include "ompi/datatype/ompi_datatype.h" #include "ompi/op/op.h" #include "ompi/memchecker.h" +#include "ompi/runtime/ompi_software_events.h" #if OMPI_BUILD_MPI_PROFILING #if OPAL_HAVE_WEAK_SYMBOLS @@ -47,6 +48,8 @@ int MPI_Reduce(const void *sendbuf, void *recvbuf, int count, { int err; + SW_EVENT_RECORD(OMPI_REDUCE, 1); + MEMCHECKER( memchecker_datatype(datatype); memchecker_comm(comm); diff --git a/ompi/mpi/c/scatter.c b/ompi/mpi/c/scatter.c index 91cbf30c3dd..3564d233282 100644 --- a/ompi/mpi/c/scatter.c +++ b/ompi/mpi/c/scatter.c @@ -32,6 +32,7 @@ #include "ompi/errhandler/errhandler.h" #include "ompi/datatype/ompi_datatype.h" #include "ompi/memchecker.h" +#include "ompi/runtime/ompi_software_events.h" #if OMPI_BUILD_MPI_PROFILING #if OPAL_HAVE_WEAK_SYMBOLS @@ -49,6 +50,8 @@ int MPI_Scatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype, { int err; + SW_EVENT_RECORD(OMPI_SCATTER, 1); + MEMCHECKER( memchecker_comm(comm); if(OMPI_COMM_IS_INTRA(comm)) { diff --git a/ompi/mpi/c/send.c b/ompi/mpi/c/send.c index d5b859aede4..b85b6803657 100644 --- a/ompi/mpi/c/send.c +++ b/ompi/mpi/c/send.c @@ -30,6 +30,7 @@ #include "ompi/mca/pml/pml.h" #include "ompi/datatype/ompi_datatype.h" #include "ompi/memchecker.h" +#include "ompi/runtime/ompi_software_events.h" #if OMPI_BUILD_MPI_PROFILING #if OPAL_HAVE_WEAK_SYMBOLS @@ -40,12 +41,13 @@ static const char FUNC_NAME[] = "MPI_Send"; - int MPI_Send(const void *buf, int count, MPI_Datatype type, int dest, int tag, MPI_Comm comm) { int rc = MPI_SUCCESS; + SW_EVENT_RECORD(OMPI_SEND, 1); + MEMCHECKER( memchecker_datatype(type); memchecker_call(&opal_memchecker_base_isdefined, buf, count, type); diff --git a/ompi/runtime/Makefile.am b/ompi/runtime/Makefile.am index 427abba2674..0aa5e999c32 100644 --- a/ompi/runtime/Makefile.am +++ b/ompi/runtime/Makefile.am @@ -25,9 +25,11 @@ dist_ompidata_DATA += runtime/help-mpi-runtime.txt headers += \ runtime/mpiruntime.h \ - runtime/ompi_cr.h \ + runtime/ompi_cr.h \ runtime/params.h \ - runtime/ompi_info_support.h + runtime/ompi_info_support.h \ + runtime/ompi_software_events.h \ + runtime/papi_sde_interface.h lib@OMPI_LIBMPI_NAME@_la_SOURCES += \ runtime/ompi_mpi_abort.c \ @@ -36,5 +38,7 @@ lib@OMPI_LIBMPI_NAME@_la_SOURCES += \ runtime/ompi_mpi_finalize.c \ runtime/ompi_mpi_params.c \ runtime/ompi_mpi_preconnect.c \ - runtime/ompi_cr.c \ - runtime/ompi_info_support.c + runtime/ompi_cr.c \ + runtime/ompi_info_support.c \ + runtime/ompi_software_events.c \ + runtime/papi_sde_interface.c diff --git a/ompi/runtime/ompi_mpi_params.c b/ompi/runtime/ompi_mpi_params.c index f8376db633d..77a368d36f7 100644 --- a/ompi/runtime/ompi_mpi_params.c +++ b/ompi/runtime/ompi_mpi_params.c @@ -75,6 +75,8 @@ bool ompi_async_mpi_finalize = false; uint32_t ompi_add_procs_cutoff = OMPI_ADD_PROCS_CUTOFF_DEFAULT; bool ompi_mpi_dynamics_enabled = true; +char *ompi_mpi_spc_enable_string = NULL; + static bool show_default_mca_params = false; static bool show_file_mca_params = false; static bool show_enviro_mca_params = false; @@ -315,6 +317,14 @@ int ompi_mpi_register_params(void) MCA_BASE_VAR_SYN_FLAG_DEPRECATED); } + ompi_mpi_spc_enable_string = NULL; + (void) mca_base_var_register("ompi", "mpi", NULL, "spc_enable", + "A comma delimeted string listing the SPC counters to enable.", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_mpi_spc_enable_string); + return OMPI_SUCCESS; } diff --git a/ompi/runtime/ompi_software_events.c b/ompi/runtime/ompi_software_events.c new file mode 100644 index 00000000000..14a077dad7c --- /dev/null +++ b/ompi/runtime/ompi_software_events.c @@ -0,0 +1,261 @@ +#include "ompi_software_events.h" + +opal_timer_t sys_clock_freq_mhz = 0; + +/* Array for converting from SPC indices to MPI_T indices */ +OMPI_DECLSPEC int mpi_t_indices[OMPI_NUM_COUNTERS] = {0}; + +/* Array of names for each counter. Used for MPI_T and PAPI sde initialization */ +OMPI_DECLSPEC const char *counter_names[OMPI_NUM_COUNTERS] = { + "OMPI_SEND", + "OMPI_RECV", + "OMPI_ISEND", + "OMPI_IRECV", + "OMPI_BCAST", + "OMPI_REDUCE", + "OMPI_ALLREDUCE", + "OMPI_SCATTER", + "OMPI_GATHER", + "OMPI_ALLTOALL", + "OMPI_ALLGATHER", + "OMPI_BYTES_RECEIVED_USER", + "OMPI_BYTES_RECEIVED_MPI", + "OMPI_BYTES_SENT_USER", + "OMPI_BYTES_SENT_MPI", + "OMPI_BYTES_PUT", + "OMPI_BYTES_GET", + "OMPI_UNEXPECTED", + "OMPI_OUT_OF_SEQUENCE", + "OMPI_MATCH_TIME", + "OMPI_OOS_MATCH_TIME" +}; + +/* Array of descriptions for each counter. Used for MPI_T and PAPI sde initialization */ +OMPI_DECLSPEC const char *counter_descriptions[OMPI_NUM_COUNTERS] = { + "The number of times MPI_Send was called.", + "The number of times MPI_Recv was called.", + "The number of times MPI_Isend was called.", + "The number of times MPI_Irecv was called.", + "The number of times MPI_Bcast was called.", + "The number of times MPI_Reduce was called.", + "The number of times MPI_Allreduce was called.", + "The number of times MPI_Scatter was called.", + "The number of times MPI_Gather was called.", + "The number of times MPI_Alltoall was called.", + "The number of times MPI_Allgather was called.", + "The number of bytes received by the user through point-to-point communications. Note: Excludes RDMA operations.", + "The number of bytes received by MPI through collective, control, or other internal communications.", + "The number of bytes sent by the user through point-to-point communications. Note: Excludes RDMA operations.", + "The number of bytes sent by MPI through collective, control, or other internal communications.", + "The number of bytes sent/received using RDMA Put operations.", + "The number of bytes sent/received using RDMA Get operations.", + "The number of messages that arrived as unexpected messages.", + "The number of messages that arrived out of the proper sequence.", + "The number of microseconds spent matching unexpected messages.", + "The number of microseconds spent matching out of sequence messages." +}; + +/* An array of integer values to denote whether an event is activated (1) or not (0) */ +OMPI_DECLSPEC unsigned int attached_event[OMPI_NUM_COUNTERS] = { 0 }; +/* An array of event structures to store the event data (name and value) */ +OMPI_DECLSPEC ompi_event_t *events = NULL; + +/* ############################################################## + * ################# Begin MPI_T Functions ###################### + * ############################################################## + */ +static int ompi_sw_event_notify(mca_base_pvar_t *pvar, mca_base_pvar_event_t event, void *obj_handle, int *count) +{ + (void)obj_handle; + + int i; + + /* For this event, we need to set count to the number of long long type + * values for this counter. All SPC counters are one long long, so we + * always set count to 1. + */ + if(MCA_BASE_PVAR_HANDLE_BIND == event) + *count = 1; + /* For this event, we need to turn on the counter */ + else if(MCA_BASE_PVAR_HANDLE_START == event){ + /* Loop over the mpi_t_inddices array and find the correct SPC index to turn on */ + for(i = 0; i < OMPI_NUM_COUNTERS; i++){ + if(pvar->pvar_index == mpi_t_indices[i]){ + attached_event[i] = 1; + break; + } + } + } + /* For this event, we need to turn off the counter */ + else if(MCA_BASE_PVAR_HANDLE_STOP == event){ + /* Loop over the mpi_t_inddices array and find the correct SPC index to turn off */ + for(i = 0; i < OMPI_NUM_COUNTERS; i++){ + if(pvar->pvar_index == mpi_t_indices[i]){ + attached_event[i] = 0; + break; + } + } + } + + return MPI_SUCCESS; +} + +/* ############################################################## + * ################# Begin SPC Functions ######################## + * ############################################################## + */ + +/* This function returns the current count of an SPC counter that has been retistered + * as an MPI_T pvar. The MPI_T index is not necessarily the same as the SPC index, + * so we need to convert from MPI_T index to SPC index and then set the 'value' argument + * to the correct value for this pvar. + */ +static int ompi_sw_event_get_count(const struct mca_base_pvar_t *pvar, void *value, void *obj_handle) +{ + (void) obj_handle; + + int i; + long long *counter_value = (long long*)value; + + for(i = 0; i < OMPI_NUM_COUNTERS; i++){ + if(pvar->pvar_index == mpi_t_indices[i]){ + /* If this is a timer-based counter, we need to convert from cycles to microseconds */ + if(i == OMPI_MATCH_TIME || i == OMPI_OOS_MATCH_TIME) + *counter_value = events[i].value / sys_clock_freq_mhz; + else + *counter_value = events[i].value; + return MPI_SUCCESS; + } + } + /* If all else fails, simply set value to 0 */ + *counter_value = 0; + return MPI_SUCCESS; +} + +/* Initializes the events data structure and allocates memory for it if needed. */ +void events_init() +{ + int i; + + /* If the events data structure hasn't been allocated yet, allocate memory for it */ + if(events == NULL){ + events = (ompi_event_t*)malloc(OMPI_NUM_COUNTERS * sizeof(ompi_event_t)); + } + /* The data structure has been allocated, so we simply initialize all of the counters + * with their names and an initial count of 0. + */ + for(i = 0; i < OMPI_NUM_COUNTERS; i++){ + events[i].name = counter_names[i]; + events[i].value = 0; + } +} + +/* Initializes the SPC data structures and registers all counters as MPI_T pvars. + * Turns on only the counters that were specified in the mpi_spc_enable MCA parameter. + */ +void ompi_sw_event_init() +{ + int i, j, ret, found = 0, all_on = 0; + + /* Initialize the clock frequency variable as the CPU's frequency in MHz */ + sys_clock_freq_mhz = opal_timer_base_get_freq() / 1000000; + + events_init(); + + /* Get the MCA params string of counters to turn on */ + char **arg_strings = opal_argv_split(ompi_mpi_spc_enable_string, ','); + int num_args = opal_argv_count(arg_strings); + + /* If there is only one argument and it is 'all', then all counters + * should be turned on. If the size is 0, then no counters will be enabled. + */ + if(num_args == 1){ + if(strcmp(arg_strings[0], "all") == 0) + all_on = 1; + } + + /* Turn on only the counters that were specified in the MCA parameter */ + for(i = 0; i < OMPI_NUM_COUNTERS; i++){ + if(all_on) + attached_event[i] = 1; + else{ + /* Note: If no arguments were given, this will be skipped */ + for(j = 0; j < num_args && found < num_args; j++){ + if(strcmp(counter_names[i], arg_strings[j]) == 0){ + attached_event[i] = 1; + found++; + } + } + } + + /* Registers the current counter as an MPI_T pvar regardless of whether it's been turned on or not */ + ret = mca_base_pvar_register("ompi", "runtime", "software_events", counter_names[i], counter_descriptions[i], + OPAL_INFO_LVL_4, MPI_T_PVAR_CLASS_SIZE, + MCA_BASE_VAR_TYPE_UNSIGNED_LONG_LONG, NULL, MPI_T_BIND_NO_OBJECT, + MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS, + ompi_sw_event_get_count, NULL, ompi_sw_event_notify, NULL); + /* Initialize the mpi_t_indices array with the MPI_T indices. + * The array index indicates the SPC index, while the value indicates + * the MPI_T index. + */ + if(ret != OPAL_ERROR){ + mpi_t_indices[i] = ret; + } else{ + mpi_t_indices[i] = -1; + } + } +} + +/* Frees any dynamically alocated OMPI software events data structures */ +void ompi_sw_event_fini() +{ + free(events); +} + +/* Records an update to a counter using an atomic add operation. */ +void ompi_sw_event_record(unsigned int event_id, long long value) +{ + /* Denoted unlikely because counters will often be turned off. */ + if(OPAL_UNLIKELY(attached_event[event_id] == 1)){ + OPAL_THREAD_ADD64(&(events[event_id].value), value); + } +} + +/* Starts cycle-precision timer and stores the start value in the 'cycles' argument. + * Note: This assumes that the 'cycles' argument is initialized to 0 if the timer + * hasn't been started yet. + */ +void ompi_sw_event_timer_start(unsigned int event_id, opal_timer_t *cycles) +{ + /* Check whether cycles == 0.0 to make sure the timer hasn't started yet. + * This is denoted unlikely because the counters will often be turned off. + */ + if(OPAL_UNLIKELY(attached_event[event_id] == 1 && *cycles == 0)){ + *cycles = opal_timer_base_get_cycles(); + } +} + +/* Stops a cycle-precision timer and calculates the total elapsed time + * based on the starting time in 'cycles' and stores the result in the + * 'cycles' argument. + */ +void ompi_sw_event_timer_stop(unsigned int event_id, opal_timer_t *cycles) +{ + /* This is denoted unlikely because the counters will often be turned off. */ + if(OPAL_UNLIKELY(attached_event[event_id] == 1)){ + *cycles = opal_timer_base_get_cycles() - *cycles; + OPAL_THREAD_ADD64(&events[event_id].value, (long long)*cycles); + } +} + +/* Checks a tag, and records the user version of the counter if it's greater + * than or equal to 0 and records the mpi version of the counter otherwise. + */ +void ompi_sw_event_user_or_mpi(int tag, long long value, unsigned int user_enum, unsigned int mpi_enum) +{ + if(tag >= 0){ + SW_EVENT_RECORD(user_enum, value); + } else { + SW_EVENT_RECORD(mpi_enum, value); + } +} diff --git a/ompi/runtime/ompi_software_events.h b/ompi/runtime/ompi_software_events.h new file mode 100644 index 00000000000..a0274f3df31 --- /dev/null +++ b/ompi/runtime/ompi_software_events.h @@ -0,0 +1,140 @@ +#ifndef OMPI_SOFTWARE_EVENT +#define OMPI_SOFTWARE_EVENT + +#include +#include +#include +#include + +#include "ompi/include/mpi.h" +#include "ompi/include/ompi_config.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/runtime/params.h" +#include "opal/mca/timer/timer.h" +#include "opal/mca/base/mca_base_pvar.h" +#include "opal/util/argv.h" + +#include MCA_timer_IMPLEMENTATION_HEADER + +/* INSTRUCTIONS FOR ADDING COUNTERS + * 1.) Add a new counter name in the OMPI_COUNTERS enum before + * OMPI_NUM_COUNTERS below. + * 2.) Add corresponding counter name and descriptions to the + * counter_names and counter_descriptions arrays in + * ompi_software_events.c NOTE: The names and descriptions + * MUST be in the same array location as where you added the + * counter name in step 1. + * 3.) If this counter is based on a timer, add its enum name to + * the logic for timer-based counters in the ompi_sw_event_get_count + * function in ompi_software_events.c + * 4.) Instrument the Open MPI code base where it makes sense for + * your counter to be modified using the SW_EVENT_RECORD macro. + * Note: If your counter is timer-based you should use the + * SW_EVENT_TIMER_START and SW_EVENT_TIMER_STOP macros to record + * the time in cycles to then be converted to microseconds later + * in the ompi_sw_event_get_count function when requested by MPI_T + */ + +/* This enumeration serves as event ids for the various events */ +enum OMPI_COUNTERS{ + OMPI_SEND, + OMPI_RECV, + OMPI_ISEND, + OMPI_IRECV, + OMPI_BCAST, + OMPI_REDUCE, + OMPI_ALLREDUCE, + OMPI_SCATTER, + OMPI_GATHER, + OMPI_ALLTOALL, + OMPI_ALLGATHER, + OMPI_BYTES_RECEIVED_USER, + OMPI_BYTES_RECEIVED_MPI, + OMPI_BYTES_SENT_USER, + OMPI_BYTES_SENT_MPI, + OMPI_BYTES_PUT, + OMPI_BYTES_GET, + OMPI_UNEXPECTED, + OMPI_OUT_OF_SEQUENCE, + OMPI_MATCH_TIME, + OMPI_OOS_MATCH_TIME, + OMPI_NUM_COUNTERS /* This serves as the number of counters. It must be last. */ +}; + +/* A structure for storing the event data */ +typedef struct ompi_event_s{ + char *name; + long long value; +} ompi_event_t; + +OMPI_DECLSPEC extern unsigned int attached_event[OMPI_NUM_COUNTERS]; +OMPI_DECLSPEC extern ompi_event_t *events; + +/* Events data structure initialization function */ +void events_init(void); + +/* OMPI software event (SPC) utility functions */ +void ompi_sw_event_init(void); +void ompi_sw_event_fini(void); +void ompi_sw_event_record(unsigned int event_id, long long value); +void ompi_sw_event_timer_start(unsigned int event_id, opal_timer_t *usec); +void ompi_sw_event_timer_stop(unsigned int event_id, opal_timer_t *usec); +void ompi_sw_event_user_or_mpi(int tag, long long value, unsigned int user_enum, unsigned int mpi_enum); +void ompi_sw_event_print_all(void); + +/* MPI_T utility functions */ +static int ompi_sw_event_notify(mca_base_pvar_t *pvar, mca_base_pvar_event_t event, void *obj_handle, int *count); +long long ompi_sw_event_get_counter(int counter_id); + +/* Macros for using the software event utility functions throughout the codebase. + * If SOFTWARE_EVENTS_ENABLE is not 1, the macros become no-ops. + */ +#if SOFTWARE_EVENTS_ENABLE == 1 + +#define SW_EVENT_INIT() \ + ompi_sw_event_init() + +#define SW_EVENT_FINI() \ + ompi_sw_event_fini() + +#define SW_EVENT_RECORD(event_id, value) \ + ompi_sw_event_record(event_id, value) + +#define SW_EVENT_TIMER_START(event_id, usec) \ + ompi_sw_event_timer_start(event_id, usec) + +#define SW_EVENT_TIMER_STOP(event_id, usec) \ + ompi_sw_event_timer_stop(event_id, usec) + +#define SW_EVENT_USER_OR_MPI(tag, value, enum_if_user, enum_if_mpi) \ + ompi_sw_event_user_or_mpi(tag, value, enum_if_user, enum_if_mpi) + +#define SW_EVENT_PRINT_ALL() \ + ompi_sw_event_print_all() + +#else /* Software events are not enabled */ + +#define SW_EVENT_INIT() \ + do {} while (0) + +#define SW_EVENT_FINI() \ + do {} while (0) + +#define SW_EVENT_RECORD(event_id, value) \ + do {} while (0) + +#define SW_EVENT_TIMER_START(event_id, usec) \ + do {} while (0) + +#define SW_EVENT_TIMER_STOP(event_id, usec) \ + do {} while (0) + +#define SW_EVENT_USER_OR_MPI(tag, value, enum_if_user, enum_if_mpi) \ + do {} while (0) + +#define SW_EVENT_PRINT_ALL() \ + do {} while (0) + +#endif + +#endif diff --git a/ompi/runtime/papi_sde_interface.c b/ompi/runtime/papi_sde_interface.c new file mode 100644 index 00000000000..4e8b6820156 --- /dev/null +++ b/ompi/runtime/papi_sde_interface.c @@ -0,0 +1,30 @@ +#include +#include +#include "papi_sde_interface.h" + +#pragma weak papi_sde_init +#pragma weak papi_sde_register_counter +#pragma weak papi_sde_describe_counter + +OMPI_DECLSPEC papi_handle_t +__attribute__((weak)) +papi_sde_init(char *name_of_library, int *event_count) +{ + printf("Weak papi_sde_init called from %s. Weak functions will be called papi sde functionality. If you aren't using the papi sde component, disregard this message.\n", __FILE__); + void * ptr = NULL; + return ptr; +} + +OMPI_DECLSPEC void +__attribute__((weak)) +papi_sde_register_counter(papi_handle_t handle, char *event_name, long long int *counter) +{ + /*printf("weak papi_sde_register_counter called from %s\n", __FILE__);*/ +} + +OMPI_DECLSPEC void +__attribute__((weak)) +papi_sde_describe_counter(papi_handle_t handle, char *event_name, char *event_description) +{ + /*printf("weak papi_sde_describe_counter called from %s\n", __FILE__);*/ +} diff --git a/ompi/runtime/papi_sde_interface.h b/ompi/runtime/papi_sde_interface.h new file mode 100644 index 00000000000..54154f8a186 --- /dev/null +++ b/ompi/runtime/papi_sde_interface.h @@ -0,0 +1,15 @@ +#ifndef PAPI_SDE_INTERFACE_H +#define PAPI_SDE_INTERFACE_H + +#include "ompi/include/ompi_config.h" + +// interface to papi SDE functions +typedef void* papi_handle_t; +papi_handle_t papi_sde_init(char *name_of_library, int *event_count); +void papi_sde_register_counter(papi_handle_t handle, char *event_name, long long int *counter); +void papi_sde_describe_counter(papi_handle_t handle, char *event_name, char *event_description ); + +// required for papi_native_avail +void* papi_sde_hook_list_events( void ); + +#endif diff --git a/ompi/runtime/params.h b/ompi/runtime/params.h index 5716e142523..399b4ac6da4 100644 --- a/ompi/runtime/params.h +++ b/ompi/runtime/params.h @@ -141,6 +141,14 @@ OMPI_DECLSPEC extern bool ompi_async_mpi_init; /* EXPERIMENTAL: do not perform an RTE barrier at the beginning of MPI_Finalize */ OMPI_DECLSPEC extern bool ompi_async_mpi_finalize; +/** + * Whether or not to print the MCA parameters to a file or to stdout + * + * If this argument is set then it is used when parameters are dumped + * when the mpi_show_mca_params is set. + */ +OMPI_DECLSPEC extern char * ompi_mpi_spc_enable_string; + /** * Register MCA parameters used by the MPI layer. diff --git a/opal/mca/base/mca_base_pvar.h b/opal/mca/base/mca_base_pvar.h index 44f23b3dfc1..e305cbcbfd6 100644 --- a/opal/mca/base/mca_base_pvar.h +++ b/opal/mca/base/mca_base_pvar.h @@ -319,7 +319,7 @@ OPAL_DECLSPEC int mca_base_pvar_register (const char *project, const char *frame * associated with a component. * * While quite similar to mca_base_pvar_register(), there is one key - * difference: pvars registered this this function will automatically + * difference: pvars registered with this function will automatically * be unregistered / made unavailable when that component is closed by * its framework. */ diff --git a/opal/runtime/Makefile.am b/opal/runtime/Makefile.am index fab8ead6104..5bef9322ae7 100644 --- a/opal/runtime/Makefile.am +++ b/opal/runtime/Makefile.am @@ -46,3 +46,4 @@ lib@OPAL_LIB_PREFIX@open_pal_la_SOURCES += \ runtime/opal_cr.c \ runtime/opal_info_support.c \ runtime/opal_progress_threads.c + diff --git a/opal/threads/wait_sync.c b/opal/threads/wait_sync.c index 92b6096406c..9b5d75ece71 100644 --- a/opal/threads/wait_sync.c +++ b/opal/threads/wait_sync.c @@ -94,8 +94,10 @@ int ompi_sync_wait_mt(ompi_wait_sync_t *sync) /* In case I am the progress manager, pass the duties on */ if( sync == wait_sync_list ) { wait_sync_list = (sync == sync->next) ? NULL : sync->next; - if( NULL != wait_sync_list ) + if( NULL != wait_sync_list ){ + /* This is a possible placement for an MPI_T progress switch counter */ WAIT_SYNC_PASS_OWNERSHIP(wait_sync_list); + } } OPAL_THREAD_UNLOCK(&wait_sync_lock);