diff --git a/config/opal_check_cuda.m4 b/config/opal_check_cuda.m4
index fd7816e3ea7..68cad854513 100644
--- a/config/opal_check_cuda.m4
+++ b/config/opal_check_cuda.m4
@@ -55,6 +55,8 @@ AS_IF([test "$with_cuda" = "no" || test "x$with_cuda" = "x"],
                      AC_MSG_ERROR([Cannot continue])],
                     [AC_MSG_RESULT([found])
                      opal_check_cuda_happy=yes
+                     opal_cuda_prefix=/usr/local/
+                     opal_cuda_libdir=/usr/local/cuda/lib64
                      opal_cuda_incdir=/usr/local/cuda/include])],
              [AS_IF([test ! -d "$with_cuda"],
                     [AC_MSG_RESULT([not found])
@@ -66,10 +68,14 @@ AS_IF([test "$with_cuda" = "no" || test "x$with_cuda" = "x"],
                                    AC_MSG_WARN([Could not find cuda.h in $with_cuda/include or $with_cuda])
                                    AC_MSG_ERROR([Cannot continue])],
                                   [opal_check_cuda_happy=yes
+                                   opal_cuda_prefix=$with_cuda
                                    opal_cuda_incdir=$with_cuda
+                                   opal_cuda_libdir="$with_cuda/lib64"
                                    AC_MSG_RESULT([found ($with_cuda/cuda.h)])])],
                            [opal_check_cuda_happy=yes
+                            opal_cuda_prefix="$with_cuda"
                             opal_cuda_incdir="$with_cuda/include"
+                            opal_cuda_libdir="$with_cuda/lib64"
                             AC_MSG_RESULT([found ($opal_cuda_incdir/cuda.h)])])])])])
 
 dnl We cannot have CUDA support without dlopen support.  HOWEVER, at
@@ -119,6 +125,8 @@ if test "$opal_check_cuda_happy" = "yes"; then
     CUDA_SUPPORT=1
     opal_datatype_cuda_CPPFLAGS="-I$opal_cuda_incdir"
     AC_SUBST([opal_datatype_cuda_CPPFLAGS])
+    opal_datatype_cuda_LDFLAGS="-L$opal_cuda_libdir"
+    AC_SUBST([opal_datatype_cuda_LDFLAGS])
 else
     AC_MSG_RESULT([no])
     CUDA_SUPPORT=0
@@ -144,6 +152,14 @@ AM_CONDITIONAL([OPAL_cuda_gdr_support], [test "x$CUDA_VERSION_60_OR_GREATER" = "
 AC_DEFINE_UNQUOTED([OPAL_CUDA_GDR_SUPPORT],$CUDA_VERSION_60_OR_GREATER,
                    [Whether we have CUDA GDR support available])
 
+# Checking for nvcc
+AC_MSG_CHECKING([nvcc in $opal_cuda_prefix/bin])
+if test -x "$opal_cuda_prefix/bin/nvcc"; then
+    AC_MSG_RESULT([found])
+    AC_DEFINE_UNQUOTED([NVCC], ["$opal_cuda_prefix/bin/nvcc"], [Path to nvcc binary])
+fi
+
+AC_SUBST([NVCC],[$opal_cuda_prefix/bin/nvcc])
 ])
 
 dnl
diff --git a/configure.ac b/configure.ac
index f1b53d166a0..24ce9a8b7b3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1416,6 +1416,10 @@ m4_ifdef([project_oshmem],
 
 opal_show_subtitle "Final output"
 
+if test "$OPAL_cuda_support" != "0"; then
+  AC_CONFIG_FILES([opal/datatype/cuda/Makefile])
+fi
+
 AC_CONFIG_FILES([
     Makefile
 
diff --git a/ompi/mca/bml/bml.h b/ompi/mca/bml/bml.h
index df731a64a04..3770dbd4584 100644
--- a/ompi/mca/bml/bml.h
+++ b/ompi/mca/bml/bml.h
@@ -361,6 +361,15 @@ static inline void mca_bml_base_deregister_mem (mca_bml_base_btl_t* bml_btl, mca
     btl->btl_deregister_mem (btl, handle);
 }
 
+static inline void mca_bml_base_register_convertor (mca_bml_base_btl_t* bml_btl, mca_btl_base_registration_handle_t *handle, opal_convertor_t *convertor)
+{
+    mca_btl_base_module_t* btl = bml_btl->btl;
+
+    if (btl->btl_register_convertor != NULL) {
+        btl->btl_register_convertor (btl, handle, convertor);
+    }
+}
+
 /*
  *  BML component interface functions and datatype.
  */
diff --git a/ompi/mca/pml/ob1/pml_ob1_component.c b/ompi/mca/pml/ob1/pml_ob1_component.c
index e922c18d8f2..d038630a14d 100644
--- a/ompi/mca/pml/ob1/pml_ob1_component.c
+++ b/ompi/mca/pml/ob1/pml_ob1_component.c
@@ -184,7 +184,7 @@ static int mca_pml_ob1_component_register(void)
     mca_pml_ob1_param_register_int("free_list_max", -1, &mca_pml_ob1.free_list_max);
     mca_pml_ob1_param_register_int("free_list_inc", 64, &mca_pml_ob1.free_list_inc);
     mca_pml_ob1_param_register_int("priority", 20, &mca_pml_ob1.priority);
-    mca_pml_ob1_param_register_sizet("send_pipeline_depth", 3, &mca_pml_ob1.send_pipeline_depth);
+    mca_pml_ob1_param_register_sizet("send_pipeline_depth", 4, &mca_pml_ob1.send_pipeline_depth);
     mca_pml_ob1_param_register_sizet("recv_pipeline_depth", 4, &mca_pml_ob1.recv_pipeline_depth);
 
     /* NTH: we can get into a live-lock situation in the RDMA failure path so disable
diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index 8f3985a0cb1..ca3fa2713be 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -37,11 +37,22 @@
 #include "ompi/mca/bml/base/base.h"
 #include "ompi/memchecker.h"
 
+#include "opal/datatype/opal_datatype_cuda.h"
+#include "opal/mca/common/cuda/common_cuda.h"
+
 size_t mca_pml_ob1_rdma_cuda_btls(
     mca_bml_base_endpoint_t* bml_endpoint,
     unsigned char* base,
     size_t size,
     mca_pml_ob1_com_btl_t* rdma_btls);
+    
+int mca_pml_ob1_rdma_cuda_btl_register_data(
+    mca_bml_base_endpoint_t* bml_endpoint,
+    mca_pml_ob1_com_btl_t* rdma_btls, 
+    uint32_t num_btls_used, 
+    struct opal_convertor_t *pack_convertor);
+
+size_t mca_pml_ob1_rdma_cuda_avail(mca_bml_base_endpoint_t* bml_endpoint);
 
 int mca_pml_ob1_cuda_need_buffers(void * rreq,
                                   mca_btl_base_module_t* btl);
@@ -54,18 +65,21 @@ void mca_pml_ob1_cuda_add_ipc_support(struct mca_btl_base_module_t* btl, int32_t
  */
 int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                                         mca_bml_base_btl_t* bml_btl,
-                                        size_t size) {
+                                        size_t size)
+{
+    struct opal_convertor_t *convertor = &(sendreq->req_send.req_base.req_convertor);
     int rc;
-#if OPAL_CUDA_GDR_SUPPORT
-    /* With some BTLs, switch to RNDV from RGET at large messages */
-    if ((sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA) &&
-        (sendreq->req_send.req_bytes_packed > (bml_btl->btl->btl_cuda_rdma_limit - sizeof(mca_pml_ob1_hdr_t)))) {
-        return mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
-    }
-#endif /* OPAL_CUDA_GDR_SUPPORT */
 
     sendreq->req_send.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
+
     if (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == false) {
+#if OPAL_CUDA_GDR_SUPPORT
+        /* With some BTLs, switch to RNDV from RGET at large messages */
+        if ((sendreq->req_send.req_bytes_packed > (bml_btl->btl->btl_cuda_rdma_limit - sizeof(mca_pml_ob1_hdr_t)))) {
+            sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
+            return mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
+        }
+#endif /* OPAL_CUDA_GDR_SUPPORT */
         unsigned char *base;
         opal_convertor_get_current_pointer( &sendreq->req_send.req_base.req_convertor, (void**)&base );
         /* Set flag back */
@@ -75,6 +89,14 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                                                                            base,
                                                                            sendreq->req_send.req_bytes_packed,
                                                                            sendreq->req_rdma))) {
+                                                                            
+            rc = mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_endpoint,
+                                                         sendreq->req_rdma, sendreq->req_rdma_cnt,
+                                                         convertor); 
+            if (rc != 0) {
+                OPAL_OUTPUT_VERBOSE((0, mca_common_cuda_output, "Failed to register convertor, rc= %d\n", rc));
+                return rc;
+            }  
             rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
                                                      sendreq->req_send.req_bytes_packed);
             if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
@@ -91,14 +113,90 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
     } else {
         /* Do not send anything with first rendezvous message as copying GPU
          * memory into RNDV message is expensive. */
+        unsigned char *base;
+        size_t buffer_size = 0;
         sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
+        
+        /* cuda kernel support is not enabled */
+        if (opal_datatype_cuda_kernel_support == 0) {
+            rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
+            return rc;
+        }
+        /* cuda kernel support is enabled */
+        if ((bml_btl->btl->btl_cuda_ddt_allow_rdma == 1) &&
+            (mca_pml_ob1_rdma_cuda_avail(sendreq->req_endpoint) != 0)) {
+
+            if (convertor->local_size > bml_btl->btl->btl_cuda_ddt_pipeline_size) {
+                buffer_size = bml_btl->btl->btl_cuda_ddt_pipeline_size * bml_btl->btl->btl_cuda_ddt_pipeline_depth;
+            } else {
+                buffer_size = convertor->local_size;
+            }
+            base = opal_cuda_malloc_gpu_buffer(buffer_size, 0);
+            if (NULL == base) {
+                return OPAL_ERR_OUT_OF_RESOURCE;
+            }
+            convertor->gpu_buffer_ptr = base;
+            convertor->gpu_buffer_size = buffer_size;
+            sendreq->req_send.req_bytes_packed = convertor->local_size;
+            OPAL_OUTPUT_VERBOSE((OPAL_DATATYPE_CUDA_VERBOSE_LEVEL, mca_common_cuda_output,
+                                 "RDMA malloc GPU BUFFER %p for pack, local size %lu, "
+                                 "pipeline size %lu, depth %d\n",
+                                 base, convertor->local_size, bml_btl->btl->btl_cuda_ddt_pipeline_size,
+                                 bml_btl->btl->btl_cuda_ddt_pipeline_depth));
+            if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_ob1_rdma_cuda_btls(
+                                                                           sendreq->req_endpoint,
+                                                                           base,
+                                                                           sendreq->req_send.req_bytes_packed,
+                                                                           sendreq->req_rdma))) {
+    
+                rc = mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_endpoint,
+                                                             sendreq->req_rdma, sendreq->req_rdma_cnt,
+                                                             convertor); 
+                if (rc != 0) {
+                    OPAL_OUTPUT_VERBOSE((0, mca_common_cuda_output, "Failed to register convertor, rc= %d\n", rc));
+                    return rc;
+                }
+                convertor->flags |= CONVERTOR_CUDA_ASYNC;
+                rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
+                                                         sendreq->req_send.req_bytes_packed);
+    
+                if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
+                    mca_pml_ob1_free_rdma_resources(sendreq);
+                }
+                return rc;  /* ready to return */
+            } else {
+                /* We failed to use the last GPU buffer, release it and realloc it with the new size */
+                opal_cuda_free_gpu_buffer(base, 0);
+            }
+        }
+        /* In all other cases fall-back on copy in/out protocol */
+        if (bml_btl->btl->btl_cuda_max_send_size != 0) {
+            convertor->pipeline_size = bml_btl->btl->btl_cuda_max_send_size;
+        } else {
+            convertor->pipeline_size = bml_btl->btl->btl_max_send_size;    
+        }
+        convertor->pipeline_depth = mca_pml_ob1.send_pipeline_depth;
+        if (convertor->local_size > convertor->pipeline_size) {
+            buffer_size = convertor->pipeline_size * convertor->pipeline_depth;
+        } else {
+            buffer_size = convertor->local_size;
+        }
+        base = opal_cuda_malloc_gpu_buffer(buffer_size, 0);
+        if (NULL == base) {
+            return OPAL_ERR_OUT_OF_RESOURCE;
+        }
+        OPAL_OUTPUT_VERBOSE((OPAL_DATATYPE_CUDA_VERBOSE_LEVEL, mca_common_cuda_output,
+                             "Copy in/out malloc GPU buffer %p, pipeline_size %ld\n",
+                             base, convertor->pipeline_size));
+        convertor->gpu_buffer_ptr = base;
+        convertor->gpu_buffer_size = buffer_size;
+        convertor->pipeline_seq = 0;
         rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
     }
+
     return rc;
 }
 
-
-
 size_t mca_pml_ob1_rdma_cuda_btls(
     mca_bml_base_endpoint_t* bml_endpoint,
     unsigned char* base,
@@ -152,6 +250,55 @@ size_t mca_pml_ob1_rdma_cuda_btls(
     return num_btls_used;
 }
 
+int mca_pml_ob1_rdma_cuda_btl_register_data(
+    mca_bml_base_endpoint_t* bml_endpoint,
+    mca_pml_ob1_com_btl_t* rdma_btls, 
+    uint32_t num_btls_used, 
+    struct opal_convertor_t *pack_convertor)
+{
+    uint32_t i;
+    for (i = 0; i < num_btls_used; i++) {
+        mca_btl_base_registration_handle_t *handle = rdma_btls[i].btl_reg;
+        mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, i);
+        mca_bml_base_register_convertor(bml_btl, handle, pack_convertor);
+    }
+    return 0;
+}
+
+/* return how many btl can have RDMA support */
+size_t mca_pml_ob1_rdma_cuda_avail(mca_bml_base_endpoint_t* bml_endpoint)
+{
+    int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
+    double weight_total = 0;
+    int num_btls_used = 0, n;
+
+    /* shortcut when there are no rdma capable btls */
+    if(num_btls == 0) {
+        return 0;
+    }
+
+    /* check if GET is supported by the BTL */
+    for(n = 0;
+        (n < num_btls) && (num_btls_used < mca_pml_ob1.max_rdma_per_request);
+        n++) {
+        mca_bml_base_btl_t* bml_btl =
+            mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n);
+
+        if (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET) {
+            weight_total += bml_btl->btl_weight;
+            num_btls_used++;
+        }
+    }
+
+    /* if we don't use leave_pinned and all BTLs that already have this memory
+     * registered amount to less then half of available bandwidth - fall back to
+     * pipeline protocol */
+    if(0 == num_btls_used || (!mca_pml_ob1.leave_pinned && weight_total < 0.5))
+        return 0;
+
+    return num_btls_used;
+}
+
 int mca_pml_ob1_cuda_need_buffers(void * rreq,
                                   mca_btl_base_module_t* btl)
 {
diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
index c8cbbdfa491..cf92ae14b60 100644
--- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
@@ -547,16 +547,50 @@ void mca_pml_ob1_recv_request_frag_copy_start( mca_pml_ob1_recv_request_t* recvr
                                                size_t num_segments,
                                                mca_btl_base_descriptor_t* des)
 {
-    int result;
     size_t bytes_received = 0, data_offset = 0;
     size_t bytes_delivered __opal_attribute_unused__; /* is being set to zero in MCA_PML_OB1_RECV_REQUEST_UNPACK */
     mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
+    opal_convertor_t *convertor = &(recvreq)->req_recv.req_base.req_convertor;
+    void *cuda_stream = NULL;
+    int opal_datatype_use_kernel = 0;
+    int result;
 
     OPAL_OUTPUT((-1, "start_frag_copy frag=%p", (void *)des));
 
+    data_offset    = hdr->hdr_frag.hdr_frag_offset;
     bytes_received = mca_pml_ob1_compute_segment_length_base (segments, num_segments,
                                                               sizeof(mca_pml_ob1_frag_hdr_t));
-    data_offset     = hdr->hdr_frag.hdr_frag_offset;
+    
+    if (opal_datatype_cuda_kernel_support && (convertor->flags & CONVERTOR_CUDA_ASYNC)) {
+        convertor->flags &= ~CONVERTOR_CUDA;
+        if (opal_convertor_need_buffers(convertor) == true) {
+            opal_datatype_use_kernel = 1;
+            convertor->stream = mca_common_cuda_get_htod_stream();
+            /* some how async support is just enabled, part of convertor is unpacked */ 
+            if (convertor->pipeline_depth == 0 && convertor->gpu_buffer_ptr != NULL) {
+                opal_cuda_free_gpu_buffer(convertor->gpu_buffer_ptr, 0);
+                convertor->gpu_buffer_ptr = NULL;
+            } 
+            if (convertor->gpu_buffer_ptr == NULL) {
+                size_t buffer_size = 0;
+                convertor->pipeline_size = btl->btl_max_send_size;
+                convertor->pipeline_depth = mca_pml_ob1.recv_pipeline_depth;
+                if (convertor->local_size > convertor->pipeline_size) {
+                    buffer_size = convertor->pipeline_size * convertor->pipeline_depth;
+                } else {
+                    buffer_size = convertor->local_size;
+                }
+                OPAL_OUTPUT_VERBOSE((OPAL_DATATYPE_CUDA_VERBOSE_LEVEL, mca_common_cuda_output, "Malloc GPU buffer size %lu for frag_copy_start\n", buffer_size));
+                convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer(buffer_size, 0);
+                if (NULL == convertor->gpu_buffer_ptr) {
+                    return;
+                }
+                convertor->gpu_buffer_size = buffer_size;
+                convertor->pipeline_seq = 0;
+            }
+        }
+        convertor->flags |= CONVERTOR_CUDA;
+    }
 
     MCA_PML_OB1_RECV_REQUEST_UNPACK( recvreq,
                                      segments,
@@ -565,6 +599,11 @@ void mca_pml_ob1_recv_request_frag_copy_start( mca_pml_ob1_recv_request_t* recvr
                                      data_offset,
                                      bytes_received,
                                      bytes_delivered );
+         
+    if (opal_datatype_use_kernel == 1) {                       
+        convertor->pipeline_seq ++;
+        convertor->pipeline_seq = convertor->pipeline_seq % convertor->pipeline_depth;
+    }
     /* Store the receive request in unused context pointer. */
     des->des_context = (void *)recvreq;
     /* Store the amount of bytes in unused cbdata pointer */
@@ -572,7 +611,8 @@ void mca_pml_ob1_recv_request_frag_copy_start( mca_pml_ob1_recv_request_t* recvr
     /* Then record an event that will get triggered by a PML progress call which
      * checks the stream events.  If we get an error, abort.  Should get message
      * from CUDA code about what went wrong. */
-    result = mca_common_cuda_record_htod_event("pml", des);
+    result = mca_common_cuda_record_htod_event("pml", des, cuda_stream);
+    OPAL_OUTPUT_VERBOSE((OPAL_DATATYPE_CUDA_VERBOSE_LEVEL, mca_common_cuda_output, "Record h2d cuda event\n"));
     if (OMPI_SUCCESS != result) {
         opal_output(0, "%s:%d FATAL", __FILE__, __LINE__);
         ompi_rte_abort(-1, NULL);
@@ -608,6 +648,15 @@ void mca_pml_ob1_recv_request_frag_copy_finished( mca_btl_base_module_t* btl,
         /* schedule additional rdma operations */
         mca_pml_ob1_recv_request_schedule(recvreq, NULL);
     }
+    if(recvreq->req_bytes_received >= recvreq->req_recv.req_bytes_packed) {
+        opal_convertor_t *convertor = &(recvreq)->req_recv.req_base.req_convertor;
+        if (convertor->gpu_buffer_ptr != NULL) {
+            OPAL_OUTPUT_VERBOSE((OPAL_DATATYPE_CUDA_VERBOSE_LEVEL, mca_common_cuda_output,
+                                 "Free GPU pack/unpack buffer %p\n", convertor->gpu_buffer_ptr));
+            opal_cuda_free_gpu_buffer(convertor->gpu_buffer_ptr, 0);
+            convertor->gpu_buffer_ptr = NULL;
+        }    
+    }
 }
 #endif /* OPAL_CUDA_SUPPORT */
 
@@ -644,8 +693,11 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
         if (mca_pml_ob1_cuda_need_buffers(recvreq, btl))
 #endif /* OPAL_CUDA_SUPPORT */
         {
-            mca_pml_ob1_recv_request_ack(recvreq, &hdr->hdr_rndv, 0);
-            return;
+            /* need more careful check here */
+            if (! (recvreq->req_recv.req_base.req_convertor.flags & CONVERTOR_CUDA)) {
+                mca_pml_ob1_recv_request_ack(recvreq, &hdr->hdr_rndv, 0);
+                return;    
+            }
         }
     }
 
@@ -702,6 +754,7 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
         mca_bml_base_register_mem (rdma_bml, data_ptr, bytes_remaining, flags, &recvreq->local_handle);
         /* It is not an error if the memory region can not be registered here. The registration will
          * be attempted again for each get fragment. */
+        mca_bml_base_register_convertor(rdma_bml, recvreq->local_handle, &recvreq->req_recv.req_base.req_convertor);
     }
 
     /* The while loop adds a fragmentation mechanism. The variable bytes_remaining holds the num
diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.c b/ompi/mca/pml/ob1/pml_ob1_sendreq.c
index 96bfa16ddb5..b92e07ebff8 100644
--- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c
@@ -1,4 +1,3 @@
-/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
 /*
  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
@@ -668,10 +667,25 @@ int mca_pml_ob1_send_request_start_rdma( mca_pml_ob1_send_request_t* sendreq,
                                                     MCA_PML_OB1_HDR_FLAGS_PIN);
     }
 
+#if OPAL_CUDA_SUPPORT
+    if ( (sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA)) {
+        sendreq->req_send.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
+        if (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == true) {
+            data_ptr = sendreq->req_send.req_base.req_convertor.gpu_buffer_ptr;
+        } else {
+            opal_convertor_get_current_pointer (&sendreq->req_send.req_base.req_convertor, &data_ptr);
+        }
+        /* Set flag back */
+        sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
+    } else {
+        opal_convertor_get_current_pointer (&sendreq->req_send.req_base.req_convertor, &data_ptr);
+    }
+#else
     /* at this time ob1 does not support non-contiguous gets. the convertor represents a
      * contiguous block of memory */
     opal_convertor_get_current_pointer (&sendreq->req_send.req_base.req_convertor, &data_ptr);
-
+#endif
+    
     local_handle = sendreq->req_rdma[0].btl_reg;
 
     /* allocate an rdma fragment to keep track of the request size for use in the fin message */
diff --git a/opal/datatype/Makefile.am b/opal/datatype/Makefile.am
index 6002a739f20..472103c26c3 100644
--- a/opal/datatype/Makefile.am
+++ b/opal/datatype/Makefile.am
@@ -22,6 +22,8 @@
 # $HEADER$
 #
 
+DIST_SUBDIRS = cuda
+
 headers = \
         opal_convertor.h \
         opal_convertor_internal.h \
@@ -77,4 +79,5 @@ endif
 if OPAL_cuda_support
 libdatatype_la_SOURCES += opal_datatype_cuda.c
 headers += opal_datatype_cuda.h
+SUBDIRS      = . cuda
 endif
diff --git a/opal/datatype/cuda/Makefile.in b/opal/datatype/cuda/Makefile.in
new file mode 100644
index 00000000000..6ca57a58288
--- /dev/null
+++ b/opal/datatype/cuda/Makefile.in
@@ -0,0 +1,62 @@
+@SET_MAKE@
+
+AM_CPPFLAGS = @common_cuda_CPPFLAGS@
+srcdir = @srcdir@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+VPATH = @srcdir@
+
+NVCC       = @NVCC@
+ARCH       = @AR@
+ARCHFLAGS  = cr
+STLIB     ?= opal_datatype_cuda_kernel.a
+DYLIB     ?= opal_datatype_cuda_kernel.so
+EXTLIB     = -L$(top_builddir)/opal/datatype/.libs -ldatatype
+subdir     = opal/datatype/cuda
+
+CFLAGS  = -I$(top_builddir)/opal/include -I$(top_srcdir)/opal/include -I$(top_builddir) -I$(top_srcdir)  --compiler-options '-fPIC -g'
+LDFLAGS = -shared --compiler-options '-fPIC @LDFLAGS@'
+
+SRC := \
+    opal_datatype_cuda.cu \
+    opal_datatype_pack_cuda_kernel.cu \
+    opal_datatype_pack_cuda_wrapper.cu \
+    opal_datatype_unpack_cuda_kernel.cu \
+    opal_datatype_unpack_cuda_wrapper.cu
+
+OBJ := $(SRC:.cu=.o)
+
+.PHONY: all clean cleanall
+
+all: Makefile $(STLIB) $(DYLIB)
+
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+
+$(STLIB): $(OBJ)
+	$(ARCH) $(ARCHFLAGS) $@ $(OBJ) 
+	@RANLIB@ $@
+
+$(DYLIB): $(OBJ)
+	$(NVCC) $(LDFLAGS) $(EXTLIB) -o $(DYLIB) $(OBJ)
+
+%.o: %.cu
+	$(NVCC) $(CFLAGS) $(EXTLIB) $(INC) -c $< -o $@ 
+
+install: $(DYLIB)
+	mkdir -p @OMPI_WRAPPER_LIBDIR@
+	cp -f $(DYLIB) @OMPI_WRAPPER_LIBDIR@/
+
+clean:
+	rm -f $(OBJ) $(STLIB) $(DYLIB)
+
+cleanall: clean
+	rm -f $(STLIB) $(DYLIB)
+
+check:
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
new file mode 100644
index 00000000000..edd88ecbc48
--- /dev/null
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -0,0 +1,853 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2014-2016 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ */
+
+#include "opal/datatype/opal_convertor_internal.h"
+#include "opal/datatype/opal_datatype_internal.h"
+#include "opal/util/output.h"
+
+#include "opal_datatype_cuda_internal.cuh"
+#include "opal_datatype_cuda.cuh"
+#include <stdio.h>
+#include <assert.h>
+#include <stdarg.h>
+
+ddt_cuda_list_t *cuda_free_list;
+ddt_cuda_device_t *cuda_devices;
+ddt_cuda_device_t *current_cuda_device;
+uint32_t cuda_iov_cache_enabled;
+
+extern size_t opal_datatype_cuda_buffer_size;
+
+static inline ddt_cuda_buffer_t* obj_ddt_cuda_buffer_new()
+{
+    ddt_cuda_buffer_t *p = (ddt_cuda_buffer_t *)malloc(sizeof(ddt_cuda_buffer_t));
+    p->next = NULL;
+    p->prev = NULL;
+    p->size = 0;
+    p->gpu_addr = NULL;
+    return p;
+}
+
+static inline void obj_ddt_cuda_buffer_chop(ddt_cuda_buffer_t *p)
+{
+    p->next = NULL;
+    p->prev = NULL;
+}
+
+static inline void obj_ddt_cuda_buffer_reset(ddt_cuda_buffer_t *p)
+{
+    p->size = 0;
+    p->gpu_addr = NULL;
+}
+
+static ddt_cuda_list_t* init_cuda_free_list()
+{
+    ddt_cuda_list_t *list = NULL;
+    ddt_cuda_buffer_t *p, *prev;
+    int i;
+    list = (ddt_cuda_list_t *)malloc(sizeof(ddt_cuda_list_t));
+    p = obj_ddt_cuda_buffer_new();
+    list->head = p;
+    prev = p;
+    for (i = 1; i < DT_CUDA_FREE_LIST_SIZE; i++) {
+        p = obj_ddt_cuda_buffer_new();
+        prev->next = p;
+        p->prev = prev;
+        prev = p;
+    }
+    list->tail = p;
+    list->nb_elements = DT_CUDA_FREE_LIST_SIZE;
+    return list;
+}
+
+static inline ddt_cuda_buffer_t* cuda_list_pop_tail(ddt_cuda_list_t *list)
+{
+    ddt_cuda_buffer_t *p = list->tail;
+    if (NULL != p) {
+        list->nb_elements--;
+        if (list->head == p) {
+            list->head = NULL;
+            list->tail = NULL;
+        } else {
+            list->tail = p->prev;
+            p->prev->next = NULL;
+            obj_ddt_cuda_buffer_chop(p);
+        }
+    }
+    return p;
+}
+
+static inline void cuda_list_push_head(ddt_cuda_list_t *list, ddt_cuda_buffer_t *item)
+{
+    assert(item->next == NULL && item->prev == NULL);
+    item->next = list->head;
+    if (NULL == list->head) {
+        list->tail = item;
+    } else {
+        list->head->prev = item;
+    }
+    list->head = item;
+    list->nb_elements++;
+}
+
+static inline void cuda_list_push_tail(ddt_cuda_list_t *list, ddt_cuda_buffer_t *item)
+{
+    assert(item->next == NULL && item->prev == NULL);
+    item->prev = list->tail;
+    if (NULL == list->tail) {
+        list->head = item;
+    } else {
+        list->tail->next = item;
+    }
+    list->tail = item;
+    list->nb_elements++;
+}
+
+static inline void cuda_list_delete(ddt_cuda_list_t *list, ddt_cuda_buffer_t *item)
+{
+    if (item->prev == NULL && item->next == NULL) {
+        list->head = NULL;
+        list->tail = NULL;
+    } else if (item->prev == NULL && item->next != NULL) {
+        list->head = item->next;
+        item->next->prev = NULL;
+    } else if (item->next == NULL && item->prev != NULL) {
+        list->tail = item->prev;
+        item->prev->next = NULL;
+    } else {
+        item->prev->next = item->next;
+        item->next->prev = item->prev;
+    }
+    list->nb_elements--;
+    obj_ddt_cuda_buffer_chop(item);
+}
+
+static inline void cuda_list_insert_before(ddt_cuda_list_t *list, ddt_cuda_buffer_t *item, ddt_cuda_buffer_t *next)
+{
+    assert(item->next == NULL && item->prev == NULL);
+    item->next = next;
+    item->prev = next->prev;
+    if (next->prev != NULL) {
+        next->prev->next = item;
+    }
+    next->prev = item;
+    if (list->head == next) {
+        list->head = item;
+    }
+    list->nb_elements++;
+}
+
+/**
+ * Collapse the list of free buffers by mergining consecutive buffers. As the property of this list
+ * is continously maintained, we only have to parse it up to the newest inserted elements.
+ */
+static inline void cuda_list_item_merge_by_addr(ddt_cuda_list_t *list, ddt_cuda_buffer_t* last)
+{
+    ddt_cuda_buffer_t *current = list->head;
+    ddt_cuda_buffer_t *next = NULL;
+    void* stop_addr = last->gpu_addr;
+
+    while(1) {  /* loop forever, the exit conditions are inside */
+        if( NULL == (next = current->next) ) return;
+        if ((current->gpu_addr + current->size) == next->gpu_addr) {
+            current->size += next->size;
+            cuda_list_delete(list, next);
+            free(next);  /* release the element, and try to continue merging */
+            continue;
+        }
+        current = current->next;
+        if( NULL == current ) return;
+        if( current->gpu_addr > stop_addr ) return;
+    }
+}
+
+int32_t opal_datatype_cuda_kernel_init(void)
+{
+    uint32_t j;
+    int device;
+    cudaError cuda_err;
+
+    cuda_err = cudaGetDevice(&device);
+    if( cudaSuccess != cuda_err ) {
+        OPAL_OUTPUT_VERBOSE((0, opal_datatype_cuda_output, "Cannot retrieve the device being used. Drop CUDA support!\n"));
+        return OPAL_ERROR;
+    }
+
+    cuda_free_list = init_cuda_free_list();
+
+    /* init cuda_iov */
+    cuda_iov_cache_enabled = 1;
+
+    /* init device */
+    cuda_devices = (ddt_cuda_device_t *)malloc(sizeof(ddt_cuda_device_t));
+    
+    unsigned char *gpu_ptr = NULL;
+    if (cudaMalloc((void **)(&gpu_ptr), sizeof(char) * opal_datatype_cuda_buffer_size) != cudaSuccess) {
+        OPAL_OUTPUT_VERBOSE((0, opal_datatype_cuda_output, "cudaMalloc is failed in GPU %d\n", device));
+        return OPAL_ERROR;
+    }
+    OPAL_OUTPUT_VERBOSE((2, opal_datatype_cuda_output, "DDT engine cudaMalloc buffer %p in GPU %d\n", gpu_ptr, device));
+    cudaMemset(gpu_ptr, 0, sizeof(char) * opal_datatype_cuda_buffer_size);
+    cuda_devices[0].gpu_buffer = gpu_ptr;
+
+    cuda_devices[0].buffer_free_size = opal_datatype_cuda_buffer_size;
+    ddt_cuda_buffer_t *p = obj_ddt_cuda_buffer_new();
+    p->size = opal_datatype_cuda_buffer_size;
+    p->gpu_addr = gpu_ptr;
+    cuda_devices[0].buffer_free.head = p;
+    cuda_devices[0].buffer_free.tail = cuda_devices[0].buffer_free.head;
+    cuda_devices[0].buffer_free.nb_elements = 1;
+
+    cuda_devices[0].buffer_used.head = NULL;
+    cuda_devices[0].buffer_used.tail = NULL;
+    cuda_devices[0].buffer_used_size = 0;
+    cuda_devices[0].buffer_used.nb_elements = 0;
+
+    cuda_devices[0].device_id = device;
+
+    /* init cuda stream */
+    ddt_cuda_stream_t *cuda_streams = (ddt_cuda_stream_t *)malloc(sizeof(ddt_cuda_stream_t));
+    for (j = 0; j < NB_STREAMS; j++) {
+        cuda_err = cudaStreamCreate(&(cuda_streams->ddt_cuda_stream[j]));
+        CUDA_ERROR_CHECK(cuda_err);
+    }
+
+    cuda_streams->current_stream_id = 0;
+    cuda_devices[0].cuda_streams = cuda_streams;
+    cuda_err = cudaEventCreate(&(cuda_devices[0].memcpy_event), cudaEventDisableTiming);
+    CUDA_ERROR_CHECK(cuda_err);
+
+    /* init iov pipeline blocks */
+    ddt_cuda_iov_pipeline_block_non_cached_t *cuda_iov_pipeline_block_non_cached = NULL;
+    for (j = 0; j < NB_PIPELINE_NON_CACHED_BLOCKS; j++) {
+        if (!cuda_iov_cache_enabled) {
+            cuda_iov_pipeline_block_non_cached = (ddt_cuda_iov_pipeline_block_non_cached_t *)malloc(sizeof(ddt_cuda_iov_pipeline_block_non_cached_t));
+            cuda_err = cudaMallocHost((void **)(&(cuda_iov_pipeline_block_non_cached->cuda_iov_dist_non_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            CUDA_ERROR_CHECK(cuda_err);
+            cuda_err = cudaMalloc((void **)(&(cuda_iov_pipeline_block_non_cached->cuda_iov_dist_non_cached_d)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            CUDA_ERROR_CHECK(cuda_err);
+            cuda_err = cudaEventCreateWithFlags(&(cuda_iov_pipeline_block_non_cached->cuda_event), cudaEventDisableTiming);
+            CUDA_ERROR_CHECK(cuda_err);
+            cuda_iov_pipeline_block_non_cached->cuda_stream = NULL;
+        }
+        cuda_devices[0].cuda_iov_pipeline_block_non_cached[j] = cuda_iov_pipeline_block_non_cached;
+        cuda_devices[0].cuda_iov_pipeline_block_non_cached_first_avail = 0;
+    }
+
+    /* init iov block for cached */
+    ddt_cuda_iov_process_block_cached_t *cuda_iov_process_block_cached = NULL;
+    for (j = 0; j < NB_CACHED_BLOCKS; j++) {
+        if (cuda_iov_cache_enabled) {
+            cuda_iov_process_block_cached = (ddt_cuda_iov_process_block_cached_t *)malloc(sizeof(ddt_cuda_iov_process_block_cached_t));
+            cuda_iov_process_block_cached->cuda_iov_dist_cached_h = (ddt_cuda_iov_dist_cached_t *)malloc(sizeof(ddt_cuda_iov_dist_cached_t) * NUM_CUDA_IOV_PER_DDT);
+            cuda_err = cudaEventCreateWithFlags(&(cuda_iov_process_block_cached->cuda_event), cudaEventDisableTiming);
+            CUDA_ERROR_CHECK(cuda_err);
+            cuda_iov_process_block_cached->cuda_stream = NULL;
+        }
+        cuda_devices[0].cuda_iov_process_block_cached[j] = cuda_iov_process_block_cached;
+        cuda_devices[0].cuda_iov_process_block_cached_first_avail = 0;
+    }
+    current_cuda_device = &(cuda_devices[0]);
+
+    cuda_err = cudaDeviceSynchronize();
+    CUDA_ERROR_CHECK(cuda_err);
+    return OPAL_SUCCESS;
+}
+
+int32_t opal_datatype_cuda_kernel_fini(void)
+{
+    uint32_t j;
+    cudaError_t cuda_err;
+
+    /* free gpu buffer */
+    cuda_err = cudaFree(cuda_devices[0].gpu_buffer);
+    CUDA_ERROR_CHECK(cuda_err);
+    /* destory cuda stream and iov*/
+    for (j = 0; j < NB_STREAMS; j++) {
+        cuda_err = cudaStreamDestroy(cuda_devices[0].cuda_streams->ddt_cuda_stream[j]);
+        CUDA_ERROR_CHECK(cuda_err);
+    }
+    free(cuda_devices[0].cuda_streams);
+
+    ddt_cuda_iov_pipeline_block_non_cached_t *cuda_iov_pipeline_block_non_cached = NULL;
+    for (j = 0; j < NB_PIPELINE_NON_CACHED_BLOCKS; j++) {
+        if( NULL != (cuda_iov_pipeline_block_non_cached = cuda_devices[0].cuda_iov_pipeline_block_non_cached[j]) ) {
+            if (cuda_iov_pipeline_block_non_cached->cuda_iov_dist_non_cached_d != NULL) {
+                cuda_err = cudaFree(cuda_iov_pipeline_block_non_cached->cuda_iov_dist_non_cached_d);
+                CUDA_ERROR_CHECK(cuda_err);
+                cuda_iov_pipeline_block_non_cached->cuda_iov_dist_non_cached_d = NULL;
+                cuda_iov_pipeline_block_non_cached->cuda_iov_dist_non_cached_h = NULL;
+            }
+            cuda_err = cudaEventDestroy(cuda_iov_pipeline_block_non_cached->cuda_event);
+            CUDA_ERROR_CHECK(cuda_err);
+            cuda_iov_pipeline_block_non_cached->cuda_stream = NULL;
+            free(cuda_iov_pipeline_block_non_cached);
+            cuda_iov_pipeline_block_non_cached = NULL;
+        }
+    }
+
+    ddt_cuda_iov_process_block_cached_t *cuda_iov_process_block_cached = NULL;
+    for (j = 0; j < NB_CACHED_BLOCKS; j++) {
+        if( NULL != (cuda_iov_process_block_cached = cuda_devices[0].cuda_iov_process_block_cached[j]) ) {
+            if (cuda_iov_process_block_cached->cuda_iov_dist_cached_h != NULL) {
+                free(cuda_iov_process_block_cached->cuda_iov_dist_cached_h);
+                cuda_iov_process_block_cached->cuda_iov_dist_cached_h = NULL;
+            }
+            cuda_err = cudaEventDestroy(cuda_iov_process_block_cached->cuda_event);
+            CUDA_ERROR_CHECK(cuda_err);
+            cuda_iov_process_block_cached->cuda_stream = NULL;
+            free(cuda_iov_process_block_cached);
+            cuda_iov_process_block_cached = NULL;
+        }
+    }
+    cuda_devices[0].cuda_streams = NULL;
+    cuda_err = cudaEventDestroy(cuda_devices[0].memcpy_event);
+    CUDA_ERROR_CHECK(cuda_err);
+    
+    free(cuda_devices);
+    cuda_devices = NULL;    
+    current_cuda_device = NULL;
+
+    return OPAL_SUCCESS;
+}
+
+void* opal_datatype_cuda_cached_cuda_iov_init(uint32_t size)
+{
+#if OPAL_DATATYPE_CUDA_IOV_CACHE
+    ddt_cuda_iov_total_cached_t *tmp = (ddt_cuda_iov_total_cached_t *)malloc(sizeof(ddt_cuda_iov_total_cached_t) +
+                                                                             size * sizeof(uint32_t));
+    if( NULL != tmp ) {
+        tmp->cuda_iov_dist_d    = NULL;
+        tmp->cuda_iov_count     = size;
+        tmp->cuda_iov_is_cached = 0;
+        tmp->nb_bytes_h         = (uint32_t*)((char*)tmp + sizeof(ddt_cuda_iov_total_cached_t));
+        OPAL_OUTPUT_VERBOSE((2, opal_datatype_cuda_output, "Malloc cuda_iov_dist_cached for ddt is successed, cached cuda iov %p, nb_bytes_h %p, size %d.\n", tmp, tmp->nb_bytes_h, size));
+        return (void*)tmp;
+    }
+    OPAL_OUTPUT_VERBOSE((0, opal_datatype_cuda_output, "Malloc cuda_iov_dist_cached for ddt is failed.\n"));
+#else
+    OPAL_OUTPUT_VERBOSE((2, opal_datatype_cuda_output, "cuda iov cache is not enabled.\n"));
+#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
+    return NULL;
+}
+
+void opal_datatype_cuda_cached_cuda_iov_fini(void* cached_cuda_iov)
+{
+#if OPAL_DATATYPE_CUDA_IOV_CACHE
+    ddt_cuda_iov_total_cached_t *tmp = (ddt_cuda_iov_total_cached_t *) cached_cuda_iov;
+    if (NULL != tmp) {
+        OPAL_OUTPUT_VERBOSE((2, opal_datatype_cuda_output, "Free cuda_iov_dist for ddt is successed %p.\n", cached_cuda_iov));
+        if (NULL != tmp->cuda_iov_dist_d) {
+            cudaError_t cuda_err = cudaFree(tmp->cuda_iov_dist_d);
+            CUDA_ERROR_CHECK(cuda_err);
+            tmp->cuda_iov_dist_d = NULL;
+        }
+        tmp->nb_bytes_h = NULL;
+        free(tmp);
+    }
+#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
+}
+
+static inline int32_t
+opal_datatype_cuda_cached_cuda_iov_isfull(ddt_cuda_iov_total_cached_t *cached_cuda_iov,
+                                          ddt_cuda_iov_dist_cached_t **cuda_iov_dist_h,
+                                          uint32_t nb_blocks_used)
+{
+    if (nb_blocks_used < cached_cuda_iov->cuda_iov_count) {
+        return 0;
+    }
+realloc_cuda_iov:
+    cached_cuda_iov->nb_bytes_h = (uint32_t *)realloc(cached_cuda_iov->nb_bytes_h, sizeof(uint32_t)*cached_cuda_iov->cuda_iov_count*2);
+    assert(cached_cuda_iov->nb_bytes_h != NULL);
+    cached_cuda_iov->cuda_iov_count *= 2;
+    if (nb_blocks_used >= cached_cuda_iov->cuda_iov_count) {
+        goto realloc_cuda_iov;
+    }
+    return 1;
+}
+
+/* cached_cuda_iov_d is not ready until explicitly sync with current cuda stream */
+int32_t opal_datatype_cuda_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov_count)
+{
+    uint32_t i, j;
+    uint32_t count_desc, nb_blocks_per_description, residue_desc;
+    uint32_t thread_per_block, nb_blocks_used;
+    size_t length_per_iovec;
+    uint32_t alignment;
+    ddt_cuda_iov_process_block_cached_t *cuda_iov_process_block_cached = NULL;
+    ddt_cuda_iov_total_cached_t* cached_cuda_iov = NULL;
+    ddt_cuda_iov_dist_cached_t *cached_cuda_iov_dist_d = NULL;
+    ddt_cuda_iov_dist_cached_t *cuda_iov_dist_h = NULL;
+    cudaStream_t cuda_stream_iov = NULL;
+    cudaError_t cuda_err;
+    const struct iovec *ddt_iov = NULL;
+    uint32_t ddt_iov_count = 0;
+    size_t ncontig_disp_base;
+    size_t contig_disp = 0;
+    uint32_t *cached_cuda_iov_nb_bytes_list_h = NULL;
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+
+    opal_datatype_t *datatype = (opal_datatype_t *)pConvertor->pDesc;
+
+    opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
+    if (ddt_iov == NULL) {
+        OPAL_OUTPUT_VERBOSE((0, opal_datatype_cuda_output, "Can not get ddt iov\n"));
+        return OPAL_ERROR;
+    }
+
+    cached_cuda_iov = (ddt_cuda_iov_total_cached_t *)opal_datatype_cuda_cached_cuda_iov_init(NUM_CUDA_IOV_PER_DDT);
+    if (cached_cuda_iov == NULL) {
+        OPAL_OUTPUT_VERBOSE((0, opal_datatype_cuda_output, "Can not init cuda iov\n"));
+        return OPAL_ERROR;
+    }
+    cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
+    nb_blocks_used = 0;
+    cuda_iov_process_block_cached = current_cuda_device->cuda_iov_process_block_cached[current_cuda_device->cuda_iov_process_block_cached_first_avail];
+    current_cuda_device->cuda_iov_process_block_cached_first_avail ++;
+    if (current_cuda_device->cuda_iov_process_block_cached_first_avail >= NB_CACHED_BLOCKS) {
+        current_cuda_device->cuda_iov_process_block_cached_first_avail = 0;
+    }
+    cuda_err = cudaEventSynchronize(cuda_iov_process_block_cached->cuda_event);
+    CUDA_ERROR_CHECK(cuda_err);
+
+    if (pConvertor->stream == NULL) {
+        cuda_iov_process_block_cached->cuda_stream = cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id];
+    } else {
+        cuda_iov_process_block_cached->cuda_stream = (cudaStream_t)pConvertor->stream;
+    }
+    cuda_iov_dist_h = cuda_iov_process_block_cached->cuda_iov_dist_cached_h;
+    cuda_stream_iov = cuda_iov_process_block_cached->cuda_stream;
+    thread_per_block = CUDA_WARP_SIZE * 64;
+
+    for (i = 0; i < ddt_iov_count; i++) {
+        length_per_iovec = ddt_iov[i].iov_len;
+        ncontig_disp_base = (size_t)(ddt_iov[i].iov_base);
+
+        /* block size is either multiple of ALIGNMENT_DOUBLE or residue */
+        alignment = ALIGNMENT_DOUBLE * 1;
+
+        count_desc = length_per_iovec / alignment;
+        residue_desc = length_per_iovec % alignment;
+        nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+        OPAL_OUTPUT_VERBOSE((10, opal_datatype_cuda_output, "Cache cuda IOV description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description));
+        if (opal_datatype_cuda_cached_cuda_iov_isfull(cached_cuda_iov, &(cuda_iov_process_block_cached->cuda_iov_dist_cached_h), nb_blocks_used + nb_blocks_per_description + 1)) {
+            cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
+            cuda_iov_dist_h = (ddt_cuda_iov_dist_cached_t *)realloc(cuda_iov_dist_h, sizeof(ddt_cuda_iov_dist_cached_t)*cached_cuda_iov->cuda_iov_count);
+            assert(cuda_iov_dist_h != NULL);
+            cuda_iov_process_block_cached->cuda_iov_dist_cached_h = cuda_iov_dist_h;
+        }
+
+        for (j = 0; j < nb_blocks_per_description; j++) {
+            cuda_iov_dist_h[nb_blocks_used].ncontig_disp = ncontig_disp_base + j * thread_per_block * alignment;
+            cuda_iov_dist_h[nb_blocks_used].contig_disp = contig_disp;
+            if ( (j+1) * thread_per_block <= count_desc) {
+                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = thread_per_block * alignment;
+            } else {
+                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = (count_desc - j*thread_per_block) * alignment; 
+            }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+            assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+            contig_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
+            OPAL_OUTPUT_VERBOSE((12, opal_datatype_cuda_output, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h[nb_blocks_used].ncontig_disp, cuda_iov_dist_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]));
+            nb_blocks_used ++;
+        }
+
+        /* handle residue */
+        if (residue_desc != 0) {
+            cuda_iov_dist_h[nb_blocks_used].ncontig_disp = ncontig_disp_base + length_per_iovec / alignment * alignment;
+            cuda_iov_dist_h[nb_blocks_used].contig_disp = contig_disp;
+            cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = length_per_iovec - length_per_iovec / alignment * alignment;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+            assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+            contig_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
+            OPAL_OUTPUT_VERBOSE((12, opal_datatype_cuda_output, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h[nb_blocks_used].ncontig_disp, cuda_iov_dist_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]));
+            nb_blocks_used ++;
+        }
+    }
+    /* use additional entry to store the size of entire contiguous buffer needed for one ddt */
+    cuda_iov_dist_h[nb_blocks_used].contig_disp = contig_disp;
+    cudaMalloc((void **)(&cached_cuda_iov_dist_d), sizeof(ddt_cuda_iov_dist_cached_t) * (nb_blocks_used+1));
+    if (cached_cuda_iov_dist_d == NULL) {
+        OPAL_OUTPUT_VERBOSE((0, opal_datatype_cuda_output, "Can not malloc cuda iov in GPU\n"));
+        return OPAL_ERROR;
+    }
+    cuda_err = cudaMemcpyAsync(cached_cuda_iov_dist_d, cuda_iov_dist_h, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1),
+                               cudaMemcpyHostToDevice, cuda_stream_iov);
+    CUDA_ERROR_CHECK(cuda_err);
+    cached_cuda_iov->cuda_iov_dist_d = cached_cuda_iov_dist_d;
+    datatype->cached_iovec->cached_cuda_iov = (void*)cached_cuda_iov;
+    *cuda_iov_count = nb_blocks_used;
+
+    ddt_cuda_iov_total_cached_t *tmp = (ddt_cuda_iov_total_cached_t *)datatype->cached_iovec->cached_cuda_iov;
+    tmp->cuda_iov_count = *cuda_iov_count;
+    tmp->cuda_iov_is_cached = 1;
+
+    cuda_err = cudaEventRecord(cuda_iov_process_block_cached->cuda_event, cuda_stream_iov);
+    CUDA_ERROR_CHECK(cuda_err);
+    return OPAL_SUCCESS;
+}
+
+uint8_t opal_datatype_cuda_iov_to_cuda_iov(opal_convertor_t* pConvertor,
+                                           const struct iovec *ddt_iov,
+                                           ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current,
+                                           uint32_t ddt_iov_start_pos, uint32_t ddt_iov_end_pos,
+                                           size_t *buffer_size, uint32_t *nb_blocks_used,
+                                           size_t *total_converted, size_t *contig_disp_out, uint32_t *current_ddt_iov_pos)
+{
+    size_t ncontig_disp_base, contig_disp = 0, current_cuda_iov_length = 0;
+    uint32_t count_desc, nb_blocks_per_description, residue_desc, thread_per_block;
+    uint8_t buffer_isfull = 0, alignment;
+    size_t length_per_iovec;
+    uint32_t i, j;
+
+    thread_per_block = CUDA_WARP_SIZE * 5;
+
+    for (i = ddt_iov_start_pos; i < ddt_iov_end_pos && !buffer_isfull; i++) {
+        if (pConvertor->current_iov_partial_length > 0) {
+            ncontig_disp_base = (size_t)(ddt_iov[i].iov_base) + ddt_iov[i].iov_len - pConvertor->current_iov_partial_length;
+            length_per_iovec = pConvertor->current_iov_partial_length;
+            pConvertor->current_iov_partial_length = 0;
+        } else {
+            ncontig_disp_base = (size_t)(ddt_iov[i].iov_base);
+            length_per_iovec = ddt_iov[i].iov_len;
+        }
+        if (*buffer_size < length_per_iovec) {
+            pConvertor->current_iov_pos = i;
+            pConvertor->current_iov_partial_length = length_per_iovec - *buffer_size;
+            length_per_iovec = *buffer_size;
+            buffer_isfull = 1;
+        }
+        *buffer_size -= length_per_iovec;
+        *total_converted += length_per_iovec;
+
+        alignment = ALIGNMENT_DOUBLE;
+
+        count_desc = length_per_iovec / alignment;
+        residue_desc = length_per_iovec % alignment;
+        nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+        if ((*nb_blocks_used + nb_blocks_per_description + 1) > (CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK)) {
+            break;
+        }
+        OPAL_OUTPUT_VERBOSE((10, opal_datatype_cuda_output, "DDT IOV to CUDA IOV description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description));
+        for (j = 0; j < nb_blocks_per_description; j++) {
+            cuda_iov_dist_h_current[*nb_blocks_used].ncontig_disp = ncontig_disp_base + j * thread_per_block * alignment;
+            cuda_iov_dist_h_current[*nb_blocks_used].contig_disp = contig_disp;
+            if ( (j+1) * thread_per_block <= count_desc) {
+                current_cuda_iov_length = thread_per_block * alignment;
+            } else {
+                current_cuda_iov_length = (count_desc - j*thread_per_block) * alignment;
+            }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+            assert(current_cuda_iov_length > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+            contig_disp += current_cuda_iov_length;
+            OPAL_OUTPUT_VERBOSE((12, opal_datatype_cuda_output, "DDT IOV to CUDA IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", *nb_blocks_used, cuda_iov_dist_h_current[*nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[*nb_blocks_used].contig_disp, current_cuda_iov_length));
+            (*nb_blocks_used) ++;
+            assert (*nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+        }
+
+        /* handle residue */
+        if (residue_desc != 0) {
+            cuda_iov_dist_h_current[*nb_blocks_used].ncontig_disp = ncontig_disp_base + length_per_iovec / alignment * alignment;
+            cuda_iov_dist_h_current[*nb_blocks_used].contig_disp = contig_disp;
+            current_cuda_iov_length= length_per_iovec - length_per_iovec / alignment * alignment;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+            assert(current_cuda_iov_length > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+            contig_disp += current_cuda_iov_length;
+            OPAL_OUTPUT_VERBOSE((12, opal_datatype_cuda_output, "DDT IOV to CUDA IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", *nb_blocks_used, cuda_iov_dist_h_current[*nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[*nb_blocks_used].contig_disp, current_cuda_iov_length));
+            (*nb_blocks_used) ++;
+            assert (*nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+        }
+    }
+    cuda_iov_dist_h_current[*nb_blocks_used].contig_disp = contig_disp;
+    *contig_disp_out = contig_disp;
+    *current_ddt_iov_pos = i;
+    return buffer_isfull;
+}
+
+void opal_datatype_cuda_get_cached_cuda_iov(struct opal_convertor_t *convertor,
+                                            ddt_cuda_iov_total_cached_t **cached_cuda_iov)
+{
+    *cached_cuda_iov = NULL;
+    if (NULL != convertor->pDesc->cached_iovec) {
+        *cached_cuda_iov = (ddt_cuda_iov_total_cached_t *)convertor->pDesc->cached_iovec->cached_cuda_iov;
+    }
+}
+
+uint8_t opal_datatype_cuda_cuda_iov_is_cached(struct opal_convertor_t *convertor)
+{
+    opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
+    if (NULL == datatype->cached_iovec) {
+        return 0;
+    }
+    if (NULL == datatype->cached_iovec->cached_cuda_iov) {
+        return 0;
+    }
+    ddt_cuda_iov_total_cached_t *tmp = (ddt_cuda_iov_total_cached_t *)datatype->cached_iovec->cached_cuda_iov;
+    return tmp->cuda_iov_is_cached;
+}
+
+void opal_datatype_cuda_set_cuda_iov_position(struct opal_convertor_t *convertor,
+                                              size_t ddt_offset,
+                                              const uint32_t *cached_cuda_iov_nb_bytes_list_h,
+                                              const uint32_t cuda_iov_count)
+{
+    size_t iov_size = 0, ddt_size;
+    uint32_t i;
+
+    convertor->current_iov_partial_length = 0;
+    convertor->current_cuda_iov_pos = 0;
+    convertor->current_count = 0;
+    if (ddt_offset == 0)
+       return;
+
+    opal_datatype_type_size(convertor->pDesc, &ddt_size);
+    convertor->current_count = ddt_offset / ddt_size;
+    ddt_offset = ddt_offset % ddt_size;
+    for(i = 0; i < cuda_iov_count; i++) {
+        iov_size += cached_cuda_iov_nb_bytes_list_h[i];
+        if (iov_size >= ddt_offset) {
+            convertor->current_iov_partial_length = iov_size - ddt_offset;
+            convertor->current_cuda_iov_pos = i;
+            if (iov_size == ddt_offset)
+                convertor->current_cuda_iov_pos++;
+            return;
+        }
+    }
+}
+
+void opal_datatype_cuda_set_ddt_iov_position(struct opal_convertor_t *convertor,
+                                             size_t ddt_offset,
+                                             const struct iovec *ddt_iov,
+                                             const uint32_t ddt_iov_count)
+{
+    size_t iov_size = 0, ddt_size;
+    uint32_t i;
+
+    convertor->current_iov_partial_length = 0;
+    convertor->current_iov_pos = 0;
+    convertor->current_count = 0;
+    if (ddt_offset == 0)
+       return;
+
+    opal_datatype_type_size(convertor->pDesc, &ddt_size);
+    convertor->current_count = ddt_offset / ddt_size;
+    ddt_offset = ddt_offset % ddt_size;
+    for(i = 0; i < ddt_iov_count; i++) {
+        iov_size += ddt_iov[i].iov_len;
+        if (iov_size >= ddt_offset) {
+            convertor->current_iov_partial_length = iov_size - ddt_offset;
+            convertor->current_iov_pos = i;
+            if (iov_size == ddt_offset)
+                convertor->current_iov_pos++;
+            return;
+        }
+    }
+}
+
+/* following function will be called outside the cuda kernel lib */
+int32_t opal_datatype_cuda_is_gpu_buffer(const void *ptr)
+{
+    CUmemorytype memType;
+    CUdeviceptr dbuf = (CUdeviceptr)ptr;
+    int res;
+
+    res = cuPointerGetAttribute(&memType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, dbuf);
+    if (res != CUDA_SUCCESS) {
+        /* If we cannot determine it is device pointer,
+         * just assume it is not. */
+        OPAL_OUTPUT_VERBOSE((1, opal_datatype_cuda_output, "!!!!!!! %p is not a gpu buffer. Take no-CUDA path!\n", ptr));
+        return 0;
+    }
+    /* Anything but CU_MEMORYTYPE_DEVICE is not a GPU memory */
+    return (memType == CU_MEMORYTYPE_DEVICE) ? 1 : 0;
+}
+
+void* opal_datatype_cuda_malloc_gpu_buffer(size_t size, int gpu_id)
+{
+    ddt_cuda_device_t *device = &cuda_devices[gpu_id];
+    int dev_id = device->device_id;
+    if (device->buffer_free_size < size) {
+        OPAL_OUTPUT_VERBOSE((0, opal_datatype_cuda_output, "No GPU buffer for pack/unpack at device %d, if program crashes, please set --mca opal_opal_opal_datatype_cuda_buffer_size to larger size\n", dev_id));
+        return NULL;
+    }
+    ddt_cuda_buffer_t *ptr = device->buffer_free.head;
+    while (ptr != NULL) {
+        if (ptr->size < size) {  /* Not enough room in this buffer, check next */
+            ptr = ptr->next;
+            continue;
+        }
+        void *addr = ptr->gpu_addr;
+        ptr->size -= size;
+        if (ptr->size == 0) {
+            cuda_list_delete(&device->buffer_free, ptr);
+            obj_ddt_cuda_buffer_reset(ptr);
+            /* hold on this ptr object, we will reuse it right away */
+        } else {
+            ptr->gpu_addr += size;
+            ptr = cuda_list_pop_tail(cuda_free_list);
+            if( NULL == ptr )
+                ptr = obj_ddt_cuda_buffer_new();
+        }
+        assert(NULL != ptr);
+        ptr->size = size;
+        ptr->gpu_addr = (unsigned char*)addr;
+        cuda_list_push_head(&device->buffer_used, ptr);
+        device->buffer_used_size += size;
+        device->buffer_free_size -= size;
+        OPAL_OUTPUT_VERBOSE((2, opal_datatype_cuda_output, "Malloc GPU buffer %p, size %lu, dev_id %d.\n", addr, size, dev_id));
+        return addr;
+    }
+    return NULL;
+}
+
+void opal_datatype_cuda_free_gpu_buffer(void *addr, int gpu_id)
+{
+    ddt_cuda_device_t *device = &cuda_devices[gpu_id];
+    ddt_cuda_buffer_t *ptr = device->buffer_used.head;
+
+    /* Find the holder of this GPU allocation */
+    for( ; (NULL != ptr) && (ptr->gpu_addr != addr); ptr = ptr->next );
+    if (NULL == ptr) {  /* we could not find it. something went wrong */
+        OPAL_OUTPUT_VERBOSE((0, opal_datatype_cuda_output, "addr %p is not managed.\n", addr));
+        return;
+    }
+    cuda_list_delete(&device->buffer_used, ptr);
+    /* Insert the element in the list of free buffers ordered by the addr */
+    ddt_cuda_buffer_t *ptr_next = device->buffer_free.head;
+    while (ptr_next != NULL) {
+        if (ptr_next->gpu_addr > addr) {
+            break;
+        }
+        ptr_next = ptr_next->next;
+    }
+    if (ptr_next == NULL) {  /* buffer_free is empty, or insert to last one */
+        cuda_list_push_tail(&device->buffer_free, ptr);
+    } else {
+        cuda_list_insert_before(&device->buffer_free, ptr, ptr_next);
+    }
+    size_t size = ptr->size;
+    cuda_list_item_merge_by_addr(&device->buffer_free, ptr);
+    device->buffer_free_size += size;
+    device->buffer_used_size -= size;
+    OPAL_OUTPUT_VERBOSE((2, opal_datatype_cuda_output, "Free GPU buffer %p, size %lu\n", addr, size));
+}
+
+void opal_datatype_cuda_d2dcpy_async(void* dst, const void* src, size_t count, void* stream)
+{
+    cudaError_t cuda_err = cudaMemcpyAsync(dst, src, count, cudaMemcpyDeviceToDevice, (cudaStream_t)stream);
+    CUDA_ERROR_CHECK(cuda_err);
+}
+
+void opal_datatype_cuda_d2dcpy(void* dst, const void* src, size_t count, void* stream)
+{
+    cudaError_t cuda_err = cudaMemcpyAsync(dst, src, count, cudaMemcpyDeviceToDevice, (cudaStream_t)stream);
+    CUDA_ERROR_CHECK(cuda_err);
+    cuda_err = cudaStreamSynchronize((cudaStream_t)stream);
+    CUDA_ERROR_CHECK(cuda_err);
+}
+
+void* opal_datatype_cuda_get_cuda_stream_by_id(int stream_id)
+{
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    return (void*)cuda_streams->ddt_cuda_stream[stream_id];
+}
+
+void *opal_datatype_cuda_get_current_cuda_stream()
+{
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    return (void*)cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id];
+}
+
+void opal_datatype_cuda_sync_current_cuda_stream()
+{
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    cudaError_t cuda_err = cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
+    CUDA_ERROR_CHECK(cuda_err);
+}
+
+void opal_datatype_cuda_sync_cuda_stream(int stream_id)
+{
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    cudaError cuda_err = cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[stream_id]);
+    CUDA_ERROR_CHECK(cuda_err);
+}
+
+void* opal_datatype_cuda_alloc_event(int32_t nb_events, int32_t *loc)
+{
+    *loc = 0;
+    ddt_cuda_event_t *event_list = (ddt_cuda_event_t *)malloc(sizeof(ddt_cuda_event_t) * nb_events);
+    cudaError_t cuda_err;
+    for (int i = 0; i < nb_events; i++) {
+        cuda_err = cudaEventCreateWithFlags(&(event_list[i].cuda_event), cudaEventDisableTiming);
+        CUDA_ERROR_CHECK(cuda_err);
+    }
+    return (void*)event_list;
+}
+
+void opal_datatype_cuda_free_event(void *cuda_event_list, int32_t nb_events)
+{
+    ddt_cuda_event_t *event_list = (ddt_cuda_event_t *)cuda_event_list;
+    cudaError_t cuda_err;
+    for (int i = 0; i < nb_events; i++) {
+        cuda_err = cudaEventDestroy(event_list[i].cuda_event);
+        CUDA_ERROR_CHECK(cuda_err);
+    }
+    free (event_list);
+    return;
+}
+
+int32_t opal_datatype_cuda_event_query(void *cuda_event_list, int32_t i)
+{
+    ddt_cuda_event_t *event_list = (ddt_cuda_event_t *)cuda_event_list;
+    cudaError_t rv = cudaEventQuery(event_list[i].cuda_event);
+    if (rv == cudaSuccess) {
+        return 1;
+    } else if (rv == cudaErrorNotReady) {
+        return 0;
+    } else {
+        OPAL_OUTPUT_VERBOSE((0, opal_datatype_cuda_output, "cuda event query error.\n"));
+        return -1;
+    }
+}
+
+int32_t opal_datatype_cuda_event_sync(void *cuda_event_list, int32_t i)
+{
+    ddt_cuda_event_t *event_list = (ddt_cuda_event_t *)cuda_event_list;
+    cudaError_t rv = cudaEventSynchronize(event_list[i].cuda_event);
+    if (rv == cudaSuccess) {
+        return 1;
+    }
+    OPAL_OUTPUT_VERBOSE((0, opal_datatype_cuda_output, "cuda event sync error.\n"));
+    return -1;
+}
+
+int32_t opal_datatype_cuda_event_record(void *cuda_event_list, int32_t i, void* stream)
+{
+    ddt_cuda_event_t *event_list = (ddt_cuda_event_t *)cuda_event_list;
+    cudaError_t rv = cudaEventRecord(event_list[i].cuda_event, (cudaStream_t)stream);
+    if (rv == cudaSuccess) {
+        return 1;
+    }
+    OPAL_OUTPUT_VERBOSE((0, opal_datatype_cuda_output, "cuda event record error.\n"));
+    return -1;
+}
+
+void opal_dump_cuda_list(ddt_cuda_list_t *list)
+{
+    ddt_cuda_buffer_t *ptr = NULL;
+    ptr = list->head;
+    OPAL_OUTPUT_VERBOSE((2, opal_datatype_cuda_output, "DUMP cuda list %p, nb_elements %zu\n", list, list->nb_elements));
+    while (ptr != NULL) {
+        OPAL_OUTPUT_VERBOSE((2, opal_datatype_cuda_output, "\titem addr %p, size %ld.\n", ptr->gpu_addr, ptr->size));
+        ptr = ptr->next;
+    }
+}
+
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
new file mode 100644
index 00000000000..24792f7a2da
--- /dev/null
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -0,0 +1,124 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2014-2016 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ */
+
+#ifndef OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED
+#define OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED
+
+BEGIN_C_DECLS
+
+/* init functions of GPU datatype engine */    
+int32_t opal_datatype_cuda_kernel_init(void);
+
+/* fini function of GPU datatype engine */
+int32_t opal_datatype_cuda_kernel_fini(void);
+
+/* iov pack function */                                
+int32_t opal_datatype_cuda_generic_simple_pack_function_iov( opal_convertor_t* pConvertor,
+                                                                  struct iovec* iov, 
+                                                                  uint32_t* out_size,
+                                                                  size_t* max_data );                                              
+
+/* iov unpack function */ 
+int32_t opal_datatype_cuda_generic_simple_unpack_function_iov( opal_convertor_t* pConvertor,
+                                                               struct iovec* iov, 
+                                                               uint32_t* out_size,
+                                                               size_t* max_data ); 
+
+/* iov pack without cache */                                                          
+int32_t opal_datatype_cuda_generic_simple_pack_function_iov_non_cached( opal_convertor_t* pConvertor,
+                                                                        unsigned char *destination,
+                                                                        size_t buffer_size,
+                                                                        size_t *total_packed);
+
+/* iov unpack without cache */
+int32_t opal_datatype_cuda_generic_simple_unpack_function_iov_non_cached( opal_convertor_t* pConvertor, unsigned char *source, size_t buffer_size, size_t *total_unpacked);
+
+/* iov pack with cache */                                                                                                                    
+int32_t opal_datatype_cuda_generic_simple_pack_function_iov_cached( opal_convertor_t* pConvertor, unsigned char *destination, size_t buffer_size, size_t *total_packed);                                        
+
+/* iov unpack with cache */
+int32_t opal_datatype_cuda_generic_simple_unpack_function_iov_cached( opal_convertor_t* pConvertor, unsigned char *source, size_t buffer_size, size_t *total_unpacked);
+
+/* check if ptr is gpu memory */
+int32_t opal_datatype_cuda_is_gpu_buffer(const void *ptr);
+
+/* malloc gpu buffer for pack/unpack */
+void* opal_datatype_cuda_malloc_gpu_buffer(size_t size, int gpu_id);
+
+/* free gpu buffer used for pack/unpack */
+void opal_datatype_cuda_free_gpu_buffer(void *addr, int gpu_id);
+
+/* async cuda memory movement */
+void opal_datatype_cuda_d2dcpy_async(void* dst, const void* src, size_t count, void* stream);
+
+/* sync cuda memory movement */
+void opal_datatype_cuda_d2dcpy(void* dst, const void* src, size_t count, void* stream);
+
+void opal_dump_cuda_list(ddt_cuda_list_t *list);
+
+/* init the cuda iov used for caching */
+void* opal_datatype_cuda_cached_cuda_iov_init(void);
+
+/* clean up cached cuda iov */ 
+void opal_datatype_cuda_cached_cuda_iov_fini(void *cached_cuda_iov);
+
+/* get cached cuda iov */
+void opal_datatype_cuda_get_cached_cuda_iov(struct opal_convertor_t *convertor, ddt_cuda_iov_total_cached_t **cached_cuda_iov);                                
+
+/* check if cuda iov is cached or not */
+uint8_t opal_datatype_cuda_cuda_iov_is_cached(struct opal_convertor_t *convertor);
+
+/* move cuda iov position */
+void opal_datatype_cuda_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const uint32_t *cached_cuda_iov_nb_bytes_list_h, const uint32_t cuda_iov_count);
+
+/* move cpu iov position */
+void opal_datatype_cuda_set_ddt_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const struct iovec *ddt_iov,  const uint32_t ddt_iov_count);
+
+/* cache cuda iov */
+int32_t opal_datatype_cuda_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov_count);
+
+/* convertor cpu iov to cuda iov */
+uint8_t opal_datatype_cuda_iov_to_cuda_iov(opal_convertor_t* pConvertor, const struct iovec *ddt_iov,
+                                           ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current,
+                                           uint32_t ddt_iov_start_pos,
+                                           uint32_t ddt_iov_end_pos,
+                                           size_t *buffer_size,
+                                           uint32_t *nb_blocks_used,
+                                           size_t *total_packed,
+                                           size_t *contig_disp_out,
+                                           uint32_t *current_ddt_iov_pos);
+
+/* get cuda stream whose id is stream_id */
+void* opal_datatype_cuda_get_cuda_stream_by_id(int stream_id);
+
+/* get current cuda stream */
+void *opal_datatype_cuda_get_current_cuda_stream();
+
+/* sync current cuda stream */
+void opal_datatype_cuda_sync_current_cuda_stream();
+
+/* sync cuda stream (id) */
+void opal_datatype_cuda_sync_cuda_stream(int stream_id);
+
+/* alloc event for smcuda pack/unpack */
+void* opal_datatype_cuda_alloc_event(int32_t nb_events, int32_t *loc);
+
+/* free events used for smcuda pack/unpack */
+void opal_datatype_cuda_free_event(void *cuda_event_list, int32_t nb_events);
+
+/* query the event i */
+int32_t opal_datatype_cuda_event_query(void *cuda_event_list, int32_t i);
+
+/* sync the event i */
+int32_t opal_datatype_cuda_event_sync(void *cuda_event_list, int32_t i);
+
+/* record the event i */
+int32_t opal_datatype_cuda_event_record(void *cuda_event_list, int32_t i, void* stream);
+
+END_C_DECLS
+                            
+#endif  /* OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED */
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
new file mode 100644
index 00000000000..f10f0e457bf
--- /dev/null
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -0,0 +1,157 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2014-2016 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ */
+
+#ifndef OPAL_DATATYPE_CUDA_INTERNAL_H_HAS_BEEN_INCLUDED
+#define OPAL_DATATYPE_CUDA_INTERNAL_H_HAS_BEEN_INCLUDED
+
+#include <stdint.h>
+#include <stddef.h>
+#include <sys/time.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+/* OPAL_CUDA */
+#define OPAL_DATATYPE_CUDA_DEBUG    1
+#define OPAL_DATATYPE_CUDA_TIMING
+#define OPAL_DATATYPE_USE_ZEROCPY   0
+#define OPAL_DATATYPE_CUDA_IOV_CACHE    1
+
+#define DT_CUDA_FREE_LIST_SIZE  50
+
+#define THREAD_PER_BLOCK    32
+#define CUDA_WARP_SIZE      32
+#define NB_STREAMS          4
+#define NB_PIPELINE_NON_CACHED_BLOCKS  4
+#define NB_CACHED_BLOCKS    4
+#define CUDA_MAX_NB_BLOCKS  1024
+#define CUDA_IOV_MAX_TASK_PER_BLOCK 400
+#define ALIGNMENT_DOUBLE    8
+#define ALIGNMENT_FLOAT     4
+#define ALIGNMENT_CHAR      1
+#define NUM_CUDA_IOV_PER_DDT    150000
+#define IOV_PIPELINE_SIZE   1000
+#define KERNEL_UNROLL       16
+#define UNROLL_16           16
+#define UNROLL_8            8
+#define UNROLL_4            4
+
+#define TIMER_DATA_TYPE struct timeval
+#define GET_TIME(TV)   gettimeofday( &(TV), NULL )
+#define ELAPSED_TIME(TSTART, TEND)  (((TEND).tv_sec - (TSTART).tv_sec) * 1000000 + ((TEND).tv_usec - (TSTART).tv_usec))
+
+
+typedef struct {
+    cudaEvent_t cuda_event;
+    int32_t event_type;
+} ddt_cuda_event_t;
+
+typedef struct {
+    cudaStream_t ddt_cuda_stream[NB_STREAMS];
+    int32_t current_stream_id;
+} ddt_cuda_stream_t;
+
+typedef struct {
+    size_t ncontig_disp;
+    size_t contig_disp;
+} ddt_cuda_iov_dist_cached_t;
+
+typedef struct {
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d;
+    uint32_t cuda_iov_count;
+    uint32_t* nb_bytes_h;
+    uint8_t cuda_iov_is_cached;
+} ddt_cuda_iov_total_cached_t;
+
+typedef struct {
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_non_cached_h;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_non_cached_d;
+    cudaStream_t cuda_stream;
+    cudaEvent_t cuda_event;
+} ddt_cuda_iov_pipeline_block_non_cached_t;
+
+typedef struct {
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_cached_h;
+    cudaStream_t cuda_stream;
+    cudaEvent_t cuda_event;
+} ddt_cuda_iov_process_block_cached_t;
+
+typedef struct ddt_cuda_buffer{
+    unsigned char* gpu_addr;
+    size_t size;
+    struct ddt_cuda_buffer *next;
+    struct ddt_cuda_buffer *prev;
+} ddt_cuda_buffer_t;
+
+typedef struct {
+    ddt_cuda_buffer_t *head;
+    ddt_cuda_buffer_t *tail;
+    size_t nb_elements;
+} ddt_cuda_list_t;
+
+typedef struct {
+    int device_id;
+    unsigned char* gpu_buffer;
+    ddt_cuda_list_t buffer_free;
+    ddt_cuda_list_t buffer_used;
+    size_t buffer_free_size;
+    size_t buffer_used_size;
+    ddt_cuda_stream_t *cuda_streams;
+    ddt_cuda_iov_pipeline_block_non_cached_t *cuda_iov_pipeline_block_non_cached[NB_PIPELINE_NON_CACHED_BLOCKS];
+    ddt_cuda_iov_process_block_cached_t *cuda_iov_process_block_cached[NB_CACHED_BLOCKS];
+    uint32_t cuda_iov_process_block_cached_first_avail;
+    uint32_t cuda_iov_pipeline_block_non_cached_first_avail;
+    cudaEvent_t memcpy_event;
+} ddt_cuda_device_t;
+
+extern ddt_cuda_list_t *cuda_free_list;
+extern ddt_cuda_device_t *cuda_devices;
+extern ddt_cuda_device_t *current_cuda_device;
+extern uint32_t cuda_iov_cache_enabled;
+
+extern int opal_datatype_cuda_output;
+extern size_t opal_datatype_cuda_buffer_size;
+      
+
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist,
+                                                                 uint32_t cuda_iov_pos,
+                                                                 uint32_t cuda_iov_count,
+                                                                 uint32_t ddt_extent,
+                                                                 uint32_t current_count,
+                                                                 int nb_blocks_used,
+                                                                 unsigned char* source_base,
+                                                                 unsigned char* destination_base);
+
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist,
+                                                                   uint32_t cuda_iov_pos,
+                                                                   uint32_t cuda_iov_count,
+                                                                   uint32_t ddt_extent,
+                                                                   uint32_t current_count,
+                                                                   int nb_blocks_used,
+                                                                   unsigned char* destination_base,
+                                                                   unsigned char* source_base,
+                                                                   size_t cuda_iov_partial_length_start,
+                                                                   size_t cuda_iov_partial_length_end);
+
+#define CUDA_ERROR_CHECK(err)                                                                                       \
+    if (err != cudaSuccess) {                                                                                       \
+        OPAL_OUTPUT_VERBOSE((0, opal_datatype_cuda_output, "CUDA calls error %s\n", cudaGetErrorString(err)));      \
+    }                                                                                                               \
+
+extern "C"
+{
+int32_t opal_convertor_set_position_nocheck( opal_convertor_t* convertor, size_t* position );
+
+int32_t opal_convertor_raw( opal_convertor_t* pConvertor, 
+		                    struct iovec* iov, uint32_t* iov_count,
+		                    size_t* length );
+
+int opal_convertor_raw_cached(struct opal_convertor_t *convertor,
+                              const struct iovec **iov,
+                              uint32_t* iov_count);
+}
+
+#endif  /* OPAL_DATATYPE_CUDA_INTERNAL_H_HAS_BEEN_INCLUDED */
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
new file mode 100644
index 00000000000..15329fe080e
--- /dev/null
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -0,0 +1,266 @@
+#include "opal/datatype/opal_convertor_internal.h"
+#include "opal/datatype/opal_datatype_internal.h"
+
+#include "opal_datatype_cuda_internal.cuh"
+#include <stdio.h> 
+#include <time.h>
+
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uint32_t cuda_iov_count, uint32_t ddt_extent, uint32_t current_count, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
+{
+    uint32_t i, j;
+    uint32_t _nb_bytes;    
+    size_t src_offset, dst_offset;
+    unsigned char *_source_tmp, *_destination_tmp;
+    uint32_t current_cuda_iov_pos = cuda_iov_pos;
+    size_t destination_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
+    size_t contig_disp;
+    uint32_t _my_cuda_iov_pos;
+    uint32_t _my_cuda_iov_iteration;
+    size_t ddt_size = cuda_iov_dist[cuda_iov_count].contig_disp;
+    
+    __shared__ uint32_t nb_tasks_per_block;
+    __shared__ uint32_t WARP_SIZE;
+    __shared__ uint32_t nb_warp_per_block;
+    uint32_t copy_count;
+    uint8_t alignment;
+    uint64_t tmp_var_64[KERNEL_UNROLL];
+    uint32_t tmp_var_32[KERNEL_UNROLL];
+    unsigned char tmp_var_8[KERNEL_UNROLL];
+    uint32_t u, k;
+    uint32_t copy_count_16, copy_count_8, copy_count_left;
+    
+    if (threadIdx.x == 0) {
+        nb_tasks_per_block = nb_blocks_used / gridDim.x;
+        if (blockIdx.x < (nb_blocks_used % gridDim.x)) {
+            nb_tasks_per_block ++;
+        }
+        WARP_SIZE = 32;
+        nb_warp_per_block = blockDim.x / WARP_SIZE;
+    }
+    __syncthreads();
+      
+    const uint32_t warp_id_per_block = threadIdx.x / WARP_SIZE;
+    const uint32_t tid_per_warp = threadIdx.x & (WARP_SIZE - 1);
+    
+    for (i = warp_id_per_block; i < nb_tasks_per_block; i+= nb_warp_per_block) {
+        /* these 3 variables are used multiple times, so put in in register */
+        _my_cuda_iov_pos = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) % cuda_iov_count;
+        _my_cuda_iov_iteration = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) / cuda_iov_count;
+        contig_disp = cuda_iov_dist[_my_cuda_iov_pos].contig_disp;  
+        
+        src_offset = cuda_iov_dist[_my_cuda_iov_pos].ncontig_disp + (_my_cuda_iov_iteration + current_count) * ddt_extent;
+        dst_offset = contig_disp + ddt_size * _my_cuda_iov_iteration - destination_disp;
+        _nb_bytes = cuda_iov_dist[_my_cuda_iov_pos + 1].contig_disp - contig_disp;
+        
+        _source_tmp = source_base + src_offset;
+        _destination_tmp = destination_base + dst_offset;
+        /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
+        if ((uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
+            alignment = ALIGNMENT_DOUBLE;
+        } else if ((uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
+            alignment = ALIGNMENT_FLOAT;
+        } else {
+            alignment = ALIGNMENT_CHAR;
+        }
+        
+        copy_count = _nb_bytes / alignment;
+
+        if (alignment == ALIGNMENT_DOUBLE) {
+            uint64_t *_source_base_64, *_destination_base_64; 
+            copy_count_16 = copy_count  / (WARP_SIZE * UNROLL_16) * (WARP_SIZE * UNROLL_16);
+            _source_base_64 = (uint64_t *)(source_base + src_offset);
+            _destination_base_64 = (uint64_t *)(destination_base + dst_offset);
+            if (copy_count_16 > 0) {
+                for (k = 0; k < copy_count_16; k += UNROLL_16 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_64[u] = *(_source_base_64 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_64 + j) = tmp_var_64[u];
+
+                    }
+                }
+            }
+            _source_base_64 += copy_count_16;
+            _destination_base_64 += copy_count_16;
+        
+            copy_count_8 = (copy_count - copy_count_16) / (WARP_SIZE * UNROLL_8) * (WARP_SIZE * UNROLL_8);
+            if (copy_count_8 > 0) {
+                for (k = 0; k < copy_count_8; k += UNROLL_8 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_64[u] = *(_source_base_64 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_64 + j) = tmp_var_64[u];
+
+                    }
+                }
+            }
+            _source_base_64 += copy_count_8;
+            _destination_base_64 += copy_count_8;
+        
+            copy_count_left = copy_count - copy_count_16 - copy_count_8;
+            if (copy_count_left > 0) {
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        tmp_var_64[u] = *(_source_base_64 + j);
+                    } else {
+                        break;
+                    }
+                }
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        *(_destination_base_64 + j) = tmp_var_64[u];
+                    } else {
+                        break;
+                    }
+                }
+            }
+        } else if (alignment == ALIGNMENT_FLOAT) {
+            uint32_t *_source_base_32, *_destination_base_32;    
+            copy_count_16 = copy_count  / (WARP_SIZE * UNROLL_16) * (WARP_SIZE * UNROLL_16);
+            _source_base_32 = (uint32_t *)(source_base + src_offset);
+            _destination_base_32 = (uint32_t *)(destination_base + dst_offset);
+            if (copy_count_16 > 0) {
+                for (k = 0; k < copy_count_16; k += UNROLL_16 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_32[u] = *(_source_base_32 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_32 + j) = tmp_var_32[u];
+
+                    }
+                }
+            }
+            _source_base_32 += copy_count_16;
+            _destination_base_32 += copy_count_16;
+        
+            copy_count_8 = (copy_count - copy_count_16) / (WARP_SIZE * UNROLL_8) * (WARP_SIZE * UNROLL_8);
+            if (copy_count_8 > 0) {
+                for (k = 0; k < copy_count_8; k += UNROLL_8 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_32[u] = *(_source_base_32 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_32 + j) = tmp_var_32[u];
+
+                    }
+                }
+            }
+            _source_base_32 += copy_count_8;
+            _destination_base_32 += copy_count_8;
+        
+            copy_count_left = copy_count - copy_count_16 - copy_count_8;
+            if (copy_count_left > 0) {
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        tmp_var_32[u] = *(_source_base_32 + j);
+                    } else {
+                        break;
+                    }
+                }
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        *(_destination_base_32 + j) = tmp_var_32[u];
+                    } else {
+                        break;
+                    }
+                }
+            }
+        } else {
+            unsigned char *_source_base_8, *_destination_base_8;
+        
+            copy_count_16 = copy_count  / (WARP_SIZE * UNROLL_16) * (WARP_SIZE * UNROLL_16);
+            _source_base_8 = (unsigned char *)(source_base + src_offset);
+            _destination_base_8 = (unsigned char *)(destination_base + dst_offset);
+            if (copy_count_16 > 0) {
+                for (k = 0; k < copy_count_16; k += UNROLL_16 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_8[u] = *(_source_base_8 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_8 + j) = tmp_var_8[u];
+
+                    }
+                }
+            }
+            _source_base_8 += copy_count_16;
+            _destination_base_8 += copy_count_16;
+        
+            copy_count_8 = (copy_count - copy_count_16) / (WARP_SIZE * UNROLL_8) * (WARP_SIZE * UNROLL_8);
+            if (copy_count_8 > 0) {
+                for (k = 0; k < copy_count_8; k += UNROLL_8 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_8[u] = *(_source_base_8 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_8 + j) = tmp_var_8[u];
+
+                    }
+                }
+            }
+            _source_base_8 += copy_count_8;
+            _destination_base_8 += copy_count_8;
+        
+            copy_count_left = copy_count - copy_count_16 - copy_count_8;
+            if (copy_count_left > 0) {
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        tmp_var_8[u] = *(_source_base_8 + j);
+                    } else {
+                        break;
+                    }
+                }
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        *(_destination_base_8 + j) = tmp_var_8[u];
+                    } else {
+                        break;
+                    }
+                }
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
new file mode 100644
index 00000000000..48ac9baac8f
--- /dev/null
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -0,0 +1,322 @@
+#include "opal/datatype/opal_convertor_internal.h"
+#include "opal/datatype/opal_datatype_internal.h"
+#include "opal/util/output.h"
+
+#include "opal_datatype_cuda_internal.cuh"
+#include "opal_datatype_cuda.cuh"
+
+#include <stdio.h>
+#include <assert.h>
+
+int32_t opal_datatype_cuda_generic_simple_pack_function_iov( opal_convertor_t* pConvertor,
+                                                             struct iovec* iov,
+                                                             uint32_t* out_size,
+                                                             size_t* max_data )
+{
+    size_t buffer_size;
+    unsigned char *destination;
+    size_t total_packed;
+    uint8_t transfer_required, free_required;
+    cudaStream_t working_stream = NULL; 
+    cudaError_t cuda_err;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time, move_time;
+#endif
+
+    if ((iov[0].iov_base == NULL) || opal_datatype_cuda_is_gpu_buffer(iov[0].iov_base)) {
+        assert (iov[0].iov_len != 0);
+        buffer_size = iov[0].iov_len;
+        
+        if (iov[0].iov_base == NULL) {
+            iov[0].iov_base = (unsigned char *)opal_datatype_cuda_malloc_gpu_buffer(buffer_size, 0);
+            destination = (unsigned char *)iov[0].iov_base;
+            pConvertor->gpu_buffer_ptr = destination;
+            pConvertor->gpu_buffer_size = buffer_size;
+            free_required = 1;
+        } else {
+            destination = (unsigned char *)iov[0].iov_base;
+            free_required = 0;
+        }
+        transfer_required = 0;
+    } else {
+        buffer_size = iov[0].iov_len;
+        if (OPAL_DATATYPE_USE_ZEROCPY) {
+            pConvertor->gpu_buffer_ptr = NULL;
+            transfer_required = 0;
+            free_required = 0;
+            cuda_err = cudaHostGetDevicePointer((void **)&destination, (void *)iov[0].iov_base, 0);
+            if (cuda_err != cudaSuccess) {
+                OPAL_OUTPUT_VERBOSE((0, opal_datatype_cuda_output, "Zero copy is not supported\n"));
+                return 0;
+            }
+        } else {
+            if (pConvertor->gpu_buffer_ptr == NULL) {
+                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_datatype_cuda_malloc_gpu_buffer(buffer_size, 0);
+                pConvertor->gpu_buffer_size = buffer_size;
+            }
+            transfer_required = 1;
+            free_required = 1;
+            destination = pConvertor->gpu_buffer_ptr + pConvertor->pipeline_size * pConvertor->pipeline_seq;
+        }
+    }   
+
+    total_packed = 0;
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start_total);
+#endif
+    
+    /* start pack */
+    if (cuda_iov_cache_enabled) {
+        opal_datatype_cuda_generic_simple_pack_function_iov_cached(pConvertor, destination, buffer_size, &total_packed);
+    } else {
+        opal_datatype_cuda_generic_simple_pack_function_iov_non_cached(pConvertor, destination, buffer_size, &total_packed);
+    }
+
+    pConvertor->bConverted += total_packed;
+    OPAL_OUTPUT_VERBOSE((2, opal_datatype_cuda_output, "Pack total packed %ld\n", total_packed));
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    if (transfer_required) {
+        if (pConvertor->stream == NULL) {
+            ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+            working_stream = cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id];
+        } else {
+            working_stream = (cudaStream_t)pConvertor->stream;
+        }
+        cuda_err = cudaMemcpyAsync(iov[0].iov_base, destination, total_packed, cudaMemcpyDeviceToHost, working_stream);
+        CUDA_ERROR_CHECK(cuda_err);
+        if (!(pConvertor->flags & CONVERTOR_CUDA_ASYNC)) {
+            cuda_err = cudaStreamSynchronize(working_stream);
+            CUDA_ERROR_CHECK(cuda_err);
+        }
+    } 
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    GET_TIME( end );
+    move_time = ELAPSED_TIME( start, end );
+    OPAL_OUTPUT_VERBOSE((2, opal_datatype_cuda_output, "[Timing]: DtoH memcpy in %ld microsec, transfer required %d, pipeline_size %lu, pipeline_seq %lu\n", move_time, transfer_required, pConvertor->pipeline_size, pConvertor->pipeline_seq ));
+#endif
+
+    iov[0].iov_len = total_packed;
+    *max_data = total_packed;
+    *out_size = 1;
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    GET_TIME( end_total );
+    total_time = ELAPSED_TIME( start_total, end_total );
+    OPAL_OUTPUT_VERBOSE((2, opal_datatype_cuda_output, "[Timing]: total packing in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ));
+#endif
+    
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required && !(pConvertor->flags & CONVERTOR_CUDA_ASYNC)) {
+           opal_datatype_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+           pConvertor->gpu_buffer_ptr = NULL;
+        }
+        return 1;
+    }        
+    return 0; 
+}
+
+int32_t opal_datatype_cuda_generic_simple_pack_function_iov_non_cached( opal_convertor_t* pConvertor, unsigned char *destination, size_t buffer_size, size_t *total_packed)
+{
+    uint32_t nb_blocks, thread_per_block, nb_blocks_used;
+    unsigned char *destination_base, *source_base;
+    uint8_t buffer_isfull = 0;
+    cudaError_t cuda_err;
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current;
+    ddt_cuda_iov_pipeline_block_non_cached_t *cuda_iov_pipeline_block_non_cached;
+    cudaStream_t cuda_stream_iov = NULL;
+    const struct iovec *ddt_iov = NULL;
+    uint32_t ddt_iov_count = 0;
+    size_t contig_disp = 0;
+    uint32_t ddt_iov_start_pos, ddt_iov_end_pos, current_ddt_iov_pos;
+    OPAL_PTRDIFF_TYPE ddt_extent;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end;
+    long total_time;
+#endif
+
+    OPAL_OUTPUT_VERBOSE((2, opal_datatype_cuda_output, "Pack using IOV non cached, convertor %p, GPU base %p, pack to buffer %p\n", pConvertor, pConvertor->pBaseBuf, destination));
+    
+    opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
+    if (ddt_iov == NULL) {
+        OPAL_OUTPUT_VERBOSE((0, opal_datatype_cuda_output, "Can not get ddt iov\n"));
+        return OPAL_ERROR;
+    }
+    
+    thread_per_block = CUDA_WARP_SIZE * 5;
+    nb_blocks = 256;
+    opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
+    source_base = (unsigned char*)pConvertor->pBaseBuf + pConvertor->current_count * ddt_extent; 
+    destination_base = destination;
+    
+    while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
+        
+        nb_blocks_used = 0;
+        ddt_iov_start_pos = pConvertor->current_iov_pos;
+        ddt_iov_end_pos = ddt_iov_start_pos + IOV_PIPELINE_SIZE;
+        if (ddt_iov_end_pos > ddt_iov_count) {
+            ddt_iov_end_pos = ddt_iov_count;
+        }
+        cuda_iov_pipeline_block_non_cached = current_cuda_device->cuda_iov_pipeline_block_non_cached[current_cuda_device->cuda_iov_pipeline_block_non_cached_first_avail];
+        if (pConvertor->stream == NULL) {
+            cuda_iov_pipeline_block_non_cached->cuda_stream = cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id];
+        } else {
+            cuda_iov_pipeline_block_non_cached->cuda_stream = (cudaStream_t)pConvertor->stream;
+        }
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block_non_cached->cuda_iov_dist_non_cached_h;
+        cuda_iov_dist_d_current = cuda_iov_pipeline_block_non_cached->cuda_iov_dist_non_cached_d;
+        cuda_stream_iov = cuda_iov_pipeline_block_non_cached->cuda_stream;
+        cuda_err = cudaEventSynchronize(cuda_iov_pipeline_block_non_cached->cuda_event);
+        CUDA_ERROR_CHECK(cuda_err);
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+
+        buffer_isfull = opal_datatype_cuda_iov_to_cuda_iov(pConvertor, ddt_iov, cuda_iov_dist_h_current, ddt_iov_start_pos, ddt_iov_end_pos, &buffer_size, &nb_blocks_used, total_packed, &contig_disp, &current_ddt_iov_pos);
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        OPAL_OUTPUT_VERBOSE((2, opal_datatype_cuda_output, "[Timing]: Pack src %p to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, destination_base, total_time,  cuda_streams->current_stream_id, nb_blocks_used));
+#endif
+
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, cuda_stream_iov);
+        opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, cuda_stream_iov>>>(cuda_iov_dist_d_current, 0, nb_blocks_used, 0, 0, nb_blocks_used, source_base, destination_base);
+        cuda_err = cudaEventRecord(cuda_iov_pipeline_block_non_cached->cuda_event, cuda_stream_iov);
+        CUDA_ERROR_CHECK(cuda_err);
+        current_cuda_device->cuda_iov_pipeline_block_non_cached_first_avail ++;
+        if (current_cuda_device->cuda_iov_pipeline_block_non_cached_first_avail >= NB_PIPELINE_NON_CACHED_BLOCKS) {
+            current_cuda_device->cuda_iov_pipeline_block_non_cached_first_avail = 0;
+        }
+        destination_base += contig_disp;
+        
+        if (!buffer_isfull) {
+            pConvertor->current_iov_pos = current_ddt_iov_pos;
+            if (current_ddt_iov_pos == ddt_iov_count) {
+                pConvertor->current_count ++;
+                pConvertor->current_iov_pos = 0;
+                source_base += ddt_extent;
+            }
+        }
+        
+    }
+        
+    return OPAL_SUCCESS;
+}
+
+int32_t opal_datatype_cuda_generic_simple_pack_function_iov_cached( opal_convertor_t* pConvertor, unsigned char *destination, size_t buffer_size, size_t *total_packed)
+{
+    uint32_t i;
+    uint32_t nb_blocks, thread_per_block, nb_blocks_used;
+    unsigned char *destination_base, *source_base;
+    uint8_t buffer_isfull = 0;
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    cudaStream_t cuda_stream_iov = NULL;
+    uint32_t cuda_iov_start_pos, cuda_iov_end_pos;
+    ddt_cuda_iov_total_cached_t* cached_cuda_iov = NULL;
+    ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d = NULL;
+    uint32_t *cached_cuda_iov_nb_bytes_list_h = NULL;
+    uint32_t cached_cuda_iov_count = 0;
+    opal_datatype_count_t convertor_current_count;
+    OPAL_PTRDIFF_TYPE ddt_extent;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end;
+    long total_time;
+#endif
+    
+    OPAL_OUTPUT_VERBOSE((2, opal_datatype_cuda_output, "Pack using IOV cached, convertor %p, GPU base %p, pack to buffer %p\n", pConvertor, pConvertor->pBaseBuf, destination));
+
+    destination_base = destination;
+    thread_per_block = CUDA_WARP_SIZE * 8;
+    nb_blocks = 64;
+    source_base = (unsigned char*)pConvertor->pBaseBuf; 
+    
+    /* cuda iov is not cached, start to cache iov */
+    if(opal_datatype_cuda_cuda_iov_is_cached(pConvertor) == 0) {
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+        if (opal_datatype_cuda_cache_cuda_iov(pConvertor, &nb_blocks_used) == OPAL_SUCCESS) {
+            OPAL_OUTPUT_VERBOSE((2, opal_datatype_cuda_output, "Pack cuda iov is cached, count %d\n", nb_blocks_used));
+        } else {
+            OPAL_OUTPUT_VERBOSE((0, opal_datatype_cuda_output, "Pack cache cuda iov is failed\n"));
+            return OPAL_ERROR;
+        }
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        OPAL_OUTPUT_VERBOSE((2, opal_datatype_cuda_output, "[Timing]: Pack cuda iov is cached in %ld microsec, nb_blocks %d\n", total_time, nb_blocks_used));
+#endif
+    }
+    
+    /* now we use cached cuda iov */
+    opal_datatype_cuda_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
+    cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
+    assert(cached_cuda_iov_dist_d != NULL);
+    cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
+    assert(cached_cuda_iov_nb_bytes_list_h != NULL);
+    
+    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
+    cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
+    cuda_iov_end_pos = cached_cuda_iov_count;
+    nb_blocks_used = 0;
+    if (pConvertor->stream == NULL) {
+        cuda_stream_iov = cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id];
+    } else {
+        cuda_stream_iov = (cudaStream_t)pConvertor->stream;
+    }
+    convertor_current_count = pConvertor->current_count;
+   
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
+        for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
+            if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
+                *total_packed += cached_cuda_iov_nb_bytes_list_h[i];
+                buffer_size -= cached_cuda_iov_nb_bytes_list_h[i];
+                nb_blocks_used++;
+            } else {
+                buffer_isfull = 1;
+                break;
+            }
+        }
+        if (!buffer_isfull) {
+            pConvertor->current_count ++;
+            cuda_iov_start_pos = 0;
+            cuda_iov_end_pos = cached_cuda_iov->cuda_iov_count;
+        }
+    }
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    OPAL_OUTPUT_VERBOSE((2, opal_datatype_cuda_output, "[Timing]: Pack to dest %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_streams->current_stream_id, nb_blocks_used));
+#endif
+    opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
+    OPAL_OUTPUT_VERBOSE((2, opal_datatype_cuda_output, "Pack kernel launched src_base %p, dst_base %p, nb_blocks %d, extent %ld\n", source_base, destination_base, nb_blocks_used, ddt_extent));
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif    
+    opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cached_cuda_iov_count, ddt_extent, convertor_current_count, nb_blocks_used, source_base, destination_base);
+    pConvertor->current_cuda_iov_pos += nb_blocks_used;
+    pConvertor->current_cuda_iov_pos = pConvertor->current_cuda_iov_pos % cached_cuda_iov->cuda_iov_count;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    OPAL_OUTPUT_VERBOSE((2, opal_datatype_cuda_output, "[Timing]: Pack kernel %ld microsec\n", total_time));
+#endif    
+    return OPAL_SUCCESS;
+}
+
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
new file mode 100644
index 00000000000..f9af26b3287
--- /dev/null
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -0,0 +1,277 @@
+#include "opal/datatype/opal_convertor_internal.h"
+#include "opal/datatype/opal_datatype_internal.h"
+
+#include "opal_datatype_cuda_internal.cuh"
+#include <cuda.h>
+#include <stdio.h> 
+
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uint32_t cuda_iov_count, uint32_t ddt_extent, uint32_t current_count, int nb_blocks_used, unsigned char* destination_base, unsigned char* source_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end)
+{
+    uint32_t i, j;
+    size_t dst_offset, src_offset;
+    unsigned char *_source_tmp, *_destination_tmp;
+    uint32_t _nb_bytes;
+    uint32_t current_cuda_iov_pos = cuda_iov_pos;
+    size_t source_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
+    size_t source_partial_disp = 0;
+    size_t contig_disp; 
+    uint32_t _my_cuda_iov_pos;
+    uint32_t _my_cuda_iov_iteration;
+    size_t ddt_size = cuda_iov_dist[cuda_iov_count].contig_disp;
+
+    __shared__ uint32_t nb_tasks_per_block;
+    __shared__ uint32_t WARP_SIZE;
+    __shared__ uint32_t nb_warp_per_block;
+    uint32_t copy_count;
+    uint8_t alignment;
+    uint64_t tmp_var_64[KERNEL_UNROLL];
+    uint32_t tmp_var_32[KERNEL_UNROLL];
+    unsigned char tmp_var_8[KERNEL_UNROLL];
+    uint32_t u, k;
+    uint32_t copy_count_16, copy_count_8, copy_count_left;
+    
+    if (threadIdx.x == 0) {
+        nb_tasks_per_block = nb_blocks_used / gridDim.x;
+        if (blockIdx.x < nb_blocks_used % gridDim.x) {
+            nb_tasks_per_block ++;
+        }
+        WARP_SIZE = 32;
+        nb_warp_per_block = blockDim.x / WARP_SIZE;
+    }
+    __syncthreads();
+    
+    const uint32_t warp_id_per_block = threadIdx.x / WARP_SIZE;
+    const uint32_t tid_per_warp = threadIdx.x & (WARP_SIZE - 1);
+    
+    if (cuda_iov_partial_length_start != 0) {
+        source_partial_disp = (cuda_iov_dist[current_cuda_iov_pos+1].contig_disp - cuda_iov_dist[current_cuda_iov_pos].contig_disp) - cuda_iov_partial_length_start;
+    }
+    
+    for (i = warp_id_per_block; i < nb_tasks_per_block; i+= nb_warp_per_block) {
+        /* these 3 variables are used multiple times, so put in in register */
+        _my_cuda_iov_pos = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) % cuda_iov_count;
+        _my_cuda_iov_iteration = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) / cuda_iov_count;
+        contig_disp = cuda_iov_dist[_my_cuda_iov_pos].contig_disp; 
+        
+        src_offset = contig_disp + ddt_size * _my_cuda_iov_iteration - source_disp - source_partial_disp;
+        dst_offset = cuda_iov_dist[_my_cuda_iov_pos].ncontig_disp + (_my_cuda_iov_iteration + current_count) * ddt_extent;
+        _nb_bytes = cuda_iov_dist[_my_cuda_iov_pos + 1].contig_disp - contig_disp;
+
+        if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
+            src_offset = contig_disp + ddt_size * _my_cuda_iov_iteration - source_disp;
+            dst_offset = dst_offset + _nb_bytes - cuda_iov_partial_length_start;  
+            _nb_bytes = cuda_iov_partial_length_start;
+        } else if (i == nb_tasks_per_block-1 && (blockIdx.x == (nb_blocks_used-1) % gridDim.x) && cuda_iov_partial_length_end != 0) {
+            _nb_bytes = cuda_iov_partial_length_end;
+        }
+        
+        _destination_tmp = destination_base + dst_offset; 
+        _source_tmp = source_base + src_offset;
+        if ((uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
+            alignment = ALIGNMENT_DOUBLE;
+        } else if ((uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
+            alignment = ALIGNMENT_FLOAT;
+        } else {
+            alignment = ALIGNMENT_CHAR;
+        }
+        copy_count = _nb_bytes / alignment;
+
+        if (alignment == ALIGNMENT_DOUBLE) {
+            uint64_t *_source_base_64, *_destination_base_64; 
+            copy_count_16 = copy_count  / (WARP_SIZE * UNROLL_16) * (WARP_SIZE * UNROLL_16);
+            _source_base_64 = (uint64_t *)(source_base + src_offset);
+            _destination_base_64 = (uint64_t *)(destination_base + dst_offset);
+            if (copy_count_16 > 0) {
+                for (k = 0; k < copy_count_16; k += UNROLL_16 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_64[u] = *(_source_base_64 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_64 + j) = tmp_var_64[u];
+
+                    }
+                }
+            }
+            _source_base_64 += copy_count_16;
+            _destination_base_64 += copy_count_16;
+            
+            copy_count_8 = (copy_count - copy_count_16) / (WARP_SIZE * UNROLL_8) * (WARP_SIZE * UNROLL_8);
+            if (copy_count_8 > 0) {
+                for (k = 0; k < copy_count_8; k += UNROLL_8 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_64[u] = *(_source_base_64 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_64 + j) = tmp_var_64[u];
+
+                    }
+                }
+            }
+            _source_base_64 += copy_count_8;
+            _destination_base_64 += copy_count_8;
+        
+            copy_count_left = copy_count - copy_count_16 - copy_count_8;
+            if (copy_count_left > 0) {
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        tmp_var_64[u] = *(_source_base_64 + j);
+                    } else {
+                        break;
+                    }
+                }
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        *(_destination_base_64 + j) = tmp_var_64[u];
+                    } else {
+                        break;
+                    }
+                }
+            }
+        } else if (alignment == ALIGNMENT_FLOAT) {
+            uint32_t *_source_base_32, *_destination_base_32;    
+            copy_count_16 = copy_count  / (WARP_SIZE * UNROLL_16) * (WARP_SIZE * UNROLL_16);
+            _source_base_32 = (uint32_t *)(source_base + src_offset);
+            _destination_base_32 = (uint32_t *)(destination_base + dst_offset);
+            if (copy_count_16 > 0) {
+                for (k = 0; k < copy_count_16; k += UNROLL_16 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_32[u] = *(_source_base_32 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_32 + j) = tmp_var_32[u];
+
+                    }
+                }
+            }
+            _source_base_32 += copy_count_16;
+            _destination_base_32 += copy_count_16;
+        
+            copy_count_8 = (copy_count - copy_count_16) / (WARP_SIZE * UNROLL_8) * (WARP_SIZE * UNROLL_8);
+            if (copy_count_8 > 0) {
+                for (k = 0; k < copy_count_8; k += UNROLL_8 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_32[u] = *(_source_base_32 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_32 + j) = tmp_var_32[u];
+
+                    }
+                }
+            }
+            _source_base_32 += copy_count_8;
+            _destination_base_32 += copy_count_8;
+        
+            copy_count_left = copy_count - copy_count_16 - copy_count_8;
+            if (copy_count_left > 0) {
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        tmp_var_32[u] = *(_source_base_32 + j);
+                    } else {
+                        break;
+                    }
+                }
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        *(_destination_base_32 + j) = tmp_var_32[u];
+                    } else {
+                        break;
+                    }
+                }
+            }
+        } else {
+            unsigned char *_source_base_8, *_destination_base_8;
+        
+            copy_count_16 = copy_count  / (WARP_SIZE * UNROLL_16) * (WARP_SIZE * UNROLL_16);
+            _source_base_8 = (unsigned char *)(source_base + src_offset);
+            _destination_base_8 = (unsigned char *)(destination_base + dst_offset);
+            if (copy_count_16 > 0) {
+                for (k = 0; k < copy_count_16; k += UNROLL_16 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_8[u] = *(_source_base_8 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_8 + j) = tmp_var_8[u];
+
+                    }
+                }
+            }
+            _source_base_8 += copy_count_16;
+            _destination_base_8 += copy_count_16;
+        
+            copy_count_8 = (copy_count - copy_count_16) / (WARP_SIZE * UNROLL_8) * (WARP_SIZE * UNROLL_8);
+            if (copy_count_8 > 0) {
+                for (k = 0; k < copy_count_8; k += UNROLL_8 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_8[u] = *(_source_base_8 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_8 + j) = tmp_var_8[u];
+
+                    }
+                }
+            }
+            _source_base_8 += copy_count_8;
+            _destination_base_8 += copy_count_8;
+        
+            copy_count_left = copy_count - copy_count_16 - copy_count_8;
+            if (copy_count_left > 0) {
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        tmp_var_8[u] = *(_source_base_8 + j);
+                    } else {
+                        break;
+                    }
+                }
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        *(_destination_base_8 + j) = tmp_var_8[u];
+                    } else {
+                        break;
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
new file mode 100644
index 00000000000..06a0b26b2ab
--- /dev/null
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -0,0 +1,329 @@
+#include "opal/datatype/opal_convertor_internal.h"
+#include "opal/datatype/opal_datatype_internal.h"
+#include "opal/util/output.h"
+
+#include "opal_datatype_cuda_internal.cuh"
+#include "opal_datatype_cuda.cuh"
+
+#include <stdio.h>
+#include <assert.h>
+
+int32_t opal_datatype_cuda_generic_simple_unpack_function_iov( opal_convertor_t* pConvertor,
+                                                               struct iovec* iov,
+                                                               uint32_t* out_size,
+                                                               size_t* max_data )
+{
+    size_t buffer_size;
+    unsigned char *source;
+    size_t total_unpacked;
+    uint8_t free_required = 0;
+    uint8_t gpu_rdma = 0;
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    cudaStream_t working_stream;
+    cudaError_t cuda_err;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time, move_time;
+    GET_TIME(start_total);
+#endif
+    
+    if (pConvertor->stream == NULL) {
+        working_stream = cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id];
+    } else {
+        working_stream = (cudaStream_t)pConvertor->stream;
+    }
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    if (opal_datatype_cuda_is_gpu_buffer(iov[0].iov_base)) {
+        source = (unsigned char*)iov[0].iov_base;
+        free_required = 0;
+        gpu_rdma = 1;
+    } else {
+        if (OPAL_DATATYPE_USE_ZEROCPY) {
+            cuda_err = cudaHostGetDevicePointer((void **)&source, (void *)iov[0].iov_base, 0);
+            if (cuda_err != cudaSuccess) {
+                OPAL_OUTPUT_VERBOSE((0, opal_datatype_cuda_output, "Zero copy is not supported\n"));
+                return 0;
+            }
+            pConvertor->gpu_buffer_ptr = NULL;
+            free_required = 0;
+        } else {
+            if (pConvertor->gpu_buffer_ptr == NULL) {
+                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_datatype_cuda_malloc_gpu_buffer(iov[0].iov_len, 0);
+                pConvertor->gpu_buffer_size = iov[0].iov_len;
+            }
+            source = pConvertor->gpu_buffer_ptr + pConvertor->pipeline_size * pConvertor->pipeline_seq;
+            cuda_err = cudaMemcpyAsync(source, iov[0].iov_base, iov[0].iov_len, cudaMemcpyHostToDevice, working_stream);
+            CUDA_ERROR_CHECK(cuda_err);
+            if (!(pConvertor->flags & CONVERTOR_CUDA_ASYNC)) {
+                cuda_err = cudaStreamSynchronize(working_stream);
+                CUDA_ERROR_CHECK(cuda_err);
+            }
+            free_required = 1;
+        }
+    }
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME( end );
+    move_time = ELAPSED_TIME( start, end );
+    OPAL_OUTPUT_VERBOSE((2, opal_datatype_cuda_output, "[Timing]: HtoD memcpy in %ld microsec, free required %d, pipeline_size %lu, pipeline_seq %lu\n", move_time, free_required, pConvertor->pipeline_size, pConvertor->pipeline_seq));
+#endif
+
+
+    buffer_size = iov[0].iov_len;
+    total_unpacked = 0;
+    
+    /* start unpack */
+    if (cuda_iov_cache_enabled) {
+        opal_datatype_cuda_generic_simple_unpack_function_iov_cached(pConvertor, source, buffer_size, &total_unpacked);
+    } else {
+        opal_datatype_cuda_generic_simple_unpack_function_iov_non_cached(pConvertor, source, buffer_size, &total_unpacked);
+    }
+    
+    pConvertor->bConverted += total_unpacked;
+    OPAL_OUTPUT_VERBOSE((2, opal_datatype_cuda_output, "Unpack total unpacked %ld\n", total_unpacked));
+
+    iov[0].iov_len = total_unpacked;
+    *max_data = total_unpacked;
+    *out_size = 1;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME( end_total );
+    total_time = ELAPSED_TIME( start_total, end_total );
+    OPAL_OUTPUT_VERBOSE((2, opal_datatype_cuda_output, "[Timing]: total unpacking in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time));
+#endif
+    
+    if (gpu_rdma == 0 && !(pConvertor->flags & CONVERTOR_CUDA_ASYNC)) {
+        OPAL_OUTPUT_VERBOSE((2, opal_datatype_cuda_output, "Unpack sync cuda stream\n"));
+        cuda_err = cudaStreamSynchronize(working_stream);
+        CUDA_ERROR_CHECK(cuda_err);
+    }
+
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required && !(pConvertor->flags & CONVERTOR_CUDA_ASYNC)) {
+            OPAL_OUTPUT_VERBOSE((2, opal_datatype_cuda_output, "Unpack free buffer %p\n", pConvertor->gpu_buffer_ptr));
+            opal_datatype_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+            pConvertor->gpu_buffer_ptr = NULL;
+        }
+        return 1;
+    }
+    return 0;
+}
+
+int32_t opal_datatype_cuda_generic_simple_unpack_function_iov_non_cached( opal_convertor_t* pConvertor, unsigned char *source, size_t buffer_size, size_t *total_unpacked)
+{
+    uint32_t nb_blocks, thread_per_block, nb_blocks_used;
+    unsigned char *source_base, *destination_base;
+    uint8_t buffer_isfull = 0;
+    cudaError_t cuda_err;
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current;
+    ddt_cuda_iov_pipeline_block_non_cached_t *cuda_iov_pipeline_block_non_cached;
+    cudaStream_t cuda_stream_iov = NULL;
+    const struct iovec *ddt_iov = NULL;
+    uint32_t ddt_iov_count = 0;
+    size_t contig_disp = 0;
+    uint32_t ddt_iov_start_pos, ddt_iov_end_pos, current_ddt_iov_pos;
+    OPAL_PTRDIFF_TYPE ddt_extent;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    TIMER_DATA_TYPE start, end;
+    long total_time;
+#endif
+
+    OPAL_OUTPUT_VERBOSE((2, opal_datatype_cuda_output, "Unpack using IOV non cached, convertor %p, GPU base %p, unpack from buffer %p, total size %ld\n",
+                                     pConvertor, pConvertor->pBaseBuf, source, buffer_size));
+    
+    opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
+    if (ddt_iov == NULL) {
+        OPAL_OUTPUT_VERBOSE((0, opal_datatype_cuda_output, "Can not get ddt iov\n"));
+        return OPAL_ERROR;
+    }
+    
+    thread_per_block = CUDA_WARP_SIZE * 5;
+    nb_blocks = 256;
+    source_base = source;
+    opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
+    opal_datatype_cuda_set_ddt_iov_position(pConvertor, pConvertor->bConverted, ddt_iov, ddt_iov_count);
+    destination_base = (unsigned char*)pConvertor->pBaseBuf + pConvertor->current_count * ddt_extent;
+
+    while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
+
+        nb_blocks_used = 0;
+        ddt_iov_start_pos = pConvertor->current_iov_pos;
+        ddt_iov_end_pos = ddt_iov_start_pos + IOV_PIPELINE_SIZE;
+        if (ddt_iov_end_pos > ddt_iov_count) {
+            ddt_iov_end_pos = ddt_iov_count;
+        }
+        cuda_iov_pipeline_block_non_cached = current_cuda_device->cuda_iov_pipeline_block_non_cached[current_cuda_device->cuda_iov_pipeline_block_non_cached_first_avail];
+        if (pConvertor->stream == NULL) {
+            cuda_iov_pipeline_block_non_cached->cuda_stream = cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id];
+        } else {
+            cuda_iov_pipeline_block_non_cached->cuda_stream = (cudaStream_t)pConvertor->stream;
+        }
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block_non_cached->cuda_iov_dist_non_cached_h;
+        cuda_iov_dist_d_current = cuda_iov_pipeline_block_non_cached->cuda_iov_dist_non_cached_d;
+        cuda_stream_iov = cuda_iov_pipeline_block_non_cached->cuda_stream;
+        cuda_err = cudaEventSynchronize(cuda_iov_pipeline_block_non_cached->cuda_event);
+        CUDA_ERROR_CHECK(cuda_err); 
+
+#if defined (OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+
+        buffer_isfull = opal_datatype_cuda_iov_to_cuda_iov(pConvertor, ddt_iov, cuda_iov_dist_h_current, ddt_iov_start_pos, ddt_iov_end_pos, &buffer_size, &nb_blocks_used, total_unpacked, &contig_disp, &current_ddt_iov_pos);
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        OPAL_OUTPUT_VERBOSE((2, opal_datatype_cuda_output, "[Timing]: Unpack src %p to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks_used %d\n", source_base, destination_base, total_time,  cuda_streams->current_stream_id, nb_blocks_used));
+#endif
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, cuda_stream_iov);
+        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, cuda_stream_iov>>>(cuda_iov_dist_d_current, 0, nb_blocks_used, 0, 0, nb_blocks_used, destination_base, source_base, 0, 0);
+        cuda_err = cudaEventRecord(cuda_iov_pipeline_block_non_cached->cuda_event, cuda_stream_iov);
+        CUDA_ERROR_CHECK(cuda_err);
+        current_cuda_device->cuda_iov_pipeline_block_non_cached_first_avail ++;
+        if (current_cuda_device->cuda_iov_pipeline_block_non_cached_first_avail >= NB_PIPELINE_NON_CACHED_BLOCKS) {
+            current_cuda_device->cuda_iov_pipeline_block_non_cached_first_avail = 0;
+        }
+        source_base += contig_disp;
+        if (!buffer_isfull) {
+            pConvertor->current_iov_pos = current_ddt_iov_pos;
+            if (current_ddt_iov_pos == ddt_iov_count) {
+                pConvertor->current_count ++;
+                pConvertor->current_iov_pos = 0;
+                destination_base += ddt_extent;
+            }
+        }
+    }
+
+    return OPAL_SUCCESS;
+}
+
+int32_t opal_datatype_cuda_generic_simple_unpack_function_iov_cached( opal_convertor_t* pConvertor, unsigned char *source, size_t buffer_size, size_t *total_unpacked)
+{
+    uint32_t i;
+    uint32_t nb_blocks, thread_per_block, nb_blocks_used;
+    unsigned char *source_base, *destination_base;
+    uint8_t buffer_isfull = 0;
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    cudaStream_t cuda_stream_iov = NULL;
+    uint32_t cuda_iov_start_pos, cuda_iov_end_pos;
+    ddt_cuda_iov_total_cached_t* cached_cuda_iov = NULL;
+    ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d = NULL;
+    uint32_t *cached_cuda_iov_nb_bytes_list_h = NULL;
+    uint32_t cached_cuda_iov_count = 0;
+    size_t cuda_iov_partial_length_start = 0;
+    size_t cuda_iov_partial_length_end = 0;
+    opal_datatype_count_t convertor_current_count;
+    OPAL_PTRDIFF_TYPE ddt_extent;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    TIMER_DATA_TYPE start, end;
+    long total_time;
+    GET_TIME(start);
+#endif
+
+    OPAL_OUTPUT_VERBOSE((2, opal_datatype_cuda_output, "Unpack using IOV cached, convertor %p, GPU base %p, unpack from buffer %p, total size %ld\n",
+                                     pConvertor, pConvertor->pBaseBuf, source, buffer_size));
+
+    source_base = source;
+    thread_per_block = CUDA_WARP_SIZE * 8;
+    nb_blocks = 64;
+    destination_base = (unsigned char*)pConvertor->pBaseBuf;
+    
+    /* cuda iov is not cached, start to cache iov */
+    if(opal_datatype_cuda_cuda_iov_is_cached(pConvertor) == 0) {
+#if defined (OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+        if (opal_datatype_cuda_cache_cuda_iov(pConvertor, &nb_blocks_used) == OPAL_SUCCESS) {
+            OPAL_OUTPUT_VERBOSE((2, opal_datatype_cuda_output, "Unpack cuda iov is cached, count %d\n", nb_blocks_used));
+        }
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        OPAL_OUTPUT_VERBOSE((2, opal_datatype_cuda_output, "[Timing]: Unpack cuda iov is cached in %ld microsec, nb_blocks_used %d\n", total_time, nb_blocks_used));
+#endif
+    }
+      
+    /* now we use cached cuda iov */
+    opal_datatype_cuda_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
+    cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
+    assert(cached_cuda_iov_dist_d != NULL);
+    cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
+    assert(cached_cuda_iov_nb_bytes_list_h != NULL);
+    
+    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
+    opal_datatype_cuda_set_cuda_iov_position(pConvertor, pConvertor->bConverted, cached_cuda_iov_nb_bytes_list_h, cached_cuda_iov_count);
+    cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
+    cuda_iov_end_pos = cached_cuda_iov_count;
+    nb_blocks_used = 0;
+    if (pConvertor->stream == NULL) {
+        cuda_stream_iov = cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id];
+    } else {
+        cuda_stream_iov = (cudaStream_t)pConvertor->stream;
+    }
+    convertor_current_count = pConvertor->current_count;
+    
+    if (pConvertor->current_iov_partial_length > 0) {
+        cuda_iov_partial_length_start = pConvertor->current_iov_partial_length;
+        *total_unpacked += cuda_iov_partial_length_start;
+        buffer_size -= cuda_iov_partial_length_start;
+        pConvertor->current_iov_partial_length = 0;
+        cuda_iov_start_pos ++;
+        nb_blocks_used ++;
+    }
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
+        for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
+            if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
+                *total_unpacked += cached_cuda_iov_nb_bytes_list_h[i];
+                buffer_size -= cached_cuda_iov_nb_bytes_list_h[i];
+                nb_blocks_used ++;
+            } else {
+                if (buffer_size > 0) {
+                    cuda_iov_partial_length_end = buffer_size;
+                    *total_unpacked += cuda_iov_partial_length_end;
+                    nb_blocks_used ++;
+                }
+                buffer_size = 0;
+                buffer_isfull = 1;
+                break;
+            }
+        }
+        if (!buffer_isfull) {
+            pConvertor->current_count ++;
+            cuda_iov_start_pos = 0;
+            cuda_iov_end_pos = cached_cuda_iov_count;
+        }
+    }
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    OPAL_OUTPUT_VERBOSE((2, opal_datatype_cuda_output, "[Timing]: Unpack src %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, total_time,  cuda_streams->current_stream_id, nb_blocks_used));
+#endif
+    opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
+    OPAL_OUTPUT_VERBOSE((2, opal_datatype_cuda_output, "Unpack kernel launched src_base %p, dst_base %p, nb_blocks %d\n", source_base, destination_base, nb_blocks_used));
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif    
+    opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cached_cuda_iov_count, ddt_extent, convertor_current_count, nb_blocks_used, destination_base, source_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    OPAL_OUTPUT_VERBOSE((2, opal_datatype_cuda_output, "[Timing]: Unpack kernel %ld microsec\n", total_time));
+#endif
+
+    return OPAL_SUCCESS;
+}
\ No newline at end of file
diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c
index 46aff829723..62cad379a64 100644
--- a/opal/datatype/opal_convertor.c
+++ b/opal/datatype/opal_convertor.c
@@ -552,8 +552,11 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
 
     convertor->flags |= CONVERTOR_RECV;
 #if OPAL_CUDA_SUPPORT
-    mca_cuda_convertor_init(convertor, pUserBuf);
-#endif
+    mca_cuda_convertor_init(convertor, pUserBuf, datatype);
+    convertor->pipeline_depth = 0;
+    convertor->pipeline_seq = 0;
+    convertor->pipeline_size = 0;
+#endif  /* OPAL_CUDA_SUPPORT */
 
     OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );
 
@@ -564,7 +567,12 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
             if( convertor->pDesc->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
                 convertor->fAdvance = opal_unpack_homogeneous_contig_checksum;
             } else {
-                convertor->fAdvance = opal_generic_simple_unpack_checksum;
+                if ((convertor->flags & CONVERTOR_CUDA) && (opal_datatype_cuda_kernel_support == 1)) {
+                    convertor->fAdvance = opal_generic_simple_unpack_cuda_checksum;
+                    convertor->gpu_buffer_ptr = NULL;
+                } else {
+                    convertor->fAdvance = opal_generic_simple_unpack_checksum;
+                }
             }
         }
     } else {
@@ -574,7 +582,12 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
             if( convertor->pDesc->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
                 convertor->fAdvance = opal_unpack_homogeneous_contig;
             } else {
-                convertor->fAdvance = opal_generic_simple_unpack;
+                if ((convertor->flags & CONVERTOR_CUDA) && (opal_datatype_cuda_kernel_support == 1)) {
+                    convertor->fAdvance = opal_generic_simple_unpack_cuda;
+                    convertor->gpu_buffer_ptr = NULL;
+                } else {
+                    convertor->fAdvance = opal_generic_simple_unpack;
+                }
             }
         }
     }
@@ -589,8 +602,11 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
 {
     convertor->flags |= CONVERTOR_SEND;
 #if OPAL_CUDA_SUPPORT
-    mca_cuda_convertor_init(convertor, pUserBuf);
-#endif
+    mca_cuda_convertor_init(convertor, pUserBuf, datatype);
+    convertor->pipeline_depth = 0;
+    convertor->pipeline_seq = 0;
+    convertor->pipeline_size = 0;
+#endif  /* OPAL_CUDA_SUPPORT */
 
     OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );
 
@@ -605,8 +621,13 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
                 else
                     convertor->fAdvance = opal_pack_homogeneous_contig_with_gaps_checksum;
             } else {
-                convertor->fAdvance = opal_generic_simple_pack_checksum;
-            }
+                if ((convertor->flags & CONVERTOR_CUDA) && (opal_datatype_cuda_kernel_support == 1)) {
+                    convertor->fAdvance = opal_generic_simple_pack_cuda_checksum;
+                    convertor->gpu_buffer_ptr = NULL;
+                } else {
+                    convertor->fAdvance = opal_generic_simple_pack_checksum;
+                }
+            }   
         }
     } else {
         if( CONVERTOR_SEND_CONVERSION == (convertor->flags & (CONVERTOR_SEND_CONVERSION|CONVERTOR_HOMOGENEOUS)) ) {
@@ -619,7 +640,12 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
                 else
                     convertor->fAdvance = opal_pack_homogeneous_contig_with_gaps;
             } else {
-                convertor->fAdvance = opal_generic_simple_pack;
+                if ((convertor->flags & CONVERTOR_CUDA) && (opal_datatype_cuda_kernel_support == 1)) {
+                    convertor->fAdvance = opal_generic_simple_pack_cuda;
+                    convertor->gpu_buffer_ptr = NULL;
+                } else {
+                    convertor->fAdvance = opal_generic_simple_pack;
+                }
             }
         }
     }
diff --git a/opal/datatype/opal_convertor.h b/opal/datatype/opal_convertor.h
index 7c5de1af39b..bdb965abc9c 100644
--- a/opal/datatype/opal_convertor.h
+++ b/opal/datatype/opal_convertor.h
@@ -3,7 +3,7 @@
  * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
  *                         Corporation.  All rights reserved.
- * Copyright (c) 2004-2014 The University of Tennessee and The University
+ * Copyright (c) 2004-2015 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
@@ -79,6 +79,8 @@ typedef struct dt_stack_t dt_stack_t;
  */
 #define DT_STATIC_STACK_SIZE   5                /**< This should be sufficient for most applications */
 
+#define MAX_IPC_EVENT_HANDLE   10
+
 struct opal_convertor_t {
     opal_object_t                 super;          /**< basic superclass */
     uint32_t                      remoteArch;     /**< the remote architecture */
@@ -109,6 +111,16 @@ struct opal_convertor_t {
 #if OPAL_CUDA_SUPPORT
     memcpy_fct_t                  cbmemcpy;       /**< memcpy or cuMemcpy */
     void *                        stream;         /**< CUstream for async copy */
+
+    unsigned char *               gpu_buffer_ptr; /**< GPU buffer used for pack/unpack */
+    size_t                        gpu_buffer_size;
+    size_t                        pipeline_depth;
+    size_t                        pipeline_seq;
+    size_t                        pipeline_size;
+    uint32_t                      current_cuda_iov_pos;
+    uint32_t                      current_iov_pos;
+    size_t                        current_iov_partial_length;
+    opal_datatype_count_t         current_count;
 #endif
     /* size: 248, cachelines: 4, members: 20 */
     /* last cacheline: 56 bytes */
@@ -276,7 +288,22 @@ opal_convertor_raw( opal_convertor_t* convertor,  /* [IN/OUT] */
                     struct iovec* iov,            /* [IN/OUT] */
                     uint32_t* iov_count,          /* [IN/OUT] */
                     size_t* length );             /* [OUT]    */
+OPAL_DECLSPEC void
+opal_convertor_to_iov(struct opal_convertor_t *convertor,
+                      struct iovec **iov,
+                      uint32_t *iov_count,
+                      size_t *max_data);
 
+/**
+ * A straighforward description of the datatype in terms of a NULL
+ * based iovec (so basically displacements from the begining of a pointer,
+ * will be generated and stored in the datatype itself. This description
+ * can be used to pack/unpack the data manually.
+ */
+OPAL_DECLSPEC int
+opal_convertor_raw_cached(struct opal_convertor_t *convertor,
+                          const struct iovec **iov,
+                          uint32_t* iov_count);
 /*
  * Upper level does not need to call the _nocheck function directly.
  */
diff --git a/opal/datatype/opal_convertor_raw.c b/opal/datatype/opal_convertor_raw.c
index ce0eaf33305..d1f99ecf994 100644
--- a/opal/datatype/opal_convertor_raw.c
+++ b/opal/datatype/opal_convertor_raw.c
@@ -1,6 +1,6 @@
 /* -*- Mode: C; c-basic-offset:4 ; -*- */
 /*
- * Copyright (c) 2004-2009 The University of Tennessee and The University
+ * Copyright (c) 2004-2015 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2009      Oak Ridge National Labs.  All rights reserved.
@@ -35,8 +35,8 @@
  */
 int32_t
 opal_convertor_raw( opal_convertor_t* pConvertor,
-		    struct iovec* iov, uint32_t* iov_count,
-		    size_t* length )
+                    struct iovec* iov, uint32_t* iov_count,
+                    size_t* length )
 {
     const opal_datatype_t *pData = pConvertor->pDesc;
     dt_stack_t* pStack;       /* pointer to the position on the stack */
@@ -75,9 +75,9 @@ opal_convertor_raw( opal_convertor_t* pConvertor,
     description = pConvertor->use_desc->desc;
 
     /* For the first step we have to add both displacement to the source. After in the
-    * main while loop we will set back the source_base to the correct value. This is
-    * due to the fact that the convertor can stop in the middle of a data with a count
-    */
+     * main while loop we will set back the source_base to the correct value. This is
+     * due to the fact that the convertor can stop in the middle of a data with a count
+     */
     pStack = pConvertor->pStack + pConvertor->stack_pos;
     pos_desc     = pStack->index;
     source_base  = pConvertor->pBaseBuf + pStack->disp;
@@ -99,7 +99,7 @@ opal_convertor_raw( opal_convertor_t* pConvertor,
                     blength *= count_desc;
                     /* now here we have a basic datatype */
                     OPAL_DATATYPE_SAFEGUARD_POINTER( source_base, blength, pConvertor->pBaseBuf,
-                                                pConvertor->pDesc, pConvertor->count );
+                                                     pConvertor->pDesc, pConvertor->count );
                     DO_DEBUG( opal_output( 0, "raw 1. iov[%d] = {base %p, length %lu}\n",
                                            index, (void*)source_base, (unsigned long)blength ); );
                     iov[index].iov_base = (IOVBASE_TYPE *) source_base;
@@ -112,7 +112,7 @@ opal_convertor_raw( opal_convertor_t* pConvertor,
             } else {
                 for( i = count_desc; (i > 0) && (index < *iov_count); i--, index++ ) {
                     OPAL_DATATYPE_SAFEGUARD_POINTER( source_base, blength, pConvertor->pBaseBuf,
-                                                pConvertor->pDesc, pConvertor->count );
+                                                     pConvertor->pDesc, pConvertor->count );
                     DO_DEBUG( opal_output( 0, "raw 2. iov[%d] = {base %p, length %lu}\n",
                                            index, (void*)source_base, (unsigned long)blength ); );
                     iov[index].iov_base = (IOVBASE_TYPE *) source_base;
@@ -139,8 +139,8 @@ opal_convertor_raw( opal_convertor_t* pConvertor,
             if( --(pStack->count) == 0 ) { /* end of loop */
                 if( pConvertor->stack_pos == 0 ) {
                     /* we lie about the size of the next element in order to
-                    * make sure we exit the main loop.
-                    */
+                     * make sure we exit the main loop.
+                     */
                     *iov_count = index;
                     goto complete_loop;  /* completed */
                 }
@@ -172,7 +172,7 @@ opal_convertor_raw( opal_convertor_t* pConvertor,
                 source_base += end_loop->first_elem_disp;
                 for( i = count_desc; (i > 0) && (index < *iov_count); i--, index++ ) {
                     OPAL_DATATYPE_SAFEGUARD_POINTER( source_base, end_loop->size, pConvertor->pBaseBuf,
-                                                pConvertor->pDesc, pConvertor->count );
+                                                     pConvertor->pDesc, pConvertor->count );
                     iov[index].iov_base = (IOVBASE_TYPE *) source_base;
                     iov[index].iov_len  = end_loop->size;
                     source_base += pElem->loop.extent;
@@ -189,14 +189,14 @@ opal_convertor_raw( opal_convertor_t* pConvertor,
             PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
                         pStack->disp + local_disp);
             pos_desc++;
-          update_loop_description:  /* update the current state */
+        update_loop_description:  /* update the current state */
             source_base = pConvertor->pBaseBuf + pStack->disp;
             UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
             DDT_DUMP_STACK( pConvertor->pStack, pConvertor->stack_pos, pElem, "advance loop" );
             continue;
         }
     }
-complete_loop:
+ complete_loop:
     pConvertor->bConverted += raw_data;  /* update the already converted bytes */
     *length = raw_data;
     *iov_count = index;
@@ -211,3 +211,66 @@ opal_convertor_raw( opal_convertor_t* pConvertor,
                            pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
     return 0;
 }
+
+#define IOVEC_INITIAL_SIZE 64
+
+void
+opal_convertor_to_iov(struct opal_convertor_t *convertor,
+                      struct iovec **iov,
+                      uint32_t *iov_count,
+                      size_t *max_data)
+{
+    uint32_t temp_count = IOVEC_INITIAL_SIZE;
+    struct iovec *iovec;
+    size_t temp_data;
+
+    *iov_count = 0;
+    *max_data = 0;
+
+    *iov = iovec = (struct iovec*) malloc(temp_count * sizeof(struct iovec));
+    while(1) {
+        int ret = opal_convertor_raw(convertor, iovec, &temp_count, &temp_data);
+        *iov_count += temp_count;
+        *max_data += temp_data;
+        if(ret)
+            break;
+
+        *iov = (struct iovec*)realloc(*iov, (*iov_count + IOVEC_INITIAL_SIZE) * sizeof(struct iovec));
+        temp_count = IOVEC_INITIAL_SIZE;
+        iovec = &((*iov)[*iov_count]);
+    }
+}
+
+int opal_convertor_raw_cached(struct opal_convertor_t *convertor,
+                              const struct iovec **iov,
+                              uint32_t* iov_count)
+{
+    if( NULL == convertor->pDesc->cached_iovec ) {
+        opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
+        datatype->cached_iovec = (opal_datatype_caching_iovec_t *)malloc(sizeof(opal_datatype_caching_iovec_t));
+        datatype->cached_iovec->cached_iovec = NULL;
+        datatype->cached_iovec->cached_iovec_count = 0;
+        
+        struct opal_convertor_t conv;
+        size_t max_data;
+
+        OBJ_CONSTRUCT(&conv, opal_convertor_t);
+        conv.remoteArch = convertor->remoteArch;
+        conv.stack_pos  = 0;
+        conv.flags      = convertor->flags;
+        conv.master     = convertor->master;
+        opal_convertor_prepare_for_send(&conv, convertor->pDesc, 1, NULL);
+        opal_convertor_get_packed_size(&conv, &max_data);
+        opal_convertor_to_iov(&conv, (struct iovec **)&(datatype->cached_iovec->cached_iovec),
+                              (uint32_t *)&(datatype->cached_iovec->cached_iovec_count), &max_data);
+#if OPAL_CUDA_SUPPORT
+        datatype->cached_iovec->cached_cuda_iov = NULL;
+#endif /* OPAL_CUDA_SUPPORT */
+    
+        OBJ_DESTRUCT(&conv);
+    }
+    *iov = convertor->pDesc->cached_iovec->cached_iovec;
+    *iov_count = convertor->pDesc->cached_iovec->cached_iovec_count;
+
+    return OPAL_SUCCESS;
+}
diff --git a/opal/datatype/opal_datatype.h b/opal/datatype/opal_datatype.h
index 25f014ead0d..49ea82d321c 100644
--- a/opal/datatype/opal_datatype.h
+++ b/opal/datatype/opal_datatype.h
@@ -95,6 +95,13 @@ struct dt_type_desc_t {
 };
 typedef struct dt_type_desc_t dt_type_desc_t;
 
+typedef struct opal_datatype_caching_iovec_t {
+    struct iovec*      cached_iovec;
+    uint32_t           cached_iovec_count;
+#if OPAL_CUDA_SUPPORT
+    void*              cached_cuda_iov;
+#endif /* OPAL_CUDA_SUPPORT */
+} opal_datatype_caching_iovec_t;
 
 /*
  * The datatype description.
@@ -107,29 +114,28 @@ struct opal_datatype_t {
     size_t             size;     /**< total size in bytes of the memory used by the data if
                                       the data is put on a contiguous buffer */
     OPAL_PTRDIFF_TYPE  true_lb;  /**< the true lb of the data without user defined lb and ub */
+    /* --- cacheline 1 boundary (64 bytes) --- */
     OPAL_PTRDIFF_TYPE  true_ub;  /**< the true ub of the data without user defined lb and ub */
     OPAL_PTRDIFF_TYPE  lb;       /**< lower bound in memory */
     OPAL_PTRDIFF_TYPE  ub;       /**< upper bound in memory */
-    /* --- cacheline 1 boundary (64 bytes) --- */
     size_t             nbElems;  /**< total number of elements inside the datatype */
-    uint32_t           align;    /**< data should be aligned to */
 
     /* Attribute fields */
     char               name[OPAL_MAX_OBJECT_NAME];  /**< name of the datatype */
-    /* --- cacheline 2 boundary (128 bytes) was 8-12 bytes ago --- */
+    /* --- cacheline 2 boundary (128 bytes) was 40 bytes ago --- */
     dt_type_desc_t     desc;     /**< the data description */
     dt_type_desc_t     opt_desc; /**< short description of the data used when conversion is useless
                                       or in the send case (without conversion) */
 
+    uint32_t           align;    /**< data should be aligned to */
     uint32_t           btypes[OPAL_DATATYPE_MAX_SUPPORTED];
                                  /**< basic elements count used to compute the size of the
                                       datatype for remote nodes. The length of the array is dependent on
                                       the maximum number of datatypes of all top layers.
                                       Reason being is that Fortran is not at the OPAL layer. */
-    /* --- cacheline 5 boundary (320 bytes) was 32-36 bytes ago --- */
+    /* --- cacheline 6 boundary (384 bytes) was 8 bytes ago --- */
 
-    /* size: 352, cachelines: 6, members: 15 */
-    /* last cacheline: 28-32 bytes */
+    opal_datatype_caching_iovec_t* cached_iovec;
 };
 
 typedef struct opal_datatype_t opal_datatype_t;
diff --git a/opal/datatype/opal_datatype_create.c b/opal/datatype/opal_datatype_create.c
index e64e1f04190..ca1e819600b 100644
--- a/opal/datatype/opal_datatype_create.c
+++ b/opal/datatype/opal_datatype_create.c
@@ -3,7 +3,7 @@
  * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
  *                         Corporation.  All rights reserved.
- * Copyright (c) 2004-2013 The University of Tennessee and The University
+ * Copyright (c) 2004-2015 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
@@ -27,6 +27,10 @@
 #include "opal/datatype/opal_datatype_internal.h"
 #include "limits.h"
 #include "opal/prefetch.h"
+#if OPAL_CUDA_SUPPORT
+#include "opal/datatype/opal_convertor.h"
+#include "opal/datatype/opal_datatype_cuda.h"
+#endif /* OPAL_CUDA_SUPPORT */ 
 
 static void opal_datatype_construct( opal_datatype_t* pData )
 {
@@ -53,6 +57,8 @@ static void opal_datatype_construct( opal_datatype_t* pData )
     pData->opt_desc.length    = 0;
     pData->opt_desc.used      = 0;
 
+    pData->cached_iovec       = NULL;
+
     for( i = 0; i < OPAL_DATATYPE_MAX_SUPPORTED; i++ )
         pData->btypes[i]      = 0;
 }
@@ -82,6 +88,22 @@ static void opal_datatype_destruct( opal_datatype_t* datatype )
 
     /* make sure the name is set to empty */
     datatype->name[0] = '\0';
+
+    if( NULL != datatype->cached_iovec ) {
+        if (datatype->cached_iovec->cached_iovec != NULL) {
+            free(datatype->cached_iovec->cached_iovec);
+        }
+#if OPAL_CUDA_SUPPORT   
+        /* free cuda iov */
+        if (opal_datatype_cuda_kernel_support == 1 && datatype->cached_iovec->cached_cuda_iov != NULL) {
+            opal_cached_cuda_iov_fini((void*)datatype->cached_iovec->cached_cuda_iov);
+            datatype->cached_iovec->cached_cuda_iov = NULL;
+        }
+#endif /* OPAL_CUDA_SUPPORT */
+    
+        free(datatype->cached_iovec);
+        datatype->cached_iovec = NULL;
+    }
 }
 
 OBJ_CLASS_INSTANCE(opal_datatype_t, opal_object_t, opal_datatype_construct, opal_datatype_destruct);
diff --git a/opal/datatype/opal_datatype_cuda.c b/opal/datatype/opal_datatype_cuda.c
index 71b60e60801..7feadf06672 100644
--- a/opal/datatype/opal_datatype_cuda.c
+++ b/opal/datatype/opal_datatype_cuda.c
@@ -12,11 +12,13 @@
 #include <errno.h>
 #include <string.h>
 #include <unistd.h>
+#include <dlfcn.h>
 
 #include "opal/align.h"
 #include "opal/util/output.h"
 #include "opal/datatype/opal_convertor.h"
 #include "opal/datatype/opal_datatype_cuda.h"
+#include "opal/mca/installdirs/installdirs.h"
 
 static bool initialized = false;
 int opal_cuda_verbose = 0;
@@ -26,6 +28,28 @@ static void opal_cuda_support_init(void);
 static int (*common_cuda_initialization_function)(opal_common_cuda_function_table_t *) = NULL;
 static opal_common_cuda_function_table_t ftable;
 
+/* folowing variables are used for cuda ddt kernel support */
+static opal_datatype_cuda_kernel_function_table_t cuda_kernel_table;
+static void *opal_datatype_cuda_kernel_handle = NULL;
+static char *opal_datatype_cuda_kernel_lib = NULL;
+int32_t opal_datatype_cuda_kernel_support = 0;
+int opal_datatype_cuda_output = 0;
+int opal_datatype_cuda_verbose = 0;
+int opal_datatype_cuda_kernel_support_enabled = 1;
+size_t opal_datatype_cuda_buffer_size = 64*1024*1024;
+
+#define OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN(handle, fname)            \
+    do {                                                                            \
+        char* _error;                                                               \
+        *(void **)(&(cuda_kernel_table.fname ## _p)) = dlsym((handle), # fname);    \
+        if(NULL != (_error = dlerror()) )  {                                        \
+            opal_output(0, "Finding %s error: %s\n", # fname, _error);              \
+            cuda_kernel_table.fname ## _p = NULL;                                   \
+            return OPAL_ERROR;                                                      \
+        }                                                                           \
+    } while (0)
+
+
 /* This function allows the common cuda code to register an
  * initialization function that gets called the first time an attempt
  * is made to send or receive a GPU pointer.  This allows us to delay
@@ -41,7 +65,7 @@ void opal_cuda_add_initialization_function(int (*fptr)(opal_common_cuda_function
  * is enabled or not.  If CUDA is not enabled, then short circuit out
  * for all future calls.
  */
-void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf)
+void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf, const struct opal_datatype_t* datatype)
 {
     /* Only do the initialization on the first GPU access */
     if (!initialized) {
@@ -59,7 +83,23 @@ void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf)
 
     if (ftable.gpu_is_gpu_buffer(pUserBuf, convertor)) {
         convertor->flags |= CONVERTOR_CUDA;
+        if (OPAL_SUCCESS != opal_cuda_kernel_support_init()) {
+            opal_cuda_kernel_support_fini();
+        }
     }
+    
+    convertor->stream = NULL;
+
+    convertor->current_cuda_iov_pos = 0;
+    convertor->current_iov_pos = 0;
+    convertor->current_iov_partial_length = 0;
+    convertor->current_count = 0;
+
+    convertor->pipeline_depth = 0;
+    convertor->pipeline_seq = 0;
+    convertor->pipeline_size = 0;
+    convertor->gpu_buffer_ptr = NULL;
+    convertor->gpu_buffer_size = 0;
 }
 
 /* Checks the type of pointer
@@ -80,9 +120,8 @@ bool opal_cuda_check_bufs(char *dest, char *src)
 
     if (ftable.gpu_is_gpu_buffer(dest, NULL) || ftable.gpu_is_gpu_buffer(src, NULL)) {
         return true;
-    } else {
-        return false;
     }
+    return false;
 }
 
 /*
@@ -109,9 +148,8 @@ void *opal_cuda_memcpy(void *dest, const void *src, size_t size, opal_convertor_
         opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
                     res, dest, src, (int)size);
         abort();
-    } else {
-        return dest;
     }
+    return dest;
 }
 
 /*
@@ -127,9 +165,8 @@ void *opal_cuda_memcpy_sync(void *dest, const void *src, size_t size)
         opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
                     res, dest, src, (int)size);
         abort();
-    } else {
-        return dest;
     }
+    return dest;
 }
 
 /*
@@ -191,3 +228,254 @@ void opal_cuda_set_copy_function_async(opal_convertor_t* convertor, void *stream
     convertor->flags |= CONVERTOR_CUDA_ASYNC;
     convertor->stream = stream;
 }
+
+/* following functions are used for cuda ddt kernel support */
+int32_t opal_cuda_kernel_support_init(void)
+{
+    if (0 == opal_datatype_cuda_kernel_support_enabled) {
+        return OPAL_SUCCESS;
+    }
+
+    if (opal_datatype_cuda_kernel_handle ==  NULL) {
+
+        /* If the library name was initialized but the load failed, we have another chance to change it */
+        if( NULL != opal_datatype_cuda_kernel_lib )
+            free(opal_datatype_cuda_kernel_lib);
+        asprintf(&opal_datatype_cuda_kernel_lib, "%s/%s", opal_install_dirs.libdir, "opal_datatype_cuda_kernel.so");
+
+        opal_datatype_cuda_kernel_handle = dlopen(opal_datatype_cuda_kernel_lib , RTLD_LAZY);
+        if (!opal_datatype_cuda_kernel_handle) {
+            opal_output( 0, "Failed to load %s library: error %s\n", opal_datatype_cuda_kernel_lib, dlerror());
+            opal_datatype_cuda_kernel_handle = NULL;
+            return OPAL_ERROR;
+        }
+
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_datatype_cuda_kernel_init );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_datatype_cuda_kernel_fini );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_datatype_cuda_generic_simple_pack_function_iov );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_datatype_cuda_generic_simple_unpack_function_iov );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_datatype_cuda_free_gpu_buffer );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_datatype_cuda_malloc_gpu_buffer );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_datatype_cuda_d2dcpy_async );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_datatype_cuda_d2dcpy );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_datatype_cuda_cached_cuda_iov_fini );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_datatype_cuda_get_cuda_stream_by_id );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_datatype_cuda_get_current_cuda_stream );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_datatype_cuda_sync_current_cuda_stream );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_datatype_cuda_sync_cuda_stream );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_datatype_cuda_alloc_event );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_datatype_cuda_free_event );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_datatype_cuda_event_query );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_datatype_cuda_event_sync );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_datatype_cuda_event_record );
+
+        /* set output verbose */
+        opal_datatype_cuda_output = opal_output_open(NULL);
+        opal_output_set_verbosity(opal_datatype_cuda_output, opal_datatype_cuda_verbose); 
+        
+        if (OPAL_SUCCESS != cuda_kernel_table.opal_datatype_cuda_kernel_init_p()) {
+            return OPAL_ERROR;
+        }
+        opal_datatype_cuda_kernel_support = 1;
+        opal_output( 0, "opal_cuda_kernel_support_init done\n");
+    }
+    return OPAL_SUCCESS;
+}
+
+int32_t opal_cuda_kernel_support_fini(void)
+{
+    if (opal_datatype_cuda_kernel_handle != NULL) {
+        cuda_kernel_table.opal_datatype_cuda_kernel_fini_p();
+        /* Reset all functions to NULL */
+        cuda_kernel_table.opal_datatype_cuda_kernel_init_p = NULL;
+        cuda_kernel_table.opal_datatype_cuda_kernel_fini_p = NULL;
+        cuda_kernel_table.opal_datatype_cuda_generic_simple_pack_function_iov_p = NULL;
+        cuda_kernel_table.opal_datatype_cuda_generic_simple_unpack_function_iov_p = NULL;
+        cuda_kernel_table.opal_datatype_cuda_free_gpu_buffer_p = NULL;
+        cuda_kernel_table.opal_datatype_cuda_malloc_gpu_buffer_p = NULL;
+        cuda_kernel_table.opal_datatype_cuda_d2dcpy_async_p = NULL;
+        cuda_kernel_table.opal_datatype_cuda_d2dcpy_p = NULL;
+        cuda_kernel_table.opal_datatype_cuda_cached_cuda_iov_fini_p = NULL;
+        cuda_kernel_table.opal_datatype_cuda_get_cuda_stream_by_id_p = NULL;
+        cuda_kernel_table.opal_datatype_cuda_get_current_cuda_stream_p = NULL;
+        cuda_kernel_table.opal_datatype_cuda_sync_current_cuda_stream_p = NULL;
+        cuda_kernel_table.opal_datatype_cuda_sync_cuda_stream_p = NULL;
+        cuda_kernel_table.opal_datatype_cuda_alloc_event_p = NULL;
+        cuda_kernel_table.opal_datatype_cuda_free_event_p = NULL;
+        cuda_kernel_table.opal_datatype_cuda_event_query_p = NULL;
+        cuda_kernel_table.opal_datatype_cuda_event_sync_p = NULL;
+        cuda_kernel_table.opal_datatype_cuda_event_record_p = NULL;
+
+        dlclose(opal_datatype_cuda_kernel_handle);
+        opal_datatype_cuda_kernel_handle = NULL;
+
+        if( NULL != opal_datatype_cuda_kernel_lib )
+            free(opal_datatype_cuda_kernel_lib);
+        opal_datatype_cuda_kernel_lib = NULL;
+        opal_datatype_cuda_kernel_support = 0;
+        
+        /* close output verbose */
+        opal_output_close(opal_datatype_cuda_output);
+        opal_output( 0, "opal_cuda_kernel_support_fini done\n");
+    }
+    return OPAL_SUCCESS;
+}
+
+int32_t opal_cuda_sync_all_events(void *cuda_event_list, int32_t nb_events)
+{
+    for (int i = 0; i < nb_events; i++) {
+        opal_cuda_event_sync(cuda_event_list, i);
+    }
+    return OPAL_SUCCESS;
+}
+
+int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
+                                                    struct iovec* iov,
+                                                    uint32_t* out_size,
+                                                    size_t* max_data )
+{
+    if (cuda_kernel_table.opal_datatype_cuda_generic_simple_pack_function_iov_p != NULL) {
+        return cuda_kernel_table.opal_datatype_cuda_generic_simple_pack_function_iov_p(pConvertor, iov, out_size, max_data);
+    }
+    opal_output(0, "opal_datatype_cuda_generic_simple_pack_function_iov function pointer is NULL\n");
+    return -1;
+}
+
+int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
+                                                      struct iovec* iov,
+                                                      uint32_t* out_size,
+                                                      size_t* max_data )
+{
+    if (cuda_kernel_table.opal_datatype_cuda_generic_simple_unpack_function_iov_p != NULL) {
+        return cuda_kernel_table.opal_datatype_cuda_generic_simple_unpack_function_iov_p(pConvertor, iov, out_size, max_data);
+    }
+    opal_output(0, "opal_datatype_cuda_generic_simple_unpack_function_iov function pointer is NULL\n");
+    return -1;
+}
+
+void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id)
+{
+    if (cuda_kernel_table.opal_datatype_cuda_malloc_gpu_buffer_p != NULL) {
+        return cuda_kernel_table.opal_datatype_cuda_malloc_gpu_buffer_p(size, gpu_id);
+    }
+    opal_output(0, "opal_datatype_cuda_malloc_gpu_buffer function pointer is NULL\n");
+    return NULL;
+}
+
+void opal_cuda_free_gpu_buffer(void *addr, int gpu_id)
+{
+    if (cuda_kernel_table.opal_datatype_cuda_free_gpu_buffer_p != NULL) {
+        cuda_kernel_table.opal_datatype_cuda_free_gpu_buffer_p(addr, gpu_id);
+    } else {
+        opal_output(0, "opal_datatype_cuda_free_gpu_buffer function pointer is NULL\n");
+    }
+}
+
+void opal_cuda_d2dcpy(void* dst, const void* src, size_t count, void* stream)
+{
+    if (cuda_kernel_table.opal_datatype_cuda_d2dcpy_p != NULL) {
+        cuda_kernel_table.opal_datatype_cuda_d2dcpy_p(dst, src, count, stream);
+    } else {
+        opal_output(0, "opal_datatype_cuda_d2dcpy function pointer is NULL\n");
+    }
+}
+
+void opal_cuda_d2dcpy_async(void* dst, const void* src, size_t count, void* stream)
+{
+    if (cuda_kernel_table.opal_datatype_cuda_d2dcpy_async_p != NULL) {
+        cuda_kernel_table.opal_datatype_cuda_d2dcpy_async_p(dst, src, count, stream);
+    } else {
+        opal_output(0, "opal_datatype_cuda_d2dcpy_async function pointer is NULL\n");
+    }
+}
+
+void opal_cached_cuda_iov_fini(void *cached_cuda_iov)
+{
+    if (cuda_kernel_table.opal_datatype_cuda_cached_cuda_iov_fini_p != NULL) {
+        cuda_kernel_table.opal_datatype_cuda_cached_cuda_iov_fini_p(cached_cuda_iov);
+    } else {
+        opal_output(0, "opal_datatype_cuda_cached_cuda_iov_fini function pointer is NULL\n");
+    }
+}
+
+void* opal_cuda_get_cuda_stream_by_id(int stream_id)
+{
+    if (cuda_kernel_table.opal_datatype_cuda_get_current_cuda_stream_p != NULL) {
+        return cuda_kernel_table.opal_datatype_cuda_get_cuda_stream_by_id_p(stream_id);
+    }
+    opal_output(0, "opal_datatype_cuda_get_current_cuda_stream function pointer is NULL\n");
+    return NULL;
+}
+
+void* opal_cuda_get_current_cuda_stream(void)
+{
+    if (cuda_kernel_table.opal_datatype_cuda_get_current_cuda_stream_p != NULL) {
+        return cuda_kernel_table.opal_datatype_cuda_get_current_cuda_stream_p();
+    }
+    opal_output(0, "opal_datatype_cuda_get_current_cuda_stream function pointer is NULL\n");
+    return NULL;
+}
+
+void opal_cuda_sync_current_cuda_stream(void)
+{
+    if (cuda_kernel_table.opal_datatype_cuda_sync_current_cuda_stream_p != NULL) {
+        cuda_kernel_table.opal_datatype_cuda_sync_current_cuda_stream_p();
+    } else {
+        opal_output(0, "opal_datatype_cuda_sync_current_cuda_stream function pointer is NULL\n");
+    }
+}
+
+void opal_cuda_sync_cuda_stream(int stream_id)
+{
+    if (cuda_kernel_table.opal_datatype_cuda_sync_cuda_stream_p != NULL) {
+        cuda_kernel_table.opal_datatype_cuda_sync_cuda_stream_p(stream_id);
+    } else {
+        opal_output(0, "opal_datatype_cuda_sync_cuda_stream function pointer is NULL\n");
+    }
+}
+
+void* opal_cuda_alloc_event(int32_t nb_events, int32_t *loc)
+{
+    if (cuda_kernel_table.opal_datatype_cuda_alloc_event_p != NULL) {
+        return cuda_kernel_table.opal_datatype_cuda_alloc_event_p(nb_events, loc);
+    }
+    opal_output(0, "opal_datatype_cuda_alloc_event function pointer is NULL\n");
+    return NULL;
+}
+
+void opal_cuda_free_event(void *cuda_event_list, int32_t nb_events)
+{
+    if (cuda_kernel_table.opal_datatype_cuda_free_event_p != NULL) {
+        cuda_kernel_table.opal_datatype_cuda_free_event_p(cuda_event_list, nb_events);
+    } else {
+        opal_output(0, "opal_datatype_cuda_free_event function pointer is NULL\n");
+    }
+}
+
+int32_t opal_cuda_event_query(void *cuda_event_list, int32_t i)
+{
+    if (cuda_kernel_table.opal_datatype_cuda_event_query_p != NULL) {
+        return cuda_kernel_table.opal_datatype_cuda_event_query_p(cuda_event_list, i);
+    }
+    opal_output(0, "opal_datatype_cuda_event_query function pointer is NULL\n");
+    return -2;
+}
+
+int32_t opal_cuda_event_sync(void *cuda_event_list, int32_t i)
+{
+    if (cuda_kernel_table.opal_datatype_cuda_event_sync_p != NULL) {
+        return cuda_kernel_table.opal_datatype_cuda_event_sync_p(cuda_event_list, i);
+    }
+    opal_output(0, "opal_datatype_cuda_event_sync function pointer is NULL\n");
+    return -2;
+}
+
+int32_t opal_cuda_event_record(void *cuda_event_list, int32_t i, void* stream)
+{
+    if (cuda_kernel_table.opal_datatype_cuda_event_record_p != NULL) {
+        return cuda_kernel_table.opal_datatype_cuda_event_record_p(cuda_event_list, i, stream);
+    }
+    opal_output(0, "opal_datatype_cuda_event_record function pointer is NULL\n");
+    return -2;
+}
+
diff --git a/opal/datatype/opal_datatype_cuda.h b/opal/datatype/opal_datatype_cuda.h
index 676af80273b..1822cd67ad4 100644
--- a/opal/datatype/opal_datatype_cuda.h
+++ b/opal/datatype/opal_datatype_cuda.h
@@ -10,6 +10,8 @@
 #ifndef _OPAL_DATATYPE_CUDA_H
 #define _OPAL_DATATYPE_CUDA_H
 
+#define OPAL_DATATYPE_CUDA_VERBOSE_LEVEL    5
+
 /* Structure to hold CUDA support functions that gets filled in when the
  * common cuda code is initialized.  This removes any dependency on <cuda.h>
  * in the opal cuda datatype code. */
@@ -21,7 +23,30 @@ struct opal_common_cuda_function_table {
 };
 typedef struct opal_common_cuda_function_table opal_common_cuda_function_table_t;
 
-void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf);
+struct opal_datatype_cuda_kernel_function_table {
+    int32_t (*opal_datatype_cuda_kernel_init_p)(void);
+    int32_t (*opal_datatype_cuda_kernel_fini_p)(void);
+    void (*opal_datatype_cuda_free_gpu_buffer_p)(void *addr, int gpu_id);
+    void* (*opal_datatype_cuda_malloc_gpu_buffer_p)(size_t size, int gpu_id);
+    void (*opal_datatype_cuda_d2dcpy_async_p)(void* dst, const void* src, size_t count, void* stream);
+    void (*opal_datatype_cuda_d2dcpy_p)(void* dst, const void* src, size_t count, void* stream);
+    void (*opal_datatype_cuda_cached_cuda_iov_fini_p)(void *cached_cuda_iov);
+    void* (*opal_datatype_cuda_get_cuda_stream_by_id_p)(int stream_id);
+    void* (*opal_datatype_cuda_get_current_cuda_stream_p)(void);
+    void (*opal_datatype_cuda_sync_current_cuda_stream_p)(void);
+    void (*opal_datatype_cuda_sync_cuda_stream_p)(int stream_id);
+    void* (*opal_datatype_cuda_alloc_event_p)(int32_t nb_events, int32_t *loc);
+    void (*opal_datatype_cuda_free_event_p)(void *cuda_event_list, int32_t nb_events);
+    int32_t (*opal_datatype_cuda_event_query_p)(void *cuda_event_list, int32_t i);
+    int32_t (*opal_datatype_cuda_event_sync_p)(void *cuda_event_list, int32_t i);
+    int32_t (*opal_datatype_cuda_event_record_p)(void *cuda_event_list, int32_t i, void* stream);
+    int32_t (*opal_datatype_cuda_generic_simple_pack_function_iov_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
+    int32_t (*opal_datatype_cuda_generic_simple_unpack_function_iov_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );                                                      
+};
+typedef struct opal_datatype_cuda_kernel_function_table opal_datatype_cuda_kernel_function_table_t;
+extern int32_t opal_datatype_cuda_kernel_support;
+
+void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf, const struct opal_datatype_t* datatype);
 bool opal_cuda_check_bufs(char *dest, char *src);
 void* opal_cuda_memcpy(void * dest, const void * src, size_t size, opal_convertor_t* convertor);
 void* opal_cuda_memcpy_sync(void * dest, const void * src, size_t size);
@@ -29,4 +54,26 @@ void* opal_cuda_memmove(void * dest, void * src, size_t size);
 void opal_cuda_add_initialization_function(int (*fptr)(opal_common_cuda_function_table_t *));
 void opal_cuda_set_copy_function_async(opal_convertor_t* convertor, void *stream);
 
+int32_t opal_cuda_kernel_support_init(void);
+int32_t opal_cuda_kernel_support_fini(void);
+int32_t opal_cuda_sync_all_events(void *cuda_event_list, int32_t nb_events);
+
+int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
+int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
+void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id);
+void opal_cuda_free_gpu_buffer(void *addr, int gpu_id);
+void opal_cuda_d2dcpy(void* dst, const void* src, size_t count, void* stream);
+void opal_cuda_d2dcpy_async(void* dst, const void* src, size_t count, void* stream);
+void* opal_cached_cuda_iov_init(void);
+void opal_cached_cuda_iov_fini(void *cached_cuda_iov);
+void* opal_cuda_get_cuda_stream_by_id(int stream_id);
+void* opal_cuda_get_current_cuda_stream(void);
+void opal_cuda_sync_current_cuda_stream(void);
+void opal_cuda_sync_cuda_stream(int stream_id);
+void* opal_cuda_alloc_event(int32_t nb_events, int32_t *loc);
+void opal_cuda_free_event(void *cuda_event_list, int32_t nb_events);
+int32_t opal_cuda_event_query(void *cuda_event_list, int32_t i);
+int32_t opal_cuda_event_sync(void *cuda_event_list, int32_t i);
+int32_t opal_cuda_event_record(void *cuda_event_list, int32_t i, void* stream);
+
 #endif
diff --git a/opal/datatype/opal_datatype_module.c b/opal/datatype/opal_datatype_module.c
index 7de8fae5b08..206659f1189 100644
--- a/opal/datatype/opal_datatype_module.c
+++ b/opal/datatype/opal_datatype_module.c
@@ -33,6 +33,9 @@
 #include "opal/datatype/opal_datatype.h"
 #include "opal/datatype/opal_convertor_internal.h"
 #include "opal/mca/base/mca_base_var.h"
+#if OPAL_CUDA_SUPPORT
+#include "opal/datatype/opal_datatype_cuda.h"
+#endif /* OPAL_CUDA_SUPPORT */
 
 /* by default the debuging is turned off */
 int opal_datatype_dfd = -1;
@@ -42,6 +45,9 @@ bool opal_position_debug = false;
 bool opal_copy_debug = false;
 
 extern int opal_cuda_verbose;
+extern int opal_datatype_cuda_verbose;
+extern size_t opal_datatype_cuda_buffer_size;
+extern int opal_datatype_cuda_kernel_support_enabled;
 
 /* Using this macro implies that at this point _all_ informations needed
  * to fill up the datatype are known.
@@ -187,6 +193,36 @@ int opal_datatype_register_params(void)
     if (0 > ret) {
 	return ret;
     }
+    
+    /* Set different levels of verbosity in the cuda datatype related code. */
+    ret = mca_base_var_register ("opal", "opal", NULL, "datatype_cuda_verbose",
+                                 "Set level of opal datatype cuda verbosity",
+                                 MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
+                                 OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_LOCAL,
+                                 &opal_datatype_cuda_verbose);
+    if (0 > ret) {
+	return ret;
+    }
+    
+    /* Set cuda kernel datatype engine buffer size. */
+    ret = mca_base_var_register ("opal", "opal", NULL, "opal_datatype_cuda_buffer_size",
+                                 "Set cuda datatype engine buffer size",
+                                 MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
+                                 OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_LOCAL,
+                                 &opal_datatype_cuda_buffer_size);
+    if (0 > ret) {
+	return ret;
+    }
+    
+    /* Set cuda kernel datatype engine enable or not. */
+    ret = mca_base_var_register ("opal", "opal", NULL, "opal_datatype_cuda_kernel_support_enabled",
+                                 "Set cuda kernel datatype engine enable or not",
+                                 MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
+                                 OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_LOCAL,
+                                 &opal_datatype_cuda_kernel_support_enabled);
+    if (0 > ret) {
+	return ret;
+    }
 #endif
 
 #endif /* OPAL_ENABLE_DEBUG */
@@ -248,6 +284,10 @@ int32_t opal_datatype_finalize( void )
     /* clear all master convertors */
     opal_convertor_destroy_masters();
 
+#if OPAL_CUDA_SUPPORT
+    opal_cuda_kernel_support_fini();
+#endif /* OPAL_CUDA_SUPPORT */
+
     return OPAL_SUCCESS;
 }
 
diff --git a/opal/datatype/opal_datatype_optimize.c b/opal/datatype/opal_datatype_optimize.c
index 5b66e4df595..be27af568c6 100644
--- a/opal/datatype/opal_datatype_optimize.c
+++ b/opal/datatype/opal_datatype_optimize.c
@@ -303,5 +303,6 @@ int32_t opal_datatype_commit( opal_datatype_t * pData )
         pLast->first_elem_disp = first_elem_disp;
         pLast->size            = pData->size;
     }
+
     return OPAL_SUCCESS;
 }
diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c
index 08ae1ecf7ac..fba9356068a 100644
--- a/opal/datatype/opal_datatype_pack.c
+++ b/opal/datatype/opal_datatype_pack.c
@@ -22,6 +22,7 @@
 #include "opal_config.h"
 
 #include <stddef.h>
+#include <stdio.h>
 
 #include "opal/datatype/opal_convertor_internal.h"
 #include "opal/datatype/opal_datatype_internal.h"
@@ -37,17 +38,22 @@
 #include "opal/datatype/opal_datatype_checksum.h"
 #include "opal/datatype/opal_datatype_pack.h"
 #include "opal/datatype/opal_datatype_prototypes.h"
+#if OPAL_CUDA_SUPPORT
+#include "opal/datatype/opal_datatype_cuda.h"
+#endif /* OPAL_CUDA_SUPPORT */
 
 #if defined(CHECKSUM)
 #define opal_pack_homogeneous_contig_function           opal_pack_homogeneous_contig_checksum
 #define opal_pack_homogeneous_contig_with_gaps_function opal_pack_homogeneous_contig_with_gaps_checksum
 #define opal_generic_simple_pack_function               opal_generic_simple_pack_checksum
 #define opal_pack_general_function                      opal_pack_general_checksum
+#define opal_generic_simple_pack_cuda_function          opal_generic_simple_pack_cuda_checksum
 #else
 #define opal_pack_homogeneous_contig_function           opal_pack_homogeneous_contig
 #define opal_pack_homogeneous_contig_with_gaps_function opal_pack_homogeneous_contig_with_gaps
 #define opal_generic_simple_pack_function               opal_generic_simple_pack
 #define opal_pack_general_function                      opal_pack_general
+#define opal_generic_simple_pack_cuda_function          opal_generic_simple_pack_cuda
 #endif  /* defined(CHECKSUM) */
 
 
@@ -585,3 +591,11 @@ opal_pack_general_function( opal_convertor_t* pConvertor,
                            pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
     return 0;
 }
+
+int32_t
+opal_generic_simple_pack_cuda_function( opal_convertor_t* pConvertor,
+                                        struct iovec* iov, uint32_t* out_size,
+                                        size_t* max_data )
+{
+    return opal_generic_simple_pack_function_cuda_iov( pConvertor, iov, out_size, max_data);
+}
diff --git a/opal/datatype/opal_datatype_prototypes.h b/opal/datatype/opal_datatype_prototypes.h
index 668397112b8..cd264775362 100644
--- a/opal/datatype/opal_datatype_prototypes.h
+++ b/opal/datatype/opal_datatype_prototypes.h
@@ -68,6 +68,14 @@ opal_generic_simple_pack_checksum( opal_convertor_t* pConvertor,
                                    struct iovec* iov, uint32_t* out_size,
                                    size_t* max_data );
 int32_t
+opal_generic_simple_pack_cuda( opal_convertor_t* pConvertor,
+                               struct iovec* iov, uint32_t* out_size,
+                               size_t* max_data );
+int32_t
+opal_generic_simple_pack_cuda_checksum( opal_convertor_t* pConvertor,
+                                        struct iovec* iov, uint32_t* out_size,
+                                        size_t* max_data );
+int32_t
 opal_unpack_homogeneous_contig( opal_convertor_t* pConv,
                                 struct iovec* iov, uint32_t* out_size,
                                 size_t* max_data );
@@ -83,6 +91,14 @@ int32_t
 opal_generic_simple_unpack_checksum( opal_convertor_t* pConvertor,
                                      struct iovec* iov, uint32_t* out_size,
                                      size_t* max_data );
+int32_t
+opal_generic_simple_unpack_cuda( opal_convertor_t* pConvertor,
+                                struct iovec* iov, uint32_t* out_size,
+                                size_t* max_data );                                     
+int32_t
+opal_generic_simple_unpack_cuda_checksum( opal_convertor_t* pConvertor,
+                                        struct iovec* iov, uint32_t* out_size,
+                                        size_t* max_data );
 
 END_C_DECLS
 
diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c
index 195bca48f1e..c59682511a1 100644
--- a/opal/datatype/opal_datatype_unpack.c
+++ b/opal/datatype/opal_datatype_unpack.c
@@ -39,15 +39,20 @@
 #include "opal/datatype/opal_datatype_checksum.h"
 #include "opal/datatype/opal_datatype_unpack.h"
 #include "opal/datatype/opal_datatype_prototypes.h"
+#if OPAL_CUDA_SUPPORT
+#include "opal/datatype/opal_datatype_cuda.h"
+#endif /* OPAL_CUDA_SUPPORT */
 
 #if defined(CHECKSUM)
 #define opal_unpack_general_function            opal_unpack_general_checksum
 #define opal_unpack_homogeneous_contig_function opal_unpack_homogeneous_contig_checksum
 #define opal_generic_simple_unpack_function     opal_generic_simple_unpack_checksum
+#define opal_generic_simple_unpack_cuda_function     opal_generic_simple_unpack_cuda_checksum
 #else
 #define opal_unpack_general_function            opal_unpack_general
 #define opal_unpack_homogeneous_contig_function opal_unpack_homogeneous_contig
 #define opal_generic_simple_unpack_function     opal_generic_simple_unpack
+#define opal_generic_simple_unpack_cuda_function     opal_generic_simple_unpack_cuda
 #endif  /* defined(CHECKSUM) */
 
 
@@ -581,3 +586,11 @@ opal_unpack_general_function( opal_convertor_t* pConvertor,
                            pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
     return 0;
 }
+
+int32_t
+opal_generic_simple_unpack_cuda_function( opal_convertor_t* pConvertor,
+                                          struct iovec* iov, uint32_t* out_size,
+                                          size_t* max_data )
+{
+    return opal_generic_simple_unpack_function_cuda_iov( pConvertor, iov, out_size, max_data);
+}
diff --git a/opal/mca/btl/btl.h b/opal/mca/btl/btl.h
index 48564b573ed..65b8c90f4d1 100644
--- a/opal/mca/btl/btl.h
+++ b/opal/mca/btl/btl.h
@@ -189,6 +189,9 @@ typedef uint8_t mca_btl_base_tag_t;
 #define MCA_BTL_TAG_IB                (MCA_BTL_TAG_BTL + 0)
 #define MCA_BTL_TAG_UDAPL             (MCA_BTL_TAG_BTL + 1)
 #define MCA_BTL_TAG_SMCUDA            (MCA_BTL_TAG_BTL + 2)
+#define MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK   (MCA_BTL_TAG_BTL + 3)
+#define MCA_BTL_TAG_SMCUDA_DATATYPE_PACK     (MCA_BTL_TAG_BTL + 4)
+#define MCA_BTL_TAG_SMCUDA_DATATYPE_PUT      (MCA_BTL_TAG_BTL + 5)
 
 /* prefered protocol */
 #define MCA_BTL_FLAGS_SEND            0x0001
@@ -871,6 +874,20 @@ typedef struct mca_btl_base_registration_handle_t *(*mca_btl_base_module_registe
 typedef int (*mca_btl_base_module_deregister_mem_fn_t)(
     struct mca_btl_base_module_t* btl, struct mca_btl_base_registration_handle_t *handle);
 
+
+/**
+ * @brief register a convertor
+ *
+ * @param btl (IN)         BTL module region was registered with
+ * @param handle (IN)      BTL registration handle to register
+ * @param convertor (IN)   convertor needs to be registered
+ *
+ * This function register the necessary convertor information. No need to 
+ * deregister since handle will be deregistered by mem deregisteration
+ */
+typedef int (*mca_btl_base_module_register_convertor_fn_t)(
+    struct mca_btl_base_module_t* btl, struct mca_btl_base_registration_handle_t *handle, struct opal_convertor_t *convertor);
+
 /**
  * Initiate an asynchronous send.
  * Completion Semantics: the descriptor has been queued for a send operation
@@ -1217,6 +1234,9 @@ struct mca_btl_base_module_t {
     /* new memory registration functions */
     mca_btl_base_module_register_mem_fn_t   btl_register_mem;   /**< memory registration function (NULL if not needed) */
     mca_btl_base_module_deregister_mem_fn_t btl_deregister_mem; /**< memory deregistration function (NULL if not needed) */
+    
+    /* convertor registration functions */
+    mca_btl_base_module_register_convertor_fn_t btl_register_convertor; /**< convertor registration function (NULL if not needed) */
 
     /** the mpool associated with this btl (optional) */
     mca_mpool_base_module_t*             btl_mpool;
@@ -1230,6 +1250,9 @@ struct mca_btl_base_module_t {
 #endif /* OPAL_CUDA_GDR_SUPPORT */
 #if OPAL_CUDA_SUPPORT
     size_t      btl_cuda_max_send_size;   /**< set if CUDA max send_size is different from host max send size */
+    int32_t     btl_cuda_ddt_allow_rdma;
+    size_t      btl_cuda_ddt_pipeline_size;
+    int32_t     btl_cuda_ddt_pipeline_depth;
 #endif /* OPAL_CUDA_SUPPORT */
 };
 typedef struct mca_btl_base_module_t mca_btl_base_module_t;
diff --git a/opal/mca/btl/openib/btl_openib.c b/opal/mca/btl/openib/btl_openib.c
index 0f021ce3041..7118948aad0 100644
--- a/opal/mca/btl/openib/btl_openib.c
+++ b/opal/mca/btl/openib/btl_openib.c
@@ -1589,6 +1589,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
     uint32_t iov_count = 1;
     size_t max_data = *size;
     void *ptr;
+    void *cuda_stream = NULL;
 
     assert(MCA_BTL_NO_ORDER == order);
 
@@ -1606,14 +1607,30 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
 
     iov.iov_len = max_data;
     iov.iov_base = (IOVBASE_TYPE *) ( (unsigned char*) ptr + reserve );
-    (void) opal_convertor_pack(convertor, &iov, &iov_count, &max_data);
+    if (opal_datatype_cuda_kernel_support && (convertor->flags & CONVERTOR_CUDA_ASYNC)) {
+        convertor->flags &= ~CONVERTOR_CUDA;
+        if (opal_convertor_need_buffers(convertor) == true) {
+            convertor->stream = mca_common_cuda_get_dtoh_stream();
+        }
+        convertor->flags |= CONVERTOR_CUDA;
+    }
+    opal_convertor_pack(convertor, &iov, &iov_count, &max_data);
+    if (opal_datatype_cuda_kernel_support && (convertor->flags & CONVERTOR_CUDA_ASYNC)) {
+        convertor->flags &= ~CONVERTOR_CUDA;
+        if (opal_convertor_need_buffers(convertor) == true && convertor->pipeline_depth != 0) {
+            convertor->pipeline_seq ++;
+            convertor->pipeline_seq = convertor->pipeline_seq % convertor->pipeline_depth;
+        }
+        convertor->flags |= CONVERTOR_CUDA;
+    }
 
 #if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
     /* If the convertor is copying the data asynchronously, then record an event
      * that will trigger the callback when it completes.  Mark descriptor as async.
      * No need for this in the case we are not sending any GPU data. */
     if ((convertor->flags & CONVERTOR_CUDA_ASYNC) && (0 != max_data)) {
-        mca_common_cuda_record_dtoh_event("btl_openib", (mca_btl_base_descriptor_t *)frag);
+        OPAL_OUTPUT_VERBOSE((OPAL_DATATYPE_CUDA_VERBOSE_LEVEL, mca_common_cuda_output, "Record d2h cuda event\n"));
+        mca_common_cuda_record_dtoh_event("btl_openib", (mca_btl_base_descriptor_t *)frag, convertor, cuda_stream);
         to_base_frag(frag)->base.des_flags = flags | MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC;
     }
 #endif /* OPAL_CUDA_SUPPORT */
diff --git a/opal/mca/btl/openib/btl_openib_component.c b/opal/mca/btl/openib/btl_openib_component.c
index cb741816ceb..404d895feca 100644
--- a/opal/mca/btl/openib/btl_openib_component.c
+++ b/opal/mca/btl/openib/btl_openib_component.c
@@ -69,7 +69,10 @@
 #include "opal/mca/mpool/base/base.h"
 #include "opal/mca/rcache/rcache.h"
 #include "opal/mca/rcache/base/base.h"
+#if OPAL_CUDA_SUPPORT
+#include "opal/datatype/opal_datatype_cuda.h"
 #include "opal/mca/common/cuda/common_cuda.h"
+#endif /* OPAL_CUDA_SUPPORT */
 #include "opal/mca/common/verbs/common_verbs.h"
 #include "opal/runtime/opal_params.h"
 #include "opal/runtime/opal.h"
@@ -3780,7 +3783,15 @@ static int btl_openib_component_progress(void)
     {
         int local_count = 0;
         mca_btl_base_descriptor_t *frag;
-        while (local_count < 10 && (1 == progress_one_cuda_dtoh_event(&frag))) {
+        opal_convertor_t *convertor = NULL;
+        while (local_count < 10 && (1 == progress_one_cuda_dtoh_event(&frag, &convertor))) {
+            if (convertor != NULL) {
+                if ((convertor->flags & CONVERTOR_COMPLETED) && (convertor->gpu_buffer_ptr != NULL)) {
+                    OPAL_OUTPUT_VERBOSE((OPAL_DATATYPE_CUDA_VERBOSE_LEVEL, mca_common_cuda_output, "Free GPU pack buffer %p in openib dtoh\n", convertor->gpu_buffer_ptr));
+                    opal_cuda_free_gpu_buffer(convertor->gpu_buffer_ptr, 0);
+                    convertor->gpu_buffer_ptr = NULL;
+                }
+            }
             OPAL_OUTPUT((-1, "btl_openib: event completed on frag=%p", (void *)frag));
             frag->des_cbfunc(NULL, NULL, frag, OPAL_SUCCESS);
             local_count++;
diff --git a/opal/mca/btl/openib/btl_openib_mca.c b/opal/mca/btl/openib/btl_openib_mca.c
index 700ccb27634..3450674b5cf 100644
--- a/opal/mca/btl/openib/btl_openib_mca.c
+++ b/opal/mca/btl/openib/btl_openib_mca.c
@@ -651,6 +651,7 @@ int btl_openib_register_mca_params(void)
         mca_btl_openib_module.super.btl_cuda_rdma_limit = 0;  /* Unused */
     }
 #endif /* OPAL_CUDA_GDR_SUPPORT */
+    mca_btl_openib_module.super.btl_cuda_ddt_allow_rdma = 0;
 #endif /* OPAL_CUDA_SUPPORT */
     CHECK(mca_btl_base_param_register(
             &mca_btl_openib_component.super.btl_version,
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index 6208ea5399d..d58dfe58057 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -57,6 +57,7 @@
 
 #if OPAL_CUDA_SUPPORT
 #include "opal/mca/common/cuda/common_cuda.h"
+#include "opal/datatype/opal_datatype_cuda.h"
 #endif /* OPAL_CUDA_SUPPORT */
 #include "opal/mca/mpool/base/base.h"
 #include "opal/mca/rcache/base/base.h"
@@ -80,6 +81,18 @@ static struct mca_btl_base_registration_handle_t *mca_btl_smcuda_register_mem (
 
 static int mca_btl_smcuda_deregister_mem (struct mca_btl_base_module_t* btl,
                                           struct mca_btl_base_registration_handle_t *handle);
+                                          
+static int mca_btl_smcuda_register_convertor (struct mca_btl_base_module_t* btl,
+                                              struct mca_btl_base_registration_handle_t *handle,
+                                              struct opal_convertor_t* convertor);
+                                          
+inline static int mca_btl_smcuda_cuda_ddt_start_pack(struct mca_btl_base_module_t *btl, 
+                                                     struct mca_btl_base_endpoint_t *endpoint,
+                                                     struct opal_convertor_t *pack_convertor,
+                                                     struct opal_convertor_t *unpack_convertor,
+                                                     void *remote_gpu_address,
+                                                     mca_btl_base_descriptor_t *frag,
+                                                     int lindex, int remote_device, int local_device);
 #endif
 
 mca_btl_smcuda_t mca_btl_smcuda = {
@@ -94,6 +107,7 @@ mca_btl_smcuda_t mca_btl_smcuda = {
 #if OPAL_CUDA_SUPPORT
         .btl_register_mem = mca_btl_smcuda_register_mem,
         .btl_deregister_mem = mca_btl_smcuda_deregister_mem,
+        .btl_register_convertor = mca_btl_smcuda_register_convertor,
 #endif /* OPAL_CUDA_SUPPORT */
         .btl_send = mca_btl_smcuda_send,
         .btl_sendi = mca_btl_smcuda_sendi,
@@ -482,6 +496,14 @@ create_sm_endpoint(int local_proc, struct opal_proc_t *proc)
     /* Create a remote memory pool on the endpoint. The rgpusm component
      * does not take any resources. They are filled in internally. */
     ep->rcache = mca_rcache_base_module_create ("rgpusm", NULL, NULL);
+    /* alloc array for pack/unpack use */
+    ep->smcuda_ddt_clone = NULL;
+    ep->smcuda_ddt_clone = (cuda_ddt_clone_t *)malloc(sizeof(cuda_ddt_clone_t) * SMCUDA_DT_CLONE_SIZE);
+    ep->smcuda_ddt_clone_size = SMCUDA_DT_CLONE_SIZE;
+    ep->smcuda_ddt_clone_avail = SMCUDA_DT_CLONE_SIZE;
+    for (int i = 0; i < ep->smcuda_ddt_clone_size; i++) {
+        ep->smcuda_ddt_clone[i].lindex = -1;
+    }
 #endif /* OPAL_CUDA_SUPPORT */
     return ep;
 }
@@ -686,11 +708,17 @@ int mca_btl_smcuda_del_procs(
     struct opal_proc_t **procs,
     struct mca_btl_base_endpoint_t **peers)
 {
+    struct mca_btl_base_endpoint_t * ep;
     for (size_t i = 0 ; i < nprocs ; ++i) {
         if (peers[i]->rcache) {
             mca_rcache_base_module_destroy (peers[i]->rcache);
             peers[i]->rcache = NULL;
         }
+        ep = peers[i];
+        if (ep->smcuda_ddt_clone != NULL) {
+            free(ep->smcuda_ddt_clone);
+            ep->smcuda_ddt_clone = NULL;
+        }
     }
 
     return OPAL_SUCCESS;
@@ -808,6 +836,9 @@ struct mca_btl_base_descriptor_t* mca_btl_smcuda_prepare_src(
     iov.iov_base =
         (IOVBASE_TYPE*)(((unsigned char*)(frag->segment.seg_addr.pval)) + reserve);
 
+    if (opal_datatype_cuda_kernel_support) {
+        convertor->flags &= ~CONVERTOR_CUDA_ASYNC;
+    }
     rc = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
     if( OPAL_UNLIKELY(rc < 0) ) {
         MCA_BTL_SMCUDA_FRAG_RETURN(frag);
@@ -1039,6 +1070,33 @@ static int mca_btl_smcuda_deregister_mem (struct mca_btl_base_module_t* btl,
     return OPAL_SUCCESS;
 }
 
+static int mca_btl_smcuda_register_convertor (struct mca_btl_base_module_t* btl,
+                                              struct mca_btl_base_registration_handle_t *handle,
+                                              struct opal_convertor_t *convertor)
+{
+    mca_rcache_common_cuda_reg_t *cuda_reg = (mca_rcache_common_cuda_reg_t *)((intptr_t) handle - offsetof (mca_rcache_common_cuda_reg_t, data));
+    
+    int32_t local_device = 0;
+    if (convertor->flags & CONVERTOR_CUDA) {
+    
+        int rc = mca_common_cuda_get_device(&local_device);
+        if (rc != 0) {
+            opal_output(0, "Failed to get the GPU device ID, rc= %d\n", rc);
+            return rc;
+        } 
+        convertor->flags &= ~CONVERTOR_CUDA;
+        if (opal_convertor_need_buffers(convertor) == false) {
+            cuda_reg->data.pack_unpack_required = 0;
+        } else {
+            cuda_reg->data.pack_unpack_required = 1;
+        }
+        convertor->flags |= CONVERTOR_CUDA;
+        cuda_reg->data.gpu_device = local_device;
+        cuda_reg->data.convertor = convertor;
+    }
+    return OPAL_SUCCESS;
+}
+
 int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
     struct mca_btl_base_endpoint_t *ep, void *local_address,
     uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
@@ -1109,7 +1167,7 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
     offset = (size_t) ((intptr_t) remote_address - (intptr_t) reg_ptr->base.base);
     remote_memory_address = (unsigned char *)reg_ptr->base.alloc_base + offset;
     if (0 != offset) {
-        opal_output(-1, "OFFSET=%d", (int)offset);
+        opal_output(-1, "OFFSET %d, ra %p, base %p, remote %p\n", (int)offset, (void*)remote_address, (void*)reg_ptr->base.base, remote_memory_address);
     }
 
     /* The remote side posted an IPC event to make sure we do not start our
@@ -1118,18 +1176,101 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
      * on the IPC event that we received.  Note that we pull it from
      * rget_reg, not reg_ptr, as we do not cache the event. */
     mca_common_wait_stream_synchronize(&rget_reg);
-
-    rc = mca_common_cuda_memcpy(local_address, remote_memory_address, size,
-				"mca_btl_smcuda_get", (mca_btl_base_descriptor_t *)frag,
-				&done);
-    if (OPAL_SUCCESS != rc) {
-        /* Out of resources can be handled by upper layers. */
-        if (OPAL_ERR_OUT_OF_RESOURCE != rc) {
-            opal_output(0, "Failed to cuMemcpy GPU memory, rc=%d", rc);
+    
+    /* datatype RDMA */
+    opal_convertor_t* unpack_convertor = local_handle->reg_data.convertor;
+    uint8_t unpack_required = local_handle->reg_data.pack_unpack_required;
+
+    if (unpack_convertor->flags & CONVERTOR_CUDA) {
+        uint8_t pack_required = remote_handle->reg_data.pack_unpack_required;
+        int lindex = -1;
+        int remote_device = remote_handle->reg_data.gpu_device;
+        opal_convertor_t* pack_convertor = remote_handle->reg_data.convertor;
+        int local_device = 0;
+        rc = mca_common_cuda_get_device(&local_device);
+        if (rc != 0) {
+            opal_output(0, "Failed to get the GPU device ID, rc=%d", rc);
+            return rc;
+        }
+        if(unpack_required) {
+            if (remote_device != local_device && !OPAL_DATATYPE_DIRECT_COPY_GPUMEM) {
+                unpack_convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer(mca_btl_smcuda.super.btl_cuda_ddt_pipeline_depth * mca_btl_smcuda_component.cuda_ddt_pipeline_size, 0);  
+                if (NULL == unpack_convertor->gpu_buffer_ptr) {
+                    return OPAL_ERR_OUT_OF_RESOURCE;
+                }
+            } else {
+                unpack_convertor->gpu_buffer_ptr = remote_memory_address;   
+            }
+            if (pack_required) {
+                lindex = mca_btl_smcuda_alloc_cuda_ddt_clone(ep);
+                mca_btl_smcuda_cuda_ddt_start_pack(btl, ep, pack_convertor, unpack_convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                                                    lindex, remote_device, local_device);
+                done = 0;
+            } else {
+                struct iovec iov;
+                uint32_t iov_count = 1;
+                size_t max_data;
+                unpack_convertor->stream = opal_cuda_get_cuda_stream_by_id(0);
+                if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && remote_device != local_device) {
+                    opal_cuda_free_gpu_buffer(unpack_convertor->gpu_buffer_ptr, 0);
+                    if (NULL == unpack_convertor->gpu_buffer_ptr) {
+                        return OPAL_ERR_OUT_OF_RESOURCE;
+                    }
+                    unpack_convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer(size, 0);
+                    opal_cuda_d2dcpy_async(unpack_convertor->gpu_buffer_ptr, remote_memory_address, size, unpack_convertor->stream);
+                    iov.iov_base = unpack_convertor->gpu_buffer_ptr;
+                    OPAL_OUTPUT_VERBOSE((OPAL_DATATYPE_CUDA_VERBOSE_LEVEL, mca_common_cuda_output, "start D2D copy src %p, dst %p, size %lu\n", remote_memory_address, unpack_convertor->gpu_buffer_ptr, size));
+                } else {
+                    iov.iov_base = unpack_convertor->gpu_buffer_ptr;
+                }
+                iov.iov_len = size;
+                max_data = size;
+                opal_convertor_unpack(unpack_convertor, &iov, &iov_count, &max_data );
+                opal_cuda_sync_cuda_stream(0);
+                opal_cuda_free_gpu_buffer(unpack_convertor->gpu_buffer_ptr, 0);
+                done = 1;
+            }
+        } else {
+            if (pack_required) {
+                lindex = mca_btl_smcuda_alloc_cuda_ddt_clone(ep);
+                if (remote_device == local_device || OPAL_DATATYPE_DIRECT_COPY_GPUMEM) {
+                    /* now we are able to let sender pack directly to my memory */
+                    mca_rcache_common_cuda_reg_t loc_reg;
+                    mca_rcache_common_cuda_reg_t *loc_reg_ptr = &loc_reg;
+                    cuda_ddt_put_hdr_t put_msg;
+                    if (OPAL_SUCCESS != cuda_getmemhandle(local_address, size, (mca_rcache_base_registration_t *)&loc_reg, NULL)) {
+                        mca_btl_smcuda_cuda_ddt_start_pack(btl, ep, pack_convertor, unpack_convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                                                           lindex, remote_device, local_device);
+                    }
+                    memcpy(put_msg.mem_handle, loc_reg_ptr->data.memHandle, sizeof(loc_reg_ptr->data.memHandle));
+                    put_msg.remote_address = local_address;
+                    put_msg.remote_base = loc_reg.base.base;
+                    put_msg.lindex = lindex;
+                    put_msg.pack_convertor = pack_convertor;
+                    mca_btl_smcuda_cuda_ddt_clone(ep, pack_convertor, unpack_convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                                                        lindex, 0, 0);
+                    mca_btl_smcuda_send_cuda_ddt_sig(btl, ep, &put_msg, sizeof(cuda_ddt_put_hdr_t), MCA_BTL_TAG_SMCUDA_DATATYPE_PUT);
+                } else {
+                    mca_btl_smcuda_cuda_ddt_start_pack(btl, ep, pack_convertor, unpack_convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                                                       lindex, remote_device, local_device);
+                }
+                done = 0;
+            } else {
+                rc = mca_common_cuda_memcpy(local_address, remote_memory_address, size,
+        		            "mca_btl_smcuda_get", (mca_btl_base_descriptor_t *)frag,
+        				    &done);
+                if (OPAL_SUCCESS != rc) {
+                    /* Out of resources can be handled by upper layers. */
+                    if (OPAL_ERR_OUT_OF_RESOURCE != rc) {
+                        opal_output(0, "Failed to cuMemcpy GPU memory, rc=%d", rc);
+                    }
+                    return rc;
+                }
+            }
         }
-        return rc;
     }
 
+
     if (OPAL_UNLIKELY(1 == done)) {
         cbfunc (btl, ep, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS);
         mca_btl_smcuda_free(btl, (mca_btl_base_descriptor_t *)frag);
@@ -1219,6 +1360,105 @@ static void mca_btl_smcuda_send_cuda_ipc_request(struct mca_btl_base_module_t* b
 
 }
 
+int mca_btl_smcuda_send_cuda_ddt_sig(struct mca_btl_base_module_t* btl,
+                                     struct mca_btl_base_endpoint_t* endpoint, 
+                                     void* msg, size_t msglen,
+                                     int tag)
+{
+    mca_btl_smcuda_frag_t* frag;
+    int rc;
+    
+    /* allocate a fragment, giving up if we can't get one */
+    MCA_BTL_SMCUDA_FRAG_ALLOC_EAGER(frag);
+    if( OPAL_UNLIKELY(NULL == frag) ) {
+        opal_output(0, "no frag for send unpack sig\n");
+        return OPAL_ERR_OUT_OF_RESOURCE;
+    }
+
+    /* Fill in fragment fields. */
+    frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
+    memcpy(frag->segment.seg_addr.pval, msg, msglen);
+    
+    rc = mca_btl_smcuda_send(btl, endpoint, (struct mca_btl_base_descriptor_t*)frag,  tag);
+    return rc;
+}
+
+inline static int mca_btl_smcuda_cuda_ddt_start_pack(struct mca_btl_base_module_t *btl,
+                                                     struct mca_btl_base_endpoint_t *endpoint,
+                                                     struct opal_convertor_t *pack_convertor,
+                                                     struct opal_convertor_t *unpack_convertor,
+                                                     void *remote_gpu_address,
+                                                     mca_btl_base_descriptor_t *frag,
+                                                     int lindex, int remote_device, int local_device)
+{
+    cuda_ddt_hdr_t send_msg;
+    mca_btl_smcuda_cuda_ddt_clone(endpoint, pack_convertor, unpack_convertor, remote_gpu_address, (mca_btl_base_descriptor_t *)frag, 
+                                  lindex, remote_device, local_device);
+    send_msg.lindex = lindex;
+    send_msg.packed_size = 0;
+    send_msg.seq = 0;
+    send_msg.msg_type = CUDA_DDT_PACK_START;
+    send_msg.pack_convertor = pack_convertor;
+    OPAL_OUTPUT_VERBOSE((OPAL_DATATYPE_CUDA_VERBOSE_LEVEL, mca_common_cuda_output,
+                         "smcuda btl start pack, remote_gpu_address %p, frag %p, lindex %d, remote_device %d, local_device %d\n",
+                         (void*)remote_gpu_address, (void*)frag, lindex, remote_device, local_device));
+    mca_btl_smcuda_send_cuda_ddt_sig(btl, endpoint, &send_msg, sizeof(cuda_ddt_hdr_t), MCA_BTL_TAG_SMCUDA_DATATYPE_PACK);
+    return OPAL_SUCCESS;
+}
+
+int mca_btl_smcuda_alloc_cuda_ddt_clone(struct mca_btl_base_endpoint_t *endpoint)
+{
+    int i;
+    if (endpoint->smcuda_ddt_clone_avail > 0) {
+        for (i = 0; i < endpoint->smcuda_ddt_clone_size; i++) {
+            if (endpoint->smcuda_ddt_clone[i].lindex == -1) {
+                endpoint->smcuda_ddt_clone_avail --;
+                OPAL_OUTPUT_VERBOSE((OPAL_DATATYPE_CUDA_VERBOSE_LEVEL, mca_common_cuda_output,
+                                     "Alloc cuda ddt clone array success, lindex %d\n",i));
+                return i;
+            }
+        }
+    } else {
+        endpoint->smcuda_ddt_clone = realloc(endpoint->smcuda_ddt_clone, endpoint->smcuda_ddt_clone_size + SMCUDA_DT_CLONE_SIZE);
+        endpoint->smcuda_ddt_clone_avail = SMCUDA_DT_CLONE_SIZE - 1;
+        endpoint->smcuda_ddt_clone_size += SMCUDA_DT_CLONE_SIZE;
+        return endpoint->smcuda_ddt_clone_size - SMCUDA_DT_CLONE_SIZE;
+    }
+    return -1;
+}
+
+void mca_btl_smcuda_free_cuda_ddt_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex)
+{
+    assert(endpoint->smcuda_ddt_clone[lindex].lindex == lindex);
+    cuda_ddt_smfrag_event_list_t *ddt_cuda_events = &(endpoint->smcuda_ddt_clone[lindex].ddt_cuda_events);
+    opal_cuda_free_event(ddt_cuda_events->cuda_kernel_event_list, ddt_cuda_events->nb_events);
+    ddt_cuda_events->cuda_kernel_event_list = NULL;
+    ddt_cuda_events->loc = -1;
+    ddt_cuda_events->nb_events = -1;
+    endpoint->smcuda_ddt_clone[lindex].lindex = -1;
+    endpoint->smcuda_ddt_clone_avail ++;
+}
+
+void mca_btl_smcuda_cuda_ddt_clone(struct mca_btl_base_endpoint_t *endpoint,
+                                   struct opal_convertor_t *pack_convertor,
+                                   struct opal_convertor_t *unpack_convertor,
+                                   void *remote_gpu_address,
+                                   mca_btl_base_descriptor_t *frag,
+                                   int lindex, int remote_device, int local_device)
+{
+    cuda_ddt_smfrag_event_list_t *ddt_cuda_events = &(endpoint->smcuda_ddt_clone[lindex].ddt_cuda_events);
+    endpoint->smcuda_ddt_clone[lindex].pack_convertor = pack_convertor;
+    endpoint->smcuda_ddt_clone[lindex].unpack_convertor = unpack_convertor;
+    endpoint->smcuda_ddt_clone[lindex].current_unpack_convertor_pBaseBuf = unpack_convertor->pBaseBuf;
+    endpoint->smcuda_ddt_clone[lindex].remote_gpu_address = remote_gpu_address;
+    endpoint->smcuda_ddt_clone[lindex].lindex = lindex;
+    endpoint->smcuda_ddt_clone[lindex].remote_device = remote_device;
+    endpoint->smcuda_ddt_clone[lindex].local_device = local_device;
+    endpoint->smcuda_ddt_clone[lindex].frag = frag;
+    ddt_cuda_events->cuda_kernel_event_list = opal_cuda_alloc_event(mca_btl_smcuda.super.btl_cuda_ddt_pipeline_depth, &(ddt_cuda_events->loc));
+    ddt_cuda_events->nb_events = mca_btl_smcuda.super.btl_cuda_ddt_pipeline_depth;
+}
+
 #endif /* OPAL_CUDA_SUPPORT */
 
 /**
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index 807d9081161..3ffcc3ca7e3 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -41,6 +41,8 @@
 #include "opal/mca/btl/btl.h"
 #include "opal/mca/common/sm/common_sm.h"
 
+#define OPAL_DATATYPE_DIRECT_COPY_GPUMEM    0
+
 BEGIN_C_DECLS
 
 /*
@@ -205,6 +207,7 @@ struct mca_btl_smcuda_component_t {
     int cuda_ipc_output;
     int use_cuda_ipc;
     int use_cuda_ipc_same_gpu;
+    int cuda_ddt_pipeline_size;
 #endif /* OPAL_CUDA_SUPPORT */
     unsigned long mpool_min_size;
     char *allocator;
@@ -510,6 +513,74 @@ enum ipcState {
     IPC_BAD
 };
 
+/* cuda datatype pack/unpack message */
+typedef struct {
+    int lindex;
+    int seq;
+    int msg_type;
+    int packed_size;
+    struct opal_convertor_t *pack_convertor;
+} cuda_ddt_hdr_t;
+
+/* cuda datatype put message */
+typedef struct {
+    int lindex;
+    void *remote_address;
+    void *remote_base;
+    uint64_t mem_handle[8];
+    struct opal_convertor_t *pack_convertor;
+} cuda_ddt_put_hdr_t;
+
+#define CUDA_DDT_UNPACK_FROM_BLOCK  0
+#define CUDA_DDT_COMPLETE           1
+#define CUDA_DDT_COMPLETE_ACK       2
+#define CUDA_DDT_CLEANUP            3
+#define CUDA_DDT_PACK_START         4
+#define CUDA_DDT_PACK_TO_BLOCK      5
+#define CUDA_UNPACK_NO              6
+
+
+/* event for pack/unpack */
+typedef struct {
+    int32_t loc;
+    int32_t nb_events;
+    void *cuda_kernel_event_list;
+} cuda_ddt_smfrag_event_list_t; 
+    
+/* package save pack/unpack convertor and cbfunc */
+typedef struct {
+    struct opal_convertor_t *pack_convertor;
+    struct opal_convertor_t *unpack_convertor;
+    unsigned char *current_unpack_convertor_pBaseBuf;
+    void *remote_gpu_address;
+    int lindex;
+    int remote_device;
+    int local_device;
+    mca_btl_base_descriptor_t *frag;
+    cuda_ddt_smfrag_event_list_t ddt_cuda_events;
+} cuda_ddt_clone_t;
+
+typedef struct {
+    mca_btl_base_module_t* btl;
+    struct mca_btl_base_endpoint_t *endpoint;
+    cuda_ddt_hdr_t sig_msg;
+} btl_smcuda_ddt_callback_t;
+
+#define SMCUDA_DT_CLONE_SIZE 20
+
+int mca_btl_smcuda_send_cuda_ddt_sig(struct mca_btl_base_module_t* btl,
+                                     struct mca_btl_base_endpoint_t* endpoint,
+                                     void* msg, size_t msglen,
+                                     int tag);
+int mca_btl_smcuda_alloc_cuda_ddt_clone(struct mca_btl_base_endpoint_t *endpoint);
+void mca_btl_smcuda_free_cuda_ddt_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex);
+void mca_btl_smcuda_cuda_ddt_clone(struct mca_btl_base_endpoint_t *endpoint,
+                                   struct opal_convertor_t *pack_convertor,
+                                   struct opal_convertor_t *unpack_convertor,
+                                   void *remote_gpu_address,
+                                   mca_btl_base_descriptor_t *frag,
+                                   int lindex, int remote_device, int local_device);
+
 #endif /* OPAL_CUDA_SUPPORT */
 
 
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index 8aedf9f1d7a..394d5971df6 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -54,6 +54,7 @@
 
 #if OPAL_CUDA_SUPPORT
 #include "opal/mca/common/cuda/common_cuda.h"
+#include "opal/datatype/opal_datatype_cuda.h"
 #endif /* OPAL_CUDA_SUPPORT */
 #if OPAL_ENABLE_FT_CR    == 1
 #include "opal/runtime/opal_cr.h"
@@ -148,7 +149,7 @@ static int smcuda_register(void)
                                            OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
                                            &mca_btl_smcuda_component.mpool_min_size);
 
-    mca_btl_smcuda_param_register_int("free_list_num", 8, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.sm_free_list_num);
+    mca_btl_smcuda_param_register_int("free_list_num", 16, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.sm_free_list_num);
     mca_btl_smcuda_param_register_int("free_list_max", -1, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.sm_free_list_max);
     mca_btl_smcuda_param_register_int("free_list_inc", 64, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.sm_free_list_inc);
     mca_btl_smcuda_param_register_int("max_procs", -1, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.sm_max_procs);
@@ -179,11 +180,15 @@ static int smcuda_register(void)
     mca_btl_smcuda_param_register_int("use_cuda_ipc", 1, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.use_cuda_ipc);
     mca_btl_smcuda_param_register_int("use_cuda_ipc_same_gpu", 1, OPAL_INFO_LVL_4,&mca_btl_smcuda_component.use_cuda_ipc_same_gpu);
     mca_btl_smcuda_param_register_int("cuda_ipc_verbose", 0, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.cuda_ipc_verbose);
+    mca_btl_smcuda_param_register_int("cuda_ddt_pipeline_size", 1024*1024*4, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.cuda_ddt_pipeline_size);
     mca_btl_smcuda_component.cuda_ipc_output = opal_output_open(NULL);
     opal_output_set_verbosity(mca_btl_smcuda_component.cuda_ipc_output, mca_btl_smcuda_component.cuda_ipc_verbose);
 #else /* OPAL_CUDA_SUPPORT */
     mca_btl_smcuda.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_LOW;
 #endif /* OPAL_CUDA_SUPPORT */
+    mca_btl_smcuda.super.btl_cuda_ddt_pipeline_size = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
+    mca_btl_smcuda.super.btl_cuda_ddt_pipeline_depth = 4;
+    mca_btl_smcuda.super.btl_cuda_ddt_allow_rdma = 1;
     mca_btl_smcuda.super.btl_eager_limit = 4*1024;
     mca_btl_smcuda.super.btl_rndv_eager_limit = 4*1024;
     mca_btl_smcuda.super.btl_max_send_size = 32*1024;
@@ -194,6 +199,7 @@ static int smcuda_register(void)
     mca_btl_smcuda.super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t);
     mca_btl_smcuda.super.btl_bandwidth = 9000;  /* Mbs */
     mca_btl_smcuda.super.btl_latency   = 1;     /* Microsecs */
+    OPAL_OUTPUT_VERBOSE((OPAL_DATATYPE_CUDA_VERBOSE_LEVEL, mca_common_cuda_output, "SMCUDA BTL pipeline size %lu\n", mca_btl_smcuda.super.btl_cuda_ddt_pipeline_size));
 
     /* Call the BTL based to register its MCA params */
     mca_btl_base_param_register(&mca_btl_smcuda_component.super.btl_version,
@@ -821,6 +827,265 @@ static void btl_smcuda_control(mca_btl_base_module_t* btl,
     }
 }
 
+static void btl_smcuda_datatype_pack_event_callback(btl_smcuda_ddt_callback_t *pack_callback_data)
+{
+    cuda_ddt_hdr_t *send_msg = &(pack_callback_data->sig_msg);
+    OPAL_OUTPUT_VERBOSE((OPAL_DATATYPE_CUDA_VERBOSE_LEVEL, mca_common_cuda_output,
+                         "Pack cuda event call back, seq %d\n", send_msg->seq));
+    mca_btl_smcuda_send_cuda_ddt_sig(pack_callback_data->btl, pack_callback_data->endpoint,
+                                     send_msg, sizeof(cuda_ddt_hdr_t), MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK);
+}
+
+static void btl_smcuda_datatype_unpack_event_callback(btl_smcuda_ddt_callback_t *unpack_callback_data)
+{
+    cuda_ddt_hdr_t *send_msg = &(unpack_callback_data->sig_msg);
+    OPAL_OUTPUT_VERBOSE((OPAL_DATATYPE_CUDA_VERBOSE_LEVEL, mca_common_cuda_output,
+                         "Unpack cuda event call back, seq %d\n", send_msg->seq));
+    mca_btl_smcuda_send_cuda_ddt_sig(unpack_callback_data->btl, unpack_callback_data->endpoint,
+                                     send_msg, sizeof(cuda_ddt_hdr_t), MCA_BTL_TAG_SMCUDA_DATATYPE_PACK);
+}
+
+/* for receiver */
+static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
+                                       mca_btl_base_tag_t tag,
+                                       mca_btl_base_descriptor_t* des, void* cbdata)
+{   
+    struct mca_btl_base_endpoint_t *endpoint = NULL;
+    cuda_ddt_hdr_t recv_msg;
+    mca_btl_base_segment_t* segments = des->des_segments;
+    memcpy(&recv_msg, segments->seg_addr.pval, sizeof(cuda_ddt_hdr_t));
+    int seq = recv_msg.seq;
+    int lindex = recv_msg.lindex;
+    size_t packed_size = recv_msg.packed_size;
+    int msg_type = recv_msg.msg_type;
+    mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
+    cuda_ddt_clone_t *my_cuda_dt_clone;
+    btl_smcuda_ddt_callback_t *unpack_callback_data = NULL;
+
+    /* We can find the endoint back from the rank embedded in the header */
+    endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
+    my_cuda_dt_clone = &endpoint->smcuda_ddt_clone[lindex];
+    assert(my_cuda_dt_clone->lindex == lindex);
+    
+    cuda_ddt_hdr_t send_msg;
+    send_msg.lindex = lindex;
+    send_msg.pack_convertor = my_cuda_dt_clone->pack_convertor;
+    struct opal_convertor_t *convertor = NULL;
+    cuda_ddt_smfrag_event_list_t *ddt_cuda_events = NULL;
+    
+    if (msg_type == CUDA_DDT_CLEANUP) {
+        ddt_cuda_events = &(my_cuda_dt_clone->ddt_cuda_events);
+        opal_cuda_sync_all_events(ddt_cuda_events->cuda_kernel_event_list, ddt_cuda_events->nb_events);
+        if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && my_cuda_dt_clone->remote_device != my_cuda_dt_clone->local_device) {
+            convertor = my_cuda_dt_clone->unpack_convertor;
+            if (convertor->gpu_buffer_ptr != NULL) {
+                opal_cuda_free_gpu_buffer(convertor->gpu_buffer_ptr, 0);
+                convertor->gpu_buffer_ptr = NULL;
+            }   
+        }
+        
+        mca_btl_smcuda_frag_t *frag_recv = (mca_btl_smcuda_frag_t *) my_cuda_dt_clone->frag;
+        mca_btl_base_rdma_completion_fn_t cbfunc = (mca_btl_base_rdma_completion_fn_t) frag_recv->base.des_cbfunc;
+        cbfunc (btl, endpoint, frag_recv->segment.seg_addr.pval, frag_recv->local_handle, frag_recv->base.des_context, frag_recv->base.des_cbdata, OPAL_SUCCESS);
+        mca_btl_smcuda_free(btl, (mca_btl_base_descriptor_t *)frag_recv);
+        mca_btl_smcuda_free_cuda_ddt_clone(endpoint, lindex);
+    } else if (msg_type == CUDA_DDT_UNPACK_FROM_BLOCK || msg_type == CUDA_DDT_COMPLETE){
+        struct iovec iov;
+        uint32_t iov_count = 1;
+        size_t max_data;
+        
+        send_msg.seq = seq;
+        if (msg_type == CUDA_DDT_COMPLETE) {
+            send_msg.msg_type = CUDA_DDT_COMPLETE_ACK;
+        } else {
+            send_msg.msg_type = CUDA_DDT_PACK_TO_BLOCK;
+        }
+        /* fill out callback data */
+        unpack_callback_data = (btl_smcuda_ddt_callback_t *)malloc(sizeof(btl_smcuda_ddt_callback_t));
+        unpack_callback_data->btl = btl;
+        unpack_callback_data->endpoint = endpoint;
+        unpack_callback_data->sig_msg = send_msg;
+        
+        convertor = my_cuda_dt_clone->unpack_convertor;
+        size_t pipeline_size = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
+        convertor->flags &= ~CONVERTOR_CUDA;
+        unsigned char *remote_address = NULL;
+        unsigned char *local_address = NULL;
+        if (opal_convertor_need_buffers(convertor) == false) { /* do not unpack */
+            convertor->flags |= CONVERTOR_CUDA;
+            local_address = my_cuda_dt_clone->current_unpack_convertor_pBaseBuf;
+            remote_address = (unsigned char*)my_cuda_dt_clone->remote_gpu_address + seq * pipeline_size;
+            OPAL_OUTPUT_VERBOSE((OPAL_DATATYPE_CUDA_VERBOSE_LEVEL, mca_common_cuda_output, "No unpack is needed, start D2D copy local %p, remote %p, size %ld, stream id %d, seq %d\n", local_address, remote_address, packed_size, seq, seq));
+            convertor->stream = opal_cuda_get_cuda_stream_by_id(seq);
+            opal_cuda_d2dcpy_async(local_address, remote_address, packed_size, convertor->stream);
+            my_cuda_dt_clone->current_unpack_convertor_pBaseBuf += packed_size;
+            mca_common_cuda_record_unpack_event(NULL, (void*)unpack_callback_data, convertor->stream);
+        } else {     /* unpack */
+            convertor->flags |= CONVERTOR_CUDA;
+            max_data = packed_size;
+            iov.iov_len = packed_size;
+            
+            convertor->stream = opal_cuda_get_cuda_stream_by_id(seq);
+            if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && my_cuda_dt_clone->remote_device != my_cuda_dt_clone->local_device) {
+                local_address = convertor->gpu_buffer_ptr + seq * pipeline_size;
+                remote_address = (unsigned char*)my_cuda_dt_clone->remote_gpu_address + seq * pipeline_size;
+                opal_cuda_d2dcpy_async(local_address, remote_address, packed_size, convertor->stream);
+                /* if a cudamemcpy is required, cuda event record after memcpy */
+                mca_common_cuda_record_unpack_event(NULL, (void*)unpack_callback_data, convertor->stream);
+                OPAL_OUTPUT_VERBOSE((OPAL_DATATYPE_CUDA_VERBOSE_LEVEL, mca_common_cuda_output, "Unpack is needed, start D2D copy src %p, dst %p, size %lu, stream id %d, seq %d\n", remote_address, convertor->gpu_buffer_ptr, packed_size, seq, seq));        
+                iov.iov_base = local_address;
+                opal_convertor_unpack(convertor, &iov, &iov_count, &max_data );
+                ddt_cuda_events = &(my_cuda_dt_clone->ddt_cuda_events);
+                opal_cuda_event_record(ddt_cuda_events->cuda_kernel_event_list, seq, convertor->stream);
+            } else {
+                local_address = convertor->gpu_buffer_ptr + seq * pipeline_size;
+                iov.iov_base = local_address;
+                opal_convertor_unpack(convertor, &iov, &iov_count, &max_data );
+                /* cudamemcpy is not required, so cuda event record after unpack */
+                mca_common_cuda_record_unpack_event(NULL, (void*)unpack_callback_data, convertor->stream);
+            }
+        }
+    }
+}
+
+/* for sender */
+static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
+                                     mca_btl_base_tag_t tag,
+                                     mca_btl_base_descriptor_t* des, void* cbdata)
+{
+    struct mca_btl_base_endpoint_t *endpoint = NULL;
+    cuda_ddt_hdr_t recv_msg;
+    mca_btl_base_segment_t* segments = des->des_segments;
+    memcpy(&recv_msg, segments->seg_addr.pval, sizeof(cuda_ddt_hdr_t));
+    int seq = recv_msg.seq;
+    int lindex = recv_msg.lindex;
+    int msg_type = recv_msg.msg_type;
+    struct opal_convertor_t *convertor = recv_msg.pack_convertor;
+    mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
+    cuda_ddt_hdr_t send_msg;
+    
+    btl_smcuda_ddt_callback_t *pack_callback_data = NULL;
+    
+    /* We can find the endoint back from the rank embedded in the header */
+    endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
+    
+    
+    uint32_t iov_count = 1;
+    int rv_dt = 0;
+    size_t max_data = 0;
+    size_t packed_size = 0;
+
+    send_msg.lindex = lindex;
+    if (msg_type == CUDA_DDT_COMPLETE_ACK) {
+        send_msg.packed_size = 0;
+        send_msg.seq = -2;
+        send_msg.msg_type = CUDA_DDT_CLEANUP;
+        mca_btl_smcuda_send_cuda_ddt_sig(btl, endpoint, &send_msg, sizeof(cuda_ddt_hdr_t), MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK);
+        if (convertor->gpu_buffer_ptr != NULL) {
+            opal_cuda_free_gpu_buffer(convertor->gpu_buffer_ptr, 0);
+            convertor->gpu_buffer_ptr = NULL;
+        }
+    } else if (msg_type == CUDA_DDT_PACK_TO_BLOCK) {
+        if (convertor->bConverted < convertor->local_size) {
+            struct iovec iov;
+            iov.iov_base = convertor->gpu_buffer_ptr + seq * mca_btl_smcuda_component.cuda_ddt_pipeline_size;
+            iov.iov_len = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
+            convertor->stream = opal_cuda_get_cuda_stream_by_id(seq);
+            rv_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
+            packed_size = max_data;
+            send_msg.packed_size = packed_size;
+            send_msg.seq = seq;
+            if (rv_dt == 1) {
+                send_msg.msg_type = CUDA_DDT_COMPLETE;
+            } else {
+                send_msg.msg_type = CUDA_DDT_UNPACK_FROM_BLOCK;
+            }
+            pack_callback_data = (btl_smcuda_ddt_callback_t *)malloc(sizeof(btl_smcuda_ddt_callback_t));
+            pack_callback_data->btl = btl;
+            pack_callback_data->endpoint = endpoint;
+            pack_callback_data->sig_msg = send_msg;
+            mca_common_cuda_record_pack_event(NULL, (void*)pack_callback_data, convertor->stream);
+        }
+    } else if (msg_type == CUDA_DDT_PACK_START) {
+        struct iovec iov;
+        iov.iov_base = convertor->gpu_buffer_ptr;
+        iov.iov_len = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
+        seq = 0;
+        while (rv_dt != 1 && convertor->gpu_buffer_size > 0) {
+            convertor->stream = opal_cuda_get_cuda_stream_by_id(seq);
+            rv_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
+            iov.iov_base = (void*)((unsigned char*)iov.iov_base + mca_btl_smcuda_component.cuda_ddt_pipeline_size);
+            convertor->gpu_buffer_size -= mca_btl_smcuda_component.cuda_ddt_pipeline_size;
+            send_msg.packed_size = max_data;
+            send_msg.seq = seq;
+            if (rv_dt == 1) {
+                send_msg.msg_type = CUDA_DDT_COMPLETE;
+            } else {
+                send_msg.msg_type = CUDA_DDT_UNPACK_FROM_BLOCK;
+            }
+            pack_callback_data = (btl_smcuda_ddt_callback_t *)malloc(sizeof(btl_smcuda_ddt_callback_t));
+            pack_callback_data->btl = btl;
+            pack_callback_data->endpoint = endpoint;
+            pack_callback_data->sig_msg = send_msg;
+            mca_common_cuda_record_pack_event(NULL, (void*)pack_callback_data, convertor->stream);
+            seq ++;
+        }
+    } else {
+        opal_output(0, "unknown message\n");
+    }
+}
+
+/* for sender */
+static void btl_smcuda_datatype_put(mca_btl_base_module_t* btl,
+                                    mca_btl_base_tag_t tag,
+                                    mca_btl_base_descriptor_t* des, void* cbdata)
+{
+    struct mca_btl_base_endpoint_t *endpoint = NULL;
+    mca_btl_base_segment_t* segments = des->des_segments;
+    cuda_ddt_put_hdr_t* recv_msg = (cuda_ddt_put_hdr_t*)segments->seg_addr.pval;
+    int lindex = recv_msg->lindex;
+    void *remote_address = recv_msg->remote_address;
+    void *remote_base = recv_msg->remote_base;
+    struct opal_convertor_t *convertor = recv_msg->pack_convertor;
+    mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
+    cuda_ddt_hdr_t send_msg;
+    
+    /* We can find the endoint back from the rank embedded in the header */
+    endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
+    
+    opal_cuda_free_gpu_buffer(convertor->gpu_buffer_ptr, 0);
+    mca_rcache_common_cuda_reg_t *rget_reg_ptr = NULL;
+    mca_rcache_common_cuda_reg_t rget_reg;
+    rget_reg_ptr= &rget_reg;
+    memset(&rget_reg, 0, sizeof(rget_reg));
+    memcpy(rget_reg.data.memHandle, recv_msg->mem_handle, sizeof(recv_msg->mem_handle));
+    cuda_openmemhandle(NULL, 0, (mca_rcache_base_registration_t *)&rget_reg, NULL);
+    size_t offset = (size_t) ((intptr_t)remote_address - (intptr_t)remote_base);
+    unsigned char *remote_memory_address = (unsigned char *)rget_reg_ptr->base.alloc_base + offset;
+    convertor->gpu_buffer_ptr = remote_memory_address;
+    OPAL_OUTPUT_VERBOSE((OPAL_DATATYPE_CUDA_VERBOSE_LEVEL, mca_common_cuda_output,
+                         "smcuda start put, remote_memory_address %p, r_addr %p, r_base %p\n",
+                         remote_memory_address, remote_address, remote_base));
+    convertor->gpu_buffer_size = convertor->local_size;
+    
+    struct iovec iov;
+    uint32_t iov_count = 1;
+    int rv_dt = 0;
+    size_t max_data = 0;
+    iov.iov_len = convertor->local_size;
+    iov.iov_base = convertor->gpu_buffer_ptr;
+    convertor->stream = opal_cuda_get_cuda_stream_by_id(0);
+    rv_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
+    opal_cuda_sync_cuda_stream(0);
+    assert(rv_dt == 1);
+    send_msg.lindex = lindex;
+    send_msg.packed_size = 0;
+    send_msg.seq = -2;
+    send_msg.msg_type = CUDA_DDT_CLEANUP;
+    mca_btl_smcuda_send_cuda_ddt_sig(btl, endpoint, &send_msg, sizeof(cuda_ddt_hdr_t),
+                                     MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK);
+}
+
 #endif /* OPAL_CUDA_SUPPORT */
 
 /*
@@ -935,6 +1200,13 @@ mca_btl_smcuda_component_init(int *num_btls,
     /* Register a smcuda control function to help setup IPC support */
     mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA].cbfunc = btl_smcuda_control;
     mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA].cbdata = NULL;
+    mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK].cbfunc = btl_smcuda_datatype_unpack;
+    mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK].cbdata = NULL;
+    mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA_DATATYPE_PACK].cbfunc = btl_smcuda_datatype_pack;
+    mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA_DATATYPE_PACK].cbdata = NULL;
+    mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA_DATATYPE_PUT].cbfunc = btl_smcuda_datatype_put;
+    mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA_DATATYPE_PUT].cbdata = NULL;
+
 #endif /* OPAL_CUDA_SUPPORT */
 
     return btls;
@@ -1019,6 +1291,25 @@ int mca_btl_smcuda_component_progress(void)
         }
     }
 
+#if OPAL_CUDA_SUPPORT
+    /* Check to see if there are any outstanding CUDA pack events that have
+     * completed. */ 
+    btl_smcuda_ddt_callback_t *pack_callback_frag, *unpack_callback_frag;
+    while (1 == progress_one_cuda_pack_event((void **)&pack_callback_frag)) {
+        if (pack_callback_frag != NULL) {
+            btl_smcuda_datatype_pack_event_callback(pack_callback_frag);
+            free (pack_callback_frag);
+        }
+    }
+
+    while (1 == progress_one_cuda_unpack_event((void **)&unpack_callback_frag)) {
+        if (unpack_callback_frag != NULL) {
+            btl_smcuda_datatype_unpack_event_callback(unpack_callback_frag);
+            free (unpack_callback_frag);
+        }
+    }
+#endif /* OPAL_CUDA_SUPPORT */
+
     /* poll each fifo */
     for(j = 0; j < FIFO_MAP_NUM(mca_btl_smcuda_component.num_smp_procs); j++) {
         fifo = &(mca_btl_smcuda_component.fifo[my_smp_rank][j]);
diff --git a/opal/mca/btl/smcuda/btl_smcuda_endpoint.h b/opal/mca/btl/smcuda/btl_smcuda_endpoint.h
index 1dfb359e17f..8fbb901ac0e 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_endpoint.h
+++ b/opal/mca/btl/smcuda/btl_smcuda_endpoint.h
@@ -52,6 +52,9 @@ struct mca_btl_base_endpoint_t {
     opal_proc_t *proc_opal;  /**< Needed for adding CUDA IPC support dynamically */
     enum ipcState ipcstate;  /**< CUDA IPC connection status */
     int ipctries;            /**< Number of times CUDA IPC connect was sent */
+    cuda_ddt_clone_t *smcuda_ddt_clone;
+    int smcuda_ddt_clone_size;
+    int smcuda_ddt_clone_avail;
 #endif /* OPAL_CUDA_SUPPORT */
 };
 
diff --git a/opal/mca/common/cuda/common_cuda.c b/opal/mca/common/cuda/common_cuda.c
index 94886739fb7..4d7f4fa8525 100644
--- a/opal/mca/common/cuda/common_cuda.c
+++ b/opal/mca/common/cuda/common_cuda.c
@@ -111,12 +111,13 @@ struct cudaFunctionTable {
 typedef struct cudaFunctionTable cudaFunctionTable_t;
 static cudaFunctionTable_t cuFunc;
 
+
 static int stage_one_init_ref_count = 0;
 static bool stage_three_init_complete = false;
 static bool common_cuda_initialized = false;
 static bool common_cuda_mca_parames_registered = false;
 static int mca_common_cuda_verbose;
-static int mca_common_cuda_output = 0;
+int mca_common_cuda_output = 0;
 bool mca_common_cuda_enabled = false;
 static bool mca_common_cuda_register_memory = true;
 static bool mca_common_cuda_warning = false;
@@ -167,26 +168,43 @@ CUevent *cuda_event_ipc_array = NULL;
 CUevent *cuda_event_dtoh_array = NULL;
 CUevent *cuda_event_htod_array = NULL;
 
+/* Array of CUDA events used for async packing/unpacking */
+CUevent *cuda_event_pack_array = NULL;
+CUevent *cuda_event_unpack_array = NULL;
+
 /* Array of fragments currently being moved by cuda async non-blocking
  * operations */
 struct mca_btl_base_descriptor_t **cuda_event_ipc_frag_array = NULL;
 struct mca_btl_base_descriptor_t **cuda_event_dtoh_frag_array = NULL;
 struct mca_btl_base_descriptor_t **cuda_event_htod_frag_array = NULL;
 
+/* Array of event callback used by cuda async pack/unpack */
+void **cuda_event_pack_callback_frag_array = NULL;
+void **cuda_event_unpack_callback_frag_array = NULL;
+
+/* Array of convertors currently being used by cuda async non-blocking
+ * operations */
+opal_convertor_t **cuda_event_dtoh_convertor_array = NULL;
+
 /* First free/available location in cuda_event_status_array */
 static int cuda_event_ipc_first_avail, cuda_event_dtoh_first_avail, cuda_event_htod_first_avail;
+static int cuda_event_pack_first_avail, cuda_event_unpack_first_avail;
 
 /* First currently-being used location in the cuda_event_status_array */
 static int cuda_event_ipc_first_used, cuda_event_dtoh_first_used, cuda_event_htod_first_used;
+static int cuda_event_pack_first_used, cuda_event_unpack_first_used;
 
 /* Number of status items currently in use */
 static int cuda_event_ipc_num_used, cuda_event_dtoh_num_used, cuda_event_htod_num_used;
+static int cuda_event_pack_num_used, cuda_event_unpack_num_used;
 
 /* Size of array holding events */
 int cuda_event_max = 400;
 static int cuda_event_ipc_most = 0;
 static int cuda_event_dtoh_most = 0;
 static int cuda_event_htod_most = 0;
+static int cuda_event_pack_most = 0;
+static int cuda_event_unpack_most = 0;
 
 /* Handle to libcuda.so */
 opal_dl_handle_t *libcuda_handle = NULL;
@@ -622,6 +640,76 @@ static int mca_common_cuda_stage_three_init(void)
             rc = OPAL_ERROR;
             goto cleanup_and_error;
         }
+        
+        /* Set up an array to store outstanding async packing events */
+        cuda_event_pack_num_used = 0;
+        cuda_event_pack_first_avail = 0;
+        cuda_event_pack_first_used = 0;
+
+        cuda_event_pack_array = (CUevent *) calloc(cuda_event_max, sizeof(CUevent *));
+        if (NULL == cuda_event_pack_array) {
+            opal_show_help("help-mpi-common-cuda.txt", "No memory",
+                           true, OPAL_PROC_MY_HOSTNAME);
+            rc = OPAL_ERROR;
+            goto cleanup_and_error;
+        }
+
+        /* Create the events since they can be reused. */
+        for (i = 0; i < cuda_event_max; i++) {
+            res = cuFunc.cuEventCreate(&cuda_event_pack_array[i], CU_EVENT_DISABLE_TIMING);
+            if (CUDA_SUCCESS != res) {
+                opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
+                               true, OPAL_PROC_MY_HOSTNAME, res);
+                rc = OPAL_ERROR;
+                goto cleanup_and_error;
+            }
+        }
+        
+         /* The first available status index is 0.  Make an empty frag
+            array. */
+         cuda_event_pack_callback_frag_array = (void **)
+             malloc(sizeof(void *) * cuda_event_max);
+         if (NULL == cuda_event_pack_callback_frag_array) {
+             opal_show_help("help-mpi-common-cuda.txt", "No memory",
+                            true, OPAL_PROC_MY_HOSTNAME);
+             rc = OPAL_ERROR;
+             goto cleanup_and_error;
+         }
+        
+        /* Set up an array to store outstanding async unpacking events */
+        cuda_event_unpack_num_used = 0;
+        cuda_event_unpack_first_avail = 0;
+        cuda_event_unpack_first_used = 0;
+
+        cuda_event_unpack_array = (CUevent *) calloc(cuda_event_max, sizeof(CUevent *));
+        if (NULL == cuda_event_unpack_array) {
+            opal_show_help("help-mpi-common-cuda.txt", "No memory",
+                           true, OPAL_PROC_MY_HOSTNAME);
+            rc = OPAL_ERROR;
+            goto cleanup_and_error;
+        }
+
+        /* Create the events since they can be reused. */
+        for (i = 0; i < cuda_event_max; i++) {
+            res = cuFunc.cuEventCreate(&cuda_event_unpack_array[i], CU_EVENT_DISABLE_TIMING);
+            if (CUDA_SUCCESS != res) {
+                opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
+                               true, OPAL_PROC_MY_HOSTNAME, res);
+                rc = OPAL_ERROR;
+                goto cleanup_and_error;
+            }
+        }
+        
+         /* The first available status index is 0.  Make an empty frag
+            array. */
+         cuda_event_unpack_callback_frag_array = (void **)
+             malloc(sizeof(void *) * cuda_event_max);
+         if (NULL == cuda_event_unpack_callback_frag_array) {
+             opal_show_help("help-mpi-common-cuda.txt", "No memory",
+                            true, OPAL_PROC_MY_HOSTNAME);
+             rc = OPAL_ERROR;
+             goto cleanup_and_error;
+         }
     }
 
     if (true == mca_common_cuda_enabled) {
@@ -660,6 +748,15 @@ static int mca_common_cuda_stage_three_init(void)
             rc = OPAL_ERROR;
             goto cleanup_and_error;
         }
+        
+        cuda_event_dtoh_convertor_array = (opal_convertor_t **)
+            malloc(sizeof(opal_convertor_t *) * cuda_event_max);
+        if (NULL == cuda_event_dtoh_convertor_array) {
+            opal_show_help("help-mpi-common-cuda.txt", "No memory",
+                           true, OPAL_PROC_MY_HOSTNAME);
+            rc = OPAL_ERROR;
+            goto cleanup_and_error;
+        }
 
         /* Set up an array to store outstanding async htod events.  Used on the
          * receiving side for asynchronous copies. */
@@ -868,6 +965,28 @@ void mca_common_cuda_fini(void)
             }
             free(cuda_event_dtoh_array);
         }
+        
+        if (NULL != cuda_event_pack_array) {
+            if (ctx_ok) {
+                for (i = 0; i < cuda_event_max; i++) {
+                    if (NULL != cuda_event_pack_array[i]) {
+                        cuFunc.cuEventDestroy(cuda_event_pack_array[i]);
+                    }
+                }
+            }
+            free(cuda_event_pack_array);
+        }
+        
+        if (NULL != cuda_event_unpack_array) {
+            if (ctx_ok) {
+                for (i = 0; i < cuda_event_max; i++) {
+                    if (NULL != cuda_event_unpack_array[i]) {
+                        cuFunc.cuEventDestroy(cuda_event_unpack_array[i]);
+                    }
+                }
+            }
+            free(cuda_event_unpack_array);
+        }
 
         if (NULL != cuda_event_ipc_frag_array) {
             free(cuda_event_ipc_frag_array);
@@ -878,6 +997,15 @@ void mca_common_cuda_fini(void)
         if (NULL != cuda_event_dtoh_frag_array) {
             free(cuda_event_dtoh_frag_array);
         }
+        if (NULL != cuda_event_dtoh_convertor_array) {
+            free(cuda_event_dtoh_convertor_array);
+        }
+        if (NULL != cuda_event_pack_callback_frag_array) {
+            free(cuda_event_pack_callback_frag_array);
+        }
+        if (NULL != cuda_event_unpack_callback_frag_array) {
+            free(cuda_event_unpack_callback_frag_array);
+        }
         if ((NULL != ipcStream) && ctx_ok) {
             cuFunc.cuStreamDestroy(ipcStream);
         }
@@ -1390,7 +1518,9 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
  * Record an event and save the frag.  This is called by the sending side and
  * is used to queue an event when a htod copy has been initiated.
  */
-int mca_common_cuda_record_dtoh_event(char *msg, struct mca_btl_base_descriptor_t *frag)
+int mca_common_cuda_record_dtoh_event(char *msg, struct mca_btl_base_descriptor_t *frag,
+                                      opal_convertor_t *convertor,
+                                      void *cuda_stream)
 {
     CUresult result;
 
@@ -1413,7 +1543,11 @@ int mca_common_cuda_record_dtoh_event(char *msg, struct mca_btl_base_descriptor_
         }
     }
 
-    result = cuFunc.cuEventRecord(cuda_event_dtoh_array[cuda_event_dtoh_first_avail], dtohStream);
+    if (cuda_stream == NULL) {
+        result = cuFunc.cuEventRecord(cuda_event_dtoh_array[cuda_event_dtoh_first_avail], dtohStream);
+    } else {
+        result = cuFunc.cuEventRecord(cuda_event_dtoh_array[cuda_event_dtoh_first_avail], (CUstream)cuda_stream);
+    }
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
                        true, OPAL_PROC_MY_HOSTNAME, result);
@@ -1421,6 +1555,7 @@ int mca_common_cuda_record_dtoh_event(char *msg, struct mca_btl_base_descriptor_
         return OPAL_ERROR;
     }
     cuda_event_dtoh_frag_array[cuda_event_dtoh_first_avail] = frag;
+    cuda_event_dtoh_convertor_array[cuda_event_dtoh_first_avail] = convertor;
 
     /* Bump up the first available slot and number used by 1 */
     cuda_event_dtoh_first_avail++;
@@ -1437,7 +1572,7 @@ int mca_common_cuda_record_dtoh_event(char *msg, struct mca_btl_base_descriptor_
  * Record an event and save the frag.  This is called by the receiving side and
  * is used to queue an event when a dtoh copy has been initiated.
  */
-int mca_common_cuda_record_htod_event(char *msg, struct mca_btl_base_descriptor_t *frag)
+int mca_common_cuda_record_htod_event(char *msg, struct mca_btl_base_descriptor_t *frag, void *cuda_stream)
 {
     CUresult result;
 
@@ -1461,7 +1596,11 @@ int mca_common_cuda_record_htod_event(char *msg, struct mca_btl_base_descriptor_
         }
     }
 
-    result = cuFunc.cuEventRecord(cuda_event_htod_array[cuda_event_htod_first_avail], htodStream);
+    if (cuda_stream == NULL) {
+        result = cuFunc.cuEventRecord(cuda_event_htod_array[cuda_event_htod_first_avail], htodStream);
+    } else {
+        result = cuFunc.cuEventRecord(cuda_event_htod_array[cuda_event_htod_first_avail], (CUstream)cuda_stream);
+    }
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
                        true, OPAL_PROC_MY_HOSTNAME, result);
@@ -1481,11 +1620,103 @@ int mca_common_cuda_record_htod_event(char *msg, struct mca_btl_base_descriptor_
     return OPAL_SUCCESS;
 }
 
+/*
+ * Record an event and save the frag.  This is called by the sending side and
+ * is used to queue an event when a async pack has been initiated.
+ */
+int mca_common_cuda_record_pack_event(char *msg, void *callback_frag, void *pack_stream)
+{
+    CUresult result;
+
+    /* First make sure there is room to store the event.  If not, then
+     * return an error.  The error message will tell the user to try and
+     * run again, but with a larger array for storing events. */
+    if (cuda_event_pack_num_used == cuda_event_max) {
+        opal_show_help("help-mpi-common-cuda.txt", "Out of cuEvent handles",
+                       true, cuda_event_max, cuda_event_max+100, cuda_event_max+100);
+        return OPAL_ERR_OUT_OF_RESOURCE;
+    }
+
+    if (cuda_event_pack_num_used > cuda_event_pack_most) {
+        cuda_event_pack_most = cuda_event_pack_num_used;
+        /* Just print multiples of 10 */
+        if (0 == (cuda_event_pack_most % 10)) {
+            opal_output_verbose(20, mca_common_cuda_output,
+                                "Maximum pack events used is now %d", cuda_event_pack_most);
+        }
+    }
+
+    result = cuFunc.cuEventRecord(cuda_event_pack_array[cuda_event_pack_first_avail], (CUstream)pack_stream);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
+                       true, OPAL_PROC_MY_HOSTNAME, result);
+        return OPAL_ERROR;
+    }
+    cuda_event_pack_callback_frag_array[cuda_event_pack_first_avail] = callback_frag;
+
+    /* Bump up the first available slot and number used by 1 */
+    cuda_event_pack_first_avail++;
+    if (cuda_event_pack_first_avail >= cuda_event_max) {
+        cuda_event_pack_first_avail = 0;
+    }
+    cuda_event_pack_num_used++;
+
+    return OPAL_SUCCESS;
+}
+
+/*
+ * Record an event and save the frag.  This is called by the sending side and
+ * is used to queue an event when a async pack has been initiated.
+ */
+int mca_common_cuda_record_unpack_event(char *msg, void *callback_frag, void *unpack_stream)
+{
+    CUresult result;
+
+    /* First make sure there is room to store the event.  If not, then
+     * return an error.  The error message will tell the user to try and
+     * run again, but with a larger array for storing events. */
+    if (cuda_event_unpack_num_used == cuda_event_max) {
+        opal_show_help("help-mpi-common-cuda.txt", "Out of cuEvent handles",
+                       true, cuda_event_max, cuda_event_max+100, cuda_event_max+100);
+        return OPAL_ERR_OUT_OF_RESOURCE;
+    }
+
+    if (cuda_event_unpack_num_used > cuda_event_unpack_most) {
+        cuda_event_unpack_most = cuda_event_unpack_num_used;
+        /* Just print multiples of 10 */
+        if (0 == (cuda_event_unpack_most % 10)) {
+            opal_output_verbose(20, mca_common_cuda_output,
+                                "Maximum pack events used is now %d", cuda_event_unpack_most);
+        }
+    }
+
+    result = cuFunc.cuEventRecord(cuda_event_unpack_array[cuda_event_unpack_first_avail], (CUstream)unpack_stream);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
+                       true, OPAL_PROC_MY_HOSTNAME, result);
+        return OPAL_ERROR;
+    }
+    cuda_event_unpack_callback_frag_array[cuda_event_unpack_first_avail] = callback_frag;
+
+    /* Bump up the first available slot and number used by 1 */
+    cuda_event_unpack_first_avail++;
+    if (cuda_event_unpack_first_avail >= cuda_event_max) {
+        cuda_event_unpack_first_avail = 0;
+    }
+    cuda_event_unpack_num_used++;
+
+    return OPAL_SUCCESS;
+}
+
 /**
  * Used to get the dtoh stream for initiating asynchronous copies.
  */
 void *mca_common_cuda_get_dtoh_stream(void) {
-    return (void *)dtohStream;
+    if (dtohStream == NULL) {
+        return NULL;
+    } else {
+        return (void *)dtohStream;
+    }
 }
 
 /**
@@ -1547,7 +1778,7 @@ int progress_one_cuda_ipc_event(struct mca_btl_base_descriptor_t **frag) {
 /**
  * Progress any dtoh event completions.
  */
-int progress_one_cuda_dtoh_event(struct mca_btl_base_descriptor_t **frag) {
+int progress_one_cuda_dtoh_event(struct mca_btl_base_descriptor_t **frag, opal_convertor_t **convertor) {
     CUresult result;
 
     OPAL_THREAD_LOCK(&common_cuda_dtoh_lock);
@@ -1574,6 +1805,7 @@ int progress_one_cuda_dtoh_event(struct mca_btl_base_descriptor_t **frag) {
         }
 
         *frag = cuda_event_dtoh_frag_array[cuda_event_dtoh_first_used];
+        *convertor = cuda_event_dtoh_convertor_array[cuda_event_dtoh_first_used];
         opal_output_verbose(30, mca_common_cuda_output,
                             "CUDA: cuEventQuery returned %d", result);
 
@@ -1638,6 +1870,128 @@ int progress_one_cuda_htod_event(struct mca_btl_base_descriptor_t **frag) {
     return 0;
 }
 
+/**
+ * Progress any pack event completions.
+ */
+int progress_one_cuda_pack_event(void **callback_frag) {
+    CUresult result;
+
+    if (cuda_event_pack_num_used > 0) {
+        opal_output_verbose(30, mca_common_cuda_output,
+                           "CUDA: progress_one_cuda_pack_event, outstanding_events=%d",
+                            cuda_event_pack_num_used);
+
+        result = cuFunc.cuEventQuery(cuda_event_pack_array[cuda_event_pack_first_used]);
+
+        /* We found an event that is not ready, so return. */
+        if (CUDA_ERROR_NOT_READY == result) {
+            opal_output_verbose(30, mca_common_cuda_output,
+                                "CUDA: cuEventQuery returned CUDA_ERROR_NOT_READY");
+            *callback_frag = NULL;
+            return 0;
+        } else if (CUDA_SUCCESS != result) {
+            opal_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed",
+                           true, result);
+            *callback_frag = NULL;
+            return OPAL_ERROR;
+        }
+
+        *callback_frag = cuda_event_pack_callback_frag_array[cuda_event_pack_first_used];
+        opal_output_verbose(30, mca_common_cuda_output,
+                            "CUDA: cuEventQuery returned %d", result);
+
+        /* Bump counters, loop around the circular buffer if necessary */
+        --cuda_event_pack_num_used;
+        ++cuda_event_pack_first_used;
+        if (cuda_event_pack_first_used >= cuda_event_max) {
+            cuda_event_pack_first_used = 0;
+        }
+        /* A return value of 1 indicates an event completed and a frag was returned */
+        return 1;
+    }
+    return 0;
+}
+
+/**
+ * Progress any unpack event completions.
+ */
+int progress_one_cuda_unpack_event(void **callback_frag) {
+    CUresult result;
+
+    if (cuda_event_unpack_num_used > 0) {
+        opal_output_verbose(30, mca_common_cuda_output,
+                           "CUDA: progress_one_cuda_pack_event, outstanding_events=%d",
+                            cuda_event_unpack_num_used);
+
+        result = cuFunc.cuEventQuery(cuda_event_unpack_array[cuda_event_unpack_first_used]);
+
+        /* We found an event that is not ready, so return. */
+        if (CUDA_ERROR_NOT_READY == result) {
+            opal_output_verbose(30, mca_common_cuda_output,
+                                "CUDA: cuEventQuery returned CUDA_ERROR_NOT_READY");
+            *callback_frag = NULL;
+            return 0;
+        } else if (CUDA_SUCCESS != result) {
+            opal_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed",
+                           true, result);
+            *callback_frag = NULL;
+            return OPAL_ERROR;
+        }
+
+        *callback_frag = cuda_event_unpack_callback_frag_array[cuda_event_unpack_first_used];
+        opal_output_verbose(30, mca_common_cuda_output,
+                            "CUDA: cuEventQuery returned %d", result);
+
+        /* Bump counters, loop around the circular buffer if necessary */
+        --cuda_event_unpack_num_used;
+        ++cuda_event_unpack_first_used;
+        if (cuda_event_unpack_first_used >= cuda_event_max) {
+            cuda_event_unpack_first_used = 0;
+        }
+        /* A return value of 1 indicates an event completed and a frag was returned */
+        return 1;
+    }
+    return 0;
+}
+
+int mca_common_cuda_create_event(uint64_t **event)
+{
+    CUresult result;
+
+    result = cuFunc.cuEventCreate((CUevent *)event, CU_EVENT_INTERPROCESS | CU_EVENT_DISABLE_TIMING);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
+                       true, OPAL_PROC_MY_HOSTNAME, result);
+        return OPAL_ERROR;
+    }
+    return OPAL_SUCCESS;
+}
+
+int mca_common_cuda_record_event(uint64_t *event)
+{
+    CUresult result;
+    result = cuFunc.cuEventRecord((CUevent)event,0);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        printf("record event error %d\n", result);
+        return OPAL_ERROR;
+    } else {
+        return OPAL_SUCCESS;
+    }
+}
+
+int mca_common_cuda_query_event(uint64_t *event)
+{
+    CUresult result;
+    result = cuFunc.cuEventQuery((CUevent)event);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS == result)) {
+        return OPAL_SUCCESS;
+    } else if (OPAL_UNLIKELY(CUDA_ERROR_NOT_READY == result)) {
+        return OPAL_ERROR;
+    } else {
+        printf("query event error %d\n", result);
+        return OPAL_ERROR;
+    }
+}
 
 /**
  * Need to make sure the handle we are retrieving from the cache is still
@@ -1846,7 +2200,9 @@ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t
     if (!stage_three_init_complete) {
         if (0 != mca_common_cuda_stage_three_init()) {
             opal_cuda_support = 0;
-        }
+        } else {
+	    opal_cuda_kernel_support_init();
+	}
     }
 
     return 1;
@@ -2001,6 +2357,19 @@ int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base)
     return 0;
 }
 
+int mca_common_cuda_memp2pcpy(void *dest, const void *src, size_t size)
+{
+    CUresult result;
+
+    result = cuFunc.cuMemcpy((CUdeviceptr)dest, (CUdeviceptr)src, size);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_show_help("help-mpi-common-cuda.txt", "cuMemcpy failed",
+                        true, OPAL_PROC_MY_HOSTNAME, result);
+        return OPAL_ERROR;
+    }
+    return OPAL_SUCCESS;
+}
+
 #if OPAL_CUDA_GDR_SUPPORT
 /* Check to see if the memory was freed between the time it was stored in
  * the registration cache and now.  Return true if the memory was previously
diff --git a/opal/mca/common/cuda/common_cuda.h b/opal/mca/common/cuda/common_cuda.h
index 3ff95405299..8752820e9e5 100644
--- a/opal/mca/common/cuda/common_cuda.h
+++ b/opal/mca/common/cuda/common_cuda.h
@@ -34,6 +34,9 @@ struct mca_rcache_common_cuda_reg_data_t {
     uint64_t event;
     opal_ptr_t memh_seg_addr;
     size_t memh_seg_len;
+    uint8_t pack_unpack_required;
+    int32_t gpu_device;
+    struct opal_convertor_t *convertor;
 };
 typedef struct mca_rcache_common_cuda_reg_data_t mca_rcache_common_cuda_reg_data_t;
 
@@ -43,6 +46,7 @@ struct mca_rcache_common_cuda_reg_t {
 };
 typedef struct mca_rcache_common_cuda_reg_t mca_rcache_common_cuda_reg_t;
 extern bool mca_common_cuda_enabled;
+extern int mca_common_cuda_output;
 
 OPAL_DECLSPEC void mca_common_cuda_register_mca_variables(void);
 
@@ -58,16 +62,23 @@ OPAL_DECLSPEC int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, ch
 OPAL_DECLSPEC int mca_common_cuda_record_ipc_event(char *msg,
                                                struct mca_btl_base_descriptor_t *frag);
 OPAL_DECLSPEC int mca_common_cuda_record_dtoh_event(char *msg,
-                                                    struct mca_btl_base_descriptor_t *frag);
+                                                    struct mca_btl_base_descriptor_t *frag,
+                                                    opal_convertor_t *convertor,
+                                                    void *cuda_stream);
 OPAL_DECLSPEC int mca_common_cuda_record_htod_event(char *msg,
-                                                    struct mca_btl_base_descriptor_t *frag);
+                                                    struct mca_btl_base_descriptor_t *frag,
+                                                    void *cuda_stream);
+OPAL_DECLSPEC int mca_common_cuda_record_pack_event(char *msg, void *callback_frag, void *pack_stream);
+OPAL_DECLSPEC int mca_common_cuda_record_unpack_event(char *msg, void *callback_frag, void *unpack_stream);
 
 OPAL_DECLSPEC void *mca_common_cuda_get_dtoh_stream(void);
 OPAL_DECLSPEC void *mca_common_cuda_get_htod_stream(void);
 
 OPAL_DECLSPEC int progress_one_cuda_ipc_event(struct mca_btl_base_descriptor_t **);
-OPAL_DECLSPEC int progress_one_cuda_dtoh_event(struct mca_btl_base_descriptor_t **);
+OPAL_DECLSPEC int progress_one_cuda_dtoh_event(struct mca_btl_base_descriptor_t **, opal_convertor_t **);
 OPAL_DECLSPEC int progress_one_cuda_htod_event(struct mca_btl_base_descriptor_t **);
+OPAL_DECLSPEC int progress_one_cuda_pack_event(void **callback_frag);
+OPAL_DECLSPEC int progress_one_cuda_unpack_event(void **callback_frag);
 
 OPAL_DECLSPEC int mca_common_cuda_memhandle_matches(mca_rcache_common_cuda_reg_t *new_reg,
                                                     mca_rcache_common_cuda_reg_t *old_reg);
@@ -86,6 +97,10 @@ OPAL_DECLSPEC int mca_common_cuda_device_can_access_peer(int *access, int dev1,
 OPAL_DECLSPEC int mca_common_cuda_stage_one_init(void);
 OPAL_DECLSPEC int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base);
 OPAL_DECLSPEC void mca_common_cuda_fini(void);
+OPAL_DECLSPEC int mca_common_cuda_create_event(uint64_t **event);
+OPAL_DECLSPEC int mca_common_cuda_record_event(uint64_t *event);
+OPAL_DECLSPEC int mca_common_cuda_query_event(uint64_t *event);
+OPAL_DECLSPEC int mca_common_cuda_memp2pcpy(void *dest, const void *src, size_t size);
 #if OPAL_CUDA_GDR_SUPPORT
 OPAL_DECLSPEC bool mca_common_cuda_previously_freed_memory(mca_rcache_base_registration_t *reg);
 OPAL_DECLSPEC void mca_common_cuda_get_buffer_id(mca_rcache_base_registration_t *reg);
diff --git a/opal/mca/rcache/gpusm/rcache_gpusm_module.c b/opal/mca/rcache/gpusm/rcache_gpusm_module.c
index caf8913a938..bf7af87309f 100644
--- a/opal/mca/rcache/gpusm/rcache_gpusm_module.c
+++ b/opal/mca/rcache/gpusm/rcache_gpusm_module.c
@@ -49,7 +49,7 @@
 static void mca_rcache_gpusm_registration_constructor( mca_rcache_gpusm_registration_t *item )
 {
     mca_common_cuda_construct_event_and_handle(&item->event,
-                                               (void *)&item->evtHandle);
+                                               (void *)item->evtHandle);
 }
 
 /**
diff --git a/test/datatype/Makefile.am b/test/datatype/Makefile.am
index 9c9aaa4a1a0..bc9def66660 100644
--- a/test/datatype/Makefile.am
+++ b/test/datatype/Makefile.am
@@ -20,6 +20,10 @@ if PROJECT_OMPI
 endif
 TESTS = opal_datatype_test $(MPI_TESTS)
 
+#if OPAL_cuda_support
+#TESTS += ddt_test_cuda
+#endif
+
 check_PROGRAMS = $(TESTS) $(MPI_CHECKS)
 
 unpack_ooo_SOURCES = unpack_ooo.c ddt_lib.c ddt_lib.h
@@ -29,10 +33,15 @@ unpack_ooo_LDADD = \
         $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la
 
 ddt_test_SOURCES = ddt_test.c ddt_lib.c ddt_lib.h
-ddt_test_LDFLAGS = $(OMPI_PKG_CONFIG_LDFLAGS)
-ddt_test_LDADD = \
-        $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la \
-        $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la
+ddt_test_LDFLAGS = $(WRAPPER_EXTRA_LDFLAGS)
+ddt_test_LDADD = $(top_builddir)/ompi/libmpi.la $(top_builddir)/opal/mca/common/cuda/libmca_common_cuda.la
+
+if OPAL_cuda_support
+ddt_test_cuda_SOURCES = ddt_test_cuda.c ddt_lib.c ddt_lib.h
+ddt_test_cuda_LDFLAGS = $(WRAPPER_EXTRA_LDFLAGS)
+ddt_test_cuda_CFLAGS = @opal_datatype_cuda_CPPFLAGS@ -g -O0
+ddt_test_cuda_LDADD = $(top_builddir)/ompi/libmpi.la $(top_builddir)/opal/mca/common/cuda/libmca_common_cuda.la @opal_datatype_cuda_LDFLAGS@ -lcudart
+endif
 
 ddt_raw_SOURCES = ddt_raw.c ddt_lib.c ddt_lib.h
 ddt_raw_LDFLAGS = $(OMPI_PKG_CONFIG_LDFLAGS)
diff --git a/test/datatype/ddt_lib.c b/test/datatype/ddt_lib.c
index 9170da0914a..a96ec085ddd 100644
--- a/test/datatype/ddt_lib.c
+++ b/test/datatype/ddt_lib.c
@@ -358,14 +358,28 @@ ompi_datatype_t* upper_matrix( unsigned int mat_size )
 
     disp = (int*)malloc( sizeof(int) * mat_size );
     blocklen = (int*)malloc( sizeof(int) * mat_size );
-
+    
     for( i = 0; i < mat_size; i++ ) {
         disp[i] = i * mat_size + i;
         blocklen[i] = mat_size - i;
     }
-
+    /*int ct = 0;
+    for (i = 0; i < mat_size; i++) {
+        blocklen[i] = mat_size - ct*160;
+        disp[i] = i*mat_size + ct*160;
+        if (i % 160 == 0 && i != 0) {
+            ct++;
+        }
+    }*/
+#if defined (TEST_DOUBLE)
     ompi_datatype_create_indexed( mat_size, blocklen, disp, &ompi_mpi_double.dt,
                              &upper );
+#elif defined (TEST_FLOAT)
+    ompi_datatype_create_indexed( mat_size, blocklen, disp, &ompi_mpi_float.dt, &upper );
+#elif defined (TEST_CHAR)
+    ompi_datatype_create_indexed( mat_size, blocklen, disp, &ompi_mpi_char.dt, &upper );
+#else
+#endif
     ompi_datatype_commit( &upper );
     if( outputFlags & DUMP_DATA_AFTER_COMMIT ) {
         ompi_datatype_dump( upper );
@@ -686,3 +700,26 @@ ompi_datatype_t* create_vector_type( const ompi_datatype_t* data, int count, int
     return vector;
 }
 
+ompi_datatype_t* create_struct_type(int count)
+{
+    ompi_datatype_t* dt_struct;
+    ompi_datatype_t* dt_struct_vector;
+    ompi_datatype_t* oldtypes[2];
+    MPI_Aint offsets[2], extent, lb;
+    int blockcounts[2];
+    
+    offsets[0] = 0; 
+    oldtypes[0] = MPI_FLOAT; 
+    blockcounts[0] = 4; 
+    
+    ompi_datatype_get_extent(MPI_FLOAT, &lb, &extent);
+    offsets[1] = 4 * extent; 
+    oldtypes[1] = MPI_DOUBLE; 
+    blockcounts[1] = 2;
+    
+    ompi_datatype_create_struct( 2, blockcounts, offsets, oldtypes, &dt_struct );
+    dt_struct_vector = create_vector_type( dt_struct, 10, 2, 4 );
+    ompi_datatype_commit( &dt_struct_vector );
+    return dt_struct_vector;
+}
+
diff --git a/test/datatype/ddt_lib.h b/test/datatype/ddt_lib.h
index d94690047a7..0f6bbc2cb37 100644
--- a/test/datatype/ddt_lib.h
+++ b/test/datatype/ddt_lib.h
@@ -34,6 +34,11 @@
 #define DUMP_DATA_AFTER_COMMIT 0x00000001
 #define CHECK_PACK_UNPACK      0x00000002
 
+#define TEST_DOUBLE
+//#define TEST_FLOAT
+//#define TEST_CHAR
+
+
 extern uint32_t outputFlags;
 
 /**
@@ -92,4 +97,5 @@ extern ompi_datatype_t* create_contiguous_type( const ompi_datatype_t* data, int
 extern ompi_datatype_t* create_vector_type( const ompi_datatype_t* data, int count,
                                             int length, int stride );
 extern ompi_datatype_t* create_struct_constant_gap_resized_ddt( ompi_datatype_t* type );
+extern ompi_datatype_t* create_struct_type(int count);
 
diff --git a/test/datatype/ddt_test_cuda.c b/test/datatype/ddt_test_cuda.c
new file mode 100644
index 00000000000..25e2c8db5bb
--- /dev/null
+++ b/test/datatype/ddt_test_cuda.c
@@ -0,0 +1,621 @@
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*
+ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2009 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2006 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2006      Sun Microsystems Inc. All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+#include "ddt_lib.h"
+#include "opal/runtime/opal.h"
+#include "opal/datatype/opal_convertor.h"
+#include <time.h>
+#include <stdlib.h>
+#ifdef HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif
+#include <stdio.h>
+#include <string.h>
+
+#include <cuda_runtime_api.h>
+#include "opal/mca/common/cuda/common_cuda.h"
+#include "opal/runtime/opal_params.h"
+
+#define CONVERTOR_CUDA             0x00400000
+
+/* Compile with:
+mpicc -DHAVE_CONFIG_H -I. -I../../include -I../../../ompi-trunk/include  -I../.. -I../../include -I../../../ompi-trunk/opal -I../../../ompi-trunk/orte -I../../../ompi-trunk/ompi -g ddt_test.c -o ddt_test
+*/
+
+#define TIMER_DATA_TYPE struct timeval
+#define GET_TIME(TV)   gettimeofday( &(TV), NULL )
+#define ELAPSED_TIME(TSTART, TEND)  (((TEND).tv_sec - (TSTART).tv_sec) * 1000000 + ((TEND).tv_usec - (TSTART).tv_usec))
+
+#define DUMP_DATA_AFTER_COMMIT 0x00000001
+#define CHECK_PACK_UNPACK      0x00000002
+
+uint32_t remote_arch = 0xffffffff;
+
+static int test_upper( unsigned int length )
+{
+    double *mat1, *mat2, *inbuf, *mat1_cuda, *mat2_cuda;
+    ompi_datatype_t *pdt;
+    opal_convertor_t * pConv;
+    char *ptr;
+    int rc;
+    unsigned int i, j, iov_count, split_chunk, total_length;
+    size_t max_data;
+    struct iovec a;
+    TIMER_DATA_TYPE start, end;
+    long total_time;
+
+    printf( "test upper matrix\n" );
+    pdt = upper_matrix( length );
+    /*dt_dump( pdt );*/
+
+    mat1 = malloc( length * length * sizeof(double) );
+    init_random_upper_matrix( length, mat1 );
+    mat2 = calloc( length * length, sizeof(double) );
+    
+    cudaError_t error = cudaMalloc((void **)&mat1_cuda, length * length * sizeof(double));
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    error = cudaMalloc((void **)&mat2_cuda, length * length * sizeof(double));
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+
+    total_length = length * (length + 1) * ( sizeof(double) / 2);
+    error = cudaMallocHost((void **)&inbuf, total_length);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    ptr = (char*)inbuf;
+    /* copy upper matrix in the array simulating the input buffer */
+    for( i = 0; i < length; i++ ) {
+        uint32_t pos = i * length + i;
+        for( j = i; j < length; j++, pos++ ) {
+            *inbuf = mat1[pos];
+            inbuf++;
+        }
+    }
+    inbuf = (double*)ptr;
+    
+    cudaMemcpy(mat1_cuda, mat1, length * length * sizeof(double), cudaMemcpyHostToDevice);
+    cudaMemcpy(mat2_cuda, mat2, length * length * sizeof(double), cudaMemcpyHostToDevice);
+    
+    cudaDeviceSynchronize();
+    
+    pConv = opal_convertor_create( remote_arch, 0 );
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_recv( pConv, &(pdt->super), 1, mat2_cuda ) ) {
+        printf( "Cannot attach the datatype to a convertor\n" );
+        return OMPI_ERROR;
+    }
+
+    GET_TIME( start );
+    split_chunk = (length + 1) * sizeof(double);
+    /*    split_chunk = (total_length + 1) * sizeof(double); */
+    for( i = total_length; i > 0; ) {
+        if( i <= split_chunk ) {  /* equal test just to be able to set a breakpoint */
+            split_chunk = i;
+        }
+        a.iov_base = ptr;
+        a.iov_len = split_chunk;
+        iov_count = 1;
+        max_data = split_chunk;
+        opal_convertor_unpack( pConv, &a, &iov_count, &max_data );
+        ptr += max_data;
+        i -= max_data;
+    }
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "complete unpacking in %ld microsec\n", total_time );
+    cudaFreeHost( inbuf );
+    cudaMemcpy(mat1, mat1_cuda, length * length * sizeof(double), cudaMemcpyDeviceToHost);
+    cudaMemcpy(mat2, mat2_cuda, length * length * sizeof(double), cudaMemcpyDeviceToHost);
+    rc = check_diag_matrix( length, mat1, mat2 );
+    cudaFree( mat1 );
+    cudaFree( mat2 );
+
+    /* test the automatic destruction pf the data */
+    ompi_datatype_destroy( &pdt ); assert( pdt == NULL );
+
+    OBJ_RELEASE( pConv );
+    return rc;
+}
+
+/**
+ * Computing the correct buffer length for moving a multiple of a datatype
+ * is not an easy task. Define a function to centralize the complexity in a
+ * single location.
+ */
+static size_t compute_buffer_length(ompi_datatype_t* pdt, int count)
+{
+    MPI_Aint extent, lb, true_extent, true_lb;
+    size_t length;
+
+    ompi_datatype_get_extent(pdt, &lb, &extent);
+    ompi_datatype_get_true_extent(pdt, &true_lb, &true_extent); (void)true_lb;
+    length = true_lb + true_extent + (count - 1) * extent;
+
+    return  length;
+}
+
+/**
+ *  Conversion function. They deal with data-types in 3 ways, always making local copies.
+ * In order to allow performance testings, there are 3 functions:
+ *  - one copying directly from one memory location to another one using the
+ *    data-type copy function.
+ *  - one which use a 2 convertors created with the same data-type
+ *  - and one using 2 convertors created from different data-types.
+ *
+ */
+static int local_copy_ddt_count( ompi_datatype_t* pdt, int count )
+{
+    void *pdst, *psrc;
+    TIMER_DATA_TYPE start, end;
+    long total_time;
+    size_t length;
+
+    length = compute_buffer_length(pdt, count);
+    
+    cudaError_t error = cudaMalloc((void **)&pdst, length);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(pdst, 0, length);
+
+    error = cudaMalloc((void **)&psrc, length);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(psrc, 0, length);
+
+    GET_TIME( start );
+    if( OMPI_SUCCESS != ompi_datatype_copy_content_same_ddt( pdt, count, pdst, psrc ) ) {
+        printf( "Unable to copy the datatype in the function local_copy_ddt_count."
+                " Is the datatype committed ?\n" );
+    }
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "direct local copy in %ld microsec\n", total_time );
+    cudaFree(pdst);
+    cudaFree(psrc);
+
+    return OMPI_SUCCESS;
+}
+
+static int
+local_copy_with_convertor_2datatypes( ompi_datatype_t* send_type, int send_count,
+                                      ompi_datatype_t* recv_type, int recv_count,
+                                      int chunk )
+{
+    void *pdst = NULL, *psrc = NULL, *ptemp = NULL;
+    opal_convertor_t *send_convertor = NULL, *recv_convertor = NULL;
+    struct iovec iov;
+    uint32_t iov_count;
+    size_t max_data;
+    int32_t length = 0, done1 = 0, done2 = 0;
+    TIMER_DATA_TYPE start, end, unpack_start, unpack_end;
+    long total_time, unpack_time = 0;
+    size_t slength, rlength;
+
+    rlength = compute_buffer_length(recv_type, recv_count);
+    slength = compute_buffer_length(send_type, send_count);
+    pdst  = malloc( rlength );
+    psrc  = malloc( slength );
+    ptemp = malloc( chunk );
+    
+    cudaError_t error = cudaMalloc((void **)&pdst, rlength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(pdst, 0, rlength);
+
+    error = cudaMalloc((void **)&psrc, slength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(psrc, 0, slength);
+    
+    error = cudaMallocHost((void **)&ptemp, chunk);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(ptemp, 0, chunk);
+
+    send_convertor = opal_convertor_create( remote_arch, 0 );
+    send_convertor->flags |= CONVERTOR_CUDA;
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_send( send_convertor, &(send_type->super), send_count, psrc ) ) {
+        printf( "Unable to create the send convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+    recv_convertor = opal_convertor_create( remote_arch, 0 );
+    recv_convertor->flags |= CONVERTOR_CUDA;
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_recv( recv_convertor, &(recv_type->super), recv_count, pdst ) ) {
+        printf( "Unable to create the recv convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+    
+    cudaDeviceSynchronize();
+
+    GET_TIME( start );
+    while( (done1 & done2) != 1 ) {
+        /* They are supposed to finish in exactly the same time. */
+        if( done1 | done2 ) {
+            printf( "WRONG !!! the send is %s but the receive is %s in local_copy_with_convertor_2datatypes\n",
+                    (done1 ? "finish" : "not finish"),
+                    (done2 ? "finish" : "not finish") );
+        }
+
+        max_data = chunk;
+        iov_count = 1;
+        iov.iov_base = ptemp;
+        iov.iov_len = chunk;
+
+        if( done1 == 0 ) {
+            done1 = opal_convertor_pack( send_convertor, &iov, &iov_count, &max_data );
+        }
+
+        if( done2 == 0 ) {
+            GET_TIME( unpack_start );
+            done2 = opal_convertor_unpack( recv_convertor, &iov, &iov_count, &max_data );
+            GET_TIME( unpack_end );
+            unpack_time += ELAPSED_TIME( unpack_start, unpack_end );
+        }
+
+        length += max_data;
+    }
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "copying different data-types using convertors in %ld microsec\n", total_time );
+    printf( "\t unpack in %ld microsec [pack in %ld microsec]\n", unpack_time,
+            total_time - unpack_time );
+ clean_and_return:
+    if( send_convertor != NULL ) {
+        OBJ_RELEASE( send_convertor ); assert( send_convertor == NULL );
+    }
+    if( recv_convertor != NULL ) {
+        OBJ_RELEASE( recv_convertor ); assert( recv_convertor == NULL );
+    }
+    if( NULL != pdst ) cudaFree( pdst );
+    if( NULL != psrc ) cudaFree( psrc );
+    if( NULL != ptemp ) cudaFreeHost( ptemp );
+    return OMPI_SUCCESS;
+}
+
+static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk )
+{
+    void *pdst = NULL, *psrc = NULL, *ptemp = NULL;
+    opal_convertor_t *send_convertor = NULL, *recv_convertor = NULL;
+    struct iovec iov;
+    uint32_t iov_count;
+    size_t max_data;
+    int32_t length = 0, done1 = 0, done2 = 0;
+    TIMER_DATA_TYPE start, end, unpack_start, unpack_end;
+    long total_time, unpack_time = 0;
+
+    max_data = compute_buffer_length(pdt, count);
+    
+    cudaError_t error = cudaMalloc((void **)&pdst, max_data);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(pdst, 0, max_data);
+
+    error = cudaMalloc((void **)&psrc, max_data);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(psrc, 0, max_data);
+    
+    error = cudaMallocHost((void **)&ptemp, chunk);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(ptemp, 0, chunk);
+
+    send_convertor = opal_convertor_create( remote_arch, 0 );
+    send_convertor->flags |= CONVERTOR_CUDA;
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_send( send_convertor, &(pdt->super), count, psrc ) ) {
+        printf( "Unable to create the send convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+
+    recv_convertor = opal_convertor_create( remote_arch, 0 );
+    recv_convertor->flags |= CONVERTOR_CUDA;
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_recv( recv_convertor, &(pdt->super), count, pdst ) ) {
+        printf( "Unable to create the recv convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+    
+    cudaDeviceSynchronize();
+
+    GET_TIME( start );
+    while( (done1 & done2) != 1 ) {
+        /* They are supposed to finish in exactly the same time. */
+        if( done1 | done2 ) {
+            printf( "WRONG !!! the send is %s but the receive is %s in local_copy_with_convertor\n",
+                    (done1 ? "finish" : "not finish"),
+                    (done2 ? "finish" : "not finish") );
+        }
+
+        max_data = chunk;
+        iov_count = 1;
+        iov.iov_base = ptemp;
+        iov.iov_len = chunk;
+
+        if( done1 == 0 ) {
+            done1 = opal_convertor_pack( send_convertor, &iov, &iov_count, &max_data );
+        }
+
+        if( done2 == 0 ) {
+            GET_TIME( unpack_start );
+            done2 = opal_convertor_unpack( recv_convertor, &iov, &iov_count, &max_data );
+            GET_TIME( unpack_end );
+            unpack_time += ELAPSED_TIME( unpack_start, unpack_end );
+        }
+
+        length += max_data;
+    }
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "copying same data-type using convertors in %ld microsec\n", total_time );
+    printf( "\t unpack in %ld microsec [pack in %ld microsec]\n", unpack_time,
+            total_time - unpack_time );
+ clean_and_return:
+    if( NULL != send_convertor ) OBJ_RELEASE( send_convertor );
+    if( NULL != recv_convertor ) OBJ_RELEASE( recv_convertor );
+
+    if( NULL != pdst ) cudaFree( pdst );
+    if( NULL != psrc ) cudaFree( psrc );
+    if( NULL != ptemp ) cudaFreeHost( ptemp );
+    return OMPI_SUCCESS;
+}
+
+/**
+ * Main function. Call several tests and print-out the results. It try to stress the convertor
+ * using difficult data-type constructions as well as strange segment sizes for the conversion.
+ * Usually, it is able to detect most of the data-type and convertor problems. Any modifications
+ * on the data-type engine should first pass all the tests from this file, before going into other
+ * tests.
+ */
+int main( int argc, char* argv[] )
+{
+    ompi_datatype_t *pdt, *pdt1, *pdt2, *pdt3;
+    int rc, length = 500;
+
+    opal_init_util(&argc, &argv);
+    ompi_datatype_init();
+    
+    opal_cuda_support = 1;
+    mca_common_cuda_stage_one_init();
+    
+    cudaSetDevice(0);
+
+    /**
+     * By default simulate homogeneous architectures.
+     */
+    remote_arch = opal_local_arch;
+    printf( "\n\n#\n * TEST INVERSED VECTOR\n #\n\n" );
+    pdt = create_inversed_vector( &ompi_mpi_int.dt, 10 );
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        local_copy_ddt_count(pdt, 100);
+        local_copy_with_convertor(pdt, 100, 956);
+    }
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    printf( "\n\n#\n * TEST STRANGE DATATYPE\n #\n\n" );
+    pdt = create_strange_dt();
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        local_copy_ddt_count(pdt, 1);
+        local_copy_with_convertor(pdt, 1, 956);
+    }
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+
+    printf( "\n\n#\n * TEST UPPER TRIANGULAR MATRIX (size 100)\n #\n\n" );
+    pdt = upper_matrix(100);
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        local_copy_ddt_count(pdt, 1);
+        local_copy_with_convertor(pdt, 1, 48000);
+    }
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+
+    mpich_typeub();
+    mpich_typeub2();
+    mpich_typeub3();
+
+    printf( "\n\n#\n * TEST UPPER MATRIX\n #\n\n" );
+    rc = test_upper( length );
+    if( rc == 0 )
+        printf( "decode [PASSED]\n" );
+    else
+        printf( "decode [NOT PASSED]\n" );
+
+    printf( "\n\n#\n * TEST MATRIX BORDERS\n #\n\n" );
+    pdt = test_matrix_borders( length, 100 );
+    if( outputFlags & DUMP_DATA_AFTER_COMMIT ) {
+        ompi_datatype_dump( pdt );
+    }
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+
+    printf( "\n\n#\n * TEST CONTIGUOUS\n #\n\n" );
+    pdt = test_contiguous();
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    printf( "\n\n#\n * TEST STRUCT\n #\n\n" );
+    pdt = test_struct();
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+
+    ompi_datatype_create_contiguous(0, &ompi_mpi_datatype_null.dt, &pdt1);
+    ompi_datatype_create_contiguous(0, &ompi_mpi_datatype_null.dt, &pdt2);
+    ompi_datatype_create_contiguous(0, &ompi_mpi_datatype_null.dt, &pdt3);
+
+    ompi_datatype_add( pdt3, &ompi_mpi_int.dt, 10, 0, -1 );
+    ompi_datatype_add( pdt3, &ompi_mpi_float.dt, 5, 10 * sizeof(int), -1 );
+
+    ompi_datatype_add( pdt2, &ompi_mpi_float.dt, 1, 0, -1 );
+    ompi_datatype_add( pdt2, pdt3, 3, sizeof(int) * 1, -1 );
+
+    ompi_datatype_add( pdt1, &ompi_mpi_long_long_int.dt, 5, 0, -1 );
+    ompi_datatype_add( pdt1, &ompi_mpi_long_double.dt, 2, sizeof(long long) * 5, -1 );
+
+    printf( ">>--------------------------------------------<<\n" );
+    if( outputFlags & DUMP_DATA_AFTER_COMMIT ) {
+        ompi_datatype_dump( pdt1 );
+    }
+    printf( ">>--------------------------------------------<<\n" );
+    if( outputFlags & DUMP_DATA_AFTER_COMMIT ) {
+        ompi_datatype_dump( pdt2 );
+    }
+    printf( ">>--------------------------------------------<<\n" );
+    if( outputFlags & DUMP_DATA_AFTER_COMMIT ) {
+        ompi_datatype_dump( pdt3 );
+    }
+
+    OBJ_RELEASE( pdt1 ); assert( pdt1 == NULL );
+    OBJ_RELEASE( pdt2 ); assert( pdt2 == NULL );
+    OBJ_RELEASE( pdt3 ); assert( pdt3 == NULL );
+
+    printf( ">>--------------------------------------------<<\n" );
+    printf( " Contiguous data-type (MPI_DOUBLE)\n" );
+    pdt = MPI_DOUBLE;
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        local_copy_ddt_count(pdt, 4500);
+        local_copy_with_convertor( pdt, 4500, 12 );
+        local_copy_with_convertor_2datatypes( pdt, 4500, pdt, 4500, 12 );
+    }
+    printf( ">>--------------------------------------------<<\n" );
+
+    printf( ">>--------------------------------------------<<\n" );
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        printf( "Contiguous multiple data-type (4500*1)\n" );
+        pdt = create_contiguous_type( MPI_DOUBLE, 4500 );
+        local_copy_ddt_count(pdt, 1);
+        local_copy_with_convertor( pdt, 1, 120 );
+        local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 120 );
+        OBJ_RELEASE( pdt ); assert( pdt == NULL );
+        printf( "Contiguous multiple data-type (450*10)\n" );
+        pdt = create_contiguous_type( MPI_DOUBLE, 450 );
+        local_copy_ddt_count(pdt, 10);
+        local_copy_with_convertor( pdt, 10, 120 );
+        local_copy_with_convertor_2datatypes( pdt, 10, pdt, 10, 120 );
+        OBJ_RELEASE( pdt ); assert( pdt == NULL );
+        printf( "Contiguous multiple data-type (45*100)\n" );
+        pdt = create_contiguous_type( MPI_DOUBLE, 45 );
+        local_copy_ddt_count(pdt, 100);
+        local_copy_with_convertor( pdt, 100, 120 );
+        local_copy_with_convertor_2datatypes( pdt, 100, pdt, 100, 120 );
+        OBJ_RELEASE( pdt ); assert( pdt == NULL );
+        printf( "Contiguous multiple data-type (100*45)\n" );
+        pdt = create_contiguous_type( MPI_DOUBLE, 100 );
+        local_copy_ddt_count(pdt, 45);
+        local_copy_with_convertor( pdt, 45, 120 );
+        local_copy_with_convertor_2datatypes( pdt, 45, pdt, 45, 120 );
+        OBJ_RELEASE( pdt ); assert( pdt == NULL );
+        printf( "Contiguous multiple data-type (10*450)\n" );
+        pdt = create_contiguous_type( MPI_DOUBLE, 10 );
+        local_copy_ddt_count(pdt, 450);
+        local_copy_with_convertor( pdt, 450, 120 );
+        local_copy_with_convertor_2datatypes( pdt, 450, pdt, 450, 120 );
+        OBJ_RELEASE( pdt ); assert( pdt == NULL );
+        printf( "Contiguous multiple data-type (1*4500)\n" );
+        pdt = create_contiguous_type( MPI_DOUBLE, 1 );
+        local_copy_ddt_count(pdt, 4500);
+        local_copy_with_convertor( pdt, 4500, 120 );
+        local_copy_with_convertor_2datatypes( pdt, 4500, pdt, 4500, 120 );
+        OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    }
+    printf( ">>--------------------------------------------<<\n" );
+    printf( ">>--------------------------------------------<<\n" );
+    printf( "Vector data-type (450 times 10 double stride 11)\n" );
+    pdt = create_vector_type( MPI_DOUBLE, 450, 10, 11 );
+    ompi_datatype_dump( pdt );
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        local_copy_ddt_count(pdt, 1);
+        local_copy_with_convertor( pdt, 1, 120 );
+        local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 120 );
+        local_copy_with_convertor( pdt, 1, 820 );
+        local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 820 );
+        local_copy_with_convertor( pdt, 1, 6000 );
+        local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
+        local_copy_with_convertor( pdt, 1, 36000 );
+        local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 36000 );
+    }
+    printf( ">>--------------------------------------------<<\n" );
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+
+    printf( ">>--------------------------------------------<<\n" );
+    pdt = test_struct_char_double();
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        local_copy_ddt_count(pdt, 4500);
+        local_copy_with_convertor( pdt, 4500, 120 );
+        local_copy_with_convertor_2datatypes( pdt, 4500, pdt, 4500, 120 );
+    }
+    printf( ">>--------------------------------------------<<\n" );
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+
+    printf( ">>--------------------------------------------<<\n" );
+    pdt = test_create_twice_two_doubles();
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        local_copy_ddt_count(pdt, 4500);
+        local_copy_with_convertor( pdt, 4500, 120 );
+        local_copy_with_convertor_2datatypes( pdt, 4500, pdt, 4500, 120 );
+    }
+    printf( ">>--------------------------------------------<<\n" );
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+
+    printf( ">>--------------------------------------------<<\n" );
+    pdt = test_create_blacs_type();
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        ompi_datatype_dump( pdt );
+        local_copy_ddt_count(pdt, 2);
+        local_copy_ddt_count(pdt, 4500);
+        local_copy_with_convertor( pdt, 4500, 956 );
+        local_copy_with_convertor_2datatypes( pdt, 4500, pdt, 4500, 956 );
+        local_copy_with_convertor( pdt, 4500, 16*1024 );
+        local_copy_with_convertor_2datatypes( pdt, 4500, pdt, 4500, 16*1024 );
+        local_copy_with_convertor( pdt, 4500, 64*1024 );
+        local_copy_with_convertor_2datatypes( pdt, 4500, pdt, 4500, 64*1024 );
+    }
+    printf( ">>--------------------------------------------<<\n" );
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+
+    printf( ">>--------------------------------------------<<\n" );
+    pdt1 = test_create_blacs_type1( &ompi_mpi_int.dt );
+    pdt2 = test_create_blacs_type2( &ompi_mpi_int.dt );
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        local_copy_with_convertor_2datatypes( pdt1, 1, pdt2, 1, 1000 );
+    }
+    printf( ">>--------------------------------------------<<\n" );
+    OBJ_RELEASE( pdt1 ); assert( pdt1 == NULL );
+    OBJ_RELEASE( pdt2 ); assert( pdt2 == NULL );
+
+    /* clean-ups all data allocations */
+    ompi_datatype_finalize();
+
+    return OMPI_SUCCESS;
+}
\ No newline at end of file