diff --git a/cmake/modules/ExternalDependenciesVersions.cmake b/cmake/modules/ExternalDependenciesVersions.cmake
index dd7347c254..5d61e82fa7 100644
--- a/cmake/modules/ExternalDependenciesVersions.cmake
+++ b/cmake/modules/ExternalDependenciesVersions.cmake
@@ -4,7 +4,7 @@
 set(TTG_TRACKED_VG_CMAKE_KIT_TAG 878654d0cb1904049fbd2c37b37d5385ae897658)  # provides FindOrFetchLinalgPP and "real" FindOrFetchBoost
 set(TTG_TRACKED_CATCH2_VERSION 3.5.0)
 set(TTG_TRACKED_MADNESS_TAG 93a9a5cec2a8fa87fba3afe8056607e6062a9058)
-set(TTG_TRACKED_PARSEC_TAG 58f8f3089ecad2e8ee50e80a9586e05ce8873b1c)
+set(TTG_TRACKED_PARSEC_TAG c97e2fc54698d3d937d7847a12c7e9084b22a6c8)
 set(TTG_TRACKED_BTAS_TAG c25b0a11d2a76190bfb13fa72f9e9dc3e57c3c2f)
 set(TTG_TRACKED_TILEDARRAY_TAG 5944bdba3266a3fa19f1809c8e2accf3dad4d815)
 
diff --git a/cmake/modules/FindOrFetchPARSEC.cmake b/cmake/modules/FindOrFetchPARSEC.cmake
index b3fd5faa3a..dac240402c 100644
--- a/cmake/modules/FindOrFetchPARSEC.cmake
+++ b/cmake/modules/FindOrFetchPARSEC.cmake
@@ -17,7 +17,7 @@ if (NOT TARGET PaRSEC::parsec)
 
   FetchContent_Declare(
           PARSEC
-          GIT_REPOSITORY      https://github.com/ICLDisco/parsec.git
+          GIT_REPOSITORY      https://github.com/bosilca/parsec.git
           GIT_TAG             ${TTG_TRACKED_PARSEC_TAG}
   )
   FetchContent_MakeAvailable(PARSEC)
diff --git a/ttg/ttg/parsec/devicefunc.h b/ttg/ttg/parsec/devicefunc.h
index 38682ca9df..f1ef7c1e6f 100644
--- a/ttg/ttg/parsec/devicefunc.h
+++ b/ttg/ttg/parsec/devicefunc.h
@@ -16,7 +16,6 @@ namespace ttg_parsec {
       parsec_ttg_task_base_t *caller = detail::parsec_ttg_caller;
       assert(nullptr != caller->dev_ptr);
       parsec_gpu_task_t *gpu_task = caller->dev_ptr->gpu_task;
-      parsec_flow_t *flows = caller->dev_ptr->flows;
 
       auto& view = std::get<I>(views);
       bool is_current = false;
@@ -38,15 +37,15 @@ namespace ttg_parsec {
         }
 
         /* build the flow */
-        /* TODO: reuse the flows of the task class? How can we control the sync direction then? */
-        flows[I] = parsec_flow_t{.name = nullptr,
+        *((parsec_flow_t*)gpu_task->flow_info[I].flow) =
+                  parsec_flow_t{.name = nullptr,
                                 .sym_type = PARSEC_SYM_INOUT,
                                 .flow_flags = static_cast<uint8_t>(access),
                                 .flow_index = I,
                                 .flow_datatype_mask = ~0 };
 
-        gpu_task->flow_nb_elts[I] = data->nb_elts; // size in bytes
-        gpu_task->flow[I] = &flows[I];
+        gpu_task->flow_info[I].flow_span = data->span; // size in bytes
+        gpu_task->flow_info[I].flow_dc = nullptr;
 
         /* set the input data copy, parsec will take care of the transfer
         * and the buffer will look at the parsec_data_t for the current pointer */
@@ -57,13 +56,13 @@ namespace ttg_parsec {
 
       } else {
         /* ignore the flow */
-        flows[I] = parsec_flow_t{.name = nullptr,
+        *((parsec_flow_t*)gpu_task->flow_info[I].flow) =
+                   parsec_flow_t{.name = nullptr,
                                  .sym_type = PARSEC_FLOW_ACCESS_NONE,
                                  .flow_flags = 0,
                                  .flow_index = I,
                                  .flow_datatype_mask = ~0 };
-        gpu_task->flow[I] = &flows[I];
-        gpu_task->flow_nb_elts[I] = 0; // size in bytes
+        gpu_task->flow_info[I].flow_span = 0; // size in bytes
         caller->parsec_task.data[I].data_in = nullptr;
       }
 
@@ -80,6 +79,7 @@ namespace ttg_parsec {
   template<typename... Views>
   bool register_device_memory(std::tuple<Views&...> &views) {
     bool is_current = true;
+    constexpr const std::size_t num_views = sizeof...(Views);
     if (nullptr == detail::parsec_ttg_caller) {
       throw std::runtime_error("register_device_memory may only be invoked from inside a task!");
     }
@@ -88,19 +88,13 @@ namespace ttg_parsec {
       throw std::runtime_error("register_device_memory called inside a non-gpu task!");
     }
 
+    auto task = detail::parsec_ttg_caller;
+    task->dev_ptr->gpu_task->allocate_flows(num_views);
+
     if constexpr (sizeof...(Views) > 0) {
       is_current = detail::register_device_memory(views, std::index_sequence_for<Views...>{});
     }
 
-    /* reset all entries in the current task */
-    for (int i = sizeof...(Views); i < MAX_PARAM_COUNT; ++i) {
-      detail::parsec_ttg_caller->parsec_task.data[i].data_in = nullptr;
-      detail::parsec_ttg_caller->dev_ptr->flows[i].flow_flags = PARSEC_FLOW_ACCESS_NONE;
-      detail::parsec_ttg_caller->dev_ptr->flows[i].flow_index = i;
-      detail::parsec_ttg_caller->dev_ptr->gpu_task->flow[i] = &detail::parsec_ttg_caller->dev_ptr->flows[i];
-      detail::parsec_ttg_caller->dev_ptr->gpu_task->flow_nb_elts[i] = 0;
-    }
-
     return is_current;
   }
 
@@ -120,8 +114,8 @@ namespace ttg_parsec {
     uint8_t i; // only limited number of flows
     detail::parsec_ttg_task_base_t *caller = detail::parsec_ttg_caller;
     assert(nullptr != caller->dev_ptr);
+    caller->dev_ptr->gpu_task->allocate_flows(span.size());
     parsec_gpu_task_t *gpu_task = caller->dev_ptr->gpu_task;
-    parsec_flow_t *flows = caller->dev_ptr->flows;
 
     bool is_current = false;
     for (i = 0; i < span.size(); ++i) {
@@ -146,14 +140,15 @@ namespace ttg_parsec {
 
         /* build the flow */
         /* TODO: reuse the flows of the task class? How can we control the sync direction then? */
-        flows[i] = parsec_flow_t{.name = nullptr,
+        *((parsec_flow_t*)gpu_task->flow_info[i].flow) =
+                  parsec_flow_t{.name = nullptr,
                                 .sym_type = PARSEC_SYM_INOUT,
                                 .flow_flags = static_cast<uint8_t>(access),
                                 .flow_index = i,
                                 .flow_datatype_mask = ~0 };
 
-        gpu_task->flow_nb_elts[i] = data->nb_elts; // size in bytes
-        gpu_task->flow[i] = &flows[i];
+        gpu_task->flow_info[i].flow_span = data->span; // size in bytes
+        gpu_task->flow_info[i].flow_dc = nullptr;
 
         /* set the input data copy, parsec will take care of the transfer
         * and the buffer will look at the parsec_data_t for the current pointer */
@@ -164,25 +159,17 @@ namespace ttg_parsec {
 
       } else {
         /* ignore the flow */
-        flows[i] = parsec_flow_t{.name = nullptr,
+        *((parsec_flow_t*)gpu_task->flow_info[i].flow) =
+                   parsec_flow_t{.name = nullptr,
                                  .sym_type = PARSEC_FLOW_ACCESS_NONE,
                                  .flow_flags = 0,
                                  .flow_index = i,
                                  .flow_datatype_mask = ~0 };
-        gpu_task->flow[i] = &flows[i];
-        gpu_task->flow_nb_elts[i] = 0; // size in bytes
+        gpu_task->flow_info[i].flow_span = 0; // size in bytes
         caller->parsec_task.data[i].data_in = nullptr;
       }
     }
 
-    /* reset all remaining entries in the current task */
-    for (; i < MAX_PARAM_COUNT; ++i) {
-      detail::parsec_ttg_caller->parsec_task.data[i].data_in = nullptr;
-      detail::parsec_ttg_caller->dev_ptr->flows[i].flow_flags = PARSEC_FLOW_ACCESS_NONE;
-      detail::parsec_ttg_caller->dev_ptr->flows[i].flow_index = i;
-      detail::parsec_ttg_caller->dev_ptr->gpu_task->flow[i] = &detail::parsec_ttg_caller->dev_ptr->flows[i];
-      detail::parsec_ttg_caller->dev_ptr->gpu_task->flow_nb_elts[i] = 0;
-    }
     // we cannot allow the calling thread to submit kernels so say we're not ready
     return is_current;
   }
@@ -204,7 +191,7 @@ namespace ttg_parsec {
         int ret = device_module->memcpy_async(device_module, stream,
                                               data->device_copies[0]->device_private,
                                               data->device_copies[data->owner_device]->device_private,
-                                              data->nb_elts, parsec_device_gpu_transfer_direction_d2h);
+                                              data->span, parsec_device_gpu_transfer_direction_d2h);
         assert(ret == PARSEC_SUCCESS);
       }
       if constexpr (sizeof...(Is) > 0) {
diff --git a/ttg/ttg/parsec/devicescratch.h b/ttg/ttg/parsec/devicescratch.h
index e2c3743aa3..60a12687c9 100644
--- a/ttg/ttg/parsec/devicescratch.h
+++ b/ttg/ttg/parsec/devicescratch.h
@@ -50,19 +50,6 @@ struct devicescratch {
     return data;
   }
 
-  void remove_from_flow() {
-    /* remove the scratch from the gpu-task flow */
-    assert(nullptr != detail::parsec_ttg_caller);
-    parsec_task_t *parsec_task = &detail::parsec_ttg_caller->parsec_task;
-    parsec_flow_t *flows = detail::parsec_ttg_caller->dev_ptr->flows;
-    for (int i = 0; i < MAX_PARAM_COUNT; ++i) {
-      if (nullptr != parsec_task->data[i].data_in && parsec_task->data[i].data_in->original == m_data) {
-        flows[i].flow_flags = PARSEC_FLOW_ACCESS_NONE; // disable this flow
-        break;
-      }
-    }
-  }
-
   friend parsec_data_t* detail::get_parsec_data<T>(const ttg_parsec::devicescratch<T>&);
 
 public:
@@ -93,8 +80,6 @@ struct devicescratch {
   devicescratch& operator=(const devicescratch& db) = delete;
 
   ~devicescratch() {
-    /* remove data from flow */
-    //remove_from_flow();
     if (nullptr != m_data) {
       //parsec_data_destroy(m_data);
       //parsec_data_copy_detach(m_data, parsec_data_get_copy(m_data, 0), 0);
@@ -128,7 +113,7 @@ struct devicescratch {
   }
 
   std::size_t size() const {
-    return (m_data->nb_elts / sizeof(element_type));
+    return (m_data->span / sizeof(element_type));
   }
 
 };
diff --git a/ttg/ttg/parsec/task.h b/ttg/ttg/parsec/task.h
index f29ca8ecb5..2080aaf295 100644
--- a/ttg/ttg/parsec/task.h
+++ b/ttg/ttg/parsec/task.h
@@ -10,21 +10,41 @@ namespace ttg_parsec {
 
   namespace detail {
 
+    struct parsec_ttg_gpu_task_t : public parsec_gpu_task_t {
+      std::byte *memory = nullptr;
+      void allocate_flows(std::size_t size) {
+        if (this->memory != nullptr) free_flows();
+        constexpr const auto align = std::align_val_t(std::max(alignof(parsec_flow_t), alignof(parsec_gpu_flow_info_t)));
+        this->memory = new(align) std::byte[size * (sizeof(parsec_flow_t) + sizeof(parsec_gpu_flow_info_s))];
+        parsec_flow_t *flows = (parsec_flow_t*)this->memory;
+        this->flow_info = (parsec_gpu_flow_info_t*)(this->memory + size * sizeof(parsec_flow_t));
+        for (std::size_t i = 0; i < size; ++i) {
+          this->flow_info[i].flow = &flows[i];
+          flows[i].flow_index = i;
+          flows[i].flow_flags = 0;
+          flows[i].flow_datatype_mask = ~0;
+        }
+        this->nb_flows  = size;
+      }
+
+      void free_flows() {
+        if (this->memory != nullptr) {
+          delete[] this->memory;
+          this->memory = nullptr;
+        }
+      }
+    };
+
     struct device_ptr_t {
-      parsec_gpu_task_t* gpu_task = nullptr;
-      parsec_flow_t* flows = nullptr;
+      parsec_ttg_gpu_task_t *gpu_task = nullptr;
       parsec_gpu_exec_stream_t* stream = nullptr;
       parsec_device_gpu_module_t* device = nullptr;
-      parsec_task_class_t task_class; // copy of the taskclass
     };
 
-    template<bool SupportDevice>
+    template<bool HasDeviceOp>
     struct device_state_t
     {
       static constexpr bool support_device = false;
-      static constexpr size_t num_flows = 0;
-      device_state_t()
-      { }
       static constexpr device_ptr_t* dev_ptr() {
         return nullptr;
       }
@@ -32,10 +52,10 @@ namespace ttg_parsec {
 
     template<>
     struct device_state_t<true> {
-      static constexpr bool support_device = false;
-      static constexpr size_t num_flows = MAX_PARAM_COUNT;
-      parsec_flow_t m_flows[num_flows];
-      device_ptr_t m_dev_ptr = {nullptr, &m_flows[0], nullptr, nullptr}; // gpu_task will be allocated in each task
+      static constexpr bool support_device = true;
+      parsec_ttg_gpu_task_t device_task;
+      device_ptr_t m_dev_ptr = {&device_task, nullptr, nullptr};
+
       device_ptr_t* dev_ptr() {
         return &m_dev_ptr;
       }
diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h
index 2814cf3247..597d298007 100644
--- a/ttg/ttg/parsec/ttg.h
+++ b/ttg/ttg/parsec/ttg.h
@@ -1466,10 +1466,10 @@ namespace ttg_parsec {
       ttg::device::detail::reset_current();
 
       auto discard_tmp_flows = [&](){
-        for (int i = 0; i < MAX_PARAM_COUNT; ++i) {
-          if (gpu_task->flow[i]->flow_flags & TTG_PARSEC_FLOW_ACCESS_TMP) {
+        for (int i = 0; i < gpu_task->nb_flows; ++i) {
+          if (gpu_task->flow_info[i].flow->flow_flags & TTG_PARSEC_FLOW_ACCESS_TMP) {
             /* temporary flow, discard by setting it to read-only to avoid evictions */
-            const_cast<parsec_flow_t*>(gpu_task->flow[i])->flow_flags = PARSEC_FLOW_ACCESS_READ;
+            const_cast<parsec_flow_t*>(gpu_task->flow_info[i].flow)->flow_flags = PARSEC_FLOW_ACCESS_READ;
             task->parsec_task.data[i].data_out->readers = 1;
           }
         }
@@ -1503,87 +1503,59 @@ namespace ttg_parsec {
       return rc;
     }
 
+    /* callback to set in the device task structure */
+    static void release_device_task(parsec_gpu_task_t *gpu_task) {
+      detail::parsec_ttg_gpu_task_t *task = static_cast<detail::parsec_ttg_gpu_task_t *>(gpu_task);
+      task->free_flows();
+    }
+
     template <ttg::ExecutionSpace Space>
     static parsec_hook_return_t device_static_evaluate(parsec_task_t* parsec_task) {
 
       task_t *task = (task_t*)parsec_task;
-      if (task->dev_ptr->gpu_task == nullptr) {
-
-        /* set up a device task */
-        parsec_gpu_task_t *gpu_task;
-        /* PaRSEC wants to free the gpu_task, because F***K ownerships */
-        gpu_task = static_cast<parsec_gpu_task_t*>(std::calloc(1, sizeof(*gpu_task)));
-        PARSEC_OBJ_CONSTRUCT(gpu_task, parsec_list_item_t);
-        gpu_task->ec = parsec_task;
-        gpu_task->task_type = 0; // user task
-        gpu_task->last_data_check_epoch = std::numeric_limits<uint64_t>::max(); // used internally
-        gpu_task->pushout = 0;
-        gpu_task->submit = &TT::device_static_submit<Space>;
-
-        // one way to force the task device
-        // currently this will probably break all of PaRSEC if this hint
-        // does not match where the data is located, not really useful for us
-        // instead we set a hint on the data if there is no hint set yet
-        //parsec_task->selected_device = ...;
-
-        /* set the gpu_task so it's available in register_device_memory */
-        task->dev_ptr->gpu_task = gpu_task;
 
-        /* TODO: is this the right place to set the mask? */
-        task->parsec_task.chore_mask = PARSEC_DEV_ALL;
+      /* set up the device task */
+      parsec_gpu_task_t *gpu_task = task->dev_ptr->gpu_task;
+      /* construct the GPU task */
+      PARSEC_OBJ_CONSTRUCT(gpu_task, parsec_gpu_task_t);
+      gpu_task->ec = parsec_task;
+      gpu_task->submit = &TT::device_static_submit<Space>;
+      gpu_task->release_device_task = &release_device_task;
 
-        /* copy over the task class, because that's what we need */
-        task->dev_ptr->task_class = *task->parsec_task.task_class;
+      /* TODO: is this the right place to set the mask? */
+      task->parsec_task.chore_mask = PARSEC_DEV_ALL;
 
-        // first invocation of the coroutine to get the coroutine handle
-        static_op<Space>(parsec_task);
 
-        /* when we come back here, the flows in gpu_task are set (see register_device_memory) */
+      // first invocation of the coroutine to get the coroutine handle
+      static_op<Space>(parsec_task);
 
-        parsec_task_class_t& tc = task->dev_ptr->task_class;
+      /* when we come back here, the flows in gpu_task are set (see register_device_memory) */
 
-        // input flows are set up during register_device_memory as part of the first invocation above
-        for (int i = 0; i < MAX_PARAM_COUNT; ++i) {
-          tc.in[i]  = gpu_task->flow[i];
-          tc.out[i] = gpu_task->flow[i];
+      /* set the device hint on the data */
+      TT *tt = task->tt;
+      if (tt->devicemap) {
+        int parsec_dev;
+        if constexpr (std::is_void_v<keyT>) {
+          parsec_dev = detail::ttg_device_to_parsec_device(tt->devicemap());
+        } else {
+          parsec_dev = detail::ttg_device_to_parsec_device(tt->devicemap(task->key));
         }
-        tc.nb_flows = MAX_PARAM_COUNT;
-
-        /* set the device hint on the data */
-        TT *tt = task->tt;
-        if (tt->devicemap) {
-          int parsec_dev;
-          if constexpr (std::is_void_v<keyT>) {
-            parsec_dev = detail::ttg_device_to_parsec_device(tt->devicemap());
-          } else {
-            parsec_dev = detail::ttg_device_to_parsec_device(tt->devicemap(task->key));
-          }
-          for (int i = 0; i < MAX_PARAM_COUNT; ++i) {
-            /* only set on mutable data since we have exclusive access */
-            if (tc.in[i]->flow_flags & PARSEC_FLOW_ACCESS_WRITE) {
-              parsec_data_t *data = parsec_task->data[i].data_in->original;
-              /* only set the preferred device if the host has the latest copy
-               * as otherwise we may end up with the wrong data if there is a newer
-               * version on a different device. Also, keep fingers crossed. */
-              if (data->owner_device == 0) {
-                parsec_advise_data_on_device(data, parsec_dev, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE);
-              }
+        for (int i = 0; i < gpu_task->nb_flows; ++i) {
+          /* only set on mutable data since we have exclusive access */
+          if (gpu_task->flow_info[i].flow->flow_flags & PARSEC_FLOW_ACCESS_WRITE) {
+            parsec_data_t *data = parsec_task->data[i].data_in->original;
+            /* only set the preferred device if the host has the latest copy
+              * as otherwise we may end up with the wrong data if there is a newer
+              * version on a different device. Also, keep fingers crossed. */
+            if (data->owner_device == 0) {
+              parsec_advise_data_on_device(data, parsec_dev, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE);
             }
           }
         }
-
-        /* set the new task class that contains the flows */
-        task->parsec_task.task_class = &task->dev_ptr->task_class;
-
-        /* select this one */
-        return PARSEC_HOOK_RETURN_DONE;
       }
 
-      std::cerr << "EVALUATE called on task with assigned GPU task!" << std::endl;
-
-      /* not sure if this might happen*/
-      return PARSEC_HOOK_RETURN_ERROR;
-
+      /* select this one */
+      return PARSEC_HOOK_RETURN_DONE;
     }
 
     template <ttg::ExecutionSpace Space>
@@ -3432,24 +3404,21 @@ namespace ttg_parsec {
         if (data->owner_device != 0) {
           /* find the flow */
           int flowidx = 0;
-          while (flowidx < MAX_PARAM_COUNT &&
-                gpu_task->flow[flowidx]->flow_flags != PARSEC_FLOW_ACCESS_NONE) {
+          while (flowidx < gpu_task->nb_flows) {
             if (detail::parsec_ttg_caller->parsec_task.data[flowidx].data_in->original == data) {
               /* found the right data, set the corresponding flow as pushout */
               break;
             }
             ++flowidx;
           }
-          if (flowidx == MAX_PARAM_COUNT) {
-            throw std::runtime_error("Cannot add more than MAX_PARAM_COUNT flows to a task!");
-          }
-          if (gpu_task->flow[flowidx]->flow_flags == PARSEC_FLOW_ACCESS_NONE) {
+          assert(flowidx < gpu_task->nb_flows);
+          if (gpu_task->flow_info[flowidx].flow->flow_flags == PARSEC_FLOW_ACCESS_NONE) {
             /* no flow found, add one and mark it pushout */
             detail::parsec_ttg_caller->parsec_task.data[flowidx].data_in = data->device_copies[0];
-            gpu_task->flow_nb_elts[flowidx] = data->nb_elts;
+            gpu_task->flow_info[flowidx].flow_span = data->span;
           }
-          /* need to mark the flow WRITE to convince PaRSEC that the data changed */
-          ((parsec_flow_t *)gpu_task->flow[flowidx])->flow_flags |= PARSEC_FLOW_ACCESS_WRITE;
+          /* need to mark the flow RW to make PaRSEC happy */
+          ((parsec_flow_t *)gpu_task->flow_info[flowidx].flow)->flow_flags |= PARSEC_FLOW_ACCESS_WRITE;
           gpu_task->pushout |= 1<<flowidx;
         }
       };