From f92bde3625289cb59d7e3542d83c1920daf1493c Mon Sep 17 00:00:00 2001
From: SeverinDiederichs <severin.diederichs@cern.ch>
Date: Fri, 3 Oct 2025 12:01:16 +0200
Subject: [PATCH 1/2] sync injection via HostCallBacks and cuda event

---
 include/AdePT/core/AsyncAdePTTransport.cuh | 64 +++++++++++++---------
 1 file changed, 38 insertions(+), 26 deletions(-)
diff --git a/include/AdePT/core/AsyncAdePTTransport.cuh b/include/AdePT/core/AsyncAdePTTransport.cuh
index 1c9e1dcc..dbd11e01 100644
--- a/include/AdePT/core/AsyncAdePTTransport.cuh
+++ b/include/AdePT/core/AsyncAdePTTransport.cuh
@@ -842,12 +842,15 @@ void TransportLoop(int trackCapacity, int leakCapacity, int scoringCapacity, int
   // Auxiliary struct used to keep track of the queues that need flushing
   AllLeaked allLeaked{nullptr, nullptr, nullptr};
 
-  cudaEvent_t cudaEvent, cudaStatsEvent;
+  cudaEvent_t cudaEvent, cudaStatsEvent, enqueueDoneEvent;
   cudaStream_t hitTransferStream, injectStream, extractStream, statsStream, interactionStream;
   COPCORE_CUDA_CHECK(cudaEventCreateWithFlags(&cudaEvent, cudaEventDisableTiming));
   COPCORE_CUDA_CHECK(cudaEventCreateWithFlags(&cudaStatsEvent, cudaEventDisableTiming));
+  COPCORE_CUDA_CHECK(cudaEventCreateWithFlags(&enqueueDoneEvent, cudaEventDisableTiming));
   unique_ptr_cuda<cudaEvent_t> cudaEventCleanup{&cudaEvent};
   unique_ptr_cuda<cudaEvent_t> cudaStatsEventCleanup{&cudaStatsEvent};
+  unique_ptr_cuda<cudaEvent_t> enqueueDoneEventCleanup{&enqueueDoneEvent};
+  cudaEventRecord(enqueueDoneEvent, gpuState.stream); // prime first event to avoid possible stall on first iteration
   COPCORE_CUDA_CHECK(cudaStreamCreate(&hitTransferStream));
   COPCORE_CUDA_CHECK(cudaStreamCreate(&injectStream));
   COPCORE_CUDA_CHECK(cudaStreamCreate(&extractStream));
@@ -979,15 +982,12 @@ void TransportLoop(int trackCapacity, int leakCapacity, int scoringCapacity, int
       // *** Particle injection ***
       // --------------------------
       if (gpuState.injectState == InjectState::Idle) {
-        for (auto &eventState : eventStates) {
-          if (const auto state = eventState.load(std::memory_order_acquire); state == EventState::G4RequestsFlush) {
-            eventState = EventState::Inject;
-          } else if (state == EventState::Inject) {
-            eventState = EventState::InjectionCompleted;
-          }
-        }
 
         if (auto &toDevice = trackBuffer.getActiveBuffer(); toDevice.nTrack > 0) {
+
+          // there are actually tracks that are injected, move state machine
+          AdvanceEventStates(EventState::G4RequestsFlush, EventState::Inject, eventStates);
+
           gpuState.injectState = InjectState::CreatingSlots;
 
           trackBuffer.swapToDeviceBuffers();
@@ -1016,20 +1016,36 @@ void TransportLoop(int trackCapacity, int leakCapacity, int scoringCapacity, int
           // Ensure that copy operation completed before releasing lock on to-device buffer
           COPCORE_CUDA_CHECK(cudaEventSynchronize(cudaEvent));
         } else {
-          // No tracks in to-device buffer
-          // Move tracks that requested a flush to InjectionCompleted
-          gpuState.injectState = InjectState::Idle;
+          // No tracks in to-device buffer, we can immediately mark the Injection as completed
+          AdvanceEventStates(EventState::G4RequestsFlush, EventState::InjectionCompleted, eventStates);
         }
       }
 
       // *** Enqueue particles that are ready on the device ***
       if (gpuState.injectState == InjectState::ReadyToEnqueue) {
-        gpuState.injectState = InjectState::Enqueueing;
+
+        // struct needed for hostCallBack function to change both event and injection state
+        struct EnqueueDoneCtx {
+          std::vector<std::atomic<EventState>> *eventStates;
+          std::atomic<InjectState> *injectState;
+        };
+
+        // Enqueue into per-particle queues
         EnqueueTracks<<<1, 256, 0, gpuState.stream>>>(allParticleQueues, gpuState.injectionQueue);
-        // New injection has to wait until particles are enqueued:
-        waitForOtherStream(injectStream, gpuState.stream);
-      } else if (gpuState.injectState == InjectState::Enqueueing) {
-        gpuState.injectState = InjectState::Idle;
+
+        // Event marks enqueue completion for this iteration
+        COPCORE_CUDA_CHECK(cudaEventRecord(enqueueDoneEvent, gpuState.stream));
+
+        auto *ctx = new EnqueueDoneCtx{&eventStates, &gpuState.injectState};
+        COPCORE_CUDA_CHECK(cudaLaunchHostFunc(
+            gpuState.stream,
+            [](void *arg) {
+              auto *ctx = static_cast<EnqueueDoneCtx *>(arg);
+              AdvanceEventStates(EventState::Inject, EventState::InjectionCompleted, *ctx->eventStates);
+              ctx->injectState->store(InjectState::Idle, std::memory_order_release);
+              delete ctx;
+            },
+            ctx));
       }
 
       // ------------------
@@ -1193,6 +1209,9 @@ void TransportLoop(int trackCapacity, int leakCapacity, int scoringCapacity, int
         AdvanceEventStates(EventState::Transporting, EventState::WaitingForTransportToFinish, eventStates);
         AdvanceEventStates(EventState::InjectionCompleted, EventState::Transporting, eventStates);
 
+        // ensure that enqueuing is finished. The population count should see the injected tracks
+        COPCORE_CUDA_CHECK(cudaStreamWaitEvent(statsStream, enqueueDoneEvent, 0));
+
         // Reset all counters count the currently flying population
         ZeroEventCounters<<<1, 256, 0, statsStream>>>(gpuState.stats_dev);
         CountCurrentPopulation<<<ParticleType::NumParticleTypes, 128, 0, statsStream>>>(
@@ -1489,22 +1508,15 @@ void TransportLoop(int trackCapacity, int leakCapacity, int scoringCapacity, int
       {
         inFlight  = 0;
         numLeaked = 0;
-        // FIXME: Synchronizing with the injectionStream here, along with the stats stream is not
-        // needed by the logic. It is a temporary fix to a long-standing bug where some particles may
-        // be injected after their events have already finished. This synchronization doesn't cause
-        // any measurable slowdown, but it should be removed once the underlying cause of the bug is
-        // understood and fixed
-        COPCORE_CUDA_CHECK(cudaEventRecord(cudaEvent, injectStream));
+
         // Synchronize with stats count before taking decisions
-        cudaError_t result, injectResult;
-        while ((result = cudaEventQuery(cudaStatsEvent)) == cudaErrorNotReady ||
-               (injectResult = cudaEventQuery(cudaEvent)) == cudaErrorNotReady) {
+        cudaError_t result;
+        while ((result = cudaEventQuery(cudaStatsEvent)) == cudaErrorNotReady) {
           // Cuda uses a busy wait. This reduces CPU consumption by 50%:
           using namespace std::chrono_literals;
           std::this_thread::sleep_for(50us);
         }
         COPCORE_CUDA_CHECK(result);
-        COPCORE_CUDA_CHECK(injectResult);
 
         for (int i = 0; i < ParticleType::NumParticleTypes; i++) {
           inFlight += gpuState.stats->inFlight[i];

From 583df62cada87e1ceef5a79d571ec1500d1ffc7d Mon Sep 17 00:00:00 2001
From: SeverinDiederichs <severin.diederichs@cern.ch>
Date: Fri, 3 Oct 2025 12:05:07 +0200
Subject: [PATCH 2/2] add missed line

---
 include/AdePT/core/AsyncAdePTTransport.cuh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/AdePT/core/AsyncAdePTTransport.cuh b/include/AdePT/core/AsyncAdePTTransport.cuh
index dbd11e01..80f5b5e9 100644
--- a/include/AdePT/core/AsyncAdePTTransport.cuh
+++ b/include/AdePT/core/AsyncAdePTTransport.cuh
@@ -1023,6 +1023,7 @@ void TransportLoop(int trackCapacity, int leakCapacity, int scoringCapacity, int
 
       // *** Enqueue particles that are ready on the device ***
       if (gpuState.injectState == InjectState::ReadyToEnqueue) {
+        gpuState.injectState = InjectState::Enqueueing;
 
         // struct needed for hostCallBack function to change both event and injection state
         struct EnqueueDoneCtx {