From f92bde3625289cb59d7e3542d83c1920daf1493c Mon Sep 17 00:00:00 2001 From: SeverinDiederichs Date: Fri, 3 Oct 2025 12:01:16 +0200 Subject: [PATCH 1/2] sync injection via HostCallBacks and cuda event --- include/AdePT/core/AsyncAdePTTransport.cuh | 64 +++++++++++++--------- 1 file changed, 38 insertions(+), 26 deletions(-) diff --git a/include/AdePT/core/AsyncAdePTTransport.cuh b/include/AdePT/core/AsyncAdePTTransport.cuh index 1c9e1dcc..dbd11e01 100644 --- a/include/AdePT/core/AsyncAdePTTransport.cuh +++ b/include/AdePT/core/AsyncAdePTTransport.cuh @@ -842,12 +842,15 @@ void TransportLoop(int trackCapacity, int leakCapacity, int scoringCapacity, int // Auxiliary struct used to keep track of the queues that need flushing AllLeaked allLeaked{nullptr, nullptr, nullptr}; - cudaEvent_t cudaEvent, cudaStatsEvent; + cudaEvent_t cudaEvent, cudaStatsEvent, enqueueDoneEvent; cudaStream_t hitTransferStream, injectStream, extractStream, statsStream, interactionStream; COPCORE_CUDA_CHECK(cudaEventCreateWithFlags(&cudaEvent, cudaEventDisableTiming)); COPCORE_CUDA_CHECK(cudaEventCreateWithFlags(&cudaStatsEvent, cudaEventDisableTiming)); + COPCORE_CUDA_CHECK(cudaEventCreateWithFlags(&enqueueDoneEvent, cudaEventDisableTiming)); unique_ptr_cuda cudaEventCleanup{&cudaEvent}; unique_ptr_cuda cudaStatsEventCleanup{&cudaStatsEvent}; + unique_ptr_cuda enqueueDoneEventCleanup{&enqueueDoneEvent}; + cudaEventRecord(enqueueDoneEvent, gpuState.stream); // prime first event to avoid possible stall on first iteration COPCORE_CUDA_CHECK(cudaStreamCreate(&hitTransferStream)); COPCORE_CUDA_CHECK(cudaStreamCreate(&injectStream)); COPCORE_CUDA_CHECK(cudaStreamCreate(&extractStream)); @@ -979,15 +982,12 @@ void TransportLoop(int trackCapacity, int leakCapacity, int scoringCapacity, int // *** Particle injection *** // -------------------------- if (gpuState.injectState == InjectState::Idle) { - for (auto &eventState : eventStates) { - if (const auto state = eventState.load(std::memory_order_acquire); state == EventState::G4RequestsFlush) { - eventState = EventState::Inject; - } else if (state == EventState::Inject) { - eventState = EventState::InjectionCompleted; - } - } if (auto &toDevice = trackBuffer.getActiveBuffer(); toDevice.nTrack > 0) { + + // there are actually tracks that are injected, move state machine + AdvanceEventStates(EventState::G4RequestsFlush, EventState::Inject, eventStates); + gpuState.injectState = InjectState::CreatingSlots; trackBuffer.swapToDeviceBuffers(); @@ -1016,20 +1016,36 @@ void TransportLoop(int trackCapacity, int leakCapacity, int scoringCapacity, int // Ensure that copy operation completed before releasing lock on to-device buffer COPCORE_CUDA_CHECK(cudaEventSynchronize(cudaEvent)); } else { - // No tracks in to-device buffer - // Move tracks that requested a flush to InjectionCompleted - gpuState.injectState = InjectState::Idle; + // No tracks in to-device buffer, we can immediately mark the Injection as completed + AdvanceEventStates(EventState::G4RequestsFlush, EventState::InjectionCompleted, eventStates); } } // *** Enqueue particles that are ready on the device *** if (gpuState.injectState == InjectState::ReadyToEnqueue) { - gpuState.injectState = InjectState::Enqueueing; + + // struct needed for hostCallBack function to change both event and injection state + struct EnqueueDoneCtx { + std::vector> *eventStates; + std::atomic *injectState; + }; + + // Enqueue into per-particle queues EnqueueTracks<<<1, 256, 0, gpuState.stream>>>(allParticleQueues, gpuState.injectionQueue); - // New injection has to wait until particles are enqueued: - waitForOtherStream(injectStream, gpuState.stream); - } else if (gpuState.injectState == InjectState::Enqueueing) { - gpuState.injectState = InjectState::Idle; + + // Event marks enqueue completion for this iteration + COPCORE_CUDA_CHECK(cudaEventRecord(enqueueDoneEvent, gpuState.stream)); + + auto *ctx = new EnqueueDoneCtx{&eventStates, &gpuState.injectState}; + COPCORE_CUDA_CHECK(cudaLaunchHostFunc( + gpuState.stream, + [](void *arg) { + auto *ctx = static_cast(arg); + AdvanceEventStates(EventState::Inject, EventState::InjectionCompleted, *ctx->eventStates); + ctx->injectState->store(InjectState::Idle, std::memory_order_release); + delete ctx; + }, + ctx)); } // ------------------ @@ -1193,6 +1209,9 @@ void TransportLoop(int trackCapacity, int leakCapacity, int scoringCapacity, int AdvanceEventStates(EventState::Transporting, EventState::WaitingForTransportToFinish, eventStates); AdvanceEventStates(EventState::InjectionCompleted, EventState::Transporting, eventStates); + // ensure that enqueuing is finished. The population count should see the injected tracks + COPCORE_CUDA_CHECK(cudaStreamWaitEvent(statsStream, enqueueDoneEvent, 0)); + // Reset all counters count the currently flying population ZeroEventCounters<<<1, 256, 0, statsStream>>>(gpuState.stats_dev); CountCurrentPopulation<<>>( @@ -1489,22 +1508,15 @@ void TransportLoop(int trackCapacity, int leakCapacity, int scoringCapacity, int { inFlight = 0; numLeaked = 0; - // FIXME: Synchronizing with the injectionStream here, along with the stats stream is not - // needed by the logic. It is a temporary fix to a long-standing bug where some particles may - // be injected after their events have already finished. This synchronization doesn't cause - // any measurable slowdown, but it should be removed once the underlying cause of the bug is - // understood and fixed - COPCORE_CUDA_CHECK(cudaEventRecord(cudaEvent, injectStream)); + // Synchronize with stats count before taking decisions - cudaError_t result, injectResult; - while ((result = cudaEventQuery(cudaStatsEvent)) == cudaErrorNotReady || - (injectResult = cudaEventQuery(cudaEvent)) == cudaErrorNotReady) { + cudaError_t result; + while ((result = cudaEventQuery(cudaStatsEvent)) == cudaErrorNotReady) { // Cuda uses a busy wait. This reduces CPU consumption by 50%: using namespace std::chrono_literals; std::this_thread::sleep_for(50us); } COPCORE_CUDA_CHECK(result); - COPCORE_CUDA_CHECK(injectResult); for (int i = 0; i < ParticleType::NumParticleTypes; i++) { inFlight += gpuState.stats->inFlight[i]; From 583df62cada87e1ceef5a79d571ec1500d1ffc7d Mon Sep 17 00:00:00 2001 From: SeverinDiederichs Date: Fri, 3 Oct 2025 12:05:07 +0200 Subject: [PATCH 2/2] add missed line --- include/AdePT/core/AsyncAdePTTransport.cuh | 1 + 1 file changed, 1 insertion(+) diff --git a/include/AdePT/core/AsyncAdePTTransport.cuh b/include/AdePT/core/AsyncAdePTTransport.cuh index dbd11e01..80f5b5e9 100644 --- a/include/AdePT/core/AsyncAdePTTransport.cuh +++ b/include/AdePT/core/AsyncAdePTTransport.cuh @@ -1023,6 +1023,7 @@ void TransportLoop(int trackCapacity, int leakCapacity, int scoringCapacity, int // *** Enqueue particles that are ready on the device *** if (gpuState.injectState == InjectState::ReadyToEnqueue) { + gpuState.injectState = InjectState::Enqueueing; // struct needed for hostCallBack function to change both event and injection state struct EnqueueDoneCtx {