[Backend] Make sure membar works on warpgroup partitions (triton-lang#6441)

Mogball · web-flow · commit e79e08edb43c · 2025-04-09T12:23:31.000-07:00
Alias analysis was not propagating shared memory aliases through warp
specialize captures into partition regions. This meant that Membar was
not actually analyzing shared memory accesses within partitions and
inserting barriers. This can sometimes cause kernels to hang if they
don't synchronize on mbarrier waits, for example.
diff --git a/include/triton/Analysis/Alias.h b/include/triton/Analysis/Alias.h
@@ -89,6 +89,11 @@ class SharedMemoryAliasAnalysis
   visitOperation(Operation *op,
                  ArrayRef<const dataflow::Lattice<AliasInfo> *> operands,
                  ArrayRef<dataflow::Lattice<AliasInfo> *> results) override;
+
+  void visitNonControlFlowArguments(
+      Operation *op, const RegionSuccessor &successor,
+      ArrayRef<dataflow::Lattice<AliasInfo> *> argLattices,
+      unsigned firstIndex) override;
 };
 
 } // namespace mlir
diff --git a/lib/Analysis/Alias.cpp b/lib/Analysis/Alias.cpp
@@ -58,6 +58,30 @@ LogicalResult SharedMemoryAliasAnalysis::visitOperation(
   return success();
 }
 
+void SharedMemoryAliasAnalysis::visitNonControlFlowArguments(
+    Operation *op, const RegionSuccessor &successor,
+    ArrayRef<dataflow::Lattice<AliasInfo> *> argLattices, unsigned firstIndex) {
+  auto wsOp = dyn_cast<triton::gpu::WarpSpecializePartitionsOp>(op);
+  if (!wsOp) {
+    setAllToEntryStates(argLattices.take_front(firstIndex));
+    setAllToEntryStates(argLattices.drop_front(
+        firstIndex + successor.getSuccessorInputs().size()));
+    return;
+  }
+
+  // Propagate aliases from the parent operation's operands to the block
+  // arguments.
+  assert(!successor.isParent());
+  ProgramPoint *point = getProgramPointAfter(wsOp);
+
+  for (auto [capture, argLattice] :
+       llvm::zip(wsOp.getParentOp().getExplicitCaptures(), argLattices)) {
+    propagateIfChanged(
+        argLattice,
+        argLattice->join(getLatticeElementFor(point, capture)->getValue()));
+  }
+}
+
 AliasResult SharedMemoryAliasAnalysis::alias(Value lhs, Value rhs) {
   // TODO: implement
   return AliasResult::MayAlias;
diff --git a/lib/Analysis/Allocation.cpp b/lib/Analysis/Allocation.cpp
@@ -332,7 +332,7 @@ class AllocationAnalysis {
         solver->load<SharedMemoryAliasAnalysis>();
     // Run the analysis rooted at every isolated from above operation, including
     // the top-level function but also any nested regions.
-    operation->walk([&](Operation *op) {
+    operation->walk<mlir::WalkOrder::PreOrder>([&](Operation *op) {
       if (op->hasTrait<OpTrait::IsIsolatedFromAbove>() &&
           failed(solver->initializeAndRun(op))) {
         // TODO: return error instead of bailing out..
diff --git a/test/Analysis/test-membar.mlir b/test/Analysis/test-membar.mlir
@@ -1000,3 +1000,45 @@ module attributes {"ttg.num-warps" = 4 : i32} {
     tt.return
   }
 }
+
+// -----
+
+#shared = #ttg.swizzled_shared<{vec = 2, perPhase = 2, maxPhase = 4, order = [1, 0]}>
+
+module attributes {"ttg.num-warps" = 4 : i32, "ttg.target" = "cuda:80"} {
+
+// CHECK-LABEL: @membar_alias_through_warp_specialize
+tt.func @membar_alias_through_warp_specialize() {
+  %0 = ttg.local_alloc : () -> !ttg.memdesc<16x16xf16, #shared, #ttg.shared_memory, mutable>
+  ttg.warp_specialize(%0)
+  default {
+    ttg.warp_yield
+  }
+  // CHECK: partition0
+  partition0(%arg0: !ttg.memdesc<16x16xf16, #shared, #ttg.shared_memory, mutable>) num_warps(2) {
+    %c0 = arith.constant 0 : i32
+    %1 = ttg.memdesc_subview %arg0[%c0, %c0] : !ttg.memdesc<16x16xf16, #shared, #ttg.shared_memory, mutable> -> !ttg.memdesc<16x16xf16, #shared, #ttg.shared_memory, mutable>
+    %c = arith.constant dense<0.0> : tensor<16x16xf16>
+    // CHECK: local_store
+    ttg.local_store %c, %1 : tensor<16x16xf16> -> !ttg.memdesc<16x16xf16, #shared, #ttg.shared_memory, mutable>
+    // CHECK-NEXT: gpu.barrier
+    // CHECK-NEXT: local_store
+    ttg.local_store %c, %1 : tensor<16x16xf16> -> !ttg.memdesc<16x16xf16, #shared, #ttg.shared_memory, mutable>
+    ttg.warp_return
+  }
+  // CHECK: partition1
+  partition1(%arg0: !ttg.memdesc<16x16xf16, #shared, #ttg.shared_memory, mutable>) num_warps(2) {
+    %c0 = arith.constant 0 : i32
+    %1 = ttg.memdesc_subview %arg0[%c0, %c0] : !ttg.memdesc<16x16xf16, #shared, #ttg.shared_memory, mutable> -> !ttg.memdesc<16x16xf16, #shared, #ttg.shared_memory, mutable>
+    %c = arith.constant dense<0.0> : tensor<16x16xf16>
+    // CHECK: local_store
+    ttg.local_store %c, %1 : tensor<16x16xf16> -> !ttg.memdesc<16x16xf16, #shared, #ttg.shared_memory, mutable>
+    // CHECK-NEXT: gpu.barrier
+    // CHECK-NEXT: local_store
+    ttg.local_store %c, %1 : tensor<16x16xf16> -> !ttg.memdesc<16x16xf16, #shared, #ttg.shared_memory, mutable>
+    ttg.warp_return
+  } : (!ttg.memdesc<16x16xf16, #shared, #ttg.shared_memory, mutable>) -> ()
+  tt.return
+}
+
+}
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertWarpSpecializeToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertWarpSpecializeToLLVM.cpp
@@ -275,7 +275,7 @@ static void rewritePartitionRegions(WarpSpecializeOp ws, Block *switchLoop,
 
     // Rewrite all warp returns.
     partition->walk([&](WarpReturnOp op) {
-      b.setInsertionPoint(op);
+      TritonLLVMIRRewriter b(op.getLoc(), op);
       createBarrier(b, kSwitchLoopBarrierIdx, /*numThreads=*/std::nullopt,
                     /*aligned=*/false);
       b.replaceOpWithNewOp<LLVM::BrOp>(op, switchLoop);