Skip to content

Commit a2dbe99

Browse files
pthamminigcbot
authored andcommitted
Replace Atomic Fence with GenISA_source_value
Replace Atomic Fence with GenISA_source_value
1 parent 5bffd05 commit a2dbe99

File tree

3 files changed

+156
-5
lines changed

3 files changed

+156
-5
lines changed

IGC/Compiler/CISACodeGen/EmitVISAPass.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8575,6 +8575,8 @@ void EmitPass::EmitGenIntrinsicMessage(llvm::GenIntrinsicInst *inst) {
85758575
case GenISAIntrinsic::GenISA_source_value: {
85768576
m_encoder->Copy(m_currShader->GetNULL(), GetSymbol(inst->getOperand(0)));
85778577
m_encoder->Push();
8578+
m_encoder->Fence(false, false, false, false, false, false, false, true);
8579+
m_encoder->Push();
85788580
break;
85798581
}
85808582
case GenISAIntrinsic::GenISA_movcr: {

IGC/Compiler/Optimizer/SynchronizationObjectCoalescing.cpp

Lines changed: 151 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ SPDX-License-Identifier: MIT
1717
#include "Compiler/IGCPassSupport.h"
1818
#include "SynchronizationObjectCoalescing.hpp"
1919
#include "visa_igc_common_header.h"
20+
#include "llvm/IR/IRBuilder.h"
21+
#include "llvm/Analysis/CFG.h"
2022
#include <utility>
2123
#include <map>
2224

@@ -285,6 +287,9 @@ class SynchronizationObjectCoalescing : public llvm::FunctionPass {
285287
static_cast<SynchronizationCaseMask>(WriteSyncRead | WriteSyncWrite | AtomicSyncRead | AtomicSyncWrite |
286288
WriteSyncAtomic | ReadSyncAtomic | ReadSyncWrite | AtomicSyncAtomic);
287289

290+
////////////////////////////////////////////////////////////////////////
291+
void CreateSourceValueInst(std::vector<llvm::Instruction *> &pAtomicInstToBeSourced, llvm::Instruction *pFenceInst);
292+
288293
////////////////////////////////////////////////////////////////////////
289294
void EraseRedundantInst(llvm::Instruction *pInst);
290295

@@ -327,6 +332,7 @@ class SynchronizationObjectCoalescing : public llvm::FunctionPass {
327332

328333
////////////////////////////////////////////////////////////////////////
329334
bool IsRequiredForAtomicOperationsOrdering(const llvm::Instruction *pSourceInst,
335+
std::vector<llvm::Instruction *> &pAtomicInstToBeSourced,
330336
bool onlyGlobalAtomics = false) const;
331337

332338
////////////////////////////////////////////////////////////////////////
@@ -440,6 +446,7 @@ class SynchronizationObjectCoalescing : public llvm::FunctionPass {
440446
std::vector<llvm::Instruction *> m_LscMemoryFences;
441447
std::vector<llvm::Instruction *> m_UntypedMemoryFences;
442448
std::vector<llvm::Instruction *> m_ThreadGroupBarriers;
449+
std::unordered_set<llvm::Instruction *> m_SourcedAtomicInstructions;
443450

444451
// this variable holds a mapping from a basic block to its memory instructions ordered by their occurrences in it
445452
// (the initial index of line of this basic block - the number of instructions preceding an instruction it its basic
@@ -538,6 +545,107 @@ bool SynchronizationObjectCoalescing::ProcessFunction() {
538545
return FindRedundancies();
539546
}
540547

548+
// Referenced from MemoryModelPass
549+
inline PHINode *FindDominatingPhi(DominatorTree &DT, Instruction *def, BasicBlock *postDominator) {
550+
IGC_ASSERT(def->getParent() != postDominator);
551+
IGC_ASSERT(!DT.dominates(def, postDominator));
552+
SmallPtrSet<PHINode *, 8> seen;
553+
SmallVector<User *, 8> worklist(def->users());
554+
while (!worklist.empty()) {
555+
PHINode *phi = dyn_cast<PHINode>(worklist.pop_back_val());
556+
if (phi == nullptr || seen.count(phi) > 0) {
557+
continue;
558+
}
559+
if (phi->getParent() == postDominator || DT.dominates(phi, postDominator)) {
560+
return phi;
561+
}
562+
seen.insert(phi);
563+
}
564+
return nullptr;
565+
}
566+
567+
////////////////////////////////////////////////////////////////////////
568+
/// @brief Fence Instruction responsible for only ordering of atomic Instructions
569+
/// can be replaced with Source Value Intrinsic which will still maintain
570+
/// the order of Instructions
571+
void SynchronizationObjectCoalescing::CreateSourceValueInst(std::vector<llvm::Instruction *> &pAtomicInstToBeSourced,
572+
llvm::Instruction *pFenceInst) {
573+
IGC_ASSERT(pAtomicInstToBeSourced.size() > 0);
574+
// reversing the list to source the atomic instructions in the order
575+
reverse(pAtomicInstToBeSourced.begin(), pAtomicInstToBeSourced.end());
576+
Function *funcPtr = GenISAIntrinsic::getDeclaration(pFenceInst->getModule(), GenISAIntrinsic::GenISA_source_value);
577+
BasicBlock *fenceBB = pFenceInst->getParent();
578+
579+
Function *F = pAtomicInstToBeSourced[0]->getFunction();
580+
DominatorTree DT(*F);
581+
PostDominatorTree PDT(*F);
582+
LoopInfo LI(DT);
583+
584+
for (llvm::Instruction *atomicInst : pAtomicInstToBeSourced) {
585+
// Making sure that the Fence Inst is potentially reachable from the atomic Instruction.
586+
if (!isPotentiallyReachable(atomicInst, pFenceInst, nullptr, &DT, &LI)) {
587+
continue;
588+
}
589+
590+
BasicBlock *atomicBB = atomicInst->getParent();
591+
BasicBlock *fenceDominator = fenceBB;
592+
Instruction *insertPoint = atomicBB->getTerminator();
593+
Value *sourceVal = cast<GenIntrinsicInst>(atomicInst);
594+
595+
// TODO: Determining Insert point can be improved which can postpone the source value intrinsic as long as possible.
596+
// Similar analysis is done in FindOptimalInsertPoints() in ApplyCacheControls.cpp
597+
598+
// Check if fence Instruction BB post dominates atomic Instruction BB
599+
// Else find the BB that is a predecessor of fence BB and post dominates atomic BB.
600+
// If we don't find one, then the insert point is near the terminator of atomic BB
601+
while (fenceDominator && fenceDominator != atomicBB) {
602+
if (PDT.dominates(fenceDominator, atomicBB)) {
603+
// If fence instruction is in same BB, then use fence as insert point
604+
// Else use the terminator of fenceDominator as insert point
605+
insertPoint = fenceBB == fenceDominator ? pFenceInst : fenceDominator->getTerminator();
606+
// It's possible that the atomic instruction does not dominate
607+
// the post-dominator, find a PHI user of the atomic instruction
608+
// that dominates the post-dominator.
609+
if (!DT.dominates(atomicBB, fenceDominator)) {
610+
PHINode *phi = FindDominatingPhi(DT, atomicInst, fenceDominator);
611+
if (phi) {
612+
sourceVal = phi;
613+
} else {
614+
// Fallback to inserting the source value in the basic
615+
// block with the atomic instruction.
616+
insertPoint = atomicBB->getTerminator();
617+
}
618+
}
619+
break;
620+
}
621+
fenceDominator = fenceDominator->getSinglePredecessor();
622+
}
623+
// If Fence is present in same BB as atomic, then insert at Fence Instruction
624+
if (fenceBB == atomicBB) {
625+
insertPoint = pFenceInst;
626+
}
627+
628+
IRBuilder<> builder(insertPoint);
629+
Type *sourceValType = sourceVal->getType();
630+
631+
// Source value intrinsic accepts only i32.
632+
if (sourceValType->isIntegerTy()) {
633+
sourceVal = builder.CreateZExtOrTrunc(sourceVal, builder.getInt32Ty());
634+
} else if (sourceValType->isFloatingPointTy()) {
635+
if (sourceValType->isFloatTy()) {
636+
sourceVal = builder.CreateBitCast(sourceVal, builder.getInt32Ty());
637+
} else {
638+
sourceVal = builder.CreateFPToUI(sourceVal, builder.getInt32Ty());
639+
}
640+
} else {
641+
IGC_ASSERT_MESSAGE(0, "Unexpected type");
642+
}
643+
644+
builder.CreateCall(funcPtr, {sourceVal});
645+
m_SourcedAtomicInstructions.insert(atomicInst);
646+
}
647+
}
648+
541649
////////////////////////////////////////////////////////////////////////
542650
void SynchronizationObjectCoalescing::EraseRedundantInst(llvm::Instruction *pInst) {
543651
bool isFence = IsFenceOperation(pInst);
@@ -740,7 +848,13 @@ bool SynchronizationObjectCoalescing::FindRedundancies() {
740848
}
741849
SynchronizationCaseMask referenceSyncCaseMask = GetStrictSynchronizationMask(pInst);
742850
bool isObligatory = (syncCaseMask & referenceSyncCaseMask) != 0;
743-
isObligatory |= IsRequiredForAtomicOperationsOrdering(pInst, true /*onlyGlobalAtomics*/);
851+
852+
std::vector<llvm::Instruction *> atomicInstToBeSourced;
853+
if (!isObligatory) {
854+
isObligatory =
855+
IsRequiredForAtomicOperationsOrdering(pInst, atomicInstToBeSourced, true /*onlyGlobalAtomics*/);
856+
}
857+
744858
bool verifyUnsynchronizedInstructions = IsFenceOperation(pInst);
745859
verifyUnsynchronizedInstructions &= (!isObligatory || syncCaseMask == ReadSyncWrite);
746860

@@ -767,6 +881,9 @@ bool SynchronizationObjectCoalescing::FindRedundancies() {
767881
#if _DEBUG
768882
RegisterRedundancyExplanation(pInst, ExplanationEntry::GlobalMemoryRedundancy);
769883
#endif // _DEBUG
884+
if (IGC_IS_FLAG_ENABLED(ReplaceAtomicFenceWithSourceValue) && atomicInstToBeSourced.size() > 0) {
885+
CreateSourceValueInst(atomicInstToBeSourced, const_cast<Instruction *>(pInst));
886+
}
770887
EraseRedundantGlobalScope(pInst);
771888
isModified = true;
772889
SetLocalMemoryInstructionMask();
@@ -831,7 +948,12 @@ bool SynchronizationObjectCoalescing::FindRedundancies() {
831948
GetSynchronizationMaskForAllResources(localForwardMemoryInstructionMask, localBackwardMemoryInstructionMask);
832949
SynchronizationCaseMask referenceSyncCaseMask = GetStrictSynchronizationMask(pInst);
833950
bool isObligatory = (syncCaseMask & referenceSyncCaseMask) != 0;
834-
isObligatory |= IsRequiredForAtomicOperationsOrdering(pInst);
951+
952+
std::vector<llvm::Instruction *> atomicInstToBeSourced;
953+
if (!isObligatory) {
954+
isObligatory = IsRequiredForAtomicOperationsOrdering(pInst, atomicInstToBeSourced);
955+
}
956+
835957
bool verifyUnsynchronizedInstructions = IsFenceOperation(pInst);
836958
verifyUnsynchronizedInstructions &= (!isObligatory || syncCaseMask == ReadSyncWrite);
837959

@@ -847,6 +969,9 @@ bool SynchronizationObjectCoalescing::FindRedundancies() {
847969
#if _DEBUG
848970
RegisterRedundancyExplanation(pInst, ExplanationEntry::StrictRedundancy);
849971
#endif // _DEBUG
972+
if (IGC_IS_FLAG_ENABLED(ReplaceAtomicFenceWithSourceValue) && atomicInstToBeSourced.size() > 0) {
973+
CreateSourceValueInst(atomicInstToBeSourced, const_cast<Instruction *>(pInst));
974+
}
850975
EraseRedundantInst(pInst);
851976
isModified = true;
852977
}
@@ -1731,8 +1856,9 @@ SynchronizationObjectCoalescing::GetUnsynchronizedForwardInstructionMask(const l
17311856
/// operations present before the fence (in program order)
17321857
/// @param pSourceInst the source synchronization instruction
17331858
/// @param onlyGlobalAtomics check only TGM and UGM atomic operations
1734-
bool SynchronizationObjectCoalescing::IsRequiredForAtomicOperationsOrdering(const llvm::Instruction *pSourceInst,
1735-
bool onlyGlobalAtomics /*= false*/) const {
1859+
bool SynchronizationObjectCoalescing::IsRequiredForAtomicOperationsOrdering(
1860+
const llvm::Instruction *pSourceInst, std::vector<llvm::Instruction *> &pAtomicInstToBeSourced,
1861+
bool onlyGlobalAtomics /*= false*/) const {
17361862
if (!IsFenceOperation(pSourceInst)) {
17371863
// Not a fence, nothing to check
17381864
return false;
@@ -1782,6 +1908,10 @@ bool SynchronizationObjectCoalescing::IsRequiredForAtomicOperationsOrdering(cons
17821908
{
17831909
isPotentiallyUnsynchronizedAtomic = false;
17841910
// Lambda that checks if a fence operation synchronizes the atomic operation.
1911+
// This can be improved to detect the users of atomic instruction and end the search for fences once we find the
1912+
// user. This user is essentially same as Source Value Intrinsic, however it can be reordered in visa affecting
1913+
// the execution order of atomic instructions. If we can find a way to treat this user as a special instruction
1914+
// and avoid reordering, we can skip creating new source value instruction.
17851915
std::function<bool(const llvm::Instruction *)> IsBoundaryInst = [this, &atomicPointerMemoryInstructionMask,
17861916
&isPotentiallyUnsynchronizedAtomic,
17871917
pSourceInst](const llvm::Instruction *pInst) {
@@ -1840,7 +1970,22 @@ bool SynchronizationObjectCoalescing::IsRequiredForAtomicOperationsOrdering(cons
18401970
if (!substituteFenceFound) {
18411971
// Found an atomic operation that requires the source fence
18421972
// instruction for correct memory ordering.
1843-
return true;
1973+
1974+
// If ReplaceAtomicFenceWithSourceValue is true, we can replace this fence with GenISA_source_value.
1975+
// This will source the atomic instruction and still maintains the order of atomic instructions.
1976+
// Else return true marking the fence instruction as Obligatory.
1977+
1978+
if (IGC_IS_FLAG_ENABLED(ReplaceAtomicFenceWithSourceValue)) {
1979+
// If a previous fence was replaced with source value intrinsic, GetVisibleMemoryInstructions will add the
1980+
// same atomic instruction again for the next fence resulting in multiple source value intrinsics but we need
1981+
// it to be sourced only once. Hence we check if it was already sourced previously. Continues to check all
1982+
// valid atomic Instructions to be sourced.
1983+
if (m_SourcedAtomicInstructions.find(const_cast<Instruction *>(pInst)) == m_SourcedAtomicInstructions.end()) {
1984+
pAtomicInstToBeSourced.push_back(const_cast<Instruction *>(pInst));
1985+
}
1986+
} else {
1987+
return true;
1988+
}
18441989
}
18451990
}
18461991
}
@@ -2002,6 +2147,7 @@ void SynchronizationObjectCoalescing::InvalidateMembers() {
20022147
m_OrderedFenceInstructionsInBasicBlockCache.clear();
20032148
m_OrderedBarrierInstructionsInBasicBlockCache.clear();
20042149
m_BasicBlockMemoryInstructionMaskCache.clear();
2150+
m_SourcedAtomicInstructions.clear();
20052151
#if _DEBUG
20062152
m_ExplanationEntries.clear();
20072153
#endif // _DEBUG

IGC/common/igc_flags.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -433,6 +433,9 @@ DECLARE_IGC_REGKEY(
433433
"The mask is casted to IGC::SyncInstMask and informs which synchronization objects should not be coalesced. Note "
434434
"that synchronization objects classified in multiple types are not disabled if any bit describing them is off.",
435435
true)
436+
DECLARE_IGC_REGKEY(bool, ReplaceAtomicFenceWithSourceValue, true,
437+
"Fences are required to maintain the order of atomic memory instructions. This flag will replace the fence with "
438+
"GenISA_source_value intrinsic which sources the result of atomic operation and still maintains the order.", true)
436439
DECLARE_IGC_REGKEY(bool, UnrollLoopForCodeSizeOnly, false,
437440
"Only unroll the loop if it can reduce program size/register pressure. Ignore all other threshold "
438441
"setting but still enable EnablePromoteLoopUnrollwithAlloca due to high likelyhood to reduce size.",

0 commit comments

Comments
 (0)