From 4550c9664a96d2cae8705d85992ea21b6a2424ce Mon Sep 17 00:00:00 2001
From: Phillip Assmann
Date: Thu, 10 Apr 2025 11:15:27 +0000
Subject: [PATCH 01/22] cpu: fix bac memory leak (by @dhschall)
---
src/mem/cache/prefetch/fdp.cc | 22 ++++++++++++----------
1 file changed, 12 insertions(+), 10 deletions(-)
diff --git a/src/mem/cache/prefetch/fdp.cc b/src/mem/cache/prefetch/fdp.cc
index 6eff00a450d..95176a65457 100644
--- a/src/mem/cache/prefetch/fdp.cc
+++ b/src/mem/cache/prefetch/fdp.cc
@@ -171,23 +171,25 @@ FetchDirectedPrefetcher::translationComplete(PrefetchRequest *pfr, bool failed)
assert(cache != nullptr);
if (failed) {
- DPRINTF(HWPrefetch, "Translation of %#x failed\n", pfr->addr);
+ DPRINTF(HWPrefetch, "Translation of %#x failed\n", it->addr);
stats.translationFail++;
} else {
- DPRINTF(HWPrefetch, "Translation of %#x succeeded\n", pfr->addr);
+ DPRINTF(HWPrefetch, "Translation of %#x succeeded\n", it->addr);
stats.translationSuccess++;
- it->createPkt(curTick() + latency);
- stats.pfPacketsCreated++;
-
- if (cacheSnoop && (cache->inCache(pfr->pkt->getAddr(), pfr->pkt->isSecure())
- || (cache->inMissQueue(pfr->pkt->getAddr(), pfr->pkt->isSecure())))) {
+ if (cacheSnoop && (cache->inCache(
+ it->req->getPaddr(), it->req->isSecure()
+ ) || (cache->inMissQueue(
+ it->req->getPaddr(), it->req->isSecure()
+ )))) {
stats.pfInCache++;
DPRINTF(HWPrefetch, "Drop Packet. In Cache / MSHR\n");
} else {
-
+ it->createPkt(curTick() + latency);
+ stats.pfPacketsCreated++;
+
DPRINTF(HWPrefetch, "Addr: %#x Add packet to PFQ. pkt PA:%#x, "
- "PFQ sz:%i\n", pfr->addr, pfr->pkt->getAddr(), pfq.size());
-
+ "PFQ sz:%i\n", it->addr, it->pkt->getAddr(), pfq.size());
+
stats.pfCandidatesAdded++;
pfq.push_back(*it);
}
From 39a8609f14749e129a8549462ad6f99433441174 Mon Sep 17 00:00:00 2001
From: Phillip Assmann
Date: Thu, 10 Apr 2025 11:24:42 +0000
Subject: [PATCH 02/22] cpu: add bac branch predictor delay
---
src/cpu/o3/BaseO3CPU.py | 1 +
src/cpu/o3/bac.cc | 15 +++++++++++++++
src/cpu/o3/bac.hh | 6 ++++++
3 files changed, 22 insertions(+)
diff --git a/src/cpu/o3/BaseO3CPU.py b/src/cpu/o3/BaseO3CPU.py
index a6906c16437..e1cc8358854 100644
--- a/src/cpu/o3/BaseO3CPU.py
+++ b/src/cpu/o3/BaseO3CPU.py
@@ -106,6 +106,7 @@ def support_take_over(cls):
# Forward pipeline delays
bacToFetchDelay = Param.Cycles(1, "Branch address calc. to fetch delay")
+ bacBranchPredictDelay = Param.Cycles(0, "BAC Branch Predictor delay")
fetchToDecodeDelay = Param.Cycles(1, "Fetch to decode delay")
decodeWidth = Param.Unsigned(8, "Decode width")
diff --git a/src/cpu/o3/bac.cc b/src/cpu/o3/bac.cc
index cbb11762693..b7dce266ae1 100644
--- a/src/cpu/o3/bac.cc
+++ b/src/cpu/o3/bac.cc
@@ -72,6 +72,7 @@ BAC::BAC(CPU *_cpu, const BaseO3CPUParams ¶ms)
decodeToFetchDelay(params.decodeToFetchDelay),
commitToFetchDelay(params.commitToFetchDelay),
bacToFetchDelay(params.bacToFetchDelay),
+ bacBranchPredictDelay(params.bacBranchPredictDelay),
fetchTargetWidth(params.fetchTargetWidth),
minInstSize(params.minInstSize),
numThreads(params.numThreads),
@@ -83,6 +84,7 @@ BAC::BAC(CPU *_cpu, const BaseO3CPUParams ¶ms)
for (int i = 0; i < MaxThreads; i++) {
bacPC[i].reset(params.isa[0]->newPCState());
stalls[i] = {false, false, false};
+ branchPredictRemaining[i] = Cycles(0);
}
assert(bpu!=nullptr);
@@ -401,12 +403,24 @@ BAC::checkSignalsAndUpdate(ThreadID tid)
return true;
}
+ if (branchPredictRemaining[tid] > Cycles(0)) {
+ --branchPredictRemaining[tid];
+ DPRINTF(BAC,
+ "[global] Stalling for Branch Predictor for %i more cycles.\n",
+ branchPredictRemaining
+ );
+ stalls[tid].bpu = true;
+ } else {
+ stalls[tid].bpu = false;
+ }
+
if (checkStall(tid)) {
// return block(tid);
bacStatus[tid] = Blocked;
return false;
}
+
// If at this point the FTQ is still invalid we need to wait for
// A resteer/squash signal.
if (!ftq->isValid(tid) && bacStatus[tid] != Idle) {
@@ -679,6 +693,7 @@ BAC::generateFetchTargets(ThreadID tid, bool &status_change)
// Now make the actual prediction. Note the BPU will advance
// the PC to the next instruction.
predict_taken = predict(tid, staticInst, curFT, *next_pc);
+ branchPredictRemaining[tid] = Cycles(bacBranchPredictDelay);
DPRINTF(BAC, "[tid:%i, ftn:%llu] Branch found at PC %#x "
"taken?:%i, target:%#x\n",
diff --git a/src/cpu/o3/bac.hh b/src/cpu/o3/bac.hh
index 8d6e59072b5..7e1f899bdf9 100644
--- a/src/cpu/o3/bac.hh
+++ b/src/cpu/o3/bac.hh
@@ -373,6 +373,9 @@ class BAC
*/
bool wroteToTimeBuffer;
+ /** Tracks remaining cycles that the branch predictor stalls BAC */
+ Cycles branchPredictRemaining[MaxThreads];
+
/** Source of possible stalls. */
struct Stalls
{
@@ -399,6 +402,9 @@ class BAC
/** BAC to fetch delay. */
const Cycles bacToFetchDelay;
+ /** BAC branch predict delay. */
+ const Cycles bacBranchPredictDelay;
+
/** The maximum width of a fetch target. This also determines the
* maximum addresses searched in one cycle. (FT width / minInstSize) */
const unsigned fetchTargetWidth;
From b2c1442fd7e648ad32a092855b26ee43041445da Mon Sep 17 00:00:00 2001
From: Phillip Assmann
Date: Thu, 10 Apr 2025 15:23:50 +0000
Subject: [PATCH 03/22] cpu: move cond pred out of bpred unit
---
src/cpu/minor/BaseMinorCPU.py | 5 +-
src/cpu/o3/BaseO3CPU.py | 5 +-
src/cpu/pred/2bit_local.cc | 8 +-
src/cpu/pred/2bit_local.hh | 7 +-
src/cpu/pred/BranchPredictor.py | 28 +++-
src/cpu/pred/SConscript | 2 +
src/cpu/pred/bi_mode.cc | 2 +-
src/cpu/pred/bi_mode.hh | 4 +-
src/cpu/pred/bpred_unit.cc | 24 ++--
src/cpu/pred/bpred_unit.hh | 97 +++----------
src/cpu/pred/conditional.cc | 52 +++++++
src/cpu/pred/conditional.hh | 148 ++++++++++++++++++++
src/cpu/pred/multiperspective_perceptron.cc | 2 +-
src/cpu/pred/multiperspective_perceptron.hh | 4 +-
src/cpu/pred/tage.cc | 4 +-
src/cpu/pred/tage.hh | 4 +-
src/cpu/pred/tagescl_ref.cc | 2 +-
src/cpu/pred/tagescl_ref.hh | 4 +-
src/cpu/pred/tournament.cc | 2 +-
src/cpu/pred/tournament.hh | 4 +-
20 files changed, 295 insertions(+), 113 deletions(-)
create mode 100644 src/cpu/pred/conditional.cc
create mode 100644 src/cpu/pred/conditional.hh
diff --git a/src/cpu/minor/BaseMinorCPU.py b/src/cpu/minor/BaseMinorCPU.py
index 545dfeaee56..5484a11a882 100644
--- a/src/cpu/minor/BaseMinorCPU.py
+++ b/src/cpu/minor/BaseMinorCPU.py
@@ -426,7 +426,10 @@ def support_take_over(cls):
)
branchPred = Param.BranchPredictor(
- TournamentBP(numThreads=Parent.numThreads), "Branch Predictor"
+ BranchPredictor(
+ conditionalBranchPred=TournamentBP(numThreads=Parent.numThreads)
+ ),
+ "Branch Predictor",
)
def addCheckerCpu(self):
diff --git a/src/cpu/o3/BaseO3CPU.py b/src/cpu/o3/BaseO3CPU.py
index e1cc8358854..d97c1f2c7dd 100644
--- a/src/cpu/o3/BaseO3CPU.py
+++ b/src/cpu/o3/BaseO3CPU.py
@@ -210,7 +210,10 @@ def support_take_over(cls):
smtCommitPolicy = Param.CommitPolicy("RoundRobin", "SMT Commit Policy")
branchPred = Param.BranchPredictor(
- TournamentBP(numThreads=Parent.numThreads), "Branch Predictor"
+ BranchPredictor(
+ conditionalBranchPred=TournamentBP(numThreads=Parent.numThreads)
+ ),
+ "Branch Predictor",
)
needsTSO = Param.Bool(False, "Enable TSO Memory model")
diff --git a/src/cpu/pred/2bit_local.cc b/src/cpu/pred/2bit_local.cc
index 7c27355b20f..34b495bc218 100644
--- a/src/cpu/pred/2bit_local.cc
+++ b/src/cpu/pred/2bit_local.cc
@@ -52,7 +52,7 @@ namespace branch_prediction
{
LocalBP::LocalBP(const LocalBPParams ¶ms)
- : BPredUnit(params),
+ : ConditionalPredictor(params),
localPredictorSize(params.localPredictorSize),
localCtrBits(params.localCtrBits),
localPredictorSets(localPredictorSize / localCtrBits),
@@ -78,6 +78,12 @@ LocalBP::LocalBP(const LocalBPParams ¶ms)
instShiftAmt);
}
+void LocalBP::branchPlaceholder(ThreadID tid, Addr pc,
+ bool uncond, void * &bpHistory)
+{
+// Placeholder for a function that only returns history items
+}
+
void
LocalBP::updateHistories(ThreadID tid, Addr pc, bool uncond, bool taken,
Addr target, const StaticInstPtr &inst,
diff --git a/src/cpu/pred/2bit_local.hh b/src/cpu/pred/2bit_local.hh
index 8d77289f966..bb539d37a9c 100644
--- a/src/cpu/pred/2bit_local.hh
+++ b/src/cpu/pred/2bit_local.hh
@@ -46,7 +46,7 @@
#include "base/sat_counter.hh"
#include "base/types.hh"
-#include "cpu/pred/bpred_unit.hh"
+#include "cpu/pred/conditional.hh"
#include "params/LocalBP.hh"
namespace gem5
@@ -62,7 +62,7 @@ namespace branch_prediction
* predictor state that needs to be recorded or updated; the update can be
* determined solely by the branch being taken or not taken.
*/
-class LocalBP : public BPredUnit
+class LocalBP : public ConditionalPredictor
{
public:
/**
@@ -73,6 +73,9 @@ class LocalBP : public BPredUnit
// Overriding interface functions
bool lookup(ThreadID tid, Addr pc, void * &bp_history) override;
+ void branchPlaceholder(ThreadID tid, Addr pc, bool uncond,
+ void * &bpHistory) override;
+
void updateHistories(ThreadID tid, Addr pc, bool uncond, bool taken,
Addr target, const StaticInstPtr &inst,
void * &bp_history) override;
diff --git a/src/cpu/pred/BranchPredictor.py b/src/cpu/pred/BranchPredictor.py
index 496b92a02d7..d23249899a5 100644
--- a/src/cpu/pred/BranchPredictor.py
+++ b/src/cpu/pred/BranchPredictor.py
@@ -142,6 +142,18 @@ class SimpleBTB(BranchTargetBuffer):
)
+class ConditionalPredictor(SimObject):
+ type = "ConditionalPredictor"
+ cxx_class = "gem5::branch_prediction::ConditionalPredictor"
+ cxx_header = "cpu/pred/conditional.hh"
+ abstract = True
+
+ numThreads = Param.Unsigned(Parent.numThreads, "Number of threads")
+ instShiftAmt = Param.Unsigned(
+ Parent.instShiftAmt, "Number of bits to shift instructions by"
+ )
+
+
class IndirectPredictor(SimObject):
type = "IndirectPredictor"
cxx_class = "gem5::branch_prediction::IndirectPredictor"
@@ -179,7 +191,6 @@ class BranchPredictor(SimObject):
type = "BranchPredictor"
cxx_class = "gem5::branch_prediction::BPredUnit"
cxx_header = "cpu/pred/bpred_unit.hh"
- abstract = True
numThreads = Param.Unsigned(Parent.numThreads, "Number of threads")
instShiftAmt = Param.Unsigned(2, "Number of bits to shift instructions by")
@@ -197,6 +208,9 @@ class BranchPredictor(SimObject):
ras = Param.ReturnAddrStack(
ReturnAddrStack(), "Return address stack, set to NULL to disable RAS."
)
+ conditionalBranchPred = Param.ConditionalPredictor(
+ "Conditional branch predictor"
+ )
indirectBranchPred = Param.IndirectPredictor(
SimpleIndirectPredictor(),
"Indirect branch predictor, set to NULL to disable "
@@ -212,7 +226,7 @@ class BranchPredictor(SimObject):
)
-class LocalBP(BranchPredictor):
+class LocalBP(ConditionalPredictor):
type = "LocalBP"
cxx_class = "gem5::branch_prediction::LocalBP"
cxx_header = "cpu/pred/2bit_local.hh"
@@ -221,7 +235,7 @@ class LocalBP(BranchPredictor):
localCtrBits = Param.Unsigned(2, "Bits per counter")
-class TournamentBP(BranchPredictor):
+class TournamentBP(ConditionalPredictor):
type = "TournamentBP"
cxx_class = "gem5::branch_prediction::TournamentBP"
cxx_header = "cpu/pred/tournament.hh"
@@ -235,7 +249,7 @@ class TournamentBP(BranchPredictor):
choiceCtrBits = Param.Unsigned(2, "Bits of choice counters")
-class BiModeBP(BranchPredictor):
+class BiModeBP(ConditionalPredictor):
type = "BiModeBP"
cxx_class = "gem5::branch_prediction::BiModeBP"
cxx_header = "cpu/pred/bi_mode.hh"
@@ -310,7 +324,7 @@ class TAGEBase(SimObject):
# TAGE branch predictor as described in https://www.jilp.org/vol8/v8paper1.pdf
# The default sizes below are for the 8C-TAGE configuration (63.5 Kbits)
-class TAGE(BranchPredictor):
+class TAGE(ConditionalPredictor):
type = "TAGE"
cxx_class = "gem5::branch_prediction::TAGE"
cxx_header = "cpu/pred/tage.hh"
@@ -776,7 +790,7 @@ class TAGE_SC_L_8KB(TAGE_SC_L):
statistical_corrector = TAGE_SC_L_8KB_StatisticalCorrector()
-class MultiperspectivePerceptron(BranchPredictor):
+class MultiperspectivePerceptron(ConditionalPredictor):
type = "MultiperspectivePerceptron"
cxx_class = "gem5::branch_prediction::MultiperspectivePerceptron"
cxx_header = "cpu/pred/multiperspective_perceptron.hh"
@@ -1117,7 +1131,7 @@ class MultiperspectivePerceptronTAGE8KB(MultiperspectivePerceptronTAGE):
statistical_corrector = MPP_StatisticalCorrector_8KB()
-class TageSCLRef(BranchPredictor):
+class TageSCLRef(ConditionalPredictor):
type = "TageSCLRef"
cxx_class = "gem5::branch_prediction::TageSCLRef"
cxx_header = "cpu/pred/tagescl_ref.hh"
diff --git a/src/cpu/pred/SConscript b/src/cpu/pred/SConscript
index f52299fecc0..e791f318d26 100644
--- a/src/cpu/pred/SConscript
+++ b/src/cpu/pred/SConscript
@@ -44,6 +44,7 @@ Import('*')
SimObject('BranchPredictor.py',
sim_objects=[
'BranchPredictor',
+ 'ConditionalPredictor',
'IndirectPredictor', 'SimpleIndirectPredictor',
'BranchTargetBuffer', 'SimpleBTB', 'BTBIndexingPolicy', 'BTBSetAssociative',
'ReturnAddrStack',
@@ -68,6 +69,7 @@ Source('bpred_unit.cc')
Source('2bit_local.cc')
Source('simple_indirect.cc')
Source('it_tage.cc')
+Source('conditional.cc')
Source('indirect.cc')
Source('ras.cc')
Source('tournament.cc')
diff --git a/src/cpu/pred/bi_mode.cc b/src/cpu/pred/bi_mode.cc
index f9f9330b883..ed39731ab28 100644
--- a/src/cpu/pred/bi_mode.cc
+++ b/src/cpu/pred/bi_mode.cc
@@ -54,7 +54,7 @@ namespace branch_prediction
{
BiModeBP::BiModeBP(const BiModeBPParams ¶ms)
- : BPredUnit(params),
+ : ConditionalPredictor(params),
globalHistoryReg(params.numThreads, 0),
globalHistoryBits(ceilLog2(params.globalPredictorSize)),
choicePredictorSize(params.choicePredictorSize),
diff --git a/src/cpu/pred/bi_mode.hh b/src/cpu/pred/bi_mode.hh
index c0513826730..2131677ae90 100644
--- a/src/cpu/pred/bi_mode.hh
+++ b/src/cpu/pred/bi_mode.hh
@@ -46,7 +46,7 @@
#define __CPU_PRED_BI_MODE_PRED_HH__
#include "base/sat_counter.hh"
-#include "cpu/pred/bpred_unit.hh"
+#include "cpu/pred/conditional.hh"
#include "params/BiModeBP.hh"
namespace gem5
@@ -69,7 +69,7 @@ namespace branch_prediction
* the branch's PC to choose between the two, destructive aliasing is reduced.
*/
-class BiModeBP : public BPredUnit
+class BiModeBP : public ConditionalPredictor
{
public:
BiModeBP(const BiModeBPParams ¶ms);
diff --git a/src/cpu/pred/bpred_unit.cc b/src/cpu/pred/bpred_unit.cc
index 2e0daec6760..95aa71026fc 100644
--- a/src/cpu/pred/bpred_unit.cc
+++ b/src/cpu/pred/bpred_unit.cc
@@ -63,6 +63,7 @@ BPredUnit::BPredUnit(const Params ¶ms)
predHist(numThreads),
btb(params.btb),
ras(params.ras),
+ cPred(params.conditionalBranchPred),
iPred(params.indirectBranchPred),
stats(this)
{
@@ -94,12 +95,6 @@ BPredUnit::drainSanityCheck() const
assert(ph.empty());
}
-void
-BPredUnit::branchPlaceholder(ThreadID tid, Addr pc,
- bool uncond, void * &bp_history)
-{
- panic("BPredUnit::branchPlaceholder() not implemented for this BP.\n");
-}
bool
BPredUnit::predict(const StaticInstPtr &inst, const InstSeqNum &seqNum,
@@ -156,7 +151,7 @@ BPredUnit::predict(const StaticInstPtr &inst, const InstSeqNum &seqNum,
} else {
// Conditional branches -------
++stats.condPredicted;
- hist->condPred = lookup(tid, pc.instAddr(), hist->bpHistory);
+ hist->condPred = cPred->lookup(tid, pc.instAddr(), hist->bpHistory);
if (hist->condPred) {
++stats.condPredictedTaken;
@@ -326,7 +321,7 @@ BPredUnit::predict(const StaticInstPtr &inst, const InstSeqNum &seqNum,
* The actual prediction tables will updated once
* we know the correct direction.
**/
- updateHistories(tid, hist->pc, hist->uncond, hist->predTaken,
+ cPred->updateHistories(tid, hist->pc, hist->uncond, hist->predTaken,
hist->target->instAddr(), hist->inst, hist->bpHistory);
@@ -383,7 +378,7 @@ BPredUnit::commitBranch(ThreadID tid, PredictorHistory* &hist)
hist->target->instAddr());
// Update the branch predictor with the correct results.
- update(tid, hist->pc,
+ cPred->update(tid, hist->pc,
hist->actuallyTaken,
hist->bpHistory, false,
hist->inst,
@@ -469,7 +464,7 @@ BPredUnit::squashHistory(ThreadID tid, PredictorHistory* &history)
}
// This call will delete the bpHistory.
- squash(tid, history->bpHistory);
+ cPred->squash(tid, history->bpHistory);
delete history;
history = nullptr;
@@ -548,7 +543,7 @@ BPredUnit::squash(const InstSeqNum &squashed_sn,
set(hist->target, corr_target);
// Correct Direction predictor ------------------
- update(tid, hist->pc, actually_taken, hist->bpHistory,
+ cPred->update(tid, hist->pc, actually_taken, hist->bpHistory,
true, hist->inst, corr_target.instAddr());
@@ -633,6 +628,13 @@ BPredUnit::squash(const InstSeqNum &squashed_sn,
}
}
+void
+BPredUnit::branchPlaceholder(ThreadID tid, Addr pc,
+ bool uncond, void * &bp_history)
+{
+ // Delegate to conditional predictor
+ cPred->branchPlaceholder(tid, pc, uncond, bp_history);
+}
void
BPredUnit::dump()
diff --git a/src/cpu/pred/bpred_unit.hh b/src/cpu/pred/bpred_unit.hh
index c94e5231818..f3f4ceded38 100644
--- a/src/cpu/pred/bpred_unit.hh
+++ b/src/cpu/pred/bpred_unit.hh
@@ -49,6 +49,7 @@
#include "cpu/inst_seq.hh"
#include "cpu/pred/branch_type.hh"
#include "cpu/pred/btb.hh"
+#include "cpu/pred/conditional.hh"
#include "cpu/pred/indirect.hh"
#include "cpu/pred/ras.hh"
#include "cpu/static_inst.hh"
@@ -141,81 +142,6 @@ class BPredUnit : public SimObject
* Interface functions to the conditional branch predictor
*
*/
-
- /**
- * Looks up a given conditional branch PC of in the BP to see if it
- * is taken or not taken.
- * @param tid The thread id.
- * @param pc The PC to look up.
- * @param bp_history Pointer that will be set to an object that
- * has the branch predictor state associated with the lookup.
- * @return Whether the branch is taken or not taken.
- */
- virtual bool lookup(ThreadID tid, Addr pc, void * &bp_history) = 0;
-
- /**
- * Ones done with the prediction this function updates the
- * path and global history. All branches call this function
- * including unconditional once.
- * @param tid The thread id.
- * @param pc The branch's pc that will be updated.
- * @param uncond Wheather or not this branch is an unconditional branch.
- * @param taken Whether or not the branch was taken
- * @param target The final target of branch. Some modern
- * predictors use the target in their history.
- * @param inst Static instruction information
- * @param bp_history Pointer that will be set to an object that
- * has the branch predictor state associated with the lookup.
- *
- */
- virtual void updateHistories(ThreadID tid, Addr pc, bool uncond,
- bool taken, Addr target,
- const StaticInstPtr &inst, void * &bp_history) = 0;
-
- /**
- * @param tid The thread id.
- * @param bp_history Pointer to the history object. The predictor
- * will need to update any state and delete the object.
- */
- virtual void squash(ThreadID tid, void * &bp_history) = 0;
-
-
- /**
- * Updates the BP with taken/not taken information.
- * @param tid The thread id.
- * @param pc The branch's PC that will be updated.
- * @param taken Whether the branch was taken or not taken.
- * @param bp_history Pointer to the branch predictor state that is
- * associated with the branch lookup that is being updated.
- * @param squashed Set to true when this function is called during a
- * squash operation.
- * @param inst Static instruction information
- * @param target The resolved target of the branch (only needed
- * for squashed branches)
- * @todo Make this update flexible enough to handle a global predictor.
- */
- virtual void update(ThreadID tid, Addr pc, bool taken,
- void * &bp_history, bool squashed,
- const StaticInstPtr &inst, Addr target) = 0;
-
- /**
- * Special function for the decoupled front-end. In it there can be
- * branches which are not detected by the BPU in the first place as it
- * requires a BTB hit. This function will generate a placeholder for
- * such a branch once it is pre-decoded in the fetch stage. It will
- * only create the branch history object but not update any internal state
- * of the BPU.
- * If the branch turns to be wrong then decode or commit will
- * be able to use the normal squash functionality to correct the branch.
- * Note that not all branch predictors implement this functionality.
- * @param tid The thread id.
- * @param pc The branch's PC.
- * @param uncond Whether or not this branch is an unconditional branch.
- * @param bp_history Pointer that will be set to an branch history object.
- */
- virtual void branchPlaceholder(ThreadID tid, Addr pc,
- bool uncond, void * &bp_history);
-
/**
* Looks up a given PC in the BTB to see if a matching entry exists.
* @param tid The thread id.
@@ -270,6 +196,24 @@ class BPredUnit : public SimObject
return btb->update(tid, pc, target);
}
+ /**
+ * Special function for the decoupled front-end. In it there can be
+ * branches which are not detected by the BPU in the first place as it
+ * requires a BTB hit. This function will generate a placeholder for
+ * such a branch once it is pre-decoded in the fetch stage. It will
+ * only create the branch history object but not update any internal state
+ * of the BPU.
+ * If the branch turns to be wrong then decode or commit will
+ * be able to use the normal squash functionality to correct the branch.
+ * Note that not all branch predictors implement this functionality.
+ * @param tid The thread id.
+ * @param pc The branch's PC.
+ * @param uncond Whether or not this branch is an unconditional branch.
+ * @param bp_history Pointer that will be set to an branch history object.
+ */
+ void branchPlaceholder(ThreadID tid, Addr pc,
+ bool uncond, void * &bp_history);
+
void dump();
@@ -492,6 +436,9 @@ class BPredUnit : public SimObject
/** The return address stack. */
ReturnAddrStack * ras;
+ /** The conditional branch predictor. */
+ ConditionalPredictor * cPred;
+
/** The indirect target predictor. */
IndirectPredictor * iPred;
diff --git a/src/cpu/pred/conditional.cc b/src/cpu/pred/conditional.cc
new file mode 100644
index 00000000000..a59ecd99120
--- /dev/null
+++ b/src/cpu/pred/conditional.cc
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2025 Technical University of Munich
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "cpu/pred/conditional.hh"
+
+namespace gem5
+{
+
+namespace branch_prediction
+{
+
+ConditionalPredictor::ConditionalPredictor(const Params ¶ms)
+ : SimObject(params),
+ instShiftAmt(params.instShiftAmt)
+{
+}
+
+
+void
+ConditionalPredictor::branchPlaceholder(ThreadID tid, Addr pc,
+ bool uncond, void * &bp_history)
+{
+ panic("BPredUnit::branchPlaceholder() not implemented for this BP.\n");
+}
+
+} // namespace branch_prediction
+} // namespace gem5
diff --git a/src/cpu/pred/conditional.hh b/src/cpu/pred/conditional.hh
new file mode 100644
index 00000000000..46dbf2b3a34
--- /dev/null
+++ b/src/cpu/pred/conditional.hh
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2025 Technical University of Munich
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder. You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* @file
+ * Conditional branch predictor interface
+ */
+
+#ifndef __CPU_PRED_CONDITIONAL_BASE_HH__
+#define __CPU_PRED_CONDITIONAL_BASE_HH__
+
+#include "arch/generic/pcstate.hh"
+#include "cpu/inst_seq.hh"
+#include "cpu/pred/branch_type.hh"
+#include "params/ConditionalPredictor.hh"
+#include "sim/sim_object.hh"
+
+namespace gem5
+{
+
+namespace branch_prediction
+{
+
+class ConditionalPredictor : public SimObject
+{
+ public:
+
+ typedef ConditionalPredictorParams Params;
+
+ ConditionalPredictor(const Params ¶ms);
+
+
+ /**
+ * Looks up a given conditional branch PC of in the BP to see if it
+ * is taken or not taken.
+ * @param tid The thread id.
+ * @param pc The PC to look up.
+ * @param bp_history Pointer that will be set to an object that
+ * has the branch predictor state associated with the lookup.
+ * @return Whether the branch is taken or not taken.
+ */
+ virtual bool lookup(ThreadID tid, Addr pc, void * &bp_history) = 0;
+
+ /**
+ * Ones done with the prediction this function updates the
+ * path and global history. All branches call this function
+ * including unconditional once.
+ * @param tid The thread id.
+ * @param pc The branch's pc that will be updated.
+ * @param uncond Wheather or not this branch is an unconditional branch.
+ * @param taken Whether or not the branch was taken
+ * @param target The final target of branch. Some modern
+ * predictors use the target in their history.
+ * @param inst Static instruction information
+ * @param bp_history Pointer that will be set to an object that
+ * has the branch predictor state associated with the lookup.
+ *
+ */
+ virtual void updateHistories(ThreadID tid, Addr pc, bool uncond,
+ bool taken, Addr target,
+ const StaticInstPtr &inst, void * &bp_history) = 0;
+
+ /**
+ * @param tid The thread id.
+ * @param bp_history Pointer to the history object. The predictor
+ * will need to update any state and delete the object.
+ */
+ virtual void squash(ThreadID tid, void * &bp_history) = 0;
+
+
+ /**
+ * Updates the BP with taken/not taken information.
+ * @param tid The thread id.
+ * @param pc The branch's PC that will be updated.
+ * @param taken Whether the branch was taken or not taken.
+ * @param bp_history Pointer to the branch predictor state that is
+ * associated with the branch lookup that is being updated.
+ * @param squashed Set to true when this function is called during a
+ * squash operation.
+ * @param inst Static instruction information
+ * @param target The resolved target of the branch (only needed
+ * for squashed branches)
+ * @todo Make this update flexible enough to handle a global predictor.
+ */
+ virtual void update(ThreadID tid, Addr pc, bool taken,
+ void * &bp_history, bool squashed,
+ const StaticInstPtr &inst, Addr target) = 0;
+
+ /**
+ * Special function for the decoupled front-end. In it there can be
+ * branches which are not detected by the BPU in the first place as it
+ * requires a BTB hit. This function will generate a placeholder for
+ * such a branch once it is pre-decoded in the fetch stage. It will
+ * only create the branch history object but not update any internal state
+ * of the BPU.
+ * If the branch turns to be wrong then decode or commit will
+ * be able to use the normal squash functionality to correct the branch.
+ * Note that not all branch predictors implement this functionality.
+ * @param tid The thread id.
+ * @param pc The branch's PC.
+ * @param uncond Whether or not this branch is an unconditional branch.
+ * @param bp_history Pointer that will be set to an branch history object.
+ */
+ virtual void branchPlaceholder(ThreadID tid, Addr pc,
+ bool uncond, void * &bp_history);
+ protected:
+
+ /** Number of bits to shift instructions by for predictor addresses. */
+ const unsigned instShiftAmt;
+};
+
+} // namespace branch_prediction
+} // namespace gem5
+
+#endif // __CPU_PRED_CONDITIONAL_BASE_HH__
diff --git a/src/cpu/pred/multiperspective_perceptron.cc b/src/cpu/pred/multiperspective_perceptron.cc
index c8284e49a1d..3f1bdec504b 100644
--- a/src/cpu/pred/multiperspective_perceptron.cc
+++ b/src/cpu/pred/multiperspective_perceptron.cc
@@ -128,7 +128,7 @@ MultiperspectivePerceptron::ThreadData::ThreadData(int num_filters,
}
MultiperspectivePerceptron::MultiperspectivePerceptron(
- const MultiperspectivePerceptronParams &p) : BPredUnit(p),
+ const MultiperspectivePerceptronParams &p) : ConditionalPredictor(p),
blockSize(p.block_size), pcshift(p.pcshift), threshold(p.threshold),
bias0(p.bias0), bias1(p.bias1), biasmostly0(p.biasmostly0),
biasmostly1(p.biasmostly1), nbest(p.nbest), tunebits(p.tunebits),
diff --git a/src/cpu/pred/multiperspective_perceptron.hh b/src/cpu/pred/multiperspective_perceptron.hh
index f1055d5fae3..af00715fdf1 100644
--- a/src/cpu/pred/multiperspective_perceptron.hh
+++ b/src/cpu/pred/multiperspective_perceptron.hh
@@ -55,7 +55,7 @@
#include
#include "base/random.hh"
-#include "cpu/pred/bpred_unit.hh"
+#include "cpu/pred/conditional.hh"
#include "params/MultiperspectivePerceptron.hh"
namespace gem5
@@ -64,7 +64,7 @@ namespace gem5
namespace branch_prediction
{
-class MultiperspectivePerceptron : public BPredUnit
+class MultiperspectivePerceptron : public ConditionalPredictor
{
protected:
/**
diff --git a/src/cpu/pred/tage.cc b/src/cpu/pred/tage.cc
index dd6ef5ddbe0..831d1834aaa 100644
--- a/src/cpu/pred/tage.cc
+++ b/src/cpu/pred/tage.cc
@@ -62,7 +62,9 @@ namespace gem5
namespace branch_prediction
{
-TAGE::TAGE(const TAGEParams ¶ms) : BPredUnit(params), tage(params.tage)
+TAGE::TAGE(const TAGEParams ¶ms) :
+ ConditionalPredictor(params),
+ tage(params.tage)
{
}
diff --git a/src/cpu/pred/tage.hh b/src/cpu/pred/tage.hh
index 329ba922ad4..482b546c391 100644
--- a/src/cpu/pred/tage.hh
+++ b/src/cpu/pred/tage.hh
@@ -64,7 +64,7 @@
#include "base/random.hh"
#include "base/types.hh"
-#include "cpu/pred/bpred_unit.hh"
+#include "cpu/pred/conditional.hh"
#include "cpu/pred/tage_base.hh"
#include "params/TAGE.hh"
@@ -74,7 +74,7 @@ namespace gem5
namespace branch_prediction
{
-class TAGE: public BPredUnit
+class TAGE: public ConditionalPredictor
{
protected:
TAGEBase *tage;
diff --git a/src/cpu/pred/tagescl_ref.cc b/src/cpu/pred/tagescl_ref.cc
index 032bded5285..8a13365b62a 100644
--- a/src/cpu/pred/tagescl_ref.cc
+++ b/src/cpu/pred/tagescl_ref.cc
@@ -54,7 +54,7 @@ namespace branch_prediction
{
TageSCLRef::TageSCLRef(const TageSCLRefParams ¶ms)
- : BPredUnit(params)
+ : ConditionalPredictor(params)
{
predictor = new PREDICTOR();
}
diff --git a/src/cpu/pred/tagescl_ref.hh b/src/cpu/pred/tagescl_ref.hh
index 7ae2db85c0d..f1cb86e94b3 100644
--- a/src/cpu/pred/tagescl_ref.hh
+++ b/src/cpu/pred/tagescl_ref.hh
@@ -46,7 +46,7 @@
#include "base/sat_counter.hh"
#include "base/types.hh"
-#include "cpu/pred/bpred_unit.hh"
+#include "cpu/pred/conditional.hh"
#include "params/TageSCLRef.hh"
namespace gem5
@@ -63,7 +63,7 @@ namespace branch_prediction
* predictor state that needs to be recorded or updated; the update can be
* determined solely by the branch being taken or not taken.
*/
-class TageSCLRef : public BPredUnit
+class TageSCLRef : public ConditionalPredictor
{
public:
/**
diff --git a/src/cpu/pred/tournament.cc b/src/cpu/pred/tournament.cc
index a6428575f28..bbedaf15504 100644
--- a/src/cpu/pred/tournament.cc
+++ b/src/cpu/pred/tournament.cc
@@ -51,7 +51,7 @@ namespace branch_prediction
{
TournamentBP::TournamentBP(const TournamentBPParams ¶ms)
- : BPredUnit(params),
+ : ConditionalPredictor(params),
localPredictorSize(params.localPredictorSize),
localCtrBits(params.localCtrBits),
localCtrs(localPredictorSize, SatCounter8(localCtrBits)),
diff --git a/src/cpu/pred/tournament.hh b/src/cpu/pred/tournament.hh
index 36b50c706a4..8de9faa695e 100644
--- a/src/cpu/pred/tournament.hh
+++ b/src/cpu/pred/tournament.hh
@@ -46,7 +46,7 @@
#include "base/sat_counter.hh"
#include "base/types.hh"
-#include "cpu/pred/bpred_unit.hh"
+#include "cpu/pred/conditional.hh"
#include "params/TournamentBP.hh"
namespace gem5
@@ -63,7 +63,7 @@ namespace branch_prediction
* predictor chooses between the two. Both the global history register
* and the selected local history are speculatively updated.
*/
-class TournamentBP : public BPredUnit
+class TournamentBP : public ConditionalPredictor
{
public:
/**
From 47fbba65ba694a00199a7ef3b2c069e09a9df827 Mon Sep 17 00:00:00 2001
From: Phillip Assmann
Date: Wed, 23 Apr 2025 14:37:19 +0000
Subject: [PATCH 04/22] cpu: add overriding predictor and latency support
---
src/cpu/minor/fetch2.cc | 2 +-
src/cpu/o3/BaseO3CPU.py | 1 -
src/cpu/o3/bac.cc | 31 ++++++++----
src/cpu/o3/bac.hh | 10 ++--
src/cpu/pred/BranchPredictor.py | 5 ++
src/cpu/pred/bpred_unit.cc | 85 +++++++++++++++++++++++++++++----
src/cpu/pred/bpred_unit.hh | 53 +++++++++++---------
src/cpu/pred/conditional.cc | 3 +-
src/cpu/pred/conditional.hh | 10 ++++
src/cpu/simple/base.cc | 2 +-
10 files changed, 153 insertions(+), 49 deletions(-)
diff --git a/src/cpu/minor/fetch2.cc b/src/cpu/minor/fetch2.cc
index b02294bfe6a..eb16b601d05 100644
--- a/src/cpu/minor/fetch2.cc
+++ b/src/cpu/minor/fetch2.cc
@@ -204,7 +204,7 @@ Fetch2::predictBranch(MinorDynInstPtr inst, BranchData &branch)
DPRINTF(Branch, "Trying to predict for inst: %s\n", *inst);
if (branchPredictor.predict(inst->staticInst,
- inst->id.fetchSeqNum, *inst_pc, inst->id.threadId)) {
+ inst->id.fetchSeqNum, *inst_pc, inst->id.threadId).taken) {
set(branch.target, *inst_pc);
inst->predictedTaken = true;
set(inst->predictedTarget, inst_pc);
diff --git a/src/cpu/o3/BaseO3CPU.py b/src/cpu/o3/BaseO3CPU.py
index d97c1f2c7dd..4535be91485 100644
--- a/src/cpu/o3/BaseO3CPU.py
+++ b/src/cpu/o3/BaseO3CPU.py
@@ -106,7 +106,6 @@ def support_take_over(cls):
# Forward pipeline delays
bacToFetchDelay = Param.Cycles(1, "Branch address calc. to fetch delay")
- bacBranchPredictDelay = Param.Cycles(0, "BAC Branch Predictor delay")
fetchToDecodeDelay = Param.Cycles(1, "Fetch to decode delay")
decodeWidth = Param.Unsigned(8, "Decode width")
diff --git a/src/cpu/o3/bac.cc b/src/cpu/o3/bac.cc
index b7dce266ae1..a9c580013a0 100644
--- a/src/cpu/o3/bac.cc
+++ b/src/cpu/o3/bac.cc
@@ -72,11 +72,10 @@ BAC::BAC(CPU *_cpu, const BaseO3CPUParams ¶ms)
decodeToFetchDelay(params.decodeToFetchDelay),
commitToFetchDelay(params.commitToFetchDelay),
bacToFetchDelay(params.bacToFetchDelay),
- bacBranchPredictDelay(params.bacBranchPredictDelay),
fetchTargetWidth(params.fetchTargetWidth),
minInstSize(params.minInstSize),
numThreads(params.numThreads),
- stats(_cpu,this)
+ stats(_cpu, this)
{
fatal_if(decoupledFrontEnd && (fetchTargetWidth < params.fetchBufferSize),
"Fetch target width should be larger than fetch buffer size!");
@@ -591,14 +590,16 @@ BAC::newFetchTarget(ThreadID tid, const PCStateBase &start_pc)
return ft;
}
-bool
+BPredUnit::Prediction
BAC::predict(ThreadID tid, const StaticInstPtr &inst,
const FetchTargetPtr &ft, PCStateBase &pc)
{
/** Perform the prediction. */
BPredUnit::PredictorHistory* bpu_history = nullptr;
- bool taken = bpu->predict(inst, ft->ftNum(), pc, tid, bpu_history);
+ BPredUnit::Prediction pred = bpu->predict(
+ inst, ft->ftNum(), pc, tid, bpu_history
+ );
/** Push the prediction history to the fetch target.
* The postFetch() function will move the history from the FTQ to the
@@ -607,7 +608,7 @@ BAC::predict(ThreadID tid, const StaticInstPtr &inst,
ft->bpu_history = static_cast(bpu_history);
DPRINTF(Branch,"[tid:%i, ftn:%llu] History added.\n", tid, ft->ftNum());
- return taken;
+ return pred;
}
@@ -692,8 +693,9 @@ BAC::generateFetchTargets(ThreadID tid, bool &status_change)
// Now make the actual prediction. Note the BPU will advance
// the PC to the next instruction.
- predict_taken = predict(tid, staticInst, curFT, *next_pc);
- branchPredictRemaining[tid] = Cycles(bacBranchPredictDelay);
+ BPredUnit::Prediction pred = predict(tid, staticInst, curFT, *next_pc);
+ predict_taken = pred.taken;
+ branchPredictRemaining[tid] = Cycles(pred.latency);
DPRINTF(BAC, "[tid:%i, ftn:%llu] Branch found at PC %#x "
"taken?:%i, target:%#x\n",
@@ -884,7 +886,7 @@ BAC::updatePreDecode(ThreadID tid, const InstSeqNum seqNum,
hist = new BPredUnit::PredictorHistory(tid, seqNum,
pc.instAddr(), inst);
bpu->branchPlaceholder(tid, pc.instAddr(), inst->isUncondCtrl(),
- hist->bpHistory);
+ hist);
hist->predTaken = hist->condPred = false;
hist->targetProvider = BPredUnit::TargetProvider::NoTarget;
@@ -941,8 +943,12 @@ BAC::updatePC(const DynInstPtr &inst,
} else {
// With a coupled front-end we need to make the branch prediction
// here.
- predict_taken = bpu->predict(inst->staticInst, inst->seqNum,
- fetch_pc, tid);
+ //
+ // Latency is ignored in coupled mode
+ BPredUnit::Prediction pred = bpu->predict(
+ inst->staticInst, inst->seqNum, fetch_pc, tid
+ );
+ predict_taken = pred.taken;
}
DPRINTF(BAC, "[tid:%i] [sn:%llu] Branch at PC %#x "
@@ -1004,6 +1010,9 @@ BAC::profileCycle(ThreadID tid)
case Squashing:
stats.squashCycles++;
break;
+ case Blocked:
+ stats.blockedCycles++;
+ break;
case FTQFull:
stats.ftqFullCycles++;
break;
@@ -1023,6 +1032,8 @@ BAC::BACStats::BACStats(o3::CPU *cpu, BAC *bac)
"Number of cycles BAC is running"),
ADD_STAT(squashCycles, statistics::units::Cycle::get(),
"Number of cycles BAC is squashing"),
+ ADD_STAT(blockedCycles, statistics::units::Cycle::get(),
+ "Number of cycles BAC is blocked"),
ADD_STAT(ftqFullCycles, statistics::units::Cycle::get(),
"Number of cycles BAC has spent waiting for FTQ to become free"),
diff --git a/src/cpu/o3/bac.hh b/src/cpu/o3/bac.hh
index 7e1f899bdf9..87a31fda757 100644
--- a/src/cpu/o3/bac.hh
+++ b/src/cpu/o3/bac.hh
@@ -93,6 +93,7 @@ typedef std::shared_ptr FetchTargetPtr;
class BAC
{
typedef branch_prediction::BranchType BranchType;
+ typedef branch_prediction::BPredUnit BPredUnit;
public:
/** Overall decoupled BPU stage status. Used to determine if the CPU can
@@ -245,9 +246,9 @@ class BAC
* @param inst The branch instruction.
* @param ft The fetch target that is currently processed.
* @param PC The predicted PC is passed back through this parameter.
- * @return Returns if the branch is taken or not.
+ * @return Returns the prediction result from the BPU.
*/
- bool predict(ThreadID tid, const StaticInstPtr &inst,
+ BPredUnit::Prediction predict(ThreadID tid, const StaticInstPtr &inst,
const FetchTargetPtr &ft, PCStateBase &pc);
@@ -402,9 +403,6 @@ class BAC
/** BAC to fetch delay. */
const Cycles bacToFetchDelay;
- /** BAC branch predict delay. */
- const Cycles bacBranchPredictDelay;
-
/** The maximum width of a fetch target. This also determines the
* maximum addresses searched in one cycle. (FT width / minInstSize) */
const unsigned fetchTargetWidth;
@@ -434,6 +432,8 @@ class BAC
statistics::Scalar runCycles;
/** Stat for total number of squashing cycles. */
statistics::Scalar squashCycles;
+ /** Stat for total number of blocked cycles. */
+ statistics::Scalar blockedCycles;
/** Stat for total number of cycles the FTQ was full. */
statistics::Scalar ftqFullCycles;
diff --git a/src/cpu/pred/BranchPredictor.py b/src/cpu/pred/BranchPredictor.py
index d23249899a5..51b9383690c 100644
--- a/src/cpu/pred/BranchPredictor.py
+++ b/src/cpu/pred/BranchPredictor.py
@@ -152,6 +152,7 @@ class ConditionalPredictor(SimObject):
instShiftAmt = Param.Unsigned(
Parent.instShiftAmt, "Number of bits to shift instructions by"
)
+ latency = Param.Cycles(0, "Latency of the predictor (in cycles)")
class IndirectPredictor(SimObject):
@@ -211,6 +212,10 @@ class BranchPredictor(SimObject):
conditionalBranchPred = Param.ConditionalPredictor(
"Conditional branch predictor"
)
+ overridingBranchPred = Param.ConditionalPredictor(
+ NULL,
+ "Secondary, overriding predictor which corrects the primary predictor",
+ )
indirectBranchPred = Param.IndirectPredictor(
SimpleIndirectPredictor(),
"Indirect branch predictor, set to NULL to disable "
diff --git a/src/cpu/pred/bpred_unit.cc b/src/cpu/pred/bpred_unit.cc
index 95aa71026fc..02afbff76fc 100644
--- a/src/cpu/pred/bpred_unit.cc
+++ b/src/cpu/pred/bpred_unit.cc
@@ -64,6 +64,7 @@ BPredUnit::BPredUnit(const Params ¶ms)
btb(params.btb),
ras(params.ras),
cPred(params.conditionalBranchPred),
+ overridingCPred(params.overridingBranchPred),
iPred(params.indirectBranchPred),
stats(this)
{
@@ -96,13 +97,13 @@ BPredUnit::drainSanityCheck() const
}
-bool
+BPredUnit::Prediction
BPredUnit::predict(const StaticInstPtr &inst, const InstSeqNum &seqNum,
PCStateBase &pc, ThreadID tid)
{
/** Perform the prediction. */
PredictorHistory* bpu_history = nullptr;
- bool taken = predict(inst, seqNum, pc, tid, bpu_history);
+ Prediction pred = predict(inst, seqNum, pc, tid, bpu_history);
assert(bpu_history!=nullptr);
@@ -112,18 +113,19 @@ BPredUnit::predict(const StaticInstPtr &inst, const InstSeqNum &seqNum,
DPRINTF(Branch, "[tid:%i] [sn:%llu] History entry added. "
"predHist.size(): %i\n", tid, seqNum, predHist[tid].size());
- return taken;
+ return pred;
}
-bool
+BPredUnit::Prediction
BPredUnit::predict(const StaticInstPtr &inst, const InstSeqNum &seqNum,
PCStateBase &pc, ThreadID tid, PredictorHistory* &hist)
{
assert(hist == nullptr);
+ Cycles totalLatency = Cycles(0);
// See if branch predictor predicts taken.
// If so, get its target addr either from the BTB or the RAS.
@@ -151,12 +153,36 @@ BPredUnit::predict(const StaticInstPtr &inst, const InstSeqNum &seqNum,
} else {
// Conditional branches -------
++stats.condPredicted;
- hist->condPred = cPred->lookup(tid, pc.instAddr(), hist->bpHistory);
+ hist->condPred = cPred->lookup(
+ tid, pc.instAddr(), hist->bpHistory
+ );
+
+ if (overridingCPred) {
+ bool secondaryPred = overridingCPred->lookup(
+ tid, pc.instAddr(), hist->overridingBpHistory
+ );
+ assert(hist->overridingBpHistory);
+ if (secondaryPred != hist->condPred) {
+ // If the predictors disagree,
+ // use the result of the overriding predictor
+ // and incur its latency
+ totalLatency += overridingCPred->getLatency();
+ hist->condPred = secondaryPred;
+ } else {
+ // If the predictors agree,
+ // use the result of the primary predictor
+ totalLatency += cPred->getLatency();
+ }
+ } else {
+ totalLatency += cPred->getLatency();
+ }
+
if (hist->condPred) {
++stats.condPredictedTaken;
}
}
+
hist->predTaken = hist->condPred;
DPRINTF(Branch,
@@ -324,6 +350,13 @@ BPredUnit::predict(const StaticInstPtr &inst, const InstSeqNum &seqNum,
cPred->updateHistories(tid, hist->pc, hist->uncond, hist->predTaken,
hist->target->instAddr(), hist->inst, hist->bpHistory);
+ if (overridingCPred) {
+ overridingCPred->updateHistories(
+ tid, hist->pc, hist->uncond, hist->predTaken,
+ hist->target->instAddr(), hist->inst, hist->overridingBpHistory
+ );
+ }
+
if (iPred) {
// Update the indirect predictor with the direction prediction
@@ -331,7 +364,10 @@ BPredUnit::predict(const StaticInstPtr &inst, const InstSeqNum &seqNum,
*hist->target, brType, hist->indirectHistory);
}
- return hist->predTaken;
+ return Prediction {
+ .taken = hist->predTaken,
+ .latency = totalLatency,
+ };
}
@@ -384,6 +420,18 @@ BPredUnit::commitBranch(ThreadID tid, PredictorHistory* &hist)
hist->inst,
hist->target->instAddr());
+ // If the overriding predictor was used,
+ // also update it with the correct result
+ if (overridingCPred) {
+
+ assert(hist->overridingBpHistory);
+ overridingCPred->update(
+ tid, hist->pc, hist->actuallyTaken,
+ hist->overridingBpHistory, false,
+ hist->inst, hist->target->instAddr()
+ );
+ }
+
// Commit also Indirect predictor and RAS
if (iPred) {
iPred->commit(tid, hist->seqNum,
@@ -466,6 +514,14 @@ BPredUnit::squashHistory(ThreadID tid, PredictorHistory* &history)
// This call will delete the bpHistory.
cPred->squash(tid, history->bpHistory);
+ // If the overriding predictor was used, also squash it
+ // This call will delete the overridingBpHistory.
+ if (overridingCPred) {
+ assert(history->overridingBpHistory);
+ overridingCPred->squash(tid, history->overridingBpHistory);
+ assert(history->overridingBpHistory == nullptr);
+ }
+
delete history;
history = nullptr;
}
@@ -546,6 +602,14 @@ BPredUnit::squash(const InstSeqNum &squashed_sn,
cPred->update(tid, hist->pc, actually_taken, hist->bpHistory,
true, hist->inst, corr_target.instAddr());
+ // If the overriding predictor was used, also update it
+ if (overridingCPred) {
+ overridingCPred->update(tid, hist->pc, actually_taken,
+ hist->overridingBpHistory,
+ true, hist->inst,
+ corr_target.instAddr());
+ }
+
// Correct Indirect predictor -------------------
if (iPred) {
@@ -630,10 +694,15 @@ BPredUnit::squash(const InstSeqNum &squashed_sn,
void
BPredUnit::branchPlaceholder(ThreadID tid, Addr pc,
- bool uncond, void * &bp_history)
+ bool uncond, PredictorHistory* &hist)
{
// Delegate to conditional predictor
- cPred->branchPlaceholder(tid, pc, uncond, bp_history);
+ cPred->branchPlaceholder(tid, pc, uncond, hist->bpHistory);
+ // If the overriding predictor is used, also call it
+ if (overridingCPred) {
+ overridingCPred->branchPlaceholder(tid, pc, uncond,
+ hist->overridingBpHistory);
+ }
}
void
diff --git a/src/cpu/pred/bpred_unit.hh b/src/cpu/pred/bpred_unit.hh
index f3f4ceded38..0074216ee4b 100644
--- a/src/cpu/pred/bpred_unit.hh
+++ b/src/cpu/pred/bpred_unit.hh
@@ -82,7 +82,13 @@ class BPredUnit : public SimObject
/** Branch Predictor Unit (BPU) interface functions */
public:
-
+ struct Prediction
+ {
+ /** Whether the branch is predicted taken */
+ bool taken;
+ /** The latency that this prediction would normally take */
+ Cycles latency;
+ };
/**
* @param params The params object, that has the size of the BP and BTB.
@@ -103,7 +109,7 @@ class BPredUnit : public SimObject
* @param tid The thread id.
* @return Returns if the branch is taken or not.
*/
- bool predict(const StaticInstPtr &inst, const InstSeqNum &seqNum,
+ Prediction predict(const StaticInstPtr &inst, const InstSeqNum &seqNum,
PCStateBase &pc, ThreadID tid);
/**
@@ -196,24 +202,6 @@ class BPredUnit : public SimObject
return btb->update(tid, pc, target);
}
- /**
- * Special function for the decoupled front-end. In it there can be
- * branches which are not detected by the BPU in the first place as it
- * requires a BTB hit. This function will generate a placeholder for
- * such a branch once it is pre-decoded in the fetch stage. It will
- * only create the branch history object but not update any internal state
- * of the BPU.
- * If the branch turns to be wrong then decode or commit will
- * be able to use the normal squash functionality to correct the branch.
- * Note that not all branch predictors implement this functionality.
- * @param tid The thread id.
- * @param pc The branch's PC.
- * @param uncond Whether or not this branch is an unconditional branch.
- * @param bp_history Pointer that will be set to an branch history object.
- */
- void branchPlaceholder(ThreadID tid, Addr pc,
- bool uncond, void * &bp_history);
-
void dump();
@@ -375,6 +363,8 @@ class BPredUnit : public SimObject
*/
void *bpHistory = nullptr;
+ void *overridingBpHistory = nullptr;
+
void *indirectHistory = nullptr;
void *rasHistory = nullptr;
@@ -387,7 +377,7 @@ class BPredUnit : public SimObject
/**
* Internal prediction function.
*/
- bool predict(const StaticInstPtr &inst, const InstSeqNum &seqNum,
+ Prediction predict(const StaticInstPtr &inst, const InstSeqNum &seqNum,
PCStateBase &pc, ThreadID tid, PredictorHistory* &bpu_history);
/**
@@ -406,7 +396,23 @@ class BPredUnit : public SimObject
*/
void commitBranch(ThreadID tid, PredictorHistory* &bpu_history);
-
+ /**
+ * Special function for the decoupled front-end. In it there can be
+ * branches which are not detected by the BPU in the first place as it
+ * requires a BTB hit. This function will generate a placeholder for
+ * such a branch once it is pre-decoded in the fetch stage. It will
+ * only create the branch history object but not update any internal state
+ * of the BPU.
+ * If the branch turns to be wrong then decode or commit will
+ * be able to use the normal squash functionality to correct the branch.
+ * Note that not all branch predictors implement this functionality.
+ * @param tid The thread id.
+ * @param pc The branch's PC.
+ * @param uncond Whether or not this branch is an unconditional branch.
+ * @param bp_history Pointer that will be set to an branch history object.
+ */
+ void branchPlaceholder(ThreadID tid, Addr pc,
+ bool uncond, PredictorHistory* &hist);
protected:
/** Number of the threads for which the branch history is maintained. */
@@ -439,6 +445,9 @@ class BPredUnit : public SimObject
/** The conditional branch predictor. */
ConditionalPredictor * cPred;
+ /** The overriding conditional branch predictor. */
+ ConditionalPredictor * overridingCPred;
+
/** The indirect target predictor. */
IndirectPredictor * iPred;
diff --git a/src/cpu/pred/conditional.cc b/src/cpu/pred/conditional.cc
index a59ecd99120..b8a000dffbe 100644
--- a/src/cpu/pred/conditional.cc
+++ b/src/cpu/pred/conditional.cc
@@ -36,7 +36,8 @@ namespace branch_prediction
ConditionalPredictor::ConditionalPredictor(const Params ¶ms)
: SimObject(params),
- instShiftAmt(params.instShiftAmt)
+ instShiftAmt(params.instShiftAmt),
+ latency(params.latency)
{
}
diff --git a/src/cpu/pred/conditional.hh b/src/cpu/pred/conditional.hh
index 46dbf2b3a34..5c16fe7a71f 100644
--- a/src/cpu/pred/conditional.hh
+++ b/src/cpu/pred/conditional.hh
@@ -62,6 +62,13 @@ class ConditionalPredictor : public SimObject
ConditionalPredictor(const Params ¶ms);
+ /**
+ * Returns the configured prediction latency in cycles
+ * @return The prediction latency in cycles
+ */
+ Cycles getLatency() const {
+ return latency;
+ }
/**
* Looks up a given conditional branch PC of in the BP to see if it
@@ -140,6 +147,9 @@ class ConditionalPredictor : public SimObject
/** Number of bits to shift instructions by for predictor addresses. */
const unsigned instShiftAmt;
+
+ /** Latency of the predictor (for lookup / actual prediction) in cycles */
+ const Cycles latency;
};
} // namespace branch_prediction
diff --git a/src/cpu/simple/base.cc b/src/cpu/simple/base.cc
index 4614115d997..2dc3a970f2b 100644
--- a/src/cpu/simple/base.cc
+++ b/src/cpu/simple/base.cc
@@ -396,7 +396,7 @@ BaseSimpleCPU::preExecute()
set(t_info.predPC, thread->pcState());
const bool predict_taken(
branchPred->predict(curStaticInst, cur_sn, *t_info.predPC,
- curThread));
+ curThread).taken);
if (predict_taken)
++t_info.execContextStats.numPredictedBranches;
From f9be615302bd3e6ebdb4e65b8c1d410b6f29d0bc Mon Sep 17 00:00:00 2001
From: Phillip Assmann
Date: Mon, 28 Apr 2025 13:34:17 +0000
Subject: [PATCH 05/22] cpu: pred add LLBP + dynamic latency
---
src/cpu/o3/bac.cc | 8 +-
src/cpu/o3/bac.hh | 3 +-
src/cpu/pred/2bit_local.cc | 4 +-
src/cpu/pred/2bit_local.hh | 3 +-
src/cpu/pred/BranchPredictor.py | 17 +-
src/cpu/pred/SConscript | 3 +
src/cpu/pred/bi_mode.cc | 4 +-
src/cpu/pred/bi_mode.hh | 3 +-
src/cpu/pred/bpred_unit.cc | 20 +-
src/cpu/pred/bpred_unit.hh | 8 -
src/cpu/pred/branch_type.hh | 10 +
src/cpu/pred/conditional.cc | 2 +-
src/cpu/pred/conditional.hh | 16 +-
src/cpu/pred/llbp.cc | 367 ++++++++++++++++++
src/cpu/pred/llbp.hh | 174 +++++++++
src/cpu/pred/ltage.cc | 4 +-
src/cpu/pred/ltage.hh | 2 +-
src/cpu/pred/multiperspective_perceptron.cc | 8 +-
src/cpu/pred/multiperspective_perceptron.hh | 4 +-
.../pred/multiperspective_perceptron_tage.cc | 4 +-
.../pred/multiperspective_perceptron_tage.hh | 2 +-
src/cpu/pred/tage.cc | 12 +-
src/cpu/pred/tage.hh | 4 +-
src/cpu/pred/tage_base.hh | 3 +-
src/cpu/pred/tage_sc_l.cc | 4 +-
src/cpu/pred/tage_sc_l.hh | 2 +-
src/cpu/pred/tagescl_ref.cc | 4 +-
src/cpu/pred/tagescl_ref.hh | 2 +-
src/cpu/pred/tournament.cc | 6 +-
src/cpu/pred/tournament.hh | 2 +-
30 files changed, 640 insertions(+), 65 deletions(-)
create mode 100644 src/cpu/pred/llbp.cc
create mode 100644 src/cpu/pred/llbp.hh
diff --git a/src/cpu/o3/bac.cc b/src/cpu/o3/bac.cc
index a9c580013a0..4445b26edd5 100644
--- a/src/cpu/o3/bac.cc
+++ b/src/cpu/o3/bac.cc
@@ -590,14 +590,14 @@ BAC::newFetchTarget(ThreadID tid, const PCStateBase &start_pc)
return ft;
}
-BPredUnit::Prediction
+Prediction
BAC::predict(ThreadID tid, const StaticInstPtr &inst,
const FetchTargetPtr &ft, PCStateBase &pc)
{
/** Perform the prediction. */
BPredUnit::PredictorHistory* bpu_history = nullptr;
- BPredUnit::Prediction pred = bpu->predict(
+ Prediction pred = bpu->predict(
inst, ft->ftNum(), pc, tid, bpu_history
);
@@ -693,7 +693,7 @@ BAC::generateFetchTargets(ThreadID tid, bool &status_change)
// Now make the actual prediction. Note the BPU will advance
// the PC to the next instruction.
- BPredUnit::Prediction pred = predict(tid, staticInst, curFT, *next_pc);
+ Prediction pred = predict(tid, staticInst, curFT, *next_pc);
predict_taken = pred.taken;
branchPredictRemaining[tid] = Cycles(pred.latency);
@@ -945,7 +945,7 @@ BAC::updatePC(const DynInstPtr &inst,
// here.
//
// Latency is ignored in coupled mode
- BPredUnit::Prediction pred = bpu->predict(
+ Prediction pred = bpu->predict(
inst->staticInst, inst->seqNum, fetch_pc, tid
);
predict_taken = pred.taken;
diff --git a/src/cpu/o3/bac.hh b/src/cpu/o3/bac.hh
index 87a31fda757..7ae8113f06f 100644
--- a/src/cpu/o3/bac.hh
+++ b/src/cpu/o3/bac.hh
@@ -53,6 +53,7 @@ namespace gem5
{
struct BaseO3CPUParams;
+typedef branch_prediction::Prediction Prediction;
namespace o3
{
@@ -248,7 +249,7 @@ class BAC
* @param PC The predicted PC is passed back through this parameter.
* @return Returns the prediction result from the BPU.
*/
- BPredUnit::Prediction predict(ThreadID tid, const StaticInstPtr &inst,
+ Prediction predict(ThreadID tid, const StaticInstPtr &inst,
const FetchTargetPtr &ft, PCStateBase &pc);
diff --git a/src/cpu/pred/2bit_local.cc b/src/cpu/pred/2bit_local.cc
index 34b495bc218..a14d077b6b5 100644
--- a/src/cpu/pred/2bit_local.cc
+++ b/src/cpu/pred/2bit_local.cc
@@ -93,7 +93,7 @@ LocalBP::updateHistories(ThreadID tid, Addr pc, bool uncond, bool taken,
}
-bool
+Prediction
LocalBP::lookup(ThreadID tid, Addr branch_addr, void * &bp_history)
{
bool taken;
@@ -109,7 +109,7 @@ LocalBP::lookup(ThreadID tid, Addr branch_addr, void * &bp_history)
taken = getPrediction(counter_val);
- return taken;
+ return staticPrediction(taken);
}
void
diff --git a/src/cpu/pred/2bit_local.hh b/src/cpu/pred/2bit_local.hh
index bb539d37a9c..d117c922264 100644
--- a/src/cpu/pred/2bit_local.hh
+++ b/src/cpu/pred/2bit_local.hh
@@ -46,6 +46,7 @@
#include "base/sat_counter.hh"
#include "base/types.hh"
+#include "cpu/pred/branch_type.hh"
#include "cpu/pred/conditional.hh"
#include "params/LocalBP.hh"
@@ -71,7 +72,7 @@ class LocalBP : public ConditionalPredictor
LocalBP(const LocalBPParams ¶ms);
// Overriding interface functions
- bool lookup(ThreadID tid, Addr pc, void * &bp_history) override;
+ Prediction lookup(ThreadID tid, Addr pc, void * &bp_history) override;
void branchPlaceholder(ThreadID tid, Addr pc, bool uncond,
void * &bpHistory) override;
diff --git a/src/cpu/pred/BranchPredictor.py b/src/cpu/pred/BranchPredictor.py
index 51b9383690c..d8a3411cbac 100644
--- a/src/cpu/pred/BranchPredictor.py
+++ b/src/cpu/pred/BranchPredictor.py
@@ -152,7 +152,9 @@ class ConditionalPredictor(SimObject):
instShiftAmt = Param.Unsigned(
Parent.instShiftAmt, "Number of bits to shift instructions by"
)
- latency = Param.Cycles(0, "Latency of the predictor (in cycles)")
+ latency = Param.Cycles(
+ 0, "Static (flat) latency of the predictor (in cycles)"
+ )
class IndirectPredictor(SimObject):
@@ -795,6 +797,19 @@ class TAGE_SC_L_8KB(TAGE_SC_L):
statistical_corrector = TAGE_SC_L_8KB_StatisticalCorrector()
+class LLBP(LTAGE):
+ type = "LLBP"
+ cxx_class = "gem5::branch_prediction::LLBP"
+ cxx_header = "cpu/pred/llbp.hh"
+
+ CTWidth = Param.Int(14, "RCR CTWidth")
+ patternBufferCapacity = Param.Int(16, "Pattern Buffer Capacity")
+ contextCapacity = Param.Int(1 << 4, "Context Capacity")
+ storageCapacity = Param.Int(1 << 16, "Storage Capacity")
+ ptnCounterBits = Param.Int(3, "Bits in Pattern Counter")
+ ctxCounterBits = Param.Int(2, "Bits in Context Replacement Counter")
+
+
class MultiperspectivePerceptron(ConditionalPredictor):
type = "MultiperspectivePerceptron"
cxx_class = "gem5::branch_prediction::MultiperspectivePerceptron"
diff --git a/src/cpu/pred/SConscript b/src/cpu/pred/SConscript
index e791f318d26..30f4962b082 100644
--- a/src/cpu/pred/SConscript
+++ b/src/cpu/pred/SConscript
@@ -52,6 +52,7 @@ SimObject('BranchPredictor.py',
'LocalBP', 'TournamentBP', 'BiModeBP', 'TAGEBase', 'TAGE', 'LoopPredictor',
'TAGE_SC_L_TAGE', 'TAGE_SC_L_TAGE_64KB', 'TAGE_SC_L_TAGE_8KB',
'LTAGE', 'TAGE_SC_L_LoopPredictor', 'StatisticalCorrector', 'TAGE_SC_L',
+ 'LLBP',
'TAGE_SC_L_64KB_StatisticalCorrector',
'TAGE_SC_L_8KB_StatisticalCorrector',
'TAGE_SC_L_64KB', 'TAGE_SC_L_8KB',
@@ -79,6 +80,7 @@ Source('tage.cc')
Source('tagescl_ref.cc')
Source('loop_predictor.cc')
Source('ltage.cc')
+Source('llbp.cc')
Source('multiperspective_perceptron.cc')
Source('multiperspective_perceptron_8KB.cc')
Source('multiperspective_perceptron_64KB.cc')
@@ -99,3 +101,4 @@ DebugFlag('Branch')
DebugFlag('Tage')
DebugFlag('LTage')
DebugFlag('TageSCL')
+DebugFlag('LLBP')
diff --git a/src/cpu/pred/bi_mode.cc b/src/cpu/pred/bi_mode.cc
index ed39731ab28..5a6557c12c1 100644
--- a/src/cpu/pred/bi_mode.cc
+++ b/src/cpu/pred/bi_mode.cc
@@ -128,7 +128,7 @@ BiModeBP::squash(ThreadID tid, void * &bp_history)
* choice array's prediction is used to select between the two
* direction predictors for the final branch prediction.
*/
-bool
+Prediction
BiModeBP::lookup(ThreadID tid, Addr branchAddr, void * &bp_history)
{
unsigned choiceHistoryIdx = ((branchAddr >> instShiftAmt)
@@ -163,7 +163,7 @@ BiModeBP::lookup(ThreadID tid, Addr branchAddr, void * &bp_history)
history->finalPred = finalPrediction;
bp_history = static_cast(history);
- return finalPrediction;
+ return staticPrediction(finalPrediction);
}
diff --git a/src/cpu/pred/bi_mode.hh b/src/cpu/pred/bi_mode.hh
index 2131677ae90..0f230831f23 100644
--- a/src/cpu/pred/bi_mode.hh
+++ b/src/cpu/pred/bi_mode.hh
@@ -46,6 +46,7 @@
#define __CPU_PRED_BI_MODE_PRED_HH__
#include "base/sat_counter.hh"
+#include "cpu/pred/branch_type.hh"
#include "cpu/pred/conditional.hh"
#include "params/BiModeBP.hh"
@@ -73,7 +74,7 @@ class BiModeBP : public ConditionalPredictor
{
public:
BiModeBP(const BiModeBPParams ¶ms);
- bool lookup(ThreadID tid, Addr pc, void * &bp_history) override;
+ Prediction lookup(ThreadID tid, Addr pc, void * &bp_history) override;
void updateHistories(ThreadID tid, Addr pc, bool uncond, bool taken,
Addr target, const StaticInstPtr &inst,
void * &bp_history) override;
diff --git a/src/cpu/pred/bpred_unit.cc b/src/cpu/pred/bpred_unit.cc
index 02afbff76fc..ab4b046bdb5 100644
--- a/src/cpu/pred/bpred_unit.cc
+++ b/src/cpu/pred/bpred_unit.cc
@@ -97,7 +97,7 @@ BPredUnit::drainSanityCheck() const
}
-BPredUnit::Prediction
+Prediction
BPredUnit::predict(const StaticInstPtr &inst, const InstSeqNum &seqNum,
PCStateBase &pc, ThreadID tid)
{
@@ -119,7 +119,7 @@ BPredUnit::predict(const StaticInstPtr &inst, const InstSeqNum &seqNum,
-BPredUnit::Prediction
+Prediction
BPredUnit::predict(const StaticInstPtr &inst, const InstSeqNum &seqNum,
PCStateBase &pc, ThreadID tid, PredictorHistory* &hist)
{
@@ -153,28 +153,30 @@ BPredUnit::predict(const StaticInstPtr &inst, const InstSeqNum &seqNum,
} else {
// Conditional branches -------
++stats.condPredicted;
- hist->condPred = cPred->lookup(
+ Prediction condPred = cPred->lookup(
tid, pc.instAddr(), hist->bpHistory
);
+ hist->condPred = condPred.taken;
if (overridingCPred) {
- bool secondaryPred = overridingCPred->lookup(
+
+ Prediction secondaryPred = overridingCPred->lookup(
tid, pc.instAddr(), hist->overridingBpHistory
);
assert(hist->overridingBpHistory);
- if (secondaryPred != hist->condPred) {
+ if (secondaryPred.taken != hist->condPred) {
// If the predictors disagree,
// use the result of the overriding predictor
// and incur its latency
- totalLatency += overridingCPred->getLatency();
- hist->condPred = secondaryPred;
+ totalLatency += secondaryPred.latency;
+ hist->condPred = secondaryPred.taken;
} else {
// If the predictors agree,
// use the result of the primary predictor
- totalLatency += cPred->getLatency();
+ totalLatency += condPred.latency;
}
} else {
- totalLatency += cPred->getLatency();
+ totalLatency += condPred.latency;
}
diff --git a/src/cpu/pred/bpred_unit.hh b/src/cpu/pred/bpred_unit.hh
index 0074216ee4b..4595effec4e 100644
--- a/src/cpu/pred/bpred_unit.hh
+++ b/src/cpu/pred/bpred_unit.hh
@@ -82,14 +82,6 @@ class BPredUnit : public SimObject
/** Branch Predictor Unit (BPU) interface functions */
public:
- struct Prediction
- {
- /** Whether the branch is predicted taken */
- bool taken;
- /** The latency that this prediction would normally take */
- Cycles latency;
- };
-
/**
* @param params The params object, that has the size of the BP and BTB.
*/
diff --git a/src/cpu/pred/branch_type.hh b/src/cpu/pred/branch_type.hh
index dcc6149a9b0..3a8defe54f1 100644
--- a/src/cpu/pred/branch_type.hh
+++ b/src/cpu/pred/branch_type.hh
@@ -85,6 +85,16 @@ inline std::string toString(BranchType type)
}
+struct Prediction
+{
+ /** Whether the branch is predicted taken */
+ bool taken;
+ /** The latency that this prediction would normally take */
+ Cycles latency;
+};
+
+
+
} // namespace branch_prediction
} // namespace gem5
diff --git a/src/cpu/pred/conditional.cc b/src/cpu/pred/conditional.cc
index b8a000dffbe..2ff5e885d4e 100644
--- a/src/cpu/pred/conditional.cc
+++ b/src/cpu/pred/conditional.cc
@@ -37,7 +37,7 @@ namespace branch_prediction
ConditionalPredictor::ConditionalPredictor(const Params ¶ms)
: SimObject(params),
instShiftAmt(params.instShiftAmt),
- latency(params.latency)
+ staticLatency(params.latency)
{
}
diff --git a/src/cpu/pred/conditional.hh b/src/cpu/pred/conditional.hh
index 5c16fe7a71f..566ab74a6ed 100644
--- a/src/cpu/pred/conditional.hh
+++ b/src/cpu/pred/conditional.hh
@@ -66,8 +66,8 @@ class ConditionalPredictor : public SimObject
* Returns the configured prediction latency in cycles
* @return The prediction latency in cycles
*/
- Cycles getLatency() const {
- return latency;
+ Cycles getStaticLatency() const {
+ return staticLatency;
}
/**
@@ -79,7 +79,7 @@ class ConditionalPredictor : public SimObject
* has the branch predictor state associated with the lookup.
* @return Whether the branch is taken or not taken.
*/
- virtual bool lookup(ThreadID tid, Addr pc, void * &bp_history) = 0;
+ virtual Prediction lookup(ThreadID tid, Addr pc, void * &bp_history) = 0;
/**
* Ones done with the prediction this function updates the
@@ -148,8 +148,14 @@ class ConditionalPredictor : public SimObject
/** Number of bits to shift instructions by for predictor addresses. */
const unsigned instShiftAmt;
- /** Latency of the predictor (for lookup / actual prediction) in cycles */
- const Cycles latency;
+ /** Static latency of the predictor in cycles */
+ const Cycles staticLatency;
+
+ /** Return a prediction with only static latency */
+ Prediction staticPrediction(bool taken) const
+ {
+ return Prediction{taken, staticLatency};
+ }
};
} // namespace branch_prediction
diff --git a/src/cpu/pred/llbp.cc b/src/cpu/pred/llbp.cc
new file mode 100644
index 00000000000..891c130985e
--- /dev/null
+++ b/src/cpu/pred/llbp.cc
@@ -0,0 +1,367 @@
+/*
+
+# Copyright (c) 2025 Technical University of Munich
+# Copyright (c) 2024 The University of Edinburgh
+# All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+Implementation of the last-level branch predictor (LLBP).
+
+*/
+
+#include "cpu/pred/llbp.hh"
+
+#include "debug/LLBP.hh"
+
+namespace gem5
+{
+
+namespace branch_prediction
+{
+
+LLBP::LLBP(const LLBPParams ¶ms)
+ : LTAGE(params),
+ contextCapacity(params.contextCapacity),
+ patternBufferCapacity(params.patternBufferCapacity),
+ storageCapacity(params.storageCapacity),
+ ctxCounterBits(params.ctxCounterBits),
+ ptnCounterBits(params.ptnCounterBits),
+ rcr(3, 8, 8, 2, params.CTWidth)
+{
+ storagePriority.resize(1 << ctxCounterBits);
+ DPRINTF(LLBP, "Using experimental LLBP\n");
+ DPRINTF(LLBP, "RCR: T=%d, W=%d, D=%d, S=%d, CTWidth=%d\n",
+ rcr.T, rcr.W, rcr.D, rcr.S, params.CTWidth);
+ DPRINTF(LLBP, "Storage: cap=%d, bits=%d, PQ=%d\n",
+ storageCapacity, ctxCounterBits, storagePriority.size());
+ DPRINTF(LLBP, "Context: cap=%d, bits=%d\n",
+ contextCapacity, ptnCounterBits);
+}
+
+void
+LLBP::init()
+{
+ LTAGE::init();
+}
+
+void
+LLBP::squash(ThreadID tid, void *&bp_history)
+{
+ // TODO: No squash in LLBP?
+ LTAGE::squash(tid, bp_history);
+}
+
+void
+LLBP::update(ThreadID tid, Addr pc, bool taken,
+ void *&bp_history, bool squashed,
+ const StaticInstPtr &inst, Addr target)
+{
+ LLBPBranchInfo *bi = static_cast(bp_history);
+ if (!inst->isUncondCtrl())
+ {
+ storageUpdate(tid, pc, rcr.getCCID(), taken, bi);
+ }
+
+ rcr.update(pc, inst, taken);
+ LTAGE::update(tid, pc, taken, bp_history, squashed, inst, target);
+
+ DPRINTF(LLBP, "[thread %d] Updated %s on %lld (%s): ccid=%llu, counter=%d,"
+ "uncond=%s, sz=%d\n",
+ tid,
+ taken ? "true" : "false",
+ pc,
+ inst->getName().c_str(),
+ rcr.getCCID(),
+ -1,
+ inst->isUncondCtrl() ? "true" : "false",
+ storage.size());
+}
+
+Prediction
+LLBP::predict(ThreadID tid, Addr branch_pc, bool cond_branch, void *&b)
+{
+ Addr pc = branch_pc;
+ bool tage_prediction = LTAGE::predict(
+ tid, branch_pc, cond_branch, b
+ ).taken;
+ LLBPBranchInfo *bi = static_cast(b);
+ auto tage_bi = bi->tageBranchInfo;
+ bi->overridden = false;
+ bi->pred_taken = tage_prediction;
+
+ int8_t llbp_confidence = 127;
+
+ if (cond_branch)
+ {
+ int tage_bank = tage->nHistoryTables;
+ if (tage_bi->provider == TAGEBase::TAGE_LONGEST_MATCH)
+ tage_bank = tage_bi->hitBank;
+ if (tage_bi->provider == TAGEBase::TAGE_ALT_MATCH)
+ tage_bank = tage_bi->altBank;
+ auto ccid = rcr.getCCID();
+ if (storage.count(ccid))
+ {
+ int i = findBestPattern(tid, pc, storage[ccid]);
+ if (i >= 0)
+ {
+ llbp_hits++;
+ llbp_confidence = storage[ccid].patterns[i].counter;
+ bool llbp_prediction = llbp_confidence >= 0;
+ if (i > tage_bank)
+ {
+ bi->overridden = true;
+ bi->pred_taken = llbp_prediction;
+ overrides++;
+ }
+ }
+ }
+ DPRINTF(LLBP, "[thread %d] Predicted %s%s on %lld: overides=%d, "
+ "llbp_hits=%d, confidence=%d, hitBank=%d, ccid=%llu\n",
+ tid,
+ bi->pred_taken ? "true" : "false",
+ bi->overridden ? "'" : "",
+ branch_pc,
+ overrides,
+ llbp_hits,
+ llbp_confidence,
+ bi->tageBranchInfo->hitBank,
+ ccid);
+ }
+ return staticPrediction(bi->pred_taken);
+}
+
+int8_t LLBP::normalize(int8_t counter)
+{
+ return counter >= 0 ? counter : -counter - 1;
+}
+
+void LLBP::storageUpdate(ThreadID tid, Addr pc, uint64_t cid, bool taken,
+ LLBPBranchInfo *bi)
+{
+ if (storage.count(cid))
+ {
+ storagePriority[storage[cid].replace].erase(cid);
+ int i = findBestPattern(tid, pc, storage[cid]);
+ if (i >= 0)
+ {
+ TAGEBase::ctrUpdate(storage[cid].patterns[i].counter, taken,
+ ptnCounterBits);
+ if (storage[cid].patterns[i].counter == (taken ? 1 : -2))
+ {
+ // entry became medium confident
+ TAGEBase::unsignedCtrUpdate(storage[cid].replace, true,
+ ctxCounterBits);
+ }
+ else if (storage[cid].patterns[i].counter == (taken ? -1 : 0))
+ {
+ // entry became low confident
+ TAGEBase::unsignedCtrUpdate(storage[cid].replace, false,
+ ctxCounterBits);
+ }
+ }
+ i = findVictimPattern(i, storage[cid]);
+ if (i >= 0 && bi->overridden && bi->pred_taken != taken)
+ {
+ storage[cid].patterns[i].tag = tage->gtag(tid, pc, i);
+ storage[cid].patterns[i].counter = taken ? 0 : -1;
+ }
+ }
+ else
+ {
+ while (storage.size() >= storageCapacity)
+ {
+ uint64_t i = findVictimContext();
+ storagePriority[storage[i].replace].erase(i);
+ storage.erase(i);
+ }
+ storage[cid] = LLBP::Context();
+ storage[cid].patterns.resize(tage->nHistoryTables + 1);
+ for (int i = 1; i <= tage->nHistoryTables; i++)
+ {
+ storage[cid].patterns[i].tag = tage->gtag(tid, pc, i);
+ storage[cid].patterns[i].counter = taken ? 0 : -1;
+ }
+ }
+ if (std::find(patternBuffer.begin(), patternBuffer.end(), cid) ==
+ patternBuffer.end())
+ {
+ patternBuffer.push_back(cid);
+ if (patternBuffer.size() > patternBufferCapacity)
+ patternBuffer.pop_front();
+ }
+ storagePriority[storage[cid].replace].insert(cid);
+}
+
+int LLBP::findBestPattern(ThreadID tid, Addr pc, Context &ctx)
+{
+ for (int i = tage->nHistoryTables; i > 0; i--)
+ {
+ if (ctx.patterns[i].tag == tage->gtag(tid, pc, i))
+ {
+ return i;
+ }
+ }
+ return -1;
+}
+
+int LLBP::findVictimPattern(int min, Context &ctx)
+{
+ int min_conf = 1 << (ptnCounterBits - 2);
+ for (int i = (min >= 0 ? min + 1 : 1); i <= tage->nHistoryTables; i++)
+ {
+ if (normalize(ctx.patterns[i].counter) < min_conf)
+ {
+ return i;
+ }
+ }
+ return -1;
+}
+
+uint64_t LLBP::findVictimContext()
+{
+ for (int i = 0; i < storagePriority.size(); i++)
+ {
+ if (storagePriority.size())
+ {
+ return *storagePriority[i].begin();
+ }
+ }
+ return -1;
+}
+
+/* from LLBP source code: */
+
+LLBP::RCR::RCR(int _T, int _W, int _D, int _shift, int _CTWidth)
+ : CTWidth(_CTWidth), T(_T), W(_W), D(_D), S(_shift)
+{
+ bb[0].resize(maxwindow);
+ bb[1].resize(maxwindow);
+ ctxs = {0, 0};
+}
+
+/*
+ * Given the {n} number of branches starting from vec[end-start]
+ * to vec[end-start-n-1] we create the hash function by shifting
+ * each PC by {shift} number if bits i.e.
+ *
+ * 000000000000| PC | :vec[end-start]
+ * ^ 0000000000| PC |00 :vec[end-start-1]
+ * ^ 00000000| PC |0000 :vec[end-start-2]
+ * . .
+ * . .
+ * . .
+ * ^ | PC |000000000000 :vec[end-start-n-1]
+ * ----------------------
+ * final hash value
+ *
+*/
+uint64_t
+LLBP::RCR::calcHash(std::list &vec, int n,
+ int start, int shift)
+{
+ uint64_t hash = 0;
+ if (vec.size() < (start + n))
+ {
+ return 0;
+ }
+ uint64_t sh = 0;
+ auto it = vec.begin();
+ std::advance(it, start);
+ for (; (it != vec.end()) && (n > 0); it++, n--)
+ {
+ uint64_t val = *it;
+
+ // Shift the value
+ hash ^= val << uint64_t(sh);
+
+ sh += shift;
+ if (sh >= CTWidth)
+ {
+ sh -= uint64_t(CTWidth);
+ }
+ }
+ return hash & ((1 << CTWidth) - 1);
+}
+
+uint64_t LLBP::RCR::getCCID()
+{
+ return ctxs.ccid & ((1ULL << CTWidth) - 1);
+}
+
+uint64_t LLBP::RCR::getPCID()
+{
+ return ctxs.pcid & ((1ULL << CTWidth) - 1);
+}
+
+bool LLBP::RCR::update(Addr pc, const StaticInstPtr &inst, bool taken)
+{
+ // Hash of all branches
+ branchCount++;
+
+ bool update = false;
+
+ switch (T)
+ {
+ case 0: // All branches
+ update = true;
+ break;
+
+ case 1: // Only calls
+ if (inst->isCall()) update = true;
+ break;
+
+ case 2: // Only calls and returns
+ if (inst->isCall() || inst->isReturn())
+ update = true;
+ break;
+
+ case 3: // Only unconditional branches
+ if (inst->isUncondCtrl()) update = true;
+ break;
+
+ case 4: // All taken branches
+ if (taken) update = true;
+ break;
+ }
+
+ if (update)
+ {
+ // Add the new branch to the history
+ bb[0].push_front(pc);
+ bb[1].push_front(pc);
+
+ // Remove the oldest branch
+ bb[0].pop_back();
+ bb[1].pop_back();
+
+ // The current context.
+ ctxs.ccid = calcHash(bb[0], W, D, S);
+ // The prefetch context.
+ ctxs.pcid = calcHash(bb[0], W, 0, S);
+
+ // DPRINTF(LLBP, "UH:%llx, %i\n", pc, taken);
+ return true;
+ }
+
+ return false;
+}
+
+} // namespace branch_prediction
+} // namespace gem5
diff --git a/src/cpu/pred/llbp.hh b/src/cpu/pred/llbp.hh
new file mode 100644
index 00000000000..f4f66eeb393
--- /dev/null
+++ b/src/cpu/pred/llbp.hh
@@ -0,0 +1,174 @@
+/*
+
+# Copyright (c) 2025 Technical University of Munich
+# Copyright (c) 2024 The University of Edinburgh
+# All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+Implementation of the last-level branch predictor (LLBP).
+
+*/
+
+#ifndef __CPU_PRED_LLBP_HH__
+#define __CPU_PRED_LLBP_HH__
+
+#include
+#include
+#include
+#include
+#include
+
+#include "base/cache/associative_cache.hh"
+#include "base/cache/cache_entry.hh"
+#include "base/types.hh"
+#include "cpu/pred/ltage.hh"
+#include "params/LLBP.hh"
+
+namespace gem5
+{
+
+namespace branch_prediction
+{
+
+class LLBP : public LTAGE
+{
+ public:
+ LLBP(const LLBPParams ¶ms);
+
+ void squash(ThreadID tid, void * &bp_history) override;
+ void update(ThreadID tid, Addr pc, bool taken,
+ void * &bp_history, bool squashed,
+ const StaticInstPtr & inst, Addr target) override;
+
+ void init() override;
+
+ protected:
+
+ struct LLBPBranchInfo : public LTageBranchInfo
+ {
+ bool overridden;
+ bool pred_taken;
+ LLBPBranchInfo(TAGEBase &tage, LoopPredictor &lp,
+ Addr pc, bool conditional)
+ : LTageBranchInfo(tage, lp, pc, conditional),
+ overridden(false)
+ {}
+
+ virtual ~LLBPBranchInfo()
+ {}
+ };
+
+ Prediction predict(
+ ThreadID tid, Addr branch_pc, bool cond_branch, void* &b) override;
+
+ struct Pattern
+ {
+ int tag;
+ // direction & whether to replace if low on space in a context
+ int8_t counter;
+ };
+
+ struct Context
+ {
+ std::vector patterns;
+ uint8_t replace; // whether to replace context in the storage
+ };
+
+ std::unordered_map storage;
+ std::vector> storagePriority;
+ std::deque patternBuffer;
+
+ int contextCapacity;
+ int patternBufferCapacity;
+ int storageCapacity;
+ int ctxCounterBits;
+ int ptnCounterBits;
+ int overrides = 0;
+ int llbp_hits = 0;
+
+ int8_t normalize(int8_t counter);
+ void storageUpdate(
+ ThreadID tid, Addr pc, uint64_t cid, bool taken, LLBPBranchInfo* bi);
+ void storageInvalidate();
+ int findBestPattern(ThreadID tid, Addr pc, Context& ctx);
+ int findVictimPattern(int min, Context& ctx);
+ uint64_t findVictimContext();
+
+
+ /* From LLBP Source Code */
+
+ class RCR
+ {
+ const int maxwindow = 120;
+
+ uint64_t calcHash(std::list &vec, int n,
+ int start=0, int shift=0);
+
+ // The context tag width
+ const int CTWidth;
+
+ // A list of previouly taken branches
+ std::list bb[10];
+
+ // We compute the context ID and prefetch context ID
+ // only when the content of the RCR changes.
+ struct
+ {
+ uint64_t ccid = 0;
+ uint64_t pcid = 0;
+ } ctxs;
+
+ int branchCount = 0;
+
+ public:
+
+ // The hash constants
+ const int T, W, D, S;
+
+ RCR(int _T, int _W, int _D, int _shift, int _CTWidth);
+
+ // Push a new branch into the RCR.
+ bool update(Addr pc, const StaticInstPtr & inst, bool taken);
+
+
+ /**
+ * Computes the modulo of a number val with respect to 2^exp
+ * i.e. val % 2^exp
+ *
+ * @param val The value to be wrapped
+ * @param exp The size as exponent of 2
+ * @return The wrapped value
+ */
+ inline static uint64_t moduloTwoExp(uint64_t val, int exp) {
+ return val & ((1 << exp) - 1);
+ }
+
+ // Get the current context ID
+ uint64_t getCCID();
+
+ // Get the prefetch context ID
+ uint64_t getPCID();
+ } rcr;
+};
+
+} // namespace branch_prediction
+} // namespace gem5
+
+ #endif // __CPU_PRED_LLBP_HH__
diff --git a/src/cpu/pred/ltage.cc b/src/cpu/pred/ltage.cc
index 64a749ee4b1..b9d94dc39ff 100644
--- a/src/cpu/pred/ltage.cc
+++ b/src/cpu/pred/ltage.cc
@@ -82,7 +82,7 @@ LTAGE::branchPlaceholder(ThreadID tid, Addr pc,
}
//prediction
-bool
+Prediction
LTAGE::predict(ThreadID tid, Addr branch_pc, bool cond_branch, void* &b)
{
LTageBranchInfo *bi = new LTageBranchInfo(*tage, *loopPredictor,
@@ -110,7 +110,7 @@ LTAGE::predict(ThreadID tid, Addr branch_pc, bool cond_branch, void* &b)
// record final prediction
bi->lpBranchInfo->predTaken = pred_taken;
- return pred_taken;
+ return staticPrediction(pred_taken);
}
// PREDICTOR UPDATE
diff --git a/src/cpu/pred/ltage.hh b/src/cpu/pred/ltage.hh
index 89bef7b7c92..070fa1f4001 100644
--- a/src/cpu/pred/ltage.hh
+++ b/src/cpu/pred/ltage.hh
@@ -128,7 +128,7 @@ class LTAGE : public TAGE
* @param b Reference to wrapping pointer to allow storing
* derived class prediction information in the base class.
*/
- bool predict(
+ Prediction predict(
ThreadID tid, Addr branch_pc, bool cond_branch, void* &b) override;
};
diff --git a/src/cpu/pred/multiperspective_perceptron.cc b/src/cpu/pred/multiperspective_perceptron.cc
index 3f1bdec504b..25355093376 100644
--- a/src/cpu/pred/multiperspective_perceptron.cc
+++ b/src/cpu/pred/multiperspective_perceptron.cc
@@ -591,7 +591,7 @@ MultiperspectivePerceptron::updateHistories(ThreadID tid, Addr pc,
threadData[tid]->path_history[0] = pc2;
}
-bool
+Prediction
MultiperspectivePerceptron::lookup(ThreadID tid, Addr instPC,
void * &bp_history)
{
@@ -608,11 +608,11 @@ MultiperspectivePerceptron::lookup(ThreadID tid, Addr instPC,
if (f.alwaysNotTakenSoFar()) {
bi->filtered = true;
bi->prediction = false;
- return false;
+ return staticPrediction(false);
} else if (f.alwaysTakenSoFar()) {
bi->filtered = true;
bi->prediction = true;
- return true;
+ return staticPrediction(true);
}
if (f.neverSeen()) {
use_static = true;
@@ -630,7 +630,7 @@ MultiperspectivePerceptron::lookup(ThreadID tid, Addr instPC,
}
}
- return bi->prediction;
+ return staticPrediction(bi->prediction);
}
void
diff --git a/src/cpu/pred/multiperspective_perceptron.hh b/src/cpu/pred/multiperspective_perceptron.hh
index af00715fdf1..d21681799bf 100644
--- a/src/cpu/pred/multiperspective_perceptron.hh
+++ b/src/cpu/pred/multiperspective_perceptron.hh
@@ -55,6 +55,7 @@
#include
#include "base/random.hh"
+#include "cpu/pred/branch_type.hh"
#include "cpu/pred/conditional.hh"
#include "params/MultiperspectivePerceptron.hh"
@@ -1064,7 +1065,8 @@ class MultiperspectivePerceptron : public ConditionalPredictor
void init() override;
// Base class methods.
- bool lookup(ThreadID tid, Addr branch_addr, void* &bp_history) override;
+ Prediction
+ lookup(ThreadID tid, Addr branch_addr, void* &bp_history) override;
void updateHistories(ThreadID tid, Addr pc, bool uncond, bool taken,
Addr target, const StaticInstPtr &inst,
void * &bp_history) override;
diff --git a/src/cpu/pred/multiperspective_perceptron_tage.cc b/src/cpu/pred/multiperspective_perceptron_tage.cc
index 6d5f2da76e8..3b9b41028b4 100644
--- a/src/cpu/pred/multiperspective_perceptron_tage.cc
+++ b/src/cpu/pred/multiperspective_perceptron_tage.cc
@@ -515,7 +515,7 @@ MultiperspectivePerceptronTAGE::updateHistories(ThreadID tid,
}
}
-bool
+Prediction
MultiperspectivePerceptronTAGE::lookup(ThreadID tid, Addr instPC,
void * &bp_history)
{
@@ -543,7 +543,7 @@ MultiperspectivePerceptronTAGE::lookup(ThreadID tid, Addr instPC,
0 /* altBank: unused */, init_lsum);
bi->predictedTaken = pred_taken;
bi->lpBranchInfo->predTaken = pred_taken;
- return pred_taken;
+ return staticPrediction(pred_taken);
}
diff --git a/src/cpu/pred/multiperspective_perceptron_tage.hh b/src/cpu/pred/multiperspective_perceptron_tage.hh
index 81d8dcb2d9b..774d5cb2ab1 100644
--- a/src/cpu/pred/multiperspective_perceptron_tage.hh
+++ b/src/cpu/pred/multiperspective_perceptron_tage.hh
@@ -248,7 +248,7 @@ class MultiperspectivePerceptronTAGE : public MultiperspectivePerceptron
void init() override;
- bool lookup(ThreadID tid, Addr instPC, void * &bp_history) override;
+ Prediction lookup(ThreadID tid, Addr instPC, void * &bp_history) override;
void update(ThreadID tid, Addr pc, bool taken,
void * &bp_history, bool squashed,
diff --git a/src/cpu/pred/tage.cc b/src/cpu/pred/tage.cc
index 831d1834aaa..bc35d579de1 100644
--- a/src/cpu/pred/tage.cc
+++ b/src/cpu/pred/tage.cc
@@ -110,20 +110,22 @@ TAGE::squash(ThreadID tid, void * &bp_history)
bp_history = nullptr;
}
-bool
+Prediction
TAGE::predict(ThreadID tid, Addr pc, bool cond_branch, void* &b)
{
TageBranchInfo *bi = new TageBranchInfo(*tage, pc, cond_branch);
b = (void*)(bi);
- return tage->tagePredict(tid, pc, cond_branch, bi->tageBranchInfo);
+ return staticPrediction(
+ tage->tagePredict(tid, pc, cond_branch, bi->tageBranchInfo)
+ );
}
-bool
+Prediction
TAGE::lookup(ThreadID tid, Addr pc, void* &bp_history)
{
- bool retval = predict(tid, pc, true, bp_history);
+ Prediction retval = predict(tid, pc, true, bp_history);
- DPRINTF(Tage, "Lookup branch: %lx; predict:%d\n", pc, retval);
+ DPRINTF(Tage, "Lookup branch: %lx; predict:%d\n", pc, retval.taken);
return retval;
}
diff --git a/src/cpu/pred/tage.hh b/src/cpu/pred/tage.hh
index 482b546c391..28754f4bb84 100644
--- a/src/cpu/pred/tage.hh
+++ b/src/cpu/pred/tage.hh
@@ -95,7 +95,7 @@ class TAGE: public ConditionalPredictor
}
};
- virtual bool predict(ThreadID tid, Addr branch_pc, bool cond_branch,
+ virtual Prediction predict(ThreadID tid, Addr branch_pc, bool cond_branch,
void* &b);
public:
@@ -103,7 +103,7 @@ class TAGE: public ConditionalPredictor
TAGE(const TAGEParams ¶ms);
// Base class methods.
- bool lookup(ThreadID tid, Addr pc, void* &bp_history) override;
+ Prediction lookup(ThreadID tid, Addr pc, void* &bp_history) override;
virtual void updateHistories(ThreadID tid, Addr pc, bool uncond,
bool taken, Addr target, const StaticInstPtr &inst,
void * &bp_history) override;
diff --git a/src/cpu/pred/tage_base.hh b/src/cpu/pred/tage_base.hh
index 98e218c65f9..d2b04c8b9af 100644
--- a/src/cpu/pred/tage_base.hh
+++ b/src/cpu/pred/tage_base.hh
@@ -68,7 +68,6 @@ class TAGEBase : public SimObject
TAGEBase(const TAGEBaseParams &p);
void init() override;
- protected:
// Prediction Structures
// Tage Entry
@@ -485,7 +484,6 @@ class TAGEBase : public SimObject
bool isSpeculativeUpdateEnabled() const;
size_t getSizeInBits() const;
- protected:
const unsigned logRatioBiModalHystEntries;
const unsigned nHistoryTables;
const unsigned tagTableCounterBits;
@@ -495,6 +493,7 @@ class TAGEBase : public SimObject
const unsigned maxHist;
const unsigned pathHistBits;
+ protected:
std::vector tagTableTagWidths;
std::vector logTagTableSizes;
diff --git a/src/cpu/pred/tage_sc_l.cc b/src/cpu/pred/tage_sc_l.cc
index 2e24fc8786d..ef7af1e3af2 100644
--- a/src/cpu/pred/tage_sc_l.cc
+++ b/src/cpu/pred/tage_sc_l.cc
@@ -364,7 +364,7 @@ TAGE_SC_L::branchPlaceholder(ThreadID tid, Addr pc,
bp_history = (void*)(bi);
}
-bool
+Prediction
TAGE_SC_L::predict(ThreadID tid, Addr pc, bool cond_branch, void* &b)
{
TageSCLBranchInfo *bi = new TageSCLBranchInfo(*tage,
@@ -412,7 +412,7 @@ TAGE_SC_L::predict(ThreadID tid, Addr pc, bool cond_branch, void* &b)
// record final prediction
bi->lpBranchInfo->predTaken = pred_taken;
- return pred_taken;
+ return staticPrediction(pred_taken);
}
void
diff --git a/src/cpu/pred/tage_sc_l.hh b/src/cpu/pred/tage_sc_l.hh
index f00ae195ab8..b2aae2e95bd 100644
--- a/src/cpu/pred/tage_sc_l.hh
+++ b/src/cpu/pred/tage_sc_l.hh
@@ -170,7 +170,7 @@ class TAGE_SC_L: public LTAGE
public:
TAGE_SC_L(const TAGE_SC_LParams ¶ms);
- bool predict(
+ Prediction predict(
ThreadID tid, Addr branch_pc, bool cond_branch, void* &b) override;
void squash(ThreadID tid, void * &bp_history) override;
void update(ThreadID tid, Addr pc, bool taken, void * &bp_history,
diff --git a/src/cpu/pred/tagescl_ref.cc b/src/cpu/pred/tagescl_ref.cc
index 8a13365b62a..d61776d09cd 100644
--- a/src/cpu/pred/tagescl_ref.cc
+++ b/src/cpu/pred/tagescl_ref.cc
@@ -76,11 +76,11 @@ TageSCLRef::updateHistories(ThreadID tid, Addr pc, bool uncond, bool taken,
}
-bool
+Prediction
TageSCLRef::lookup(ThreadID tid, Addr branch_addr, void * &bp_history)
{
auto pred = predictor->GetPrediction(branch_addr);
- return pred;
+ return staticPrediction(pred);
}
void
diff --git a/src/cpu/pred/tagescl_ref.hh b/src/cpu/pred/tagescl_ref.hh
index f1cb86e94b3..e9d89e8afaa 100644
--- a/src/cpu/pred/tagescl_ref.hh
+++ b/src/cpu/pred/tagescl_ref.hh
@@ -73,7 +73,7 @@ class TageSCLRef : public ConditionalPredictor
~TageSCLRef();
// Overriding interface functions
- bool lookup(ThreadID tid, Addr pc, void * &bp_history) override;
+ Prediction lookup(ThreadID tid, Addr pc, void * &bp_history) override;
// void updateHistories(ThreadID tid, Addr pc, bool uncond, bool taken,
// Addr target, void * &bp_history) override;
diff --git a/src/cpu/pred/tournament.cc b/src/cpu/pred/tournament.cc
index bbedaf15504..96ba8a07ecb 100644
--- a/src/cpu/pred/tournament.cc
+++ b/src/cpu/pred/tournament.cc
@@ -149,7 +149,7 @@ TournamentBP::updateLocalHist(unsigned local_history_idx, bool taken)
(localHistoryTable[local_history_idx] << 1) | taken;
}
-bool
+Prediction
TournamentBP::lookup(ThreadID tid, Addr pc, void * &bp_history)
{
bool local_prediction;
@@ -188,9 +188,9 @@ TournamentBP::lookup(ThreadID tid, Addr pc, void * &bp_history)
// Select and return the prediction
// History update will be happen in the next function
if (choice_prediction) {
- return global_prediction;
+ return staticPrediction(global_prediction);
} else {
- return local_prediction;
+ return staticPrediction(local_prediction);
}
}
diff --git a/src/cpu/pred/tournament.hh b/src/cpu/pred/tournament.hh
index 8de9faa695e..5ccfb0f8898 100644
--- a/src/cpu/pred/tournament.hh
+++ b/src/cpu/pred/tournament.hh
@@ -72,7 +72,7 @@ class TournamentBP : public ConditionalPredictor
TournamentBP(const TournamentBPParams ¶ms);
// Base class methods.
- bool lookup(ThreadID tid, Addr pc, void* &bp_history) override;
+ Prediction lookup(ThreadID tid, Addr pc, void* &bp_history) override;
void updateHistories(ThreadID tid, Addr pc, bool uncond, bool taken,
Addr target, const StaticInstPtr &inst,
void * &bp_history) override;
From a49a7ce2113b1ded537649ac4bbc7b067a6bd1c1 Mon Sep 17 00:00:00 2001
From: Phillip Assmann
Date: Fri, 9 May 2025 16:09:14 +0000
Subject: [PATCH 06/22] cpu: improve llbp, make condpred clockobject
---
src/cpu/pred/BranchPredictor.py | 10 +-
src/cpu/pred/conditional.cc | 2 +-
src/cpu/pred/conditional.hh | 6 +-
src/cpu/pred/llbp.cc | 487 ++++++++++++++++++++++++--------
src/cpu/pred/llbp.hh | 84 ++++--
5 files changed, 443 insertions(+), 146 deletions(-)
diff --git a/src/cpu/pred/BranchPredictor.py b/src/cpu/pred/BranchPredictor.py
index d8a3411cbac..f504d987c03 100644
--- a/src/cpu/pred/BranchPredictor.py
+++ b/src/cpu/pred/BranchPredictor.py
@@ -142,7 +142,7 @@ class SimpleBTB(BranchTargetBuffer):
)
-class ConditionalPredictor(SimObject):
+class ConditionalPredictor(ClockedObject):
type = "ConditionalPredictor"
cxx_class = "gem5::branch_prediction::ConditionalPredictor"
cxx_header = "cpu/pred/conditional.hh"
@@ -802,12 +802,12 @@ class LLBP(LTAGE):
cxx_class = "gem5::branch_prediction::LLBP"
cxx_header = "cpu/pred/llbp.hh"
- CTWidth = Param.Int(14, "RCR CTWidth")
- patternBufferCapacity = Param.Int(16, "Pattern Buffer Capacity")
- contextCapacity = Param.Int(1 << 4, "Context Capacity")
- storageCapacity = Param.Int(1 << 16, "Storage Capacity")
+ tagWidthBits = Param.Int(14, "RCR CTWidth")
+ patternBufferCapacity = Param.Int(64, "Pattern Buffer Capacity")
+ storageCapacity = Param.Int(14000, "Storage Capacity")
ptnCounterBits = Param.Int(3, "Bits in Pattern Counter")
ctxCounterBits = Param.Int(2, "Bits in Context Replacement Counter")
+ backingStorageLatency = Param.Cycles(6, "Backing Storage Latency")
class MultiperspectivePerceptron(ConditionalPredictor):
diff --git a/src/cpu/pred/conditional.cc b/src/cpu/pred/conditional.cc
index 2ff5e885d4e..75070b7db3c 100644
--- a/src/cpu/pred/conditional.cc
+++ b/src/cpu/pred/conditional.cc
@@ -35,7 +35,7 @@ namespace branch_prediction
{
ConditionalPredictor::ConditionalPredictor(const Params ¶ms)
- : SimObject(params),
+ : ClockedObject(params),
instShiftAmt(params.instShiftAmt),
staticLatency(params.latency)
{
diff --git a/src/cpu/pred/conditional.hh b/src/cpu/pred/conditional.hh
index 566ab74a6ed..c7d5a6d208a 100644
--- a/src/cpu/pred/conditional.hh
+++ b/src/cpu/pred/conditional.hh
@@ -46,7 +46,7 @@
#include "cpu/inst_seq.hh"
#include "cpu/pred/branch_type.hh"
#include "params/ConditionalPredictor.hh"
-#include "sim/sim_object.hh"
+#include "sim/clocked_object.hh"
namespace gem5
{
@@ -54,7 +54,7 @@ namespace gem5
namespace branch_prediction
{
-class ConditionalPredictor : public SimObject
+class ConditionalPredictor : public ClockedObject
{
public:
@@ -161,4 +161,4 @@ class ConditionalPredictor : public SimObject
} // namespace branch_prediction
} // namespace gem5
-#endif // __CPU_PRED_CONDITIONAL_BASE_HH__
+#endif //__CPU_PRED_CONDITIONAL_BASE_HH__
diff --git a/src/cpu/pred/llbp.cc b/src/cpu/pred/llbp.cc
index 891c130985e..5a1bfdbf90c 100644
--- a/src/cpu/pred/llbp.cc
+++ b/src/cpu/pred/llbp.cc
@@ -27,8 +27,8 @@ Implementation of the last-level branch predictor (LLBP).
*/
#include "cpu/pred/llbp.hh"
-
#include "debug/LLBP.hh"
+#include
namespace gem5
{
@@ -38,21 +38,19 @@ namespace branch_prediction
LLBP::LLBP(const LLBPParams ¶ms)
: LTAGE(params),
- contextCapacity(params.contextCapacity),
patternBufferCapacity(params.patternBufferCapacity),
storageCapacity(params.storageCapacity),
ctxCounterBits(params.ctxCounterBits),
ptnCounterBits(params.ptnCounterBits),
- rcr(3, 8, 8, 2, params.CTWidth)
+ backingStorageLatency(params.backingStorageLatency),
+ stats(this),
+ rcr(3, 8, 8, 2, params.tagWidthBits)
{
- storagePriority.resize(1 << ctxCounterBits);
DPRINTF(LLBP, "Using experimental LLBP\n");
- DPRINTF(LLBP, "RCR: T=%d, W=%d, D=%d, S=%d, CTWidth=%d\n",
- rcr.T, rcr.W, rcr.D, rcr.S, params.CTWidth);
- DPRINTF(LLBP, "Storage: cap=%d, bits=%d, PQ=%d\n",
- storageCapacity, ctxCounterBits, storagePriority.size());
- DPRINTF(LLBP, "Context: cap=%d, bits=%d\n",
- contextCapacity, ptnCounterBits);
+ DPRINTF(LLBP, "RCR: T=%d, W=%d, D=%d, S=%d, tagWidthBits=%d\n",
+ rcr.T, rcr.W, rcr.D, rcr.S, params.tagWidthBits);
+ DPRINTF(LLBP, "Storage: cap=%d, bits=%d\n",
+ storageCapacity, ctxCounterBits);
}
void
@@ -64,6 +62,12 @@ LLBP::init()
void
LLBP::squash(ThreadID tid, void *&bp_history)
{
+ LLBPBranchInfo *bi = static_cast(bp_history);
+ if (bi->overridden) {
+ if (!bi->avenged) {
+ stats.squashedOverrides++;
+ }
+ }
// TODO: No squash in LLBP?
LTAGE::squash(tid, bp_history);
}
@@ -73,14 +77,37 @@ LLBP::update(ThreadID tid, Addr pc, bool taken,
void *&bp_history, bool squashed,
const StaticInstPtr &inst, Addr target)
{
+ assert(bp_history);
LLBPBranchInfo *bi = static_cast(bp_history);
- if (!inst->isUncondCtrl())
- {
+ LTageBranchInfo *ltage_bi = static_cast(bi);
+
+ if (inst->isUncondCtrl()) {
+ // Insert the next prefetch context into the pattern buffer
+ if (backingStorage.count(rcr.getPCID()))
+ {
+ uint64_t pcid = rcr.getPCID();
+ if (patternBuffer.count(rcr.getPCID()) == 0) {
+ ++stats.prefetchesIssued;
+ patternBuffer[pcid] = curCycle();
+
+ patternBufferQueue.push_back(pcid);
+ if (patternBufferQueue.size() >= patternBufferCapacity)
+ {
+ patternBuffer.erase(patternBufferQueue.front());
+ patternBufferQueue.pop_front();
+ ++stats.patternBufferEvictions;
+ }
+ }
+ }
+ } else {
+ if (squashed && !bi->avenged) {
+ rcr.restore(bi->rcrBackup);
+ }
storageUpdate(tid, pc, rcr.getCCID(), taken, bi);
}
rcr.update(pc, inst, taken);
- LTAGE::update(tid, pc, taken, bp_history, squashed, inst, target);
+ lTageUpdate(tid, pc, taken, ltage_bi, squashed, inst, target);
DPRINTF(LLBP, "[thread %d] Updated %s on %lld (%s): ccid=%llu, counter=%d,"
"uncond=%s, sz=%d\n",
@@ -91,129 +118,268 @@ LLBP::update(ThreadID tid, Addr pc, bool taken,
rcr.getCCID(),
-1,
inst->isUncondCtrl() ? "true" : "false",
- storage.size());
+ backingStorage.size());
+
+ if (!squashed) {
+ delete bi;
+ bp_history = nullptr;
+ }
+}
+
+void
+LLBP::branchPlaceholder(ThreadID tid, Addr pc,
+ bool uncond, void * &bpHistory)
+{
+ LLBPBranchInfo *bi = new LLBPBranchInfo(*tage, *loopPredictor,
+ pc, !uncond);
+ bpHistory = (void*)(bi);
+}
+
+void
+LLBP::lTageUpdate(ThreadID tid, Addr pc, bool taken, LTageBranchInfo * bi,
+ bool squashed, const StaticInstPtr & inst, Addr target)
+{
+ assert(bi);
+
+ if (squashed) {
+ if (tage->isSpeculativeUpdateEnabled()) {
+ // This restores the global history, then update it
+ // and recomputes the folded histories.
+ tage->squash(tid, taken, target, inst, bi->tageBranchInfo);
+
+ if (bi->tageBranchInfo->condBranch) {
+ loopPredictor->squashLoop(bi->lpBranchInfo);
+ }
+ }
+ return;
+ }
+
+ int nrand = rng->random() & 3;
+ if (bi->tageBranchInfo->condBranch) {
+ DPRINTF(LLBP, "Updating tables for branch:%lx; taken?:%d\n",
+ pc, taken);
+ tage->updateStats(taken, bi->tageBranchInfo);
+
+ loopPredictor->updateStats(taken, bi->lpBranchInfo);
+
+ loopPredictor->condBranchUpdate(tid, pc, taken,
+ bi->tageBranchInfo->tagePred, bi->lpBranchInfo, instShiftAmt);
+
+ tage->condBranchUpdate(tid, pc, taken, bi->tageBranchInfo,
+ nrand, target, bi->lpBranchInfo->predTaken);
+ }
+
+ tage->updateHistories(tid, pc, false, taken, target,
+ inst, bi->tageBranchInfo);
+
}
Prediction
LLBP::predict(ThreadID tid, Addr branch_pc, bool cond_branch, void *&b)
{
Addr pc = branch_pc;
- bool tage_prediction = LTAGE::predict(
- tid, branch_pc, cond_branch, b
- ).taken;
- LLBPBranchInfo *bi = static_cast(b);
+
+ LLBPBranchInfo *bi = new LLBPBranchInfo(*tage, *loopPredictor, pc, cond_branch);
+ LTageBranchInfo *ltage_bi = static_cast(bi);
+ b = (void*)(bi);
+
+ Prediction ltage_prediction = lTagePredict(
+ tid, branch_pc, cond_branch, ltage_bi
+ );
+
auto tage_bi = bi->tageBranchInfo;
bi->overridden = false;
- bi->pred_taken = tage_prediction;
+ bi->base_pred = ltage_prediction.taken;
- int8_t llbp_confidence = 127;
+ int8_t llbp_confidence = 0;
if (cond_branch)
{
+ rcr.backup(bi->rcrBackup);
int tage_bank = tage->nHistoryTables;
if (tage_bi->provider == TAGEBase::TAGE_LONGEST_MATCH)
tage_bank = tage_bi->hitBank;
if (tage_bi->provider == TAGEBase::TAGE_ALT_MATCH)
tage_bank = tage_bi->altBank;
auto ccid = rcr.getCCID();
- if (storage.count(ccid))
+ if (backingStorage.count(ccid))
{
- int i = findBestPattern(tid, pc, storage[ccid]);
- if (i >= 0)
- {
- llbp_hits++;
- llbp_confidence = storage[ccid].patterns[i].counter;
- bool llbp_prediction = llbp_confidence >= 0;
- if (i > tage_bank)
- {
- bi->overridden = true;
- bi->pred_taken = llbp_prediction;
- overrides++;
+ if (patternBuffer.count(ccid)) {
+ Cycles additionalLatency = calculateRemainingLatency(patternBuffer[ccid]);
+ if (additionalLatency == 0) {
+ int i = findBestPattern(tid, pc, backingStorage[ccid]);
+ if (i >= 0)
+ {
+ int key = tage->gtag(tid, pc, i);
+ ++stats.demandHitsTotal;
+ llbp_confidence = backingStorage[ccid].patterns[key].counter;
+ bool llbp_prediction = llbp_confidence >= 0;
+ if (i > tage_bank)
+ {
+ ++stats.demandHitsOverride;
+ bi->overridden = true;
+ bi->llbp_pred = llbp_prediction;
+ } else {
+ ++stats.demandHitsNoOverride;
+ }
+ } else {
+ ++stats.demandMissesNoPattern;
+ ++stats.demandMissesTotal;
+ }
+ } else {
+ ++stats.demandMissesTotal;
}
+ } else {
+ ++stats.demandMissesNoPrefetch;
+ ++stats.demandMissesTotal;
}
+ } else {
+ ++stats.demandMissesCold;
+ ++stats.demandMissesTotal;
+ }
+ }
+ Cycles latency = ltage_prediction.latency;
+ return Prediction {.taken = bi->getPrediction(), .latency = latency};
+}
+
+
+//prediction
+Prediction
+LLBP::lTagePredict(ThreadID tid, Addr branch_pc, bool cond_branch, LTageBranchInfo* bi)
+{
+ bool pred_taken = tage->tagePredict(tid, branch_pc, cond_branch,
+ bi->tageBranchInfo);
+
+ pred_taken = loopPredictor->loopPredict(tid, branch_pc, cond_branch,
+ bi->lpBranchInfo, pred_taken,
+ instShiftAmt);
+ if (cond_branch) {
+ if (bi->lpBranchInfo->loopPredUsed) {
+ bi->tageBranchInfo->provider = LOOP;
}
- DPRINTF(LLBP, "[thread %d] Predicted %s%s on %lld: overides=%d, "
- "llbp_hits=%d, confidence=%d, hitBank=%d, ccid=%llu\n",
- tid,
- bi->pred_taken ? "true" : "false",
- bi->overridden ? "'" : "",
- branch_pc,
- overrides,
- llbp_hits,
- llbp_confidence,
- bi->tageBranchInfo->hitBank,
- ccid);
+ DPRINTF(LLBP, "Predict for %lx: taken?:%d, loopTaken?:%d, "
+ "loopValid?:%d, loopUseCounter:%d, tagePred:%d, altPred:%d\n",
+ branch_pc, pred_taken, bi->lpBranchInfo->loopPred,
+ bi->lpBranchInfo->loopPredValid,
+ loopPredictor->getLoopUseCounter(),
+ bi->tageBranchInfo->tagePred, bi->tageBranchInfo->altTaken);
}
- return staticPrediction(bi->pred_taken);
+
+ // record final prediction
+ bi->lpBranchInfo->predTaken = pred_taken;
+
+ return staticPrediction(pred_taken);
}
-int8_t LLBP::normalize(int8_t counter)
+
+int8_t LLBP::absPredCounter(int8_t counter)
{
return counter >= 0 ? counter : -counter - 1;
}
+/**
+ * Update LLBP with the real outcome of a branch.
+ * It is currently assumed that the PB still contains the
+ * corresponding pattern set (no access latency applied).
+ * The context is created if it does not exist yet.
+ * The pattern is updated / a longer pattern is allocated.
+ *
+ * @param tid Thread ID
+ * @param pc Program counter
+ * @param cid Context ID
+ * @param taken Whether the branch was taken
+ * @param bi Branch info
+ */
void LLBP::storageUpdate(ThreadID tid, Addr pc, uint64_t cid, bool taken,
LLBPBranchInfo *bi)
{
- if (storage.count(cid))
+ // Check whether the current context is known
+ // If not, we create a new context
+ if (backingStorage.count(cid))
{
- storagePriority[storage[cid].replace].erase(cid);
- int i = findBestPattern(tid, pc, storage[cid]);
+ auto context = backingStorage[cid];
+ // The best pattern is always the one that was used to predict
+ // the branch. Both the pattern and the context are updated.
+ int i = findBestPattern(tid, pc, context);
if (i >= 0)
{
- TAGEBase::ctrUpdate(storage[cid].patterns[i].counter, taken,
+ int key = tage->gtag(tid, pc, i);
+ TAGEBase::ctrUpdate(context.patterns[key].counter, taken,
ptnCounterBits);
- if (storage[cid].patterns[i].counter == (taken ? 1 : -2))
+ if (context.patterns[key].counter == (taken ? 1 : -2))
{
- // entry became medium confident
- TAGEBase::unsignedCtrUpdate(storage[cid].replace, true,
+ // Context is now medium confidence
+ TAGEBase::unsignedCtrUpdate(backingStorage[cid].confidence, true,
ctxCounterBits);
}
- else if (storage[cid].patterns[i].counter == (taken ? -1 : 0))
+ else if (context.patterns[key].counter == (taken ? -1 : 0))
{
- // entry became low confident
- TAGEBase::unsignedCtrUpdate(storage[cid].replace, false,
+ // Context is now low confidence
+ TAGEBase::unsignedCtrUpdate(context.confidence, false,
ctxCounterBits);
}
}
- i = findVictimPattern(i, storage[cid]);
- if (i >= 0 && bi->overridden && bi->pred_taken != taken)
- {
- storage[cid].patterns[i].tag = tage->gtag(tid, pc, i);
- storage[cid].patterns[i].counter = taken ? 0 : -1;
+
+ // If a misprediction occurs, we allocate a new pattern with longer history
+ // in the context. The pattern with the weakest confidence is replaced.
+ // This only happens if the LLBP was the provider.
+ if (bi->overridden) {
+ if (bi->getPrediction() != taken) {
+ if (!bi->avenged) {
+ ++stats.wrongOverridesTotal;
+ if (bi->llbp_pred == bi->base_pred) {
+ ++stats.wrongOverridesIdentical;
+ }
+ bi->avenged = true;
+ }
+ if (i < tage->nHistoryTables - 1) {
+ int choosei = std::min(i + 1, (int) tage->nHistoryTables - 1);
+ context.patterns[tage->gtag(tid, pc, choosei)].counter = taken ? 0 : -1;
+ }
+ } else {
+ ++stats.correctOverridesTotal;
+ if (bi->llbp_pred == bi->base_pred) {
+ ++stats.correctOverridesIdentical;
+ }
+ }
}
}
else
{
- while (storage.size() >= storageCapacity)
+ while (backingStorage.size() >= storageCapacity)
{
+ ++stats.backingStorageEvictions;
uint64_t i = findVictimContext();
- storagePriority[storage[i].replace].erase(i);
- storage.erase(i);
+ backingStorage.erase(i);
}
- storage[cid] = LLBP::Context();
- storage[cid].patterns.resize(tage->nHistoryTables + 1);
+
+ ++stats.backingStorageInsertions;
+ backingStorage[cid] = LLBP::Context();
+ // Fill the context with initial patterns
+ // (only this branch, in different history lengths)
for (int i = 1; i <= tage->nHistoryTables; i++)
{
- storage[cid].patterns[i].tag = tage->gtag(tid, pc, i);
- storage[cid].patterns[i].counter = taken ? 0 : -1;
+ backingStorage[cid].patterns[tage->gtag(tid, pc, i)].counter = taken ? 0 : -1;
}
}
- if (std::find(patternBuffer.begin(), patternBuffer.end(), cid) ==
- patternBuffer.end())
- {
- patternBuffer.push_back(cid);
- if (patternBuffer.size() > patternBufferCapacity)
- patternBuffer.pop_front();
- }
- storagePriority[storage[cid].replace].insert(cid);
}
+/**
+ * Find the best (=longest) pattern in the context.
+ * The context is searched by comparing the pc tag with decreasing length
+ * (similar to the TAGE predictor).
+ *
+ * @param tid Thread ID
+ * @param pc Program counter
+ * @param ctx Context to search in
+ * @return Index of the best pattern, or -1 if not found
+ */
int LLBP::findBestPattern(ThreadID tid, Addr pc, Context &ctx)
{
for (int i = tage->nHistoryTables; i > 0; i--)
{
- if (ctx.patterns[i].tag == tage->gtag(tid, pc, i))
+ int key = tage->gtag(tid, pc, i);
+ if (ctx.patterns.count(key))
{
return i;
}
@@ -221,12 +387,20 @@ int LLBP::findBestPattern(ThreadID tid, Addr pc, Context &ctx)
return -1;
}
+/**
+ * Find the pattern with the lowest confidence in the context.
+ * Lengths and confidence vals smaller than {min} are not considered.
+ *
+ * @param min Minimum confidence value
+ * @param ctx Context to search in
+ * @return Index of the pattern with the lowest confidence, or -1 if not found
+ */
int LLBP::findVictimPattern(int min, Context &ctx)
{
int min_conf = 1 << (ptnCounterBits - 2);
for (int i = (min >= 0 ? min + 1 : 1); i <= tage->nHistoryTables; i++)
{
- if (normalize(ctx.patterns[i].counter) < min_conf)
+ if (absPredCounter(ctx.patterns[i].counter) < min_conf)
{
return i;
}
@@ -234,56 +408,72 @@ int LLBP::findVictimPattern(int min, Context &ctx)
return -1;
}
+/**
+ * Find the context with the lowest confidence.
+ * @return Index of the context with the lowest confidence, or -1 if not found
+ */
uint64_t LLBP::findVictimContext()
{
- for (int i = 0; i < storagePriority.size(); i++)
- {
- if (storagePriority.size())
+ auto elem = std::min_element(
+ backingStorage.begin(), backingStorage.end(),
+ [](const auto &a, const auto &b)
{
- return *storagePriority[i].begin();
- }
+ return a.second.confidence < b.second.confidence;
+ });
+ if (elem == backingStorage.end())
+ {
+ return -1;
}
- return -1;
+ return elem->first;
+}
+
+Cycles LLBP::calculateRemainingLatency(Cycles insertTime) {
+ Cycles passedTime = (curCycle() - insertTime);
+ if (passedTime >= backingStorageLatency)
+ return Cycles(0);
+ return backingStorageLatency - passedTime;
}
/* from LLBP source code: */
LLBP::RCR::RCR(int _T, int _W, int _D, int _shift, int _CTWidth)
- : CTWidth(_CTWidth), T(_T), W(_W), D(_D), S(_shift)
+ : tagWidthBits(_CTWidth), T(_T), W(_W), D(_D), S(_shift)
{
- bb[0].resize(maxwindow);
- bb[1].resize(maxwindow);
+ bb.resize(maxwindow);
ctxs = {0, 0};
}
-/*
- * Given the {n} number of branches starting from vec[end-start]
- * to vec[end-start-n-1] we create the hash function by shifting
- * each PC by {shift} number if bits i.e.
- *
- * 000000000000| PC | :vec[end-start]
- * ^ 0000000000| PC |00 :vec[end-start-1]
- * ^ 00000000| PC |0000 :vec[end-start-2]
- * . .
- * . .
- * . .
- * ^ | PC |000000000000 :vec[end-start-n-1]
- * ----------------------
- * final hash value
- *
+/**
+ * Given {n} number of branches starting from the end of the RCR (front of the vec)
+ * (minus {skip} # of branches) we create the hash function by shifting
+ * each PC by {shift} number if bits i.e.
+ *
+ * 000000000000| PC | :vec[end-skip]
+ * ^ 0000000000| PC |00 :vec[end-skip-1]
+ * ^ 00000000| PC |0000 :vec[end-skip-2]
+ * . .
+ * . .
+ * . .
+ * ^ | PC |000000000000 :vec[end-skip-n-1]
+ * ----------------------
+ * final hash value
+ * Then, the hash value is wrapped to the size of the context tag:
+ * @return final hash value % 2^tagWidthBits
*/
uint64_t
LLBP::RCR::calcHash(std::list &vec, int n,
- int start, int shift)
+ int skip, int shift)
{
uint64_t hash = 0;
- if (vec.size() < (start + n))
+ if (vec.size() < (skip + n))
{
return 0;
}
+
+ // Compute the rolling hash in element order (newer branches at the front)
uint64_t sh = 0;
auto it = vec.begin();
- std::advance(it, start);
+ std::advance(it, skip);
for (; (it != vec.end()) && (n > 0); it++, n--)
{
uint64_t val = *it;
@@ -292,29 +482,26 @@ LLBP::RCR::calcHash(std::list &vec, int n,
hash ^= val << uint64_t(sh);
sh += shift;
- if (sh >= CTWidth)
+ if (sh >= tagWidthBits)
{
- sh -= uint64_t(CTWidth);
+ sh -= uint64_t(tagWidthBits);
}
}
- return hash & ((1 << CTWidth) - 1);
+ return moduloTwoExp(hash, tagWidthBits);
}
uint64_t LLBP::RCR::getCCID()
{
- return ctxs.ccid & ((1ULL << CTWidth) - 1);
-}
+ return moduloTwoExp(ctxs.ccid, tagWidthBits);
+} // Hash of all branches
uint64_t LLBP::RCR::getPCID()
{
- return ctxs.pcid & ((1ULL << CTWidth) - 1);
+ return moduloTwoExp(ctxs.pcid, tagWidthBits);
}
bool LLBP::RCR::update(Addr pc, const StaticInstPtr &inst, bool taken)
{
- // Hash of all branches
- branchCount++;
-
bool update = false;
switch (T)
@@ -344,24 +531,84 @@ bool LLBP::RCR::update(Addr pc, const StaticInstPtr &inst, bool taken)
if (update)
{
// Add the new branch to the history
- bb[0].push_front(pc);
- bb[1].push_front(pc);
+ bb.push_front(pc);
// Remove the oldest branch
- bb[0].pop_back();
- bb[1].pop_back();
+ bb.pop_back();
// The current context.
- ctxs.ccid = calcHash(bb[0], W, D, S);
+ ctxs.ccid = calcHash(bb, W, D, S);
// The prefetch context.
- ctxs.pcid = calcHash(bb[0], W, 0, S);
+ ctxs.pcid = calcHash(bb, W, 0, S);
- // DPRINTF(LLBP, "UH:%llx, %i\n", pc, taken);
return true;
}
return false;
}
+void LLBP::RCR::backup(std::list &vec)
+{
+ vec.clear();
+ int count = 0;
+ for (auto it = bb.begin(); it != bb.end(); ++it)
+ {
+ if (count == D + W - 1) {
+ break;
+ }
+ vec.push_back(*it);
+ count++;
+ }
+}
+
+void LLBP::RCR::restore(std::list &vec)
+{
+ bb.clear();
+ for (auto it = vec.begin(); it != vec.end(); ++it)
+ {
+ bb.push_back(*it);
+ }
+ // The current context.
+ ctxs.ccid = calcHash(bb, W, D, S);
+ // The prefetch context.
+ ctxs.pcid = calcHash(bb, W, 0, S);
+}
+
+LLBP::LLBPStats::LLBPStats(LLBP *llbp)
+ : statistics::Group(llbp),
+ ADD_STAT(prefetchesIssued, statistics::units::Count::get(),
+ "Number of prefetches issued to the backing storage"),
+ ADD_STAT(demandHitsTotal, statistics::units::Count::get(),
+ "Total on-demand hits to the pattern buffer"),
+ ADD_STAT(demandHitsOverride, statistics::units::Count::get(),
+ "On-demand hits to the pattern buffer with LLBP overriding the base predictor"),
+ ADD_STAT(demandHitsNoOverride, statistics::units::Count::get(),
+ "On-demand hits to the pattern buffer, using the base predictor (LLBP dropped)"),
+ ADD_STAT(demandMissesTotal, statistics::units::Count::get(),
+ "Total on-demand misses to the pattern buffer"),
+ ADD_STAT(demandMissesNoPattern, statistics::units::Count::get(),
+ "On-demand misses to the pattern buffer, the chosen pattern-set did not contain the needed pattern"),
+ ADD_STAT(demandMissesNoPrefetch, statistics::units::Count::get(),
+ "On-demand misses to the pattern buffer where the context was not scheduled for insertion"),
+ ADD_STAT(demandMissesCold, statistics::units::Count::get(),
+ "On-demand misses to the pattern buffer where the context was not in the backing storage"),
+ ADD_STAT(patternBufferEvictions, statistics::units::Count::get(),
+ "Number of pattern sets evicted from the pattern buffer due to capacity limits"),
+ ADD_STAT(backingStorageEvictions, statistics::units::Count::get(),
+ "Number of pattern sets evicted from the backing storage due to capacity limits"),
+ ADD_STAT(backingStorageInsertions, statistics::units::Count::get(),
+ "Number of pattern sets inserted into the backing storage (including replacements)"),
+ ADD_STAT(correctOverridesTotal, statistics::units::Count::get(),
+ "Number of branches predicted correctly by LLBP (LLBP was provider)"),
+ ADD_STAT(correctOverridesIdentical, statistics::units::Count::get(),
+ "Number of branches predicted correctly by LLBP, but the base predictor would also be correct"),
+ ADD_STAT(wrongOverridesTotal, statistics::units::Count::get(),
+ "Number of branches predicted wrong by LLBP (LLBP was provider)"),
+ ADD_STAT(wrongOverridesIdentical, statistics::units::Count::get(),
+ "Number of branches predicted wrong by LLBP, but the base predictor would also be wrong"),
+ ADD_STAT(squashedOverrides, statistics::units::Count::get(),
+ "Number of branches predicted by LLBP, but squashed before the outcome was known")
+ {}
+
} // namespace branch_prediction
} // namespace gem5
diff --git a/src/cpu/pred/llbp.hh b/src/cpu/pred/llbp.hh
index f4f66eeb393..43681d86b2d 100644
--- a/src/cpu/pred/llbp.hh
+++ b/src/cpu/pred/llbp.hh
@@ -37,6 +37,7 @@ Implementation of the last-level branch predictor (LLBP).
#include "base/cache/associative_cache.hh"
#include "base/cache/cache_entry.hh"
+#include "base/statistics.hh"
#include "base/types.hh"
#include "cpu/pred/ltage.hh"
#include "params/LLBP.hh"
@@ -58,19 +59,32 @@ class LLBP : public LTAGE
const StaticInstPtr & inst, Addr target) override;
void init() override;
+ void branchPlaceholder(ThreadID tid, Addr pc,
+ bool uncond, void * &bpHistory) override;
protected:
+ Cycles calculateRemainingLatency(Cycles insertTime);
+
struct LLBPBranchInfo : public LTageBranchInfo
{
bool overridden;
- bool pred_taken;
+ bool llbp_pred;
+ bool base_pred;
+ bool avenged;
+ std::list rcrBackup;
+
LLBPBranchInfo(TAGEBase &tage, LoopPredictor &lp,
Addr pc, bool conditional)
: LTageBranchInfo(tage, lp, pc, conditional),
- overridden(false)
+ overridden(false),
+ avenged(false)
{}
+ bool getPrediction() {
+ return overridden ? llbp_pred : base_pred;
+ }
+
virtual ~LLBPBranchInfo()
{}
};
@@ -78,32 +92,43 @@ class LLBP : public LTAGE
Prediction predict(
ThreadID tid, Addr branch_pc, bool cond_branch, void* &b) override;
+
+ Prediction lTagePredict(
+ ThreadID tid, Addr branch_pc, bool cond_branch, LTageBranchInfo* b);
+
+ void lTageUpdate(ThreadID tid, Addr pc, bool taken, LTageBranchInfo * bi,
+ bool squashed, const StaticInstPtr & inst, Addr target);
struct Pattern
{
- int tag;
- // direction & whether to replace if low on space in a context
+ //* hysteresis counter: > 0 = taken, < 0 = not taken
int8_t counter;
};
struct Context
{
- std::vector patterns;
- uint8_t replace; // whether to replace context in the storage
+ std::unordered_map patterns;
+ /** Confidence counter of the context (guides replacement) */
+ uint8_t confidence;
};
- std::unordered_map storage;
- std::vector> storagePriority;
- std::deque patternBuffer;
+ struct PatternBufferEntry
+ {
+ uint64_t cid;
+ Cycles insertTime;
+ };
+
+ std::unordered_map backingStorage;
+ std::unordered_map patternBuffer;
+ std::deque patternBufferQueue;
- int contextCapacity;
int patternBufferCapacity;
int storageCapacity;
int ctxCounterBits;
int ptnCounterBits;
- int overrides = 0;
- int llbp_hits = 0;
- int8_t normalize(int8_t counter);
+ Cycles backingStorageLatency;
+
+ int8_t absPredCounter(int8_t counter);
void storageUpdate(
ThreadID tid, Addr pc, uint64_t cid, bool taken, LLBPBranchInfo* bi);
void storageInvalidate();
@@ -111,6 +136,28 @@ class LLBP : public LTAGE
int findVictimPattern(int min, Context& ctx);
uint64_t findVictimContext();
+ struct LLBPStats : public statistics::Group
+ {
+ LLBPStats(LLBP *llbp);
+
+ statistics::Scalar prefetchesIssued;
+ statistics::Scalar demandHitsTotal;
+ statistics::Scalar demandHitsOverride;
+ statistics::Scalar demandHitsNoOverride;
+ statistics::Scalar demandMissesTotal;
+ statistics::Scalar demandMissesNoPattern;
+ statistics::Scalar demandMissesNoPrefetch;
+ statistics::Scalar demandMissesCold;
+ statistics::Scalar patternBufferEvictions;
+ statistics::Scalar backingStorageEvictions;
+ statistics::Scalar backingStorageInsertions;
+ statistics::Scalar correctOverridesTotal;
+ statistics::Scalar correctOverridesIdentical;
+ statistics::Scalar wrongOverridesTotal;
+ statistics::Scalar wrongOverridesIdentical;
+ statistics::Scalar squashedOverrides;
+ } stats;
+
/* From LLBP Source Code */
@@ -122,10 +169,10 @@ class LLBP : public LTAGE
int start=0, int shift=0);
// The context tag width
- const int CTWidth;
+ const int tagWidthBits;
// A list of previouly taken branches
- std::list bb[10];
+ std::list bb;
// We compute the context ID and prefetch context ID
// only when the content of the RCR changes.
@@ -135,8 +182,6 @@ class LLBP : public LTAGE
uint64_t pcid = 0;
} ctxs;
- int branchCount = 0;
-
public:
// The hash constants
@@ -147,6 +192,11 @@ class LLBP : public LTAGE
// Push a new branch into the RCR.
bool update(Addr pc, const StaticInstPtr & inst, bool taken);
+ // Save the RCR state into a list
+ void backup(std::list &vec);
+
+ // Restore the RCR state from a list
+ void restore(std::list &vec);
/**
* Computes the modulo of a number val with respect to 2^exp
From 2ef1f9a1aa5d46b315f0a74806e0f15b91aa5f27 Mon Sep 17 00:00:00 2001
From: Phillip Assmann
Date: Wed, 14 May 2025 11:45:37 +0000
Subject: [PATCH 07/22] cpu: llbp to composition, spec-hu
---
src/cpu/pred/BranchPredictor.py | 3 +-
src/cpu/pred/llbp.cc | 235 ++++++++++++++------------------
src/cpu/pred/llbp.hh | 45 +++---
src/cpu/pred/ltage.hh | 1 -
src/cpu/pred/tage.hh | 9 ++
5 files changed, 142 insertions(+), 151 deletions(-)
diff --git a/src/cpu/pred/BranchPredictor.py b/src/cpu/pred/BranchPredictor.py
index f504d987c03..37ec9117250 100644
--- a/src/cpu/pred/BranchPredictor.py
+++ b/src/cpu/pred/BranchPredictor.py
@@ -797,11 +797,12 @@ class TAGE_SC_L_8KB(TAGE_SC_L):
statistical_corrector = TAGE_SC_L_8KB_StatisticalCorrector()
-class LLBP(LTAGE):
+class LLBP(ConditionalPredictor):
type = "LLBP"
cxx_class = "gem5::branch_prediction::LLBP"
cxx_header = "cpu/pred/llbp.hh"
+ base = Param.LTAGE("Base predictor")
tagWidthBits = Param.Int(14, "RCR CTWidth")
patternBufferCapacity = Param.Int(64, "Pattern Buffer Capacity")
storageCapacity = Param.Int(14000, "Storage Capacity")
diff --git a/src/cpu/pred/llbp.cc b/src/cpu/pred/llbp.cc
index 5a1bfdbf90c..2450618f5ef 100644
--- a/src/cpu/pred/llbp.cc
+++ b/src/cpu/pred/llbp.cc
@@ -37,7 +37,8 @@ namespace branch_prediction
{
LLBP::LLBP(const LLBPParams ¶ms)
- : LTAGE(params),
+ : ConditionalPredictor(params),
+ base(params.base),
patternBufferCapacity(params.patternBufferCapacity),
storageCapacity(params.storageCapacity),
ctxCounterBits(params.ctxCounterBits),
@@ -56,7 +57,6 @@ LLBP::LLBP(const LLBPParams ¶ms)
void
LLBP::init()
{
- LTAGE::init();
}
void
@@ -69,7 +69,9 @@ LLBP::squash(ThreadID tid, void *&bp_history)
}
}
// TODO: No squash in LLBP?
- LTAGE::squash(tid, bp_history);
+ base->squash(tid, bi->ltage_bi);
+ delete bi;
+ bp_history = nullptr;
}
void
@@ -79,47 +81,35 @@ LLBP::update(ThreadID tid, Addr pc, bool taken,
{
assert(bp_history);
LLBPBranchInfo *bi = static_cast(bp_history);
- LTageBranchInfo *ltage_bi = static_cast(bi);
- if (inst->isUncondCtrl()) {
- // Insert the next prefetch context into the pattern buffer
- if (backingStorage.count(rcr.getPCID()))
- {
- uint64_t pcid = rcr.getPCID();
- if (patternBuffer.count(rcr.getPCID()) == 0) {
- ++stats.prefetchesIssued;
- patternBuffer[pcid] = curCycle();
+ if (squashed && !bi->avenged && bi->rcrBackup.size()) {
+ rcr.restore(bi->rcrBackup);
+ }
- patternBufferQueue.push_back(pcid);
- if (patternBufferQueue.size() >= patternBufferCapacity)
- {
- patternBuffer.erase(patternBufferQueue.front());
- patternBufferQueue.pop_front();
- ++stats.patternBufferEvictions;
- }
- }
- }
- } else {
- if (squashed && !bi->avenged) {
- rcr.restore(bi->rcrBackup);
- }
+ if (inst->isCondCtrl())
storageUpdate(tid, pc, rcr.getCCID(), taken, bi);
- }
- rcr.update(pc, inst, taken);
- lTageUpdate(tid, pc, taken, ltage_bi, squashed, inst, target);
+ base->update(tid, pc, taken, bi->ltage_bi, squashed, inst, target);
+
+ std::string rcr_cont = "";
+
+ for (auto v: rcr.bb)
+ rcr_cont.append(std::to_string(v) + " | ");
+
- DPRINTF(LLBP, "[thread %d] Updated %s on %lld (%s): ccid=%llu, counter=%d,"
- "uncond=%s, sz=%d\n",
+ DPRINTF(LLBP, "CUPDATE @ %lld [thread %d] Updated %s on %lld (%s): ccid=%llu, "
+ "uncond=%s, sz=%d, RCR: %s\n",
+ bi,
tid,
taken ? "true" : "false",
pc,
inst->getName().c_str(),
rcr.getCCID(),
- -1,
inst->isUncondCtrl() ? "true" : "false",
- backingStorage.size());
-
+ backingStorage.size(),
+ rcr_cont
+ );
+
if (!squashed) {
delete bi;
bp_history = nullptr;
@@ -130,48 +120,16 @@ void
LLBP::branchPlaceholder(ThreadID tid, Addr pc,
bool uncond, void * &bpHistory)
{
- LLBPBranchInfo *bi = new LLBPBranchInfo(*tage, *loopPredictor,
- pc, !uncond);
+ LLBPBranchInfo *bi = new LLBPBranchInfo(pc, !uncond);
+ base->branchPlaceholder(tid, pc, uncond, bi->ltage_bi);
bpHistory = (void*)(bi);
}
-void
-LLBP::lTageUpdate(ThreadID tid, Addr pc, bool taken, LTageBranchInfo * bi,
- bool squashed, const StaticInstPtr & inst, Addr target)
+Prediction
+LLBP::lookup(ThreadID tid, Addr pc, void *&bp_history)
{
- assert(bi);
-
- if (squashed) {
- if (tage->isSpeculativeUpdateEnabled()) {
- // This restores the global history, then update it
- // and recomputes the folded histories.
- tage->squash(tid, taken, target, inst, bi->tageBranchInfo);
-
- if (bi->tageBranchInfo->condBranch) {
- loopPredictor->squashLoop(bi->lpBranchInfo);
- }
- }
- return;
- }
-
- int nrand = rng->random() & 3;
- if (bi->tageBranchInfo->condBranch) {
- DPRINTF(LLBP, "Updating tables for branch:%lx; taken?:%d\n",
- pc, taken);
- tage->updateStats(taken, bi->tageBranchInfo);
-
- loopPredictor->updateStats(taken, bi->lpBranchInfo);
-
- loopPredictor->condBranchUpdate(tid, pc, taken,
- bi->tageBranchInfo->tagePred, bi->lpBranchInfo, instShiftAmt);
-
- tage->condBranchUpdate(tid, pc, taken, bi->tageBranchInfo,
- nrand, target, bi->lpBranchInfo->predTaken);
- }
-
- tage->updateHistories(tid, pc, false, taken, target,
- inst, bi->tageBranchInfo);
-
+ Prediction retval = predict(tid, pc, true, bp_history);
+ return retval;
}
Prediction
@@ -179,15 +137,17 @@ LLBP::predict(ThreadID tid, Addr branch_pc, bool cond_branch, void *&b)
{
Addr pc = branch_pc;
- LLBPBranchInfo *bi = new LLBPBranchInfo(*tage, *loopPredictor, pc, cond_branch);
- LTageBranchInfo *ltage_bi = static_cast(bi);
- b = (void*)(bi);
+ LLBPBranchInfo *bi = new LLBPBranchInfo(pc, cond_branch);
- Prediction ltage_prediction = lTagePredict(
- tid, branch_pc, cond_branch, ltage_bi
+ Prediction ltage_prediction = base->predict(
+ tid, branch_pc, cond_branch, bi->ltage_bi
);
- auto tage_bi = bi->tageBranchInfo;
+ b = (void*)(bi);
+
+ LTAGE::LTageBranchInfo *ltage_bi = static_cast(bi->ltage_bi);
+
+ auto tage_bi = ltage_bi->tageBranchInfo;
bi->overridden = false;
bi->base_pred = ltage_prediction.taken;
@@ -195,8 +155,7 @@ LLBP::predict(ThreadID tid, Addr branch_pc, bool cond_branch, void *&b)
if (cond_branch)
{
- rcr.backup(bi->rcrBackup);
- int tage_bank = tage->nHistoryTables;
+ int tage_bank = base->getNumHistoryTables();
if (tage_bi->provider == TAGEBase::TAGE_LONGEST_MATCH)
tage_bank = tage_bi->hitBank;
if (tage_bi->provider == TAGEBase::TAGE_ALT_MATCH)
@@ -210,7 +169,7 @@ LLBP::predict(ThreadID tid, Addr branch_pc, bool cond_branch, void *&b)
int i = findBestPattern(tid, pc, backingStorage[ccid]);
if (i >= 0)
{
- int key = tage->gtag(tid, pc, i);
+ int key = base->gtag(tid, pc, i);
++stats.demandHitsTotal;
llbp_confidence = backingStorage[ccid].patterns[key].counter;
bool llbp_prediction = llbp_confidence >= 0;
@@ -238,37 +197,57 @@ LLBP::predict(ThreadID tid, Addr branch_pc, bool cond_branch, void *&b)
++stats.demandMissesTotal;
}
}
+ DPRINTF(LLBP, "LLBP: Final Prediction @ %lld for %lx is %d, "
+ "confidence=%d, overridden=%s\n",
+ bi,
+ branch_pc, bi->getPrediction(), llbp_confidence,
+ bi->overridden ? "true" : "false");
Cycles latency = ltage_prediction.latency;
return Prediction {.taken = bi->getPrediction(), .latency = latency};
}
-
-//prediction
-Prediction
-LLBP::lTagePredict(ThreadID tid, Addr branch_pc, bool cond_branch, LTageBranchInfo* bi)
+void
+LLBP::updateHistories(
+ ThreadID tid, Addr pc, bool uncond,
+ bool taken, Addr target,
+ const StaticInstPtr &inst,
+ void *&bp_history)
{
- bool pred_taken = tage->tagePredict(tid, branch_pc, cond_branch,
- bi->tageBranchInfo);
-
- pred_taken = loopPredictor->loopPredict(tid, branch_pc, cond_branch,
- bi->lpBranchInfo, pred_taken,
- instShiftAmt);
- if (cond_branch) {
- if (bi->lpBranchInfo->loopPredUsed) {
- bi->tageBranchInfo->provider = LOOP;
+ LLBPBranchInfo *bi;
+
+ rcr.update(pc, inst, taken);
+
+ if (bp_history == nullptr) {
+ assert(uncond);
+ bi = new LLBPBranchInfo(pc, !uncond);
+ bp_history = (void*)(bi);
+
+ // Insert the next prefetch context into the pattern buffer
+ if (backingStorage.count(rcr.getPCID()))
+ {
+ uint64_t pcid = rcr.getPCID();
+ if (patternBuffer.count(rcr.getPCID()) == 0) {
+ ++stats.prefetchesIssued;
+ patternBuffer[pcid] = curCycle();
+
+ patternBufferQueue.push_back(pcid);
+ if (patternBufferQueue.size() >= patternBufferCapacity)
+ {
+ patternBuffer.erase(patternBufferQueue.front());
+ patternBufferQueue.pop_front();
+ ++stats.patternBufferEvictions;
+ }
+ }
}
- DPRINTF(LLBP, "Predict for %lx: taken?:%d, loopTaken?:%d, "
- "loopValid?:%d, loopUseCounter:%d, tagePred:%d, altPred:%d\n",
- branch_pc, pred_taken, bi->lpBranchInfo->loopPred,
- bi->lpBranchInfo->loopPredValid,
- loopPredictor->getLoopUseCounter(),
- bi->tageBranchInfo->tagePred, bi->tageBranchInfo->altTaken);
+ } else {
+ bi = static_cast(bp_history);
+ storageUpdate(
+ tid, pc, rcr.getCCID(), taken, bi
+ );
}
- // record final prediction
- bi->lpBranchInfo->predTaken = pred_taken;
-
- return staticPrediction(pred_taken);
+ rcr.backup(bi->rcrBackup);
+ base->updateHistories(tid, pc, uncond, taken, target, inst, bi->ltage_bi);
}
@@ -303,7 +282,7 @@ void LLBP::storageUpdate(ThreadID tid, Addr pc, uint64_t cid, bool taken,
int i = findBestPattern(tid, pc, context);
if (i >= 0)
{
- int key = tage->gtag(tid, pc, i);
+ int key = base->gtag(tid, pc, i);
TAGEBase::ctrUpdate(context.patterns[key].counter, taken,
ptnCounterBits);
if (context.patterns[key].counter == (taken ? 1 : -2))
@@ -332,9 +311,9 @@ void LLBP::storageUpdate(ThreadID tid, Addr pc, uint64_t cid, bool taken,
}
bi->avenged = true;
}
- if (i < tage->nHistoryTables - 1) {
- int choosei = std::min(i + 1, (int) tage->nHistoryTables - 1);
- context.patterns[tage->gtag(tid, pc, choosei)].counter = taken ? 0 : -1;
+ if (i < base->getNumHistoryTables() - 1) {
+ int choosei = std::min(i + 1, (int) base->getNumHistoryTables() - 1);
+ context.patterns[base->gtag(tid, pc, choosei)].counter = taken ? 0 : -1;
}
} else {
++stats.correctOverridesTotal;
@@ -357,9 +336,9 @@ void LLBP::storageUpdate(ThreadID tid, Addr pc, uint64_t cid, bool taken,
backingStorage[cid] = LLBP::Context();
// Fill the context with initial patterns
// (only this branch, in different history lengths)
- for (int i = 1; i <= tage->nHistoryTables; i++)
+ for (int i = 1; i <= base->getNumHistoryTables(); i++)
{
- backingStorage[cid].patterns[tage->gtag(tid, pc, i)].counter = taken ? 0 : -1;
+ backingStorage[cid].patterns[base->gtag(tid, pc, i)].counter = taken ? 0 : -1;
}
}
}
@@ -376,9 +355,9 @@ void LLBP::storageUpdate(ThreadID tid, Addr pc, uint64_t cid, bool taken,
*/
int LLBP::findBestPattern(ThreadID tid, Addr pc, Context &ctx)
{
- for (int i = tage->nHistoryTables; i > 0; i--)
+ for (int i = base->getNumHistoryTables(); i > 0; i--)
{
- int key = tage->gtag(tid, pc, i);
+ int key = base->gtag(tid, pc, i);
if (ctx.patterns.count(key))
{
return i;
@@ -398,7 +377,7 @@ int LLBP::findBestPattern(ThreadID tid, Addr pc, Context &ctx)
int LLBP::findVictimPattern(int min, Context &ctx)
{
int min_conf = 1 << (ptnCounterBits - 2);
- for (int i = (min >= 0 ? min + 1 : 1); i <= tage->nHistoryTables; i++)
+ for (int i = (min >= 0 ? min + 1 : 1); i <= base->getNumHistoryTables(); i++)
{
if (absPredCounter(ctx.patterns[i].counter) < min_conf)
{
@@ -461,20 +440,19 @@ LLBP::RCR::RCR(int _T, int _W, int _D, int _shift, int _CTWidth)
* @return final hash value % 2^tagWidthBits
*/
uint64_t
-LLBP::RCR::calcHash(std::list &vec, int n,
- int skip, int shift)
+LLBP::RCR::calcHash(int n, int skip, int shift)
{
uint64_t hash = 0;
- if (vec.size() < (skip + n))
+ if (bb.size() < (skip + n))
{
return 0;
}
// Compute the rolling hash in element order (newer branches at the front)
uint64_t sh = 0;
- auto it = vec.begin();
+ auto it = bb.begin();
std::advance(it, skip);
- for (; (it != vec.end()) && (n > 0); it++, n--)
+ for (; (it != bb.end()) && (n > 0); it++, n--)
{
uint64_t val = *it;
@@ -537,9 +515,10 @@ bool LLBP::RCR::update(Addr pc, const StaticInstPtr &inst, bool taken)
bb.pop_back();
// The current context.
- ctxs.ccid = calcHash(bb, W, D, S);
+ ctxs.ccid = calcHash(W, D, S);
// The prefetch context.
- ctxs.pcid = calcHash(bb, W, 0, S);
+ ctxs.pcid = calcHash(W, 0, S);
+
return true;
}
@@ -547,21 +526,19 @@ bool LLBP::RCR::update(Addr pc, const StaticInstPtr &inst, bool taken)
return false;
}
-void LLBP::RCR::backup(std::list &vec)
+void LLBP::RCR::backup(std::list& vec)
{
vec.clear();
- int count = 0;
- for (auto it = bb.begin(); it != bb.end(); ++it)
+ int count = D + W;
+ for (auto it = bb.begin();
+ (it != bb.end()) && (count > 0);
+ ++it, --count)
{
- if (count == D + W - 1) {
- break;
- }
vec.push_back(*it);
- count++;
}
}
-void LLBP::RCR::restore(std::list &vec)
+void LLBP::RCR::restore(std::list& vec)
{
bb.clear();
for (auto it = vec.begin(); it != vec.end(); ++it)
@@ -569,9 +546,9 @@ void LLBP::RCR::restore(std::list &vec)
bb.push_back(*it);
}
// The current context.
- ctxs.ccid = calcHash(bb, W, D, S);
+ ctxs.ccid = calcHash(W, D, S);
// The prefetch context.
- ctxs.pcid = calcHash(bb, W, 0, S);
+ ctxs.pcid = calcHash(W, 0, S);
}
LLBP::LLBPStats::LLBPStats(LLBP *llbp)
diff --git a/src/cpu/pred/llbp.hh b/src/cpu/pred/llbp.hh
index 43681d86b2d..31622dbc4f4 100644
--- a/src/cpu/pred/llbp.hh
+++ b/src/cpu/pred/llbp.hh
@@ -48,11 +48,13 @@ namespace gem5
namespace branch_prediction
{
-class LLBP : public LTAGE
+class LLBP : public ConditionalPredictor
{
public:
LLBP(const LLBPParams ¶ms);
+ Prediction lookup(ThreadID tid, Addr pc, void * &bp_history) override;
+
void squash(ThreadID tid, void * &bp_history) override;
void update(ThreadID tid, Addr pc, bool taken,
void * &bp_history, bool squashed,
@@ -62,42 +64,45 @@ class LLBP : public LTAGE
void branchPlaceholder(ThreadID tid, Addr pc,
bool uncond, void * &bpHistory) override;
- protected:
+ void updateHistories(ThreadID tid, Addr pc, bool uncond,
+ bool taken, Addr target,
+ const StaticInstPtr &inst,
+ void * &bp_history) override;
+ protected:
+ LTAGE* base;
+
Cycles calculateRemainingLatency(Cycles insertTime);
- struct LLBPBranchInfo : public LTageBranchInfo
+ Prediction predict(ThreadID tid, Addr pc,
+ bool cond_branch, void * &bp_history);
+
+ struct LLBPBranchInfo
{
bool overridden;
bool llbp_pred;
bool base_pred;
bool avenged;
+ Addr pc;
+ bool conditional;
std::list rcrBackup;
-
- LLBPBranchInfo(TAGEBase &tage, LoopPredictor &lp,
- Addr pc, bool conditional)
- : LTageBranchInfo(tage, lp, pc, conditional),
- overridden(false),
- avenged(false)
+ void* ltage_bi;
+
+ LLBPBranchInfo(Addr pc, bool conditional)
+ : pc(pc),
+ conditional(conditional),
+ ltage_bi(nullptr)
{}
bool getPrediction() {
return overridden ? llbp_pred : base_pred;
}
- virtual ~LLBPBranchInfo()
+ ~LLBPBranchInfo()
{}
};
- Prediction predict(
- ThreadID tid, Addr branch_pc, bool cond_branch, void* &b) override;
-
- Prediction lTagePredict(
- ThreadID tid, Addr branch_pc, bool cond_branch, LTageBranchInfo* b);
-
- void lTageUpdate(ThreadID tid, Addr pc, bool taken, LTageBranchInfo * bi,
- bool squashed, const StaticInstPtr & inst, Addr target);
struct Pattern
{
//* hysteresis counter: > 0 = taken, < 0 = not taken
@@ -163,9 +168,10 @@ class LLBP : public LTAGE
class RCR
{
+ public:
const int maxwindow = 120;
- uint64_t calcHash(std::list &vec, int n,
+ uint64_t calcHash(int n,
int start=0, int shift=0);
// The context tag width
@@ -182,7 +188,6 @@ class LLBP : public LTAGE
uint64_t pcid = 0;
} ctxs;
- public:
// The hash constants
const int T, W, D, S;
diff --git a/src/cpu/pred/ltage.hh b/src/cpu/pred/ltage.hh
index 070fa1f4001..2da30905b15 100644
--- a/src/cpu/pred/ltage.hh
+++ b/src/cpu/pred/ltage.hh
@@ -90,7 +90,6 @@ class LTAGE : public TAGE
void init() override;
- protected:
/** The loop predictor object */
LoopPredictor *loopPredictor;
diff --git a/src/cpu/pred/tage.hh b/src/cpu/pred/tage.hh
index 28754f4bb84..cf0b5dfd6d4 100644
--- a/src/cpu/pred/tage.hh
+++ b/src/cpu/pred/tage.hh
@@ -113,6 +113,15 @@ class TAGE: public ConditionalPredictor
virtual void squash(ThreadID tid, void * &bp_history) override;
virtual void branchPlaceholder(ThreadID tid, Addr pc,
bool uncond, void * &bp_history) override;
+
+ unsigned int getNumHistoryTables() const
+ {
+ return tage->nHistoryTables;
+ }
+
+ uint16_t gtag(ThreadID tid, Addr pc, int bank) const {
+ return tage->gtag(tid, pc, bank);
+ }
};
} // namespace branch_prediction
From 4380c679475000ce5f72030b5775644c7ac4c7df Mon Sep 17 00:00:00 2001
From: Phillip Assmann
Date: Wed, 14 May 2025 16:11:26 +0000
Subject: [PATCH 08/22] cpu: llbp correct gtags
---
src/cpu/pred/llbp.cc | 74 +++++++++++++++++++++++---------------------
src/cpu/pred/llbp.hh | 9 ++++--
2 files changed, 45 insertions(+), 38 deletions(-)
diff --git a/src/cpu/pred/llbp.cc b/src/cpu/pred/llbp.cc
index 2450618f5ef..b3ae963685c 100644
--- a/src/cpu/pred/llbp.cc
+++ b/src/cpu/pred/llbp.cc
@@ -57,6 +57,7 @@ LLBP::LLBP(const LLBPParams ¶ms)
void
LLBP::init()
{
+ base->init();
}
void
@@ -68,7 +69,6 @@ LLBP::squash(ThreadID tid, void *&bp_history)
stats.squashedOverrides++;
}
}
- // TODO: No squash in LLBP?
base->squash(tid, bi->ltage_bi);
delete bi;
bp_history = nullptr;
@@ -82,14 +82,18 @@ LLBP::update(ThreadID tid, Addr pc, bool taken,
assert(bp_history);
LLBPBranchInfo *bi = static_cast(bp_history);
- if (squashed && !bi->avenged && bi->rcrBackup.size()) {
- rcr.restore(bi->rcrBackup);
+ if (squashed) {
+ assert(!bi->avenged);
+ if (bi->rcrBackup.size()) {
+ rcr.restore(bi->rcrBackup);
+ }
+ base->update(tid, pc, taken, bi->ltage_bi, squashed, inst, target);
+ return;
}
if (inst->isCondCtrl())
storageUpdate(tid, pc, rcr.getCCID(), taken, bi);
- base->update(tid, pc, taken, bi->ltage_bi, squashed, inst, target);
std::string rcr_cont = "";
@@ -110,10 +114,10 @@ LLBP::update(ThreadID tid, Addr pc, bool taken,
rcr_cont
);
- if (!squashed) {
- delete bi;
- bp_history = nullptr;
- }
+ base->update(tid, pc, taken, bi->ltage_bi, squashed, inst, target);
+
+ delete bi;
+ bp_history = nullptr;
}
void
@@ -167,9 +171,10 @@ LLBP::predict(ThreadID tid, Addr branch_pc, bool cond_branch, void *&b)
Cycles additionalLatency = calculateRemainingLatency(patternBuffer[ccid]);
if (additionalLatency == 0) {
int i = findBestPattern(tid, pc, backingStorage[ccid]);
+ bi->index = i;
if (i >= 0)
{
- int key = base->gtag(tid, pc, i);
+ int key = tage_bi->tableTags[i];
++stats.demandHitsTotal;
llbp_confidence = backingStorage[ccid].patterns[key].counter;
bool llbp_prediction = llbp_confidence >= 0;
@@ -241,9 +246,6 @@ LLBP::updateHistories(
}
} else {
bi = static_cast(bp_history);
- storageUpdate(
- tid, pc, rcr.getCCID(), taken, bi
- );
}
rcr.backup(bi->rcrBackup);
@@ -267,22 +269,26 @@ int8_t LLBP::absPredCounter(int8_t counter)
* @param pc Program counter
* @param cid Context ID
* @param taken Whether the branch was taken
+ * @param speculative Whether the update is speculative
* @param bi Branch info
*/
-void LLBP::storageUpdate(ThreadID tid, Addr pc, uint64_t cid, bool taken,
- LLBPBranchInfo *bi)
+void LLBP::storageUpdate(ThreadID tid, Addr pc, uint64_t cid, bool taken, LLBPBranchInfo *bi)
{
+ LTAGE::LTageBranchInfo *ltage_bi =
+ static_cast(bi->ltage_bi);
+
+ auto tage_bi = ltage_bi->tageBranchInfo;
+
// Check whether the current context is known
// If not, we create a new context
if (backingStorage.count(cid))
{
auto context = backingStorage[cid];
- // The best pattern is always the one that was used to predict
- // the branch. Both the pattern and the context are updated.
- int i = findBestPattern(tid, pc, context);
+
+ int i = bi->index;
if (i >= 0)
{
- int key = base->gtag(tid, pc, i);
+ int key = tage_bi->tableTags[i];
TAGEBase::ctrUpdate(context.patterns[key].counter, taken,
ptnCounterBits);
if (context.patterns[key].counter == (taken ? 1 : -2))
@@ -302,24 +308,22 @@ void LLBP::storageUpdate(ThreadID tid, Addr pc, uint64_t cid, bool taken,
// If a misprediction occurs, we allocate a new pattern with longer history
// in the context. The pattern with the weakest confidence is replaced.
// This only happens if the LLBP was the provider.
- if (bi->overridden) {
- if (bi->getPrediction() != taken) {
- if (!bi->avenged) {
- ++stats.wrongOverridesTotal;
- if (bi->llbp_pred == bi->base_pred) {
- ++stats.wrongOverridesIdentical;
- }
- bi->avenged = true;
- }
- if (i < base->getNumHistoryTables() - 1) {
- int choosei = std::min(i + 1, (int) base->getNumHistoryTables() - 1);
- context.patterns[base->gtag(tid, pc, choosei)].counter = taken ? 0 : -1;
- }
- } else {
- ++stats.correctOverridesTotal;
+ if (bi->getPrediction() != taken) {
+ if (!bi->avenged && bi->overridden) {
+ ++stats.wrongOverridesTotal;
if (bi->llbp_pred == bi->base_pred) {
- ++stats.correctOverridesIdentical;
+ ++stats.wrongOverridesIdentical;
}
+ bi->avenged = true;
+ }
+ if (i < base->getNumHistoryTables() - 1) {
+ int choosei = std::min(i + 1, (int) base->getNumHistoryTables() - 1);
+ context.patterns[tage_bi->tableTags[choosei]].counter = taken ? 0 : -1;
+ }
+ } else if (bi->overridden) {
+ ++stats.correctOverridesTotal;
+ if (bi->llbp_pred == bi->base_pred) {
+ ++stats.correctOverridesIdentical;
}
}
}
@@ -338,7 +342,7 @@ void LLBP::storageUpdate(ThreadID tid, Addr pc, uint64_t cid, bool taken,
// (only this branch, in different history lengths)
for (int i = 1; i <= base->getNumHistoryTables(); i++)
{
- backingStorage[cid].patterns[base->gtag(tid, pc, i)].counter = taken ? 0 : -1;
+ backingStorage[cid].patterns[tage_bi->tableTags[i]].counter = taken ? 0 : -1;
}
}
}
diff --git a/src/cpu/pred/llbp.hh b/src/cpu/pred/llbp.hh
index 31622dbc4f4..59aca776a3b 100644
--- a/src/cpu/pred/llbp.hh
+++ b/src/cpu/pred/llbp.hh
@@ -84,12 +84,16 @@ class LLBP : public ConditionalPredictor
bool base_pred;
bool avenged;
Addr pc;
+ int index;
bool conditional;
std::list rcrBackup;
void* ltage_bi;
LLBPBranchInfo(Addr pc, bool conditional)
- : pc(pc),
+ : overridden(false),
+ avenged(false),
+ pc(pc),
+ index(-1),
conditional(conditional),
ltage_bi(nullptr)
{}
@@ -134,8 +138,7 @@ class LLBP : public ConditionalPredictor
Cycles backingStorageLatency;
int8_t absPredCounter(int8_t counter);
- void storageUpdate(
- ThreadID tid, Addr pc, uint64_t cid, bool taken, LLBPBranchInfo* bi);
+ void storageUpdate(ThreadID tid, Addr pc, uint64_t cid, bool taken, LLBPBranchInfo* bi);
void storageInvalidate();
int findBestPattern(ThreadID tid, Addr pc, Context& ctx);
int findVictimPattern(int min, Context& ctx);
From 2ab97bfd27882a50affb8d952ced2d2986ae4b21 Mon Sep 17 00:00:00 2001
From: Phillip Assmann
Date: Wed, 21 May 2025 10:39:21 +0000
Subject: [PATCH 09/22] cpu: llbp fix update + tage base stats
---
src/cpu/pred/llbp.cc | 100 ++++++++++++++++++++++++--------------
src/cpu/pred/llbp.hh | 15 +++++-
src/cpu/pred/tage_base.cc | 4 ++
src/cpu/pred/tage_base.hh | 1 +
4 files changed, 82 insertions(+), 38 deletions(-)
diff --git a/src/cpu/pred/llbp.cc b/src/cpu/pred/llbp.cc
index b3ae963685c..902ded75221 100644
--- a/src/cpu/pred/llbp.cc
+++ b/src/cpu/pred/llbp.cc
@@ -76,23 +76,23 @@ LLBP::squash(ThreadID tid, void *&bp_history)
void
LLBP::update(ThreadID tid, Addr pc, bool taken,
- void *&bp_history, bool squashed,
+ void *&bp_history, bool resteer,
const StaticInstPtr &inst, Addr target)
{
assert(bp_history);
LLBPBranchInfo *bi = static_cast(bp_history);
- if (squashed) {
+ if (resteer) {
assert(!bi->avenged);
if (bi->rcrBackup.size()) {
rcr.restore(bi->rcrBackup);
}
- base->update(tid, pc, taken, bi->ltage_bi, squashed, inst, target);
+ base->update(tid, pc, taken, bi->ltage_bi, resteer, inst, target);
return;
}
if (inst->isCondCtrl())
- storageUpdate(tid, pc, rcr.getCCID(), taken, bi);
+ storageUpdate(tid, pc, taken, bi);
std::string rcr_cont = "";
@@ -114,7 +114,7 @@ LLBP::update(ThreadID tid, Addr pc, bool taken,
rcr_cont
);
- base->update(tid, pc, taken, bi->ltage_bi, squashed, inst, target);
+ base->update(tid, pc, taken, bi->ltage_bi, resteer, inst, target);
delete bi;
bp_history = nullptr;
@@ -159,28 +159,42 @@ LLBP::predict(ThreadID tid, Addr branch_pc, bool cond_branch, void *&b)
if (cond_branch)
{
- int tage_bank = base->getNumHistoryTables();
+ int tage_bank = 0;
if (tage_bi->provider == TAGEBase::TAGE_LONGEST_MATCH)
tage_bank = tage_bi->hitBank;
if (tage_bi->provider == TAGEBase::TAGE_ALT_MATCH)
tage_bank = tage_bi->altBank;
+ if (tage_bank)
+ ++stats.baseHitsTotal;
+
auto ccid = rcr.getCCID();
+ bi->index = tage_bank;
+ bi->cid = ccid;
if (backingStorage.count(ccid))
{
if (patternBuffer.count(ccid)) {
Cycles additionalLatency = calculateRemainingLatency(patternBuffer[ccid]);
if (additionalLatency == 0) {
- int i = findBestPattern(tid, pc, backingStorage[ccid]);
- bi->index = i;
- if (i >= 0)
+ int i = findBestPattern(backingStorage[ccid], tage_bi);
+ if (i > 0)
{
- int key = tage_bi->tableTags[i];
+ int key = calculateTag(tage_bi->tableTags[i], i);
+ auto &pattern = backingStorage[ccid].patterns[key];
+
+ int revisits = pattern.visited;
+ if (revisits < stats.revisits.size() - 1) {
+ --stats.revisits[revisits];
+ ++stats.revisits[revisits + 1];
+ ++pattern.visited;
+ }
+
++stats.demandHitsTotal;
- llbp_confidence = backingStorage[ccid].patterns[key].counter;
+ llbp_confidence = pattern.counter;
bool llbp_prediction = llbp_confidence >= 0;
- if (i > tage_bank)
+ if (i >= tage_bank)
{
++stats.demandHitsOverride;
+ bi->index = i;
bi->overridden = true;
bi->llbp_pred = llbp_prediction;
} else {
@@ -267,34 +281,38 @@ int8_t LLBP::absPredCounter(int8_t counter)
*
* @param tid Thread ID
* @param pc Program counter
- * @param cid Context ID
* @param taken Whether the branch was taken
- * @param speculative Whether the update is speculative
* @param bi Branch info
*/
-void LLBP::storageUpdate(ThreadID tid, Addr pc, uint64_t cid, bool taken, LLBPBranchInfo *bi)
+void LLBP::storageUpdate(ThreadID tid, Addr pc, bool taken, LLBPBranchInfo *bi)
{
LTAGE::LTageBranchInfo *ltage_bi =
static_cast(bi->ltage_bi);
-
+
+ uint64_t cid = bi->cid;
+
auto tage_bi = ltage_bi->tageBranchInfo;
- // Check whether the current context is known
+ // Check whether the branch context is known
// If not, we create a new context
if (backingStorage.count(cid))
{
- auto context = backingStorage[cid];
+ LLBP::Context& context = backingStorage[cid];
int i = bi->index;
- if (i >= 0)
+ if (i > 0 && bi->overridden)
{
- int key = tage_bi->tableTags[i];
+ int key = calculateTag(tage_bi->tableTags[i], i);
+ int8_t conf_before = context.patterns[key].counter;
TAGEBase::ctrUpdate(context.patterns[key].counter, taken,
ptnCounterBits);
+ int8_t conf_after = context.patterns[key].counter;
+ DPRINTF(LLBP, "LLBP: Storage C %llu T %lld: %d -> %d (%s)\n",
+ cid, key, conf_before, conf_after, taken ? "taken" : "not taken");
if (context.patterns[key].counter == (taken ? 1 : -2))
{
// Context is now medium confidence
- TAGEBase::unsignedCtrUpdate(backingStorage[cid].confidence, true,
+ TAGEBase::unsignedCtrUpdate(context.confidence, true,
ctxCounterBits);
}
else if (context.patterns[key].counter == (taken ? -1 : 0))
@@ -307,7 +325,6 @@ void LLBP::storageUpdate(ThreadID tid, Addr pc, uint64_t cid, bool taken, LLBPBr
// If a misprediction occurs, we allocate a new pattern with longer history
// in the context. The pattern with the weakest confidence is replaced.
- // This only happens if the LLBP was the provider.
if (bi->getPrediction() != taken) {
if (!bi->avenged && bi->overridden) {
++stats.wrongOverridesTotal;
@@ -316,9 +333,11 @@ void LLBP::storageUpdate(ThreadID tid, Addr pc, uint64_t cid, bool taken, LLBPBr
}
bi->avenged = true;
}
- if (i < base->getNumHistoryTables() - 1) {
- int choosei = std::min(i + 1, (int) base->getNumHistoryTables() - 1);
- context.patterns[tage_bi->tableTags[choosei]].counter = taken ? 0 : -1;
+ if (i < base->getNumHistoryTables()) {
+ ++stats.allocationsTotal;
+ ++stats.revisits[0];
+ int key = calculateTag(tage_bi->tableTags[i+1], i+1);
+ context.patterns[key].counter = taken ? 0 : -1;
}
} else if (bi->overridden) {
++stats.correctOverridesTotal;
@@ -337,13 +356,14 @@ void LLBP::storageUpdate(ThreadID tid, Addr pc, uint64_t cid, bool taken, LLBPBr
}
++stats.backingStorageInsertions;
- backingStorage[cid] = LLBP::Context();
- // Fill the context with initial patterns
- // (only this branch, in different history lengths)
- for (int i = 1; i <= base->getNumHistoryTables(); i++)
- {
- backingStorage[cid].patterns[tage_bi->tableTags[i]].counter = taken ? 0 : -1;
- }
+ int tage_bank = 1;
+ if (tage_bi->provider == TAGEBase::TAGE_LONGEST_MATCH)
+ tage_bank = tage_bi->hitBank;
+ if (tage_bi->provider == TAGEBase::TAGE_ALT_MATCH)
+ tage_bank = tage_bi->altBank;
+ int key = calculateTag(tage_bi->tableTags[tage_bank], tage_bank);
+ backingStorage[cid].patterns[key].counter = taken ? 0 : -1;
+ ++stats.revisits[0];
}
}
@@ -357,11 +377,11 @@ void LLBP::storageUpdate(ThreadID tid, Addr pc, uint64_t cid, bool taken, LLBPBr
* @param ctx Context to search in
* @return Index of the best pattern, or -1 if not found
*/
-int LLBP::findBestPattern(ThreadID tid, Addr pc, Context &ctx)
+int LLBP::findBestPattern(Context &ctx, TAGEBase::BranchInfo *bi)
{
for (int i = base->getNumHistoryTables(); i > 0; i--)
{
- int key = base->gtag(tid, pc, i);
+ int key = calculateTag(bi->tableTags[i], i);
if (ctx.patterns.count(key))
{
return i;
@@ -473,7 +493,7 @@ LLBP::RCR::calcHash(int n, int skip, int shift)
}
uint64_t LLBP::RCR::getCCID()
-{
+{
return moduloTwoExp(ctxs.ccid, tagWidthBits);
} // Hash of all branches
@@ -559,6 +579,8 @@ LLBP::LLBPStats::LLBPStats(LLBP *llbp)
: statistics::Group(llbp),
ADD_STAT(prefetchesIssued, statistics::units::Count::get(),
"Number of prefetches issued to the backing storage"),
+ ADD_STAT(baseHitsTotal, statistics::units::Count::get(),
+ "Total on-demand hits of the base predictor"),
ADD_STAT(demandHitsTotal, statistics::units::Count::get(),
"Total on-demand hits to the pattern buffer"),
ADD_STAT(demandHitsOverride, statistics::units::Count::get(),
@@ -573,6 +595,10 @@ LLBP::LLBPStats::LLBPStats(LLBP *llbp)
"On-demand misses to the pattern buffer where the context was not scheduled for insertion"),
ADD_STAT(demandMissesCold, statistics::units::Count::get(),
"On-demand misses to the pattern buffer where the context was not in the backing storage"),
+ ADD_STAT(allocationsTotal, statistics::units::Count::get(),
+ "Total number of new patterns allocated in any pattern set"),
+ ADD_STAT(revisits, statistics::units::Count::get(),
+ "Number of times a pattern was revisited"),
ADD_STAT(patternBufferEvictions, statistics::units::Count::get(),
"Number of pattern sets evicted from the pattern buffer due to capacity limits"),
ADD_STAT(backingStorageEvictions, statistics::units::Count::get(),
@@ -589,7 +615,9 @@ LLBP::LLBPStats::LLBPStats(LLBP *llbp)
"Number of branches predicted wrong by LLBP, but the base predictor would also be wrong"),
ADD_STAT(squashedOverrides, statistics::units::Count::get(),
"Number of branches predicted by LLBP, but squashed before the outcome was known")
- {}
+ {
+ revisits.init(10).flags(statistics::total);
+ }
} // namespace branch_prediction
} // namespace gem5
diff --git a/src/cpu/pred/llbp.hh b/src/cpu/pred/llbp.hh
index 59aca776a3b..dfad450b309 100644
--- a/src/cpu/pred/llbp.hh
+++ b/src/cpu/pred/llbp.hh
@@ -87,6 +87,7 @@ class LLBP : public ConditionalPredictor
int index;
bool conditional;
std::list rcrBackup;
+ uint64_t cid;
void* ltage_bi;
LLBPBranchInfo(Addr pc, bool conditional)
@@ -95,6 +96,7 @@ class LLBP : public ConditionalPredictor
pc(pc),
index(-1),
conditional(conditional),
+ cid(-1),
ltage_bi(nullptr)
{}
@@ -111,6 +113,7 @@ class LLBP : public ConditionalPredictor
{
//* hysteresis counter: > 0 = taken, < 0 = not taken
int8_t counter;
+ int visited = 0;
};
struct Context
@@ -135,12 +138,17 @@ class LLBP : public ConditionalPredictor
int ctxCounterBits;
int ptnCounterBits;
+ int calculateTag(int tageTag, int tageBank) const
+ {
+ return (tageTag << 4) + tageBank;
+ }
+
Cycles backingStorageLatency;
int8_t absPredCounter(int8_t counter);
- void storageUpdate(ThreadID tid, Addr pc, uint64_t cid, bool taken, LLBPBranchInfo* bi);
+ void storageUpdate(ThreadID tid, Addr pc, bool taken, LLBPBranchInfo* bi);
void storageInvalidate();
- int findBestPattern(ThreadID tid, Addr pc, Context& ctx);
+ int findBestPattern(Context &ctx, TAGEBase::BranchInfo *bi);
int findVictimPattern(int min, Context& ctx);
uint64_t findVictimContext();
@@ -149,6 +157,7 @@ class LLBP : public ConditionalPredictor
LLBPStats(LLBP *llbp);
statistics::Scalar prefetchesIssued;
+ statistics::Scalar baseHitsTotal;
statistics::Scalar demandHitsTotal;
statistics::Scalar demandHitsOverride;
statistics::Scalar demandHitsNoOverride;
@@ -156,6 +165,8 @@ class LLBP : public ConditionalPredictor
statistics::Scalar demandMissesNoPattern;
statistics::Scalar demandMissesNoPrefetch;
statistics::Scalar demandMissesCold;
+ statistics::Scalar allocationsTotal;
+ statistics::Vector revisits;
statistics::Scalar patternBufferEvictions;
statistics::Scalar backingStorageEvictions;
statistics::Scalar backingStorageInsertions;
diff --git a/src/cpu/pred/tage_base.cc b/src/cpu/pred/tage_base.cc
index ed83a48fb31..e9da9997d8e 100644
--- a/src/cpu/pred/tage_base.cc
+++ b/src/cpu/pred/tage_base.cc
@@ -497,6 +497,8 @@ TAGEBase::allocateEntry(int idx, BranchInfo* bi, bool taken)
{
if (gtable[idx][bi->tableIndices[idx]].u != 0)
return false;
+
+ ++stats.allocationsTotal;
gtable[idx][bi->tableIndices[idx]].tag = bi->tableTags[idx];
gtable[idx][bi->tableIndices[idx]].ctr = (taken) ? 0 : -1;
@@ -825,6 +827,8 @@ TAGEBase::getGHR(ThreadID tid) const
TAGEBase::TAGEBaseStats::TAGEBaseStats(
statistics::Group *parent, unsigned nHistoryTables)
: statistics::Group(parent),
+ ADD_STAT(allocationsTotal, statistics::units::Count::get(),
+ "Number of times TAGE inserted a new pattern"),
ADD_STAT(longestMatchProviderCorrect, statistics::units::Count::get(),
"Number of times TAGE Longest Match is the provider and the "
"prediction is correct"),
diff --git a/src/cpu/pred/tage_base.hh b/src/cpu/pred/tage_base.hh
index d2b04c8b9af..26bb22f4185 100644
--- a/src/cpu/pred/tage_base.hh
+++ b/src/cpu/pred/tage_base.hh
@@ -563,6 +563,7 @@ class TAGEBase : public SimObject
{
TAGEBaseStats(statistics::Group *parent, unsigned nHistoryTables);
// stats
+ statistics::Scalar allocationsTotal;
statistics::Scalar longestMatchProviderCorrect;
statistics::Scalar altMatchProviderCorrect;
statistics::Scalar bimodalAltMatchProviderCorrect;
From a547f2efbdd487b49db3fbb1e7fea3b6ed1b6e2f Mon Sep 17 00:00:00 2001
From: Phillip Assmann
Date: Thu, 22 May 2025 10:38:43 +0000
Subject: [PATCH 10/22] cpu: llbp fix keys and tags
---
src/cpu/pred/llbp.cc | 93 ++++++++++++++++++++-------------------
src/cpu/pred/llbp.hh | 19 ++++----
src/cpu/pred/tage_base.cc | 2 +-
3 files changed, 59 insertions(+), 55 deletions(-)
diff --git a/src/cpu/pred/llbp.cc b/src/cpu/pred/llbp.cc
index 902ded75221..12e90a36a3c 100644
--- a/src/cpu/pred/llbp.cc
+++ b/src/cpu/pred/llbp.cc
@@ -65,9 +65,7 @@ LLBP::squash(ThreadID tid, void *&bp_history)
{
LLBPBranchInfo *bi = static_cast(bp_history);
if (bi->overridden) {
- if (!bi->avenged) {
- stats.squashedOverrides++;
- }
+ stats.squashedOverrides++;
}
base->squash(tid, bi->ltage_bi);
delete bi;
@@ -83,15 +81,14 @@ LLBP::update(ThreadID tid, Addr pc, bool taken,
LLBPBranchInfo *bi = static_cast(bp_history);
if (resteer) {
- assert(!bi->avenged);
if (bi->rcrBackup.size()) {
rcr.restore(bi->rcrBackup);
}
- base->update(tid, pc, taken, bi->ltage_bi, resteer, inst, target);
+ base->update(tid, pc, taken, bi->ltage_bi, resteer, inst, target);
return;
}
- if (inst->isCondCtrl())
+ if (inst->isCondCtrl())
storageUpdate(tid, pc, taken, bi);
@@ -99,7 +96,7 @@ LLBP::update(ThreadID tid, Addr pc, bool taken,
for (auto v: rcr.bb)
rcr_cont.append(std::to_string(v) + " | ");
-
+
DPRINTF(LLBP, "CUPDATE @ %lld [thread %d] Updated %s on %lld (%s): ccid=%llu, "
"uncond=%s, sz=%d, RCR: %s\n",
@@ -140,21 +137,23 @@ Prediction
LLBP::predict(ThreadID tid, Addr branch_pc, bool cond_branch, void *&b)
{
Addr pc = branch_pc;
-
+
LLBPBranchInfo *bi = new LLBPBranchInfo(pc, cond_branch);
Prediction ltage_prediction = base->predict(
tid, branch_pc, cond_branch, bi->ltage_bi
);
-
+
b = (void*)(bi);
LTAGE::LTageBranchInfo *ltage_bi = static_cast(bi->ltage_bi);
-
+
auto tage_bi = ltage_bi->tageBranchInfo;
bi->overridden = false;
bi->base_pred = ltage_prediction.taken;
+
+
int8_t llbp_confidence = 0;
if (cond_branch)
@@ -165,7 +164,7 @@ LLBP::predict(ThreadID tid, Addr branch_pc, bool cond_branch, void *&b)
if (tage_bi->provider == TAGEBase::TAGE_ALT_MATCH)
tage_bank = tage_bi->altBank;
if (tage_bank)
- ++stats.baseHitsTotal;
+ ++stats.baseHitsTotal;
auto ccid = rcr.getCCID();
bi->index = tage_bank;
@@ -175,10 +174,10 @@ LLBP::predict(ThreadID tid, Addr branch_pc, bool cond_branch, void *&b)
if (patternBuffer.count(ccid)) {
Cycles additionalLatency = calculateRemainingLatency(patternBuffer[ccid]);
if (additionalLatency == 0) {
- int i = findBestPattern(backingStorage[ccid], tage_bi);
+ int i = findBestPattern(backingStorage[ccid], tage_bi, branch_pc);
if (i > 0)
{
- int key = calculateTag(tage_bi->tableTags[i], i);
+ uint64_t key = calculateTag(tage_bi->tableTags, tage_bi->tableIndices, i, branch_pc);
auto &pattern = backingStorage[ccid].patterns[key];
int revisits = pattern.visited;
@@ -225,7 +224,7 @@ LLBP::predict(ThreadID tid, Addr branch_pc, bool cond_branch, void *&b)
return Prediction {.taken = bi->getPrediction(), .latency = latency};
}
-void
+void
LLBP::updateHistories(
ThreadID tid, Addr pc, bool uncond,
bool taken, Addr target,
@@ -235,10 +234,10 @@ LLBP::updateHistories(
LLBPBranchInfo *bi;
rcr.update(pc, inst, taken);
-
+
if (bp_history == nullptr) {
assert(uncond);
- bi = new LLBPBranchInfo(pc, !uncond);
+ bi = new LLBPBranchInfo(pc, !uncond);
bp_history = (void*)(bi);
// Insert the next prefetch context into the pattern buffer
@@ -292,30 +291,32 @@ void LLBP::storageUpdate(ThreadID tid, Addr pc, bool taken, LLBPBranchInfo *bi)
uint64_t cid = bi->cid;
auto tage_bi = ltage_bi->tageBranchInfo;
-
+
// Check whether the branch context is known
// If not, we create a new context
if (backingStorage.count(cid))
{
LLBP::Context& context = backingStorage[cid];
-
- int i = bi->index;
+
+ int i = bi->index;
if (i > 0 && bi->overridden)
{
- int key = calculateTag(tage_bi->tableTags[i], i);
- int8_t conf_before = context.patterns[key].counter;
- TAGEBase::ctrUpdate(context.patterns[key].counter, taken,
- ptnCounterBits);
- int8_t conf_after = context.patterns[key].counter;
+ uint64_t key = calculateTag(tage_bi->tableTags, tage_bi->tableIndices, i, pc);
+ LLBP::Pattern& pattern = context.patterns[key];
+
+ int8_t conf_before = pattern.counter;
+ TAGEBase::ctrUpdate(pattern.counter, taken, ptnCounterBits);
+ int8_t conf_after = pattern.counter;
+
DPRINTF(LLBP, "LLBP: Storage C %llu T %lld: %d -> %d (%s)\n",
- cid, key, conf_before, conf_after, taken ? "taken" : "not taken");
- if (context.patterns[key].counter == (taken ? 1 : -2))
+ cid, key, conf_before, conf_after, taken ? "taken" : "not taken");
+ if (pattern.counter == (taken ? 1 : -2))
{
// Context is now medium confidence
TAGEBase::unsignedCtrUpdate(context.confidence, true,
ctxCounterBits);
}
- else if (context.patterns[key].counter == (taken ? -1 : 0))
+ else if (pattern.counter == (taken ? -1 : 0))
{
// Context is now low confidence
TAGEBase::unsignedCtrUpdate(context.confidence, false,
@@ -323,28 +324,30 @@ void LLBP::storageUpdate(ThreadID tid, Addr pc, bool taken, LLBPBranchInfo *bi)
}
}
- // If a misprediction occurs, we allocate a new pattern with longer history
- // in the context. The pattern with the weakest confidence is replaced.
- if (bi->getPrediction() != taken) {
- if (!bi->avenged && bi->overridden) {
+ if (bi->overridden) {
+ if (bi->getPrediction() != taken) {
++stats.wrongOverridesTotal;
if (bi->llbp_pred == bi->base_pred) {
++stats.wrongOverridesIdentical;
}
- bi->avenged = true;
+ } else {
+ ++stats.correctOverridesTotal;
+ if (bi->llbp_pred == bi->base_pred) {
+ ++stats.correctOverridesIdentical;
+ }
}
+ }
+
+ // If a misprediction occurs, we allocate a new pattern with longer history
+ // in the context. The pattern with the weakest confidence is replaced.
+ if (bi->getPrediction() != taken) {
if (i < base->getNumHistoryTables()) {
++stats.allocationsTotal;
++stats.revisits[0];
- int key = calculateTag(tage_bi->tableTags[i+1], i+1);
+ uint64_t key = calculateTag(tage_bi->tableTags, tage_bi->tableIndices, i+1, pc);
context.patterns[key].counter = taken ? 0 : -1;
}
- } else if (bi->overridden) {
- ++stats.correctOverridesTotal;
- if (bi->llbp_pred == bi->base_pred) {
- ++stats.correctOverridesIdentical;
- }
- }
+ }
}
else
{
@@ -361,7 +364,7 @@ void LLBP::storageUpdate(ThreadID tid, Addr pc, bool taken, LLBPBranchInfo *bi)
tage_bank = tage_bi->hitBank;
if (tage_bi->provider == TAGEBase::TAGE_ALT_MATCH)
tage_bank = tage_bi->altBank;
- int key = calculateTag(tage_bi->tableTags[tage_bank], tage_bank);
+ uint64_t key = calculateTag(tage_bi->tableTags, tage_bi->tableIndices, tage_bank, pc);
backingStorage[cid].patterns[key].counter = taken ? 0 : -1;
++stats.revisits[0];
}
@@ -377,11 +380,11 @@ void LLBP::storageUpdate(ThreadID tid, Addr pc, bool taken, LLBPBranchInfo *bi)
* @param ctx Context to search in
* @return Index of the best pattern, or -1 if not found
*/
-int LLBP::findBestPattern(Context &ctx, TAGEBase::BranchInfo *bi)
+int LLBP::findBestPattern(Context &ctx, TAGEBase::BranchInfo *bi, Addr pc)
{
for (int i = base->getNumHistoryTables(); i > 0; i--)
{
- int key = calculateTag(bi->tableTags[i], i);
+ uint64_t key = calculateTag(bi->tableTags, bi->tableIndices, i, pc);
if (ctx.patterns.count(key))
{
return i;
@@ -493,7 +496,7 @@ LLBP::RCR::calcHash(int n, int skip, int shift)
}
uint64_t LLBP::RCR::getCCID()
-{
+{
return moduloTwoExp(ctxs.ccid, tagWidthBits);
} // Hash of all branches
@@ -554,7 +557,7 @@ void LLBP::RCR::backup(std::list& vec)
{
vec.clear();
int count = D + W;
- for (auto it = bb.begin();
+ for (auto it = bb.begin();
(it != bb.end()) && (count > 0);
++it, --count)
{
@@ -616,7 +619,7 @@ LLBP::LLBPStats::LLBPStats(LLBP *llbp)
ADD_STAT(squashedOverrides, statistics::units::Count::get(),
"Number of branches predicted by LLBP, but squashed before the outcome was known")
{
- revisits.init(10).flags(statistics::total);
+ revisits.init(10).flags(statistics::dist);
}
} // namespace branch_prediction
diff --git a/src/cpu/pred/llbp.hh b/src/cpu/pred/llbp.hh
index dfad450b309..c27ded34634 100644
--- a/src/cpu/pred/llbp.hh
+++ b/src/cpu/pred/llbp.hh
@@ -71,7 +71,7 @@ class LLBP : public ConditionalPredictor
protected:
LTAGE* base;
-
+
Cycles calculateRemainingLatency(Cycles insertTime);
Prediction predict(ThreadID tid, Addr pc,
@@ -82,17 +82,15 @@ class LLBP : public ConditionalPredictor
bool overridden;
bool llbp_pred;
bool base_pred;
- bool avenged;
Addr pc;
int index;
bool conditional;
std::list rcrBackup;
uint64_t cid;
void* ltage_bi;
-
+
LLBPBranchInfo(Addr pc, bool conditional)
: overridden(false),
- avenged(false),
pc(pc),
index(-1),
conditional(conditional),
@@ -118,7 +116,7 @@ class LLBP : public ConditionalPredictor
struct Context
{
- std::unordered_map patterns;
+ std::unordered_map patterns;
/** Confidence counter of the context (guides replacement) */
uint8_t confidence;
};
@@ -138,9 +136,12 @@ class LLBP : public ConditionalPredictor
int ctxCounterBits;
int ptnCounterBits;
- int calculateTag(int tageTag, int tageBank) const
+ uint64_t calculateTag(int* tageTags, int* tageIndices, int tageBank, Addr pc) const
{
- return (tageTag << 4) + tageBank;
+ uint64_t tag = tageTags[tageBank];
+ uint64_t index = tageIndices[tageBank];
+ uint64_t bank = tageBank;
+ return ((tag << 49) | (index << 6) | bank);
}
Cycles backingStorageLatency;
@@ -148,7 +149,7 @@ class LLBP : public ConditionalPredictor
int8_t absPredCounter(int8_t counter);
void storageUpdate(ThreadID tid, Addr pc, bool taken, LLBPBranchInfo* bi);
void storageInvalidate();
- int findBestPattern(Context &ctx, TAGEBase::BranchInfo *bi);
+ int findBestPattern(Context &ctx, TAGEBase::BranchInfo *bi, Addr pc);
int findVictimPattern(int min, Context& ctx);
uint64_t findVictimContext();
@@ -226,7 +227,7 @@ class LLBP : public ConditionalPredictor
* @return The wrapped value
*/
inline static uint64_t moduloTwoExp(uint64_t val, int exp) {
- return val & ((1 << exp) - 1);
+ return val & ((1 << (uint64_t) exp) - 1);
}
// Get the current context ID
diff --git a/src/cpu/pred/tage_base.cc b/src/cpu/pred/tage_base.cc
index e9da9997d8e..64dcda4555f 100644
--- a/src/cpu/pred/tage_base.cc
+++ b/src/cpu/pred/tage_base.cc
@@ -497,7 +497,7 @@ TAGEBase::allocateEntry(int idx, BranchInfo* bi, bool taken)
{
if (gtable[idx][bi->tableIndices[idx]].u != 0)
return false;
-
+
++stats.allocationsTotal;
gtable[idx][bi->tableIndices[idx]].tag = bi->tableTags[idx];
From d9b7064813dc4d67d96d6db571ef4640e3465f9b Mon Sep 17 00:00:00 2001
From: Phillip Assmann
Date: Thu, 22 May 2025 14:55:45 +0000
Subject: [PATCH 11/22] cpu: llbp add assoc patterns
---
src/cpu/pred/llbp.cc | 121 ++++++++++++++++------------------
src/cpu/pred/llbp.hh | 154 ++++++++++++++++++++++++++++++++++++++-----
2 files changed, 194 insertions(+), 81 deletions(-)
diff --git a/src/cpu/pred/llbp.cc b/src/cpu/pred/llbp.cc
index 12e90a36a3c..0866626bb55 100644
--- a/src/cpu/pred/llbp.cc
+++ b/src/cpu/pred/llbp.cc
@@ -171,21 +171,17 @@ LLBP::predict(ThreadID tid, Addr branch_pc, bool cond_branch, void *&b)
bi->cid = ccid;
if (backingStorage.count(ccid))
{
+ auto& context = backingStorage.at(ccid);
if (patternBuffer.count(ccid)) {
Cycles additionalLatency = calculateRemainingLatency(patternBuffer[ccid]);
if (additionalLatency == 0) {
- int i = findBestPattern(backingStorage[ccid], tage_bi, branch_pc);
+ int i = findBestPattern(context, tage_bi, branch_pc);
if (i > 0)
{
- uint64_t key = calculateTag(tage_bi->tableTags, tage_bi->tableIndices, i, branch_pc);
- auto &pattern = backingStorage[ccid].patterns[key];
-
- int revisits = pattern.visited;
- if (revisits < stats.revisits.size() - 1) {
- --stats.revisits[revisits];
- ++stats.revisits[revisits + 1];
- ++pattern.visited;
- }
+ uint64_t key = context.patterns.calculateKey(tage_bi->tableTags, tage_bi->tableIndices, i);
+ auto &pattern = *context.patterns.getEntry(key);
+
+ context.patterns.wasHit(key, stats.patternHits);
++stats.demandHitsTotal;
llbp_confidence = pattern.counter;
@@ -210,6 +206,8 @@ LLBP::predict(ThreadID tid, Addr branch_pc, bool cond_branch, void *&b)
++stats.demandMissesNoPrefetch;
++stats.demandMissesTotal;
}
+
+ context.patterns.tickAge();
} else {
++stats.demandMissesCold;
++stats.demandMissesTotal;
@@ -220,6 +218,7 @@ LLBP::predict(ThreadID tid, Addr branch_pc, bool cond_branch, void *&b)
bi,
branch_pc, bi->getPrediction(), llbp_confidence,
bi->overridden ? "true" : "false");
+
Cycles latency = ltage_prediction.latency;
return Prediction {.taken = bi->getPrediction(), .latency = latency};
}
@@ -296,31 +295,39 @@ void LLBP::storageUpdate(ThreadID tid, Addr pc, bool taken, LLBPBranchInfo *bi)
// If not, we create a new context
if (backingStorage.count(cid))
{
- LLBP::Context& context = backingStorage[cid];
+ LLBP::Context& context = backingStorage.at(cid);
int i = bi->index;
if (i > 0 && bi->overridden)
{
- uint64_t key = calculateTag(tage_bi->tableTags, tage_bi->tableIndices, i, pc);
- LLBP::Pattern& pattern = context.patterns[key];
+ uint64_t key = context.patterns.calculateKey(tage_bi->tableTags, tage_bi->tableIndices, i);
+ LLBP::Pattern* p = context.patterns.getEntry(key);
+
+ if (p) {
+ LLBP::Pattern& pattern = *p;
- int8_t conf_before = pattern.counter;
- TAGEBase::ctrUpdate(pattern.counter, taken, ptnCounterBits);
- int8_t conf_after = pattern.counter;
-
- DPRINTF(LLBP, "LLBP: Storage C %llu T %lld: %d -> %d (%s)\n",
- cid, key, conf_before, conf_after, taken ? "taken" : "not taken");
- if (pattern.counter == (taken ? 1 : -2))
- {
- // Context is now medium confidence
- TAGEBase::unsignedCtrUpdate(context.confidence, true,
- ctxCounterBits);
- }
- else if (pattern.counter == (taken ? -1 : 0))
- {
- // Context is now low confidence
- TAGEBase::unsignedCtrUpdate(context.confidence, false,
- ctxCounterBits);
+ int8_t conf_before = pattern.counter;
+ TAGEBase::ctrUpdate(pattern.counter, taken, ptnCounterBits);
+ int8_t conf_after = pattern.counter;
+
+ DPRINTF(LLBP, "LLBP: Storage C %llu T %lld: %d -> %d (%s)\n",
+ cid, key, conf_before, conf_after, taken ? "taken" : "not taken");
+ if (pattern.counter == (taken ? 1 : -2))
+ {
+ // Context is now medium confidence
+ TAGEBase::unsignedCtrUpdate(context.confidence, true,
+ ctxCounterBits);
+ }
+ else if (pattern.counter == (taken ? -1 : 0))
+ {
+ // Context is now low confidence
+ TAGEBase::unsignedCtrUpdate(context.confidence, false,
+ ctxCounterBits);
+ }
+
+ if (bi->getPrediction() == taken) {
+ context.patterns.wasUseful(key, stats.patternUseful);
+ }
}
}
@@ -343,9 +350,10 @@ void LLBP::storageUpdate(ThreadID tid, Addr pc, bool taken, LLBPBranchInfo *bi)
if (bi->getPrediction() != taken) {
if (i < base->getNumHistoryTables()) {
++stats.allocationsTotal;
- ++stats.revisits[0];
- uint64_t key = calculateTag(tage_bi->tableTags, tage_bi->tableIndices, i+1, pc);
- context.patterns[key].counter = taken ? 0 : -1;
+ ++stats.patternHits[0];
+ ++stats.patternUseful[0];
+ uint64_t key = context.patterns.calculateKey(tage_bi->tableTags, tage_bi->tableIndices, i+1);
+ context.patterns.insertEntry(key, taken);
}
}
}
@@ -358,15 +366,20 @@ void LLBP::storageUpdate(ThreadID tid, Addr pc, bool taken, LLBPBranchInfo *bi)
backingStorage.erase(i);
}
- ++stats.backingStorageInsertions;
int tage_bank = 1;
if (tage_bi->provider == TAGEBase::TAGE_LONGEST_MATCH)
tage_bank = tage_bi->hitBank;
if (tage_bi->provider == TAGEBase::TAGE_ALT_MATCH)
tage_bank = tage_bi->altBank;
- uint64_t key = calculateTag(tage_bi->tableTags, tage_bi->tableIndices, tage_bank, pc);
- backingStorage[cid].patterns[key].counter = taken ? 0 : -1;
- ++stats.revisits[0];
+
+ backingStorage.emplace(cid, Context(PatternSet(16, 4, 8)));
+ Context& context = backingStorage.at(cid);
+ ++stats.backingStorageInsertions;
+
+ uint64_t key = context.patterns.calculateKey(tage_bi->tableTags, tage_bi->tableIndices, tage_bank);
+ context.patterns.insertEntry(key, taken);
+
+ ++stats.patternHits[0];
}
}
@@ -384,29 +397,8 @@ int LLBP::findBestPattern(Context &ctx, TAGEBase::BranchInfo *bi, Addr pc)
{
for (int i = base->getNumHistoryTables(); i > 0; i--)
{
- uint64_t key = calculateTag(bi->tableTags, bi->tableIndices, i, pc);
- if (ctx.patterns.count(key))
- {
- return i;
- }
- }
- return -1;
-}
-
-/**
- * Find the pattern with the lowest confidence in the context.
- * Lengths and confidence vals smaller than {min} are not considered.
- *
- * @param min Minimum confidence value
- * @param ctx Context to search in
- * @return Index of the pattern with the lowest confidence, or -1 if not found
- */
-int LLBP::findVictimPattern(int min, Context &ctx)
-{
- int min_conf = 1 << (ptnCounterBits - 2);
- for (int i = (min >= 0 ? min + 1 : 1); i <= base->getNumHistoryTables(); i++)
- {
- if (absPredCounter(ctx.patterns[i].counter) < min_conf)
+ uint64_t key = ctx.patterns.calculateKey(bi->tableTags, bi->tableIndices, i);
+ if (ctx.patterns.getEntry(key))
{
return i;
}
@@ -600,8 +592,10 @@ LLBP::LLBPStats::LLBPStats(LLBP *llbp)
"On-demand misses to the pattern buffer where the context was not in the backing storage"),
ADD_STAT(allocationsTotal, statistics::units::Count::get(),
"Total number of new patterns allocated in any pattern set"),
- ADD_STAT(revisits, statistics::units::Count::get(),
- "Number of times a pattern was revisited"),
+ ADD_STAT(patternHits, statistics::units::Count::get(),
+ "Number of times any pattern was hit"),
+ ADD_STAT(patternUseful, statistics::units::Count::get(),
+ "Number of times any pattern was useful"),
ADD_STAT(patternBufferEvictions, statistics::units::Count::get(),
"Number of pattern sets evicted from the pattern buffer due to capacity limits"),
ADD_STAT(backingStorageEvictions, statistics::units::Count::get(),
@@ -619,7 +613,8 @@ LLBP::LLBPStats::LLBPStats(LLBP *llbp)
ADD_STAT(squashedOverrides, statistics::units::Count::get(),
"Number of branches predicted by LLBP, but squashed before the outcome was known")
{
- revisits.init(10).flags(statistics::dist);
+ patternHits.init(10).flags(statistics::dist);
+ patternUseful.init(10).flags(statistics::dist);
}
} // namespace branch_prediction
diff --git a/src/cpu/pred/llbp.hh b/src/cpu/pred/llbp.hh
index c27ded34634..8e162909e7c 100644
--- a/src/cpu/pred/llbp.hh
+++ b/src/cpu/pred/llbp.hh
@@ -107,26 +107,151 @@ class LLBP : public ConditionalPredictor
};
+
struct Pattern
{
- //* hysteresis counter: > 0 = taken, < 0 = not taken
+ uint64_t tag;
int8_t counter;
- int visited = 0;
+ int hit = 0;
+ int useful = 0;
+ int keep = 7;
};
- struct Context
+ class PatternSet {
+ public:
+ PatternSet(int numEntries, int setSize, int bankBits) {
+ assert(numEntries % setSize == 0);
+ this->bankBits = bankBits;
+ this->setSize = setSize;
+ this->numSets = numEntries / setSize;
+ sets.resize(numSets);
+ for (auto& set : sets) {
+ set.resize(setSize);
+ }
+ }
+
+ int getID(uint64_t key) {
+ uint64_t bank = getBank(key);
+ return bank / setSize;
+ }
+
+ Pattern* getEntry(uint64_t key) {
+ auto& set = getSet(key);
+ Pattern* result = findPatternInSet(key, set);
+ return result;
+ }
+
+ void insertEntry(uint64_t key, bool taken) {
+ auto& set = getSet(key);
+ Pattern& victim = findVictimPattern(set);
+ victim.tag = key;
+ victim.counter = taken ? 0 : -1;
+ victim.hit = 0;
+ victim.useful = 0;
+ victim.keep = 7;
+ }
+
+ void wasUseful(uint64_t key, statistics::Vector& usefulTotal) {
+ Pattern* p = getEntry(key);
+ if (p) {
+ int useful = p->useful++;
+ if (useful < usefulTotal.size() - 1) {
+ --usefulTotal[useful];
+ ++usefulTotal[useful + 1];
+ }
+ saturatingAdd(p->keep, 8);
+ saturatingAdd(p->keep, 8);
+ }
+ }
+
+ void wasHit(uint64_t key, statistics::Vector& hitsTotal) {
+ Pattern* p = getEntry(key);
+ if (p) {
+ int hits = p->hit++;
+ if (hits < hitsTotal.size() - 1) {
+ --hitsTotal[hits];
+ ++hitsTotal[hits + 1];
+ }
+ }
+ }
+
+ void tickAge() {
+ for (auto& set : sets) {
+ for (auto& pattern : set) {
+ saturatingSub(pattern.keep);
+ }
+ }
+ }
+
+ uint64_t calculateKey(int* tageTags, int* tageIndices, int tageBank) {
+ uint64_t tag = tageTags[tageBank];
+ uint64_t index = tageIndices[tageBank];
+ uint64_t bank = tageBank;
+ return ((tag << 49) | (index << bankBits) | bank);
+ }
+
+ int getBank(uint64_t key) {
+ return bitmaskLowerN(bankBits) & key;
+ }
+
+ static uint64_t bitmaskLowerN(int n) {
+ return (1 << n) - 1;
+ }
+
+ static void saturatingSub(int& n) {
+ if (n > 0) {
+ --n;
+ }
+ }
+
+ static void saturatingAdd(int& n, int max) {
+ if (n < max) {
+ ++n;
+ }
+ }
+ private:
+ std::vector& getSet(uint64_t key) {
+ int id = getID(key);
+ assert(id < sets.size());
+ return sets[id];
+ }
+
+ Pattern* findPatternInSet(uint64_t key, std::vector& set) {
+ auto result = std::find_if(set.begin(), set.end(), [key](Pattern& pat) {
+ return pat.tag == key;
+ });
+
+ if (result == set.end())
+ return nullptr;
+
+ return &*result;
+ }
+
+ Pattern& findVictimPattern(std::vector& set) {
+ auto result = std::min_element(set.begin(), set.end(), [&](const Pattern& a, const Pattern& b) {
+ return a.keep < b.keep;
+ });
+
+ return *result;
+ }
+
+ int bankBits;
+ int numSets;
+ int setSize;
+ std::vector> sets;
+ };
+
+ class Context
{
- std::unordered_map patterns;
+ public:
+ PatternSet patterns;
/** Confidence counter of the context (guides replacement) */
uint8_t confidence;
- };
- struct PatternBufferEntry
- {
- uint64_t cid;
- Cycles insertTime;
+ Context(PatternSet patterns): patterns(patterns), confidence(0) {}
};
+
std::unordered_map backingStorage;
std::unordered_map patternBuffer;
std::deque patternBufferQueue;
@@ -136,14 +261,6 @@ class LLBP : public ConditionalPredictor
int ctxCounterBits;
int ptnCounterBits;
- uint64_t calculateTag(int* tageTags, int* tageIndices, int tageBank, Addr pc) const
- {
- uint64_t tag = tageTags[tageBank];
- uint64_t index = tageIndices[tageBank];
- uint64_t bank = tageBank;
- return ((tag << 49) | (index << 6) | bank);
- }
-
Cycles backingStorageLatency;
int8_t absPredCounter(int8_t counter);
@@ -167,7 +284,8 @@ class LLBP : public ConditionalPredictor
statistics::Scalar demandMissesNoPrefetch;
statistics::Scalar demandMissesCold;
statistics::Scalar allocationsTotal;
- statistics::Vector revisits;
+ statistics::Vector patternHits;
+ statistics::Vector patternUseful;
statistics::Scalar patternBufferEvictions;
statistics::Scalar backingStorageEvictions;
statistics::Scalar backingStorageInsertions;
From e4cb435d91c09f211df6630eb008423664202b23 Mon Sep 17 00:00:00 2001
From: Phillip Assmann
Date: Thu, 22 May 2025 16:27:12 +0000
Subject: [PATCH 12/22] cpu: llbp switch to tage lfu
---
src/cpu/pred/llbp.cc | 5 ++---
src/cpu/pred/llbp.hh | 21 ++++++++-------------
2 files changed, 10 insertions(+), 16 deletions(-)
diff --git a/src/cpu/pred/llbp.cc b/src/cpu/pred/llbp.cc
index 0866626bb55..914808ee531 100644
--- a/src/cpu/pred/llbp.cc
+++ b/src/cpu/pred/llbp.cc
@@ -207,7 +207,6 @@ LLBP::predict(ThreadID tid, Addr branch_pc, bool cond_branch, void *&b)
++stats.demandMissesTotal;
}
- context.patterns.tickAge();
} else {
++stats.demandMissesCold;
++stats.demandMissesTotal;
@@ -489,12 +488,12 @@ LLBP::RCR::calcHash(int n, int skip, int shift)
uint64_t LLBP::RCR::getCCID()
{
- return moduloTwoExp(ctxs.ccid, tagWidthBits);
+ return ctxs.ccid;
} // Hash of all branches
uint64_t LLBP::RCR::getPCID()
{
- return moduloTwoExp(ctxs.pcid, tagWidthBits);
+ return ctxs.pcid;
}
bool LLBP::RCR::update(Addr pc, const StaticInstPtr &inst, bool taken)
diff --git a/src/cpu/pred/llbp.hh b/src/cpu/pred/llbp.hh
index 8e162909e7c..0395c3b6094 100644
--- a/src/cpu/pred/llbp.hh
+++ b/src/cpu/pred/llbp.hh
@@ -114,7 +114,6 @@ class LLBP : public ConditionalPredictor
int8_t counter;
int hit = 0;
int useful = 0;
- int keep = 7;
};
class PatternSet {
@@ -148,7 +147,6 @@ class LLBP : public ConditionalPredictor
victim.counter = taken ? 0 : -1;
victim.hit = 0;
victim.useful = 0;
- victim.keep = 7;
}
void wasUseful(uint64_t key, statistics::Vector& usefulTotal) {
@@ -159,8 +157,6 @@ class LLBP : public ConditionalPredictor
--usefulTotal[useful];
++usefulTotal[useful + 1];
}
- saturatingAdd(p->keep, 8);
- saturatingAdd(p->keep, 8);
}
}
@@ -174,14 +170,6 @@ class LLBP : public ConditionalPredictor
}
}
}
-
- void tickAge() {
- for (auto& set : sets) {
- for (auto& pattern : set) {
- saturatingSub(pattern.keep);
- }
- }
- }
uint64_t calculateKey(int* tageTags, int* tageIndices, int tageBank) {
uint64_t tag = tageTags[tageBank];
@@ -193,6 +181,13 @@ class LLBP : public ConditionalPredictor
int getBank(uint64_t key) {
return bitmaskLowerN(bankBits) & key;
}
+
+ static int absConfidence(int8_t ctr) {
+ if (ctr < 0) {
+ return abs(ctr) - 1;
+ }
+ return ctr;
+ }
static uint64_t bitmaskLowerN(int n) {
return (1 << n) - 1;
@@ -229,7 +224,7 @@ class LLBP : public ConditionalPredictor
Pattern& findVictimPattern(std::vector& set) {
auto result = std::min_element(set.begin(), set.end(), [&](const Pattern& a, const Pattern& b) {
- return a.keep < b.keep;
+ return absConfidence(a.counter) < absConfidence(b.counter);
});
return *result;
From 18a58fb89ff2b75a7b56ac17f1b5bbec8bfa76a7 Mon Sep 17 00:00:00 2001
From: Phillip Assmann
Date: Fri, 23 May 2025 14:39:58 +0000
Subject: [PATCH 13/22] cpu: llbp add pb assoc, fix useful, add occ
---
src/cpu/pred/llbp.cc | 54 +++++++++++---------
src/cpu/pred/llbp.hh | 117 ++++++++++++++++++++++++++++++++++++++++---
2 files changed, 139 insertions(+), 32 deletions(-)
diff --git a/src/cpu/pred/llbp.cc b/src/cpu/pred/llbp.cc
index 914808ee531..694428416df 100644
--- a/src/cpu/pred/llbp.cc
+++ b/src/cpu/pred/llbp.cc
@@ -39,13 +39,14 @@ namespace branch_prediction
LLBP::LLBP(const LLBPParams ¶ms)
: ConditionalPredictor(params),
base(params.base),
- patternBufferCapacity(params.patternBufferCapacity),
+ backingStorage(),
+ patternBuffer(params.patternBufferCapacity, 64, this->backingStorage),
storageCapacity(params.storageCapacity),
ctxCounterBits(params.ctxCounterBits),
ptnCounterBits(params.ptnCounterBits),
backingStorageLatency(params.backingStorageLatency),
stats(this),
- rcr(3, 8, 8, 2, params.tagWidthBits)
+ rcr(3, 64, 8, 2, params.tagWidthBits)
{
DPRINTF(LLBP, "Using experimental LLBP\n");
DPRINTF(LLBP, "RCR: T=%d, W=%d, D=%d, S=%d, tagWidthBits=%d\n",
@@ -84,6 +85,9 @@ LLBP::update(ThreadID tid, Addr pc, bool taken,
if (bi->rcrBackup.size()) {
rcr.restore(bi->rcrBackup);
}
+
+ patternBuffer.clearInFlight(curCycle(), backingStorageLatency);
+
base->update(tid, pc, taken, bi->ltage_bi, resteer, inst, target);
return;
}
@@ -159,6 +163,7 @@ LLBP::predict(ThreadID tid, Addr branch_pc, bool cond_branch, void *&b)
if (cond_branch)
{
int tage_bank = 0;
+
if (tage_bi->provider == TAGEBase::TAGE_LONGEST_MATCH)
tage_bank = tage_bi->hitBank;
if (tage_bi->provider == TAGEBase::TAGE_ALT_MATCH)
@@ -169,11 +174,12 @@ LLBP::predict(ThreadID tid, Addr branch_pc, bool cond_branch, void *&b)
auto ccid = rcr.getCCID();
bi->index = tage_bank;
bi->cid = ccid;
- if (backingStorage.count(ccid))
- {
+ if (backingStorage.count(ccid)) {
auto& context = backingStorage.at(ccid);
- if (patternBuffer.count(ccid)) {
- Cycles additionalLatency = calculateRemainingLatency(patternBuffer[ccid]);
+ PatternBufferEntry* pbe = patternBuffer.get(ccid);
+ if (pbe) {
+ auto& entry = *pbe;
+ Cycles additionalLatency = calculateRemainingLatency(entry.insertTime);
if (additionalLatency == 0) {
int i = findBestPattern(context, tage_bi, branch_pc);
if (i > 0)
@@ -181,7 +187,12 @@ LLBP::predict(ThreadID tid, Addr branch_pc, bool cond_branch, void *&b)
uint64_t key = context.patterns.calculateKey(tage_bi->tableTags, tage_bi->tableIndices, i);
auto &pattern = *context.patterns.getEntry(key);
+ if (pattern.hit == 0) {
+ ++stats.patternUseful[0];
+ }
+
context.patterns.wasHit(key, stats.patternHits);
+ entry.lastUsed = curCycle();
++stats.demandHitsTotal;
llbp_confidence = pattern.counter;
@@ -239,20 +250,13 @@ LLBP::updateHistories(
bp_history = (void*)(bi);
// Insert the next prefetch context into the pattern buffer
- if (backingStorage.count(rcr.getPCID()))
+ uint64_t pcid = rcr.getPCID();
+ if (backingStorage.count(pcid))
{
- uint64_t pcid = rcr.getPCID();
- if (patternBuffer.count(rcr.getPCID()) == 0) {
+ if (patternBuffer.get(pcid) == nullptr)
+ {
++stats.prefetchesIssued;
- patternBuffer[pcid] = curCycle();
-
- patternBufferQueue.push_back(pcid);
- if (patternBufferQueue.size() >= patternBufferCapacity)
- {
- patternBuffer.erase(patternBufferQueue.front());
- patternBufferQueue.pop_front();
- ++stats.patternBufferEvictions;
- }
+ patternBuffer.insert(pcid, curCycle());
}
}
} else {
@@ -350,7 +354,6 @@ void LLBP::storageUpdate(ThreadID tid, Addr pc, bool taken, LLBPBranchInfo *bi)
if (i < base->getNumHistoryTables()) {
++stats.allocationsTotal;
++stats.patternHits[0];
- ++stats.patternUseful[0];
uint64_t key = context.patterns.calculateKey(tage_bi->tableTags, tage_bi->tableIndices, i+1);
context.patterns.insertEntry(key, taken);
}
@@ -371,7 +374,7 @@ void LLBP::storageUpdate(ThreadID tid, Addr pc, bool taken, LLBPBranchInfo *bi)
if (tage_bi->provider == TAGEBase::TAGE_ALT_MATCH)
tage_bank = tage_bi->altBank;
- backingStorage.emplace(cid, Context(PatternSet(16, 4, 8)));
+ backingStorage.emplace(cid, Context(PatternSet(16, 4, 8, stats.patternSetOccupancy)));
Context& context = backingStorage.at(cid);
++stats.backingStorageInsertions;
@@ -592,9 +595,11 @@ LLBP::LLBPStats::LLBPStats(LLBP *llbp)
ADD_STAT(allocationsTotal, statistics::units::Count::get(),
"Total number of new patterns allocated in any pattern set"),
ADD_STAT(patternHits, statistics::units::Count::get(),
- "Number of times any pattern was hit"),
+ "Number of times any pattern was hit (distribution)"),
ADD_STAT(patternUseful, statistics::units::Count::get(),
- "Number of times any pattern was useful"),
+ "Number of times any pattern was useful (distribution)"),
+ ADD_STAT(patternSetOccupancy, statistics::units::Count::get(),
+ "Number of patterns used in the pattern sets (distribution)"),
ADD_STAT(patternBufferEvictions, statistics::units::Count::get(),
"Number of pattern sets evicted from the pattern buffer due to capacity limits"),
ADD_STAT(backingStorageEvictions, statistics::units::Count::get(),
@@ -612,8 +617,9 @@ LLBP::LLBPStats::LLBPStats(LLBP *llbp)
ADD_STAT(squashedOverrides, statistics::units::Count::get(),
"Number of branches predicted by LLBP, but squashed before the outcome was known")
{
- patternHits.init(10).flags(statistics::dist);
- patternUseful.init(10).flags(statistics::dist);
+ patternHits.init(10).flags(statistics::pdf);
+ patternUseful.init(10).flags(statistics::pdf);
+ patternSetOccupancy.init(17).flags(statistics::pdf);
}
} // namespace branch_prediction
diff --git a/src/cpu/pred/llbp.hh b/src/cpu/pred/llbp.hh
index 0395c3b6094..7313e2a4913 100644
--- a/src/cpu/pred/llbp.hh
+++ b/src/cpu/pred/llbp.hh
@@ -30,7 +30,6 @@ Implementation of the last-level branch predictor (LLBP).
#define __CPU_PRED_LLBP_HH__
#include
-#include
#include
#include
#include
@@ -118,11 +117,19 @@ class LLBP : public ConditionalPredictor
class PatternSet {
public:
- PatternSet(int numEntries, int setSize, int bankBits) {
+ PatternSet(
+ int numEntries,
+ int setSize,
+ int bankBits,
+ statistics::Vector& totalOccupancy
+ ): bankBits(bankBits),
+ setSize(setSize),
+ occupancy(0),
+ totalOccupancy(totalOccupancy)
+ {
assert(numEntries % setSize == 0);
- this->bankBits = bankBits;
- this->setSize = setSize;
this->numSets = numEntries / setSize;
+ ++totalOccupancy[0];
sets.resize(numSets);
for (auto& set : sets) {
set.resize(setSize);
@@ -147,6 +154,12 @@ class LLBP : public ConditionalPredictor
victim.counter = taken ? 0 : -1;
victim.hit = 0;
victim.useful = 0;
+
+ if (occupancy < totalOccupancy.size() - 1) {
+ --totalOccupancy[occupancy];
+ ++totalOccupancy[occupancy + 1];
+ }
+ saturatingAdd(occupancy, numSets*setSize);
}
void wasUseful(uint64_t key, statistics::Vector& usefulTotal) {
@@ -233,6 +246,8 @@ class LLBP : public ConditionalPredictor
int bankBits;
int numSets;
int setSize;
+ int occupancy;
+ statistics::Vector& totalOccupancy;
std::vector> sets;
};
@@ -244,14 +259,99 @@ class LLBP : public ConditionalPredictor
uint8_t confidence;
Context(PatternSet patterns): patterns(patterns), confidence(0) {}
+
};
+
+ typedef std::unordered_map BackingStorage;
+
+ BackingStorage backingStorage;
+
+ struct PatternBufferEntry {
+ uint64_t cid;
+ Cycles insertTime;
+ Cycles lastUsed;
+ bool valid = false;
+ };
+
+
+ class PatternBuffer {
+ public:
+ PatternBuffer(int numEntries, int setSize, BackingStorage& backingStorage)
+ : setSize(setSize),
+ backingStorage(backingStorage) {
+ assert(numEntries % setSize == 0);
+ this->numSets = numEntries / setSize;
+
+ sets.resize(numSets);
+ for (auto& set : sets) {
+ set.resize(setSize);
+ }
+ }
+
+ void insert(uint64_t cid, Cycles now) {
+ auto& set = getSet(cid);
+ if (!backingStorage.count(cid))
+ return;
+ PatternBufferEntry& victim = findVictim(set);
+ victim.cid = cid;
+ victim.insertTime = now;
+ victim.lastUsed = now;
+ victim.valid = true;
+ }
+
+ PatternBufferEntry* get(uint64_t cid) {
+ auto& set = getSet(cid);
+ return findEntry(cid, set);
+ }
+
+ void clearInFlight(Cycles now, Cycles latency) {
+ for (auto& set: sets) {
+ for (auto& e: set) {
+ Cycles passedTime = (now - e.insertTime);
+ if (passedTime < latency) {
+ e.valid = false;
+ }
+ }
+ }
+ }
+
+ std::vector& getSet(uint64_t cid) {
+ return sets[cid % numSets];
+ }
+
+ private:
+ PatternBufferEntry* findEntry(uint64_t cid, std::vector& set) {
+ auto result = std::find_if(set.begin(), set.end(), [cid](PatternBufferEntry& e) {
+ return e.cid == cid && e.valid;
+ });
+
+ if (result == set.end())
+ return nullptr;
+
+ return &*result;
+ }
+
+ PatternBufferEntry& findVictim(std::vector& set) {
+ auto firstInvalid = std::find_if(set.begin(), set.end(), [](PatternBufferEntry& e) {
+ return !e.valid;
+ });
- std::unordered_map backingStorage;
- std::unordered_map patternBuffer;
- std::deque patternBufferQueue;
+ if (firstInvalid != set.end())
+ return *firstInvalid;
+
+ auto worst = std::min_element(set.begin(), set.end(), [&](const PatternBufferEntry& a, const PatternBufferEntry& b) {
+ return a.insertTime < b.insertTime;
+ });
+
+ return *worst;
+ }
+ int numSets;
+ int setSize;
+ BackingStorage& backingStorage;
+ std::vector> sets;
+ } patternBuffer;
- int patternBufferCapacity;
int storageCapacity;
int ctxCounterBits;
int ptnCounterBits;
@@ -281,6 +381,7 @@ class LLBP : public ConditionalPredictor
statistics::Scalar allocationsTotal;
statistics::Vector patternHits;
statistics::Vector patternUseful;
+ statistics::Vector patternSetOccupancy;
statistics::Scalar patternBufferEvictions;
statistics::Scalar backingStorageEvictions;
statistics::Scalar backingStorageInsertions;
From 2bfff37d1a52abb4bc39aec96511d46b3eaf185a Mon Sep 17 00:00:00 2001
From: Phillip Assmann
Date: Mon, 26 May 2025 16:27:23 +0000
Subject: [PATCH 14/22] cpu: llbp switch to tsl
---
src/cpu/pred/BranchPredictor.py | 4 +-
src/cpu/pred/llbp.cc | 37 ++++-----
src/cpu/pred/llbp.hh | 141 ++++++++++++++++++++------------
src/cpu/pred/tage.hh | 4 +-
src/cpu/pred/tage_base.hh | 3 +
src/cpu/pred/tage_sc_l.hh | 3 +-
6 files changed, 116 insertions(+), 76 deletions(-)
diff --git a/src/cpu/pred/BranchPredictor.py b/src/cpu/pred/BranchPredictor.py
index 37ec9117250..1dc8433ad5b 100644
--- a/src/cpu/pred/BranchPredictor.py
+++ b/src/cpu/pred/BranchPredictor.py
@@ -802,8 +802,8 @@ class LLBP(ConditionalPredictor):
cxx_class = "gem5::branch_prediction::LLBP"
cxx_header = "cpu/pred/llbp.hh"
- base = Param.LTAGE("Base predictor")
- tagWidthBits = Param.Int(14, "RCR CTWidth")
+ base = Param.TAGE_SC_L("Base predictor")
+ tagWidthBits = Param.Int(63, "RCR CTWidth")
patternBufferCapacity = Param.Int(64, "Pattern Buffer Capacity")
storageCapacity = Param.Int(14000, "Storage Capacity")
ptnCounterBits = Param.Int(3, "Bits in Pattern Counter")
diff --git a/src/cpu/pred/llbp.cc b/src/cpu/pred/llbp.cc
index 694428416df..60148a2d437 100644
--- a/src/cpu/pred/llbp.cc
+++ b/src/cpu/pred/llbp.cc
@@ -39,13 +39,13 @@ namespace branch_prediction
LLBP::LLBP(const LLBPParams ¶ms)
: ConditionalPredictor(params),
base(params.base),
+ stats(this),
backingStorage(),
- patternBuffer(params.patternBufferCapacity, 64, this->backingStorage),
+ patternBuffer(params.patternBufferCapacity, 4, this->backingStorage, stats.patternBufferEvictions),
storageCapacity(params.storageCapacity),
ctxCounterBits(params.ctxCounterBits),
ptnCounterBits(params.ptnCounterBits),
backingStorageLatency(params.backingStorageLatency),
- stats(this),
rcr(3, 64, 8, 2, params.tagWidthBits)
{
DPRINTF(LLBP, "Using experimental LLBP\n");
@@ -150,9 +150,9 @@ LLBP::predict(ThreadID tid, Addr branch_pc, bool cond_branch, void *&b)
b = (void*)(bi);
- LTAGE::LTageBranchInfo *ltage_bi = static_cast(bi->ltage_bi);
+ TAGE_SC_L::TageSCLBranchInfo *scltage_bi = static_cast(bi->ltage_bi);
- auto tage_bi = ltage_bi->tageBranchInfo;
+ auto tage_bi = scltage_bi->tageBranchInfo;
bi->overridden = false;
bi->base_pred = ltage_prediction.taken;
@@ -187,11 +187,7 @@ LLBP::predict(ThreadID tid, Addr branch_pc, bool cond_branch, void *&b)
uint64_t key = context.patterns.calculateKey(tage_bi->tableTags, tage_bi->tableIndices, i);
auto &pattern = *context.patterns.getEntry(key);
- if (pattern.hit == 0) {
- ++stats.patternUseful[0];
- }
-
- context.patterns.wasHit(key, stats.patternHits);
+ context.patterns.wasHit(key);
entry.lastUsed = curCycle();
++stats.demandHitsTotal;
@@ -287,8 +283,8 @@ int8_t LLBP::absPredCounter(int8_t counter)
*/
void LLBP::storageUpdate(ThreadID tid, Addr pc, bool taken, LLBPBranchInfo *bi)
{
- LTAGE::LTageBranchInfo *ltage_bi =
- static_cast(bi->ltage_bi);
+ TAGE_SC_L::TageSCLBranchInfo *ltage_bi =
+ static_cast(bi->ltage_bi);
uint64_t cid = bi->cid;
@@ -329,7 +325,7 @@ void LLBP::storageUpdate(ThreadID tid, Addr pc, bool taken, LLBPBranchInfo *bi)
}
if (bi->getPrediction() == taken) {
- context.patterns.wasUseful(key, stats.patternUseful);
+ context.patterns.wasUseful(key);
}
}
}
@@ -353,8 +349,9 @@ void LLBP::storageUpdate(ThreadID tid, Addr pc, bool taken, LLBPBranchInfo *bi)
if (bi->getPrediction() != taken) {
if (i < base->getNumHistoryTables()) {
++stats.allocationsTotal;
- ++stats.patternHits[0];
- uint64_t key = context.patterns.calculateKey(tage_bi->tableTags, tage_bi->tableIndices, i+1);
+ while (!base->tage->noSkip[i] && i < base->getNumHistoryTables())
+ i = i+1;
+ uint64_t key = context.patterns.calculateKey(tage_bi->tableTags, tage_bi->tableIndices, i);
context.patterns.insertEntry(key, taken);
}
}
@@ -368,20 +365,19 @@ void LLBP::storageUpdate(ThreadID tid, Addr pc, bool taken, LLBPBranchInfo *bi)
backingStorage.erase(i);
}
+ // TODO: Check if this is a skip table
int tage_bank = 1;
if (tage_bi->provider == TAGEBase::TAGE_LONGEST_MATCH)
tage_bank = tage_bi->hitBank;
if (tage_bi->provider == TAGEBase::TAGE_ALT_MATCH)
tage_bank = tage_bi->altBank;
- backingStorage.emplace(cid, Context(PatternSet(16, 4, 8, stats.patternSetOccupancy)));
+ backingStorage.emplace(cid, Context(PatternSet(64, 4, 8, stats)));
Context& context = backingStorage.at(cid);
++stats.backingStorageInsertions;
uint64_t key = context.patterns.calculateKey(tage_bi->tableTags, tage_bi->tableIndices, tage_bank);
context.patterns.insertEntry(key, taken);
-
- ++stats.patternHits[0];
}
}
@@ -574,6 +570,7 @@ void LLBP::RCR::restore(std::list& vec)
LLBP::LLBPStats::LLBPStats(LLBP *llbp)
: statistics::Group(llbp),
+ parent(llbp),
ADD_STAT(prefetchesIssued, statistics::units::Count::get(),
"Number of prefetches issued to the backing storage"),
ADD_STAT(baseHitsTotal, statistics::units::Count::get(),
@@ -617,9 +614,9 @@ LLBP::LLBPStats::LLBPStats(LLBP *llbp)
ADD_STAT(squashedOverrides, statistics::units::Count::get(),
"Number of branches predicted by LLBP, but squashed before the outcome was known")
{
- patternHits.init(10).flags(statistics::pdf);
- patternUseful.init(10).flags(statistics::pdf);
- patternSetOccupancy.init(17).flags(statistics::pdf);
+ patternHits.init(11).flags(statistics::pdf);
+ patternUseful.init(11).flags(statistics::pdf);
+ patternSetOccupancy.init(llbp->base->getNumHistoryTables() + 1).flags(statistics::pdf);
}
} // namespace branch_prediction
diff --git a/src/cpu/pred/llbp.hh b/src/cpu/pred/llbp.hh
index 7313e2a4913..7a1232600f9 100644
--- a/src/cpu/pred/llbp.hh
+++ b/src/cpu/pred/llbp.hh
@@ -38,7 +38,7 @@ Implementation of the last-level branch predictor (LLBP).
#include "base/cache/cache_entry.hh"
#include "base/statistics.hh"
#include "base/types.hh"
-#include "cpu/pred/ltage.hh"
+#include "cpu/pred/tage_sc_l.hh"
#include "params/LLBP.hh"
namespace gem5
@@ -69,7 +69,42 @@ class LLBP : public ConditionalPredictor
void * &bp_history) override;
protected:
- LTAGE* base;
+ TAGE_SC_L* base;
+
+ struct LLBPStats : public statistics::Group
+ {
+ LLBPStats(LLBP *llbp);
+
+ void preDumpStats() override {
+ for(auto& ctx : parent->backingStorage) {
+ ctx.second.patterns.commitStats();
+ }
+ }
+
+ LLBP* parent;
+
+ statistics::Scalar prefetchesIssued;
+ statistics::Scalar baseHitsTotal;
+ statistics::Scalar demandHitsTotal;
+ statistics::Scalar demandHitsOverride;
+ statistics::Scalar demandHitsNoOverride;
+ statistics::Scalar demandMissesTotal;
+ statistics::Scalar demandMissesNoPattern;
+ statistics::Scalar demandMissesNoPrefetch;
+ statistics::Scalar demandMissesCold;
+ statistics::Scalar allocationsTotal;
+ statistics::SparseHistogram patternHits;
+ statistics::SparseHistogram patternUseful;
+ statistics::SparseHistogram patternSetOccupancy;
+ statistics::Scalar patternBufferEvictions;
+ statistics::Scalar backingStorageEvictions;
+ statistics::Scalar backingStorageInsertions;
+ statistics::Scalar correctOverridesTotal;
+ statistics::Scalar correctOverridesIdentical;
+ statistics::Scalar wrongOverridesTotal;
+ statistics::Scalar wrongOverridesIdentical;
+ statistics::Scalar squashedOverrides;
+ } stats;
Cycles calculateRemainingLatency(Cycles insertTime);
@@ -113,6 +148,7 @@ class LLBP : public ConditionalPredictor
int8_t counter;
int hit = 0;
int useful = 0;
+ bool valid = false;
};
class PatternSet {
@@ -121,21 +157,24 @@ class LLBP : public ConditionalPredictor
int numEntries,
int setSize,
int bankBits,
- statistics::Vector& totalOccupancy
+ LLBPStats& stats
): bankBits(bankBits),
setSize(setSize),
occupancy(0),
- totalOccupancy(totalOccupancy)
+ stats(stats)
{
assert(numEntries % setSize == 0);
this->numSets = numEntries / setSize;
- ++totalOccupancy[0];
sets.resize(numSets);
for (auto& set : sets) {
set.resize(setSize);
}
}
+ ~PatternSet() {
+ commitStats();
+ }
+
int getID(uint64_t key) {
uint64_t bank = getBank(key);
return bank / setSize;
@@ -150,37 +189,30 @@ class LLBP : public ConditionalPredictor
void insertEntry(uint64_t key, bool taken) {
auto& set = getSet(key);
Pattern& victim = findVictimPattern(set);
+ if (victim.valid) {
+ stats.patternUseful.sample(victim.useful);
+ stats.patternHits.sample(victim.hit);
+ }
victim.tag = key;
victim.counter = taken ? 0 : -1;
victim.hit = 0;
victim.useful = 0;
+ victim.valid = true;
- if (occupancy < totalOccupancy.size() - 1) {
- --totalOccupancy[occupancy];
- ++totalOccupancy[occupancy + 1];
- }
saturatingAdd(occupancy, numSets*setSize);
}
- void wasUseful(uint64_t key, statistics::Vector& usefulTotal) {
+ void wasUseful(uint64_t key) {
Pattern* p = getEntry(key);
if (p) {
- int useful = p->useful++;
- if (useful < usefulTotal.size() - 1) {
- --usefulTotal[useful];
- ++usefulTotal[useful + 1];
- }
+ p->useful++;
}
}
- void wasHit(uint64_t key, statistics::Vector& hitsTotal) {
+ void wasHit(uint64_t key) {
Pattern* p = getEntry(key);
if (p) {
- int hits = p->hit++;
- if (hits < hitsTotal.size() - 1) {
- --hitsTotal[hits];
- ++hitsTotal[hits + 1];
- }
+ p->hit++;
}
}
@@ -195,6 +227,18 @@ class LLBP : public ConditionalPredictor
return bitmaskLowerN(bankBits) & key;
}
+ void commitStats() {
+ stats.patternSetOccupancy.sample(occupancy);
+ for (auto& set: sets) {
+ for (auto& pat: set) {
+ if (pat.valid) {
+ stats.patternUseful.sample(pat.useful);
+ stats.patternHits.sample(pat.hit);
+ }
+ }
+ }
+ }
+
static int absConfidence(int8_t ctr) {
if (ctr < 0) {
return abs(ctr) - 1;
@@ -217,6 +261,7 @@ class LLBP : public ConditionalPredictor
++n;
}
}
+
private:
std::vector& getSet(uint64_t key) {
int id = getID(key);
@@ -226,7 +271,7 @@ class LLBP : public ConditionalPredictor
Pattern* findPatternInSet(uint64_t key, std::vector& set) {
auto result = std::find_if(set.begin(), set.end(), [key](Pattern& pat) {
- return pat.tag == key;
+ return pat.tag == key && pat.valid;
});
if (result == set.end())
@@ -236,6 +281,13 @@ class LLBP : public ConditionalPredictor
}
Pattern& findVictimPattern(std::vector& set) {
+ auto firstInvalid = std::find_if(set.begin(), set.end(), [](Pattern& e) {
+ return !e.valid;
+ });
+
+ if (firstInvalid != set.end())
+ return *firstInvalid;
+
auto result = std::min_element(set.begin(), set.end(), [&](const Pattern& a, const Pattern& b) {
return absConfidence(a.counter) < absConfidence(b.counter);
});
@@ -247,7 +299,8 @@ class LLBP : public ConditionalPredictor
int numSets;
int setSize;
int occupancy;
- statistics::Vector& totalOccupancy;
+
+ LLBPStats& stats;
std::vector> sets;
};
@@ -259,7 +312,6 @@ class LLBP : public ConditionalPredictor
uint8_t confidence;
Context(PatternSet patterns): patterns(patterns), confidence(0) {}
-
};
@@ -277,9 +329,16 @@ class LLBP : public ConditionalPredictor
class PatternBuffer {
public:
- PatternBuffer(int numEntries, int setSize, BackingStorage& backingStorage)
+ PatternBuffer(
+ int numEntries,
+ int setSize,
+ BackingStorage& backingStorage,
+ statistics::Scalar& patternBufferEvictions
+ )
: setSize(setSize),
- backingStorage(backingStorage) {
+ backingStorage(backingStorage),
+ patternBufferEvictions(patternBufferEvictions)
+ {
assert(numEntries % setSize == 0);
this->numSets = numEntries / setSize;
@@ -294,6 +353,9 @@ class LLBP : public ConditionalPredictor
if (!backingStorage.count(cid))
return;
PatternBufferEntry& victim = findVictim(set);
+ if (victim.valid) {
+ ++patternBufferEvictions;
+ }
victim.cid = cid;
victim.insertTime = now;
victim.lastUsed = now;
@@ -349,6 +411,7 @@ class LLBP : public ConditionalPredictor
int numSets;
int setSize;
BackingStorage& backingStorage;
+ statistics::Scalar& patternBufferEvictions;
std::vector> sets;
} patternBuffer;
@@ -365,32 +428,6 @@ class LLBP : public ConditionalPredictor
int findVictimPattern(int min, Context& ctx);
uint64_t findVictimContext();
- struct LLBPStats : public statistics::Group
- {
- LLBPStats(LLBP *llbp);
-
- statistics::Scalar prefetchesIssued;
- statistics::Scalar baseHitsTotal;
- statistics::Scalar demandHitsTotal;
- statistics::Scalar demandHitsOverride;
- statistics::Scalar demandHitsNoOverride;
- statistics::Scalar demandMissesTotal;
- statistics::Scalar demandMissesNoPattern;
- statistics::Scalar demandMissesNoPrefetch;
- statistics::Scalar demandMissesCold;
- statistics::Scalar allocationsTotal;
- statistics::Vector patternHits;
- statistics::Vector patternUseful;
- statistics::Vector patternSetOccupancy;
- statistics::Scalar patternBufferEvictions;
- statistics::Scalar backingStorageEvictions;
- statistics::Scalar backingStorageInsertions;
- statistics::Scalar correctOverridesTotal;
- statistics::Scalar correctOverridesIdentical;
- statistics::Scalar wrongOverridesTotal;
- statistics::Scalar wrongOverridesIdentical;
- statistics::Scalar squashedOverrides;
- } stats;
/* From LLBP Source Code */
diff --git a/src/cpu/pred/tage.hh b/src/cpu/pred/tage.hh
index cf0b5dfd6d4..8fc087c143d 100644
--- a/src/cpu/pred/tage.hh
+++ b/src/cpu/pred/tage.hh
@@ -76,8 +76,10 @@ namespace branch_prediction
class TAGE: public ConditionalPredictor
{
- protected:
+ public:
TAGEBase *tage;
+
+ protected:
Random::RandomPtr rng = Random::genRandom();
diff --git a/src/cpu/pred/tage_base.hh b/src/cpu/pred/tage_base.hh
index 26bb22f4185..9cef5382dcb 100644
--- a/src/cpu/pred/tage_base.hh
+++ b/src/cpu/pred/tage_base.hh
@@ -548,11 +548,14 @@ class TAGEBase : public SimObject
/** Use taken only history. */
const bool takenOnlyHistory;
+ public:
// Tells which tables are active
// (for the base TAGE implementation all are active)
// Some other classes use this for handling associativity
std::vector noSkip;
+ protected:
+
const bool speculativeHistUpdate;
const unsigned instShiftAmt;
diff --git a/src/cpu/pred/tage_sc_l.hh b/src/cpu/pred/tage_sc_l.hh
index b2aae2e95bd..2b845b85bc6 100644
--- a/src/cpu/pred/tage_sc_l.hh
+++ b/src/cpu/pred/tage_sc_l.hh
@@ -182,7 +182,6 @@ class TAGE_SC_L: public LTAGE
void branchPlaceholder(ThreadID tid, Addr pc,
bool uncond, void * &bp_history) override;
- protected:
struct TageSCLBranchInfo : public LTageBranchInfo
{
@@ -205,6 +204,8 @@ class TAGE_SC_L: public LTAGE
{
SC = LAST_LTAGE_PROVIDER_TYPE + 1
};
+
+ protected:
const bool useSC;
};
From af8b21d789354321f837193945bee8f53a4e757b Mon Sep 17 00:00:00 2001
From: Phillip Assmann
Date: Wed, 28 May 2025 12:56:21 +0000
Subject: [PATCH 15/22] cpu: llbp do not steal from sc/lp
---
src/cpu/pred/llbp.cc | 20 +++++++++++---------
1 file changed, 11 insertions(+), 9 deletions(-)
diff --git a/src/cpu/pred/llbp.cc b/src/cpu/pred/llbp.cc
index 60148a2d437..7a5db62c138 100644
--- a/src/cpu/pred/llbp.cc
+++ b/src/cpu/pred/llbp.cc
@@ -156,18 +156,20 @@ LLBP::predict(ThreadID tid, Addr branch_pc, bool cond_branch, void *&b)
bi->overridden = false;
bi->base_pred = ltage_prediction.taken;
-
+
int8_t llbp_confidence = 0;
if (cond_branch)
{
int tage_bank = 0;
-
+
if (tage_bi->provider == TAGEBase::TAGE_LONGEST_MATCH)
tage_bank = tage_bi->hitBank;
if (tage_bi->provider == TAGEBase::TAGE_ALT_MATCH)
tage_bank = tage_bi->altBank;
+ if (tage_bi->provider == TAGE_SC_L::LOOP || tage_bi->provider == TAGE_SC_L::SC)
+ tage_bank = base->getNumHistoryTables();
if (tage_bank)
++stats.baseHitsTotal;
@@ -224,7 +226,7 @@ LLBP::predict(ThreadID tid, Addr branch_pc, bool cond_branch, void *&b)
bi,
branch_pc, bi->getPrediction(), llbp_confidence,
bi->overridden ? "true" : "false");
-
+
Cycles latency = ltage_prediction.latency;
return Prediction {.taken = bi->getPrediction(), .latency = latency};
}
@@ -247,9 +249,9 @@ LLBP::updateHistories(
// Insert the next prefetch context into the pattern buffer
uint64_t pcid = rcr.getPCID();
- if (backingStorage.count(pcid))
+ if (backingStorage.count(pcid))
{
- if (patternBuffer.get(pcid) == nullptr)
+ if (patternBuffer.get(pcid) == nullptr)
{
++stats.prefetchesIssued;
patternBuffer.insert(pcid, curCycle());
@@ -303,8 +305,8 @@ void LLBP::storageUpdate(ThreadID tid, Addr pc, bool taken, LLBPBranchInfo *bi)
LLBP::Pattern* p = context.patterns.getEntry(key);
if (p) {
- LLBP::Pattern& pattern = *p;
-
+ LLBP::Pattern& pattern = *p;
+
int8_t conf_before = pattern.counter;
TAGEBase::ctrUpdate(pattern.counter, taken, ptnCounterBits);
int8_t conf_after = pattern.counter;
@@ -354,7 +356,7 @@ void LLBP::storageUpdate(ThreadID tid, Addr pc, bool taken, LLBPBranchInfo *bi)
uint64_t key = context.patterns.calculateKey(tage_bi->tableTags, tage_bi->tableIndices, i);
context.patterns.insertEntry(key, taken);
}
- }
+ }
}
else
{
@@ -596,7 +598,7 @@ LLBP::LLBPStats::LLBPStats(LLBP *llbp)
ADD_STAT(patternUseful, statistics::units::Count::get(),
"Number of times any pattern was useful (distribution)"),
ADD_STAT(patternSetOccupancy, statistics::units::Count::get(),
- "Number of patterns used in the pattern sets (distribution)"),
+ "Number of patterns used in the pattern sets (distribution)"),
ADD_STAT(patternBufferEvictions, statistics::units::Count::get(),
"Number of pattern sets evicted from the pattern buffer due to capacity limits"),
ADD_STAT(backingStorageEvictions, statistics::units::Count::get(),
From 180c2361480396f781d035d2df404c6af3397a10 Mon Sep 17 00:00:00 2001
From: David Schall
Date: Thu, 29 May 2025 11:05:48 +0000
Subject: [PATCH 16/22] cpu: Add LLBP reference code
---
src/cpu/pred/BranchPredictor.py | 7 +
src/cpu/pred/SConscript | 6 +-
src/cpu/pred/llbp_ref.cc | 143 +++
src/cpu/pred/llbp_ref.hh | 107 +++
src/cpu/pred/llbpref/base_predictor.h | 45 +
src/cpu/pred/llbpref/cache.h | 179 ++++
src/cpu/pred/llbpref/common.h | 96 ++
src/cpu/pred/llbpref/counters.h | 104 +++
src/cpu/pred/llbpref/hist_registers.h | 508 ++++++++++
src/cpu/pred/llbpref/histogram.h | 183 ++++
src/cpu/pred/llbpref/intmath.hh | 304 ++++++
src/cpu/pred/llbpref/llbp.cc | 1248 +++++++++++++++++++++++++
src/cpu/pred/llbpref/llbp.h | 641 +++++++++++++
src/cpu/pred/llbpref/tage.cc | 1068 +++++++++++++++++++++
src/cpu/pred/llbpref/tage.h | 368 ++++++++
src/cpu/pred/llbpref/tage_scl.cc | 932 ++++++++++++++++++
src/cpu/pred/llbpref/tage_scl.h | 472 ++++++++++
17 files changed, 6410 insertions(+), 1 deletion(-)
create mode 100644 src/cpu/pred/llbp_ref.cc
create mode 100644 src/cpu/pred/llbp_ref.hh
create mode 100644 src/cpu/pred/llbpref/base_predictor.h
create mode 100644 src/cpu/pred/llbpref/cache.h
create mode 100755 src/cpu/pred/llbpref/common.h
create mode 100644 src/cpu/pred/llbpref/counters.h
create mode 100644 src/cpu/pred/llbpref/hist_registers.h
create mode 100644 src/cpu/pred/llbpref/histogram.h
create mode 100644 src/cpu/pred/llbpref/intmath.hh
create mode 100755 src/cpu/pred/llbpref/llbp.cc
create mode 100644 src/cpu/pred/llbpref/llbp.h
create mode 100755 src/cpu/pred/llbpref/tage.cc
create mode 100644 src/cpu/pred/llbpref/tage.h
create mode 100755 src/cpu/pred/llbpref/tage_scl.cc
create mode 100644 src/cpu/pred/llbpref/tage_scl.h
diff --git a/src/cpu/pred/BranchPredictor.py b/src/cpu/pred/BranchPredictor.py
index 1dc8433ad5b..7fc02d38e3c 100644
--- a/src/cpu/pred/BranchPredictor.py
+++ b/src/cpu/pred/BranchPredictor.py
@@ -810,6 +810,13 @@ class LLBP(ConditionalPredictor):
ctxCounterBits = Param.Int(2, "Bits in Context Replacement Counter")
backingStorageLatency = Param.Cycles(6, "Backing Storage Latency")
+class LLBPRef(ConditionalPredictor):
+ type = "LLBPRef"
+ cxx_class = "gem5::branch_prediction::LLBPRef"
+ cxx_header = "cpu/pred/llbp_ref.hh"
+ inf = Param.Bool(
+ False, "Use infinite storage capacity for the backing storage"
+ )
class MultiperspectivePerceptron(ConditionalPredictor):
type = "MultiperspectivePerceptron"
diff --git a/src/cpu/pred/SConscript b/src/cpu/pred/SConscript
index 30f4962b082..46583c9e643 100644
--- a/src/cpu/pred/SConscript
+++ b/src/cpu/pred/SConscript
@@ -52,7 +52,7 @@ SimObject('BranchPredictor.py',
'LocalBP', 'TournamentBP', 'BiModeBP', 'TAGEBase', 'TAGE', 'LoopPredictor',
'TAGE_SC_L_TAGE', 'TAGE_SC_L_TAGE_64KB', 'TAGE_SC_L_TAGE_8KB',
'LTAGE', 'TAGE_SC_L_LoopPredictor', 'StatisticalCorrector', 'TAGE_SC_L',
- 'LLBP',
+ 'LLBP', 'LLBPRef',
'TAGE_SC_L_64KB_StatisticalCorrector',
'TAGE_SC_L_8KB_StatisticalCorrector',
'TAGE_SC_L_64KB', 'TAGE_SC_L_8KB',
@@ -81,6 +81,10 @@ Source('tagescl_ref.cc')
Source('loop_predictor.cc')
Source('ltage.cc')
Source('llbp.cc')
+Source('llbp_ref.cc')
+Source('llbpref/tage.cc')
+Source('llbpref/tage_scl.cc')
+Source('llbpref/llbp.cc')
Source('multiperspective_perceptron.cc')
Source('multiperspective_perceptron_8KB.cc')
Source('multiperspective_perceptron_64KB.cc')
diff --git a/src/cpu/pred/llbp_ref.cc b/src/cpu/pred/llbp_ref.cc
new file mode 100644
index 00000000000..576b551ed54
--- /dev/null
+++ b/src/cpu/pred/llbp_ref.cc
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2025 Technical University of Munich
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder. You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "cpu/pred/llbp_ref.hh"
+
+#include "base/intmath.hh"
+#include "base/logging.hh"
+#include "base/trace.hh"
+#include "debug/Fetch.hh"
+#include "cpu/pred/llbpref/llbp.h"
+
+namespace gem5
+{
+
+namespace branch_prediction
+{
+
+LLBPRef::LLBPRef(const LLBPRefParams ¶ms)
+ : ConditionalPredictor(params),
+ predictor(nullptr)
+{
+ if (params.inf) {
+ predictor = new LLBP::LLBPInfTageSCL64k();
+ } else {
+ predictor = new LLBP::LLBPTageSCL64k();
+ }
+}
+
+LLBPRef::~LLBPRef()
+{
+ predictor->PrintStat(1.0);
+ delete predictor;
+}
+
+void
+LLBPRef::updateHistories(ThreadID tid, Addr pc, bool uncond, bool taken,
+ Addr target, const StaticInstPtr &inst,
+ void * &bp_history)
+{
+// Place holder for a function that is called to update predictor history
+}
+
+
+Prediction
+LLBPRef::lookup(ThreadID tid, Addr branch_addr, void * &bp_history)
+{
+ auto pred = predictor->GetPrediction(branch_addr);
+ return staticPrediction(pred);
+}
+
+void
+LLBPRef::update(ThreadID tid, Addr branch_addr, bool taken, void *&bp_history,
+ bool squashed, const StaticInstPtr & inst, Addr target)
+{
+ if (squashed) {
+ return;
+ }
+
+ auto brtype = getBranchType(inst);
+ OpType opType = OPTYPE_OP;
+ switch (brtype) {
+ case BranchType::DirectUncond:
+ opType = OPTYPE_JMP_DIRECT_UNCOND;
+ break;
+ case BranchType::DirectCond:
+ opType = OPTYPE_JMP_DIRECT_COND;
+ break;
+ case BranchType::IndirectUncond:
+ opType = OPTYPE_JMP_INDIRECT_UNCOND;
+ break;
+ case BranchType::IndirectCond:
+ opType = OPTYPE_JMP_INDIRECT_COND;
+ break;
+ case BranchType::CallDirect:
+ opType = OPTYPE_CALL_DIRECT_UNCOND;
+ break;
+ case BranchType::CallIndirect:
+ opType = OPTYPE_CALL_INDIRECT_UNCOND;
+ break;
+ case BranchType::Return:
+ opType = OPTYPE_RET_UNCOND;
+ break;
+ default:
+ opType = OPTYPE_OP;
+ break;
+ }
+
+ if (opType == OPTYPE_OP) {
+ return;
+ }
+
+ if (brtype == BranchType::DirectCond) {
+ predictor->UpdatePredictor(branch_addr, taken, false, target);
+ } else {
+ predictor->TrackOtherInst(branch_addr, opType, taken, target);
+ }
+
+
+
+
+}
+
+
+
+} // namespace branch_prediction
+} // namespace gem5
diff --git a/src/cpu/pred/llbp_ref.hh b/src/cpu/pred/llbp_ref.hh
new file mode 100644
index 00000000000..91f8326a24d
--- /dev/null
+++ b/src/cpu/pred/llbp_ref.hh
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2011, 2014 ARM Limited
+ * Copyright (c) 2022-2023 The University of Edinburgh
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder. You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __CPU_PRED_LLBP_REF_PRED_HH__
+#define __CPU_PRED_LLBP_REF_PRED_HH__
+
+#include
+
+#include "base/sat_counter.hh"
+#include "base/types.hh"
+#include "cpu/pred/branch_type.hh"
+#include "cpu/pred/conditional.hh"
+#include "params/LLBPRef.hh"
+
+// namespace LLBP
+// {
+// class LLBPTageSCL64k;
+// }
+class BasePredictor;
+
+namespace gem5
+{
+
+namespace branch_prediction
+{
+
+
+/**
+ * Implements a local predictor that uses the PC to index into a table of
+ * counters. Note that any time a pointer to the bp_history is given, it
+ * should be NULL using this predictor because it does not have any branch
+ * predictor state that needs to be recorded or updated; the update can be
+ * determined solely by the branch being taken or not taken.
+ */
+class LLBPRef : public ConditionalPredictor
+{
+ public:
+ /**
+ * Default branch predictor constructor.
+ */
+ LLBPRef(const LLBPRefParams ¶ms);
+ ~LLBPRef();
+
+ // Overriding interface functions
+ Prediction lookup(ThreadID tid, Addr pc, void * &bp_history) override;
+
+ // void branchPlaceholder(ThreadID tid, Addr pc, bool uncond,
+ // void * &bpHistory) override;
+
+ void updateHistories(ThreadID tid, Addr pc, bool uncond, bool taken,
+ Addr target, const StaticInstPtr &inst,
+ void * &bp_history) override;
+
+ void update(ThreadID tid, Addr pc, bool taken,
+ void * &bp_history, bool squashed,
+ const StaticInstPtr & inst, Addr target) override;
+
+ void squash(ThreadID tid, void * &bp_history) override
+ { assert(bp_history == NULL); }
+
+ private:
+ // LLBP::LLBPTageSCL64k *predictor;
+ BasePredictor *predictor; // Pointer to the LLBP predictor
+};
+
+} // namespace branch_prediction
+} // namespace gem5
+
+#endif // __CPU_PRED_2BIT_LOCAL_PRED_HH__
diff --git a/src/cpu/pred/llbpref/base_predictor.h b/src/cpu/pred/llbpref/base_predictor.h
new file mode 100644
index 00000000000..3155b88c69b
--- /dev/null
+++ b/src/cpu/pred/llbpref/base_predictor.h
@@ -0,0 +1,45 @@
+
+#pragma once
+// #ifndef __BASE_PREDICTOR__
+// #define __BASE_PREDICTOR__
+
+#include "common.h"
+
+class BasePredictor {
+ static inline UINT32 SatIncrement(UINT32 x, UINT32 max) {
+ if (x < max) return x + 1;
+ return x;
+ }
+
+ static inline UINT32 SatDecrement(UINT32 x) {
+ if (x > 0) return x - 1;
+ return x;
+ }
+
+ public:
+ BasePredictor() {};
+ virtual ~BasePredictor() = default;
+
+ virtual bool GetPrediction(uint64_t PC) = 0;
+ virtual void FirstTimeUpdate(uint64_t PC, bool taken,
+ uint64_t branchTarget) {};
+ virtual void UpdatePredictor(uint64_t PC, bool resolveDir,
+ bool predDir, uint64_t branchTarget) = 0;
+
+ virtual void TrackOtherInst(uint64_t PC, OpType opType, bool taken,
+ uint64_t branchTarget) = 0;
+
+ virtual void PrintStat(double NUMINST) {};
+ virtual void DumpTables(std::string filename) {};
+ virtual void LoadTables(std::string filename) {};
+ virtual void StartTracer(std::string filename) {};
+ virtual void tick() {};
+ virtual void resetStats() {};
+ virtual void btbMiss() {};
+ virtual void setState(bool warmup=false) {};
+};
+
+
+BasePredictor* CreateBP(std::string bp_name);
+
+// #endif //__BASE_PREDICTOR__
diff --git a/src/cpu/pred/llbpref/cache.h b/src/cpu/pred/llbpref/cache.h
new file mode 100644
index 00000000000..39595c9edfb
--- /dev/null
+++ b/src/cpu/pred/llbpref/cache.h
@@ -0,0 +1,179 @@
+/* MIT License
+ *
+ * Copyright (c) 2024 David Schall and EASE lab
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#pragma once
+
+template
+class BaseCache {
+ protected:
+ typedef typename std::pair key_value_pair_t;
+ typedef typename std::list::iterator list_iterator_t;
+ typedef typename std::list set_t;
+
+ std::unordered_map _index;
+ std::vector> _cache;
+ const size_t _max_size;
+ const size_t _assoc;
+ const uint64_t _sets;
+ const uint64_t _set_mask;
+
+ public:
+ BaseCache(size_t max_size, size_t assoc)
+ : _max_size(max_size),
+ _assoc(assoc),
+ _sets(max_size / assoc),
+ _set_mask(_sets - 1) {
+ // Check if number of sets is a power of 2
+ assert((_sets & (_sets - 1)) == 0);
+ assert(_assoc * _sets == _max_size);
+ _cache.resize(_sets);
+ // for (auto& set : _cache) {
+ // set.resize(assoc);
+ // }
+ }
+
+ void printCfg() {
+ printf("Max size: %lu, Assoc: %lu, Sets: %lu\n", _max_size, _assoc,
+ _sets);
+ }
+
+ size_t size() const { return _index.size(); }
+
+ key_t index(const key_t& key) { return key & _set_mask; }
+
+ set_t& getSet(const key_t& key) {
+ return _cache[index(key)];
+ }
+
+ const std::unordered_map getMap() { return _index; }
+
+ value_t* get(const key_t& key) {
+ auto it = _index.find(key);
+ if (it == _index.end()) {
+ return nullptr;
+ }
+ return &it->second->second;
+ }
+
+ void erase(const key_t& key) {
+ auto it = _index.find(key);
+ if (it == _index.end()) {
+ return;
+ }
+ auto& set = getSet(key);
+ set.erase(it->second);
+ _index.erase(key);
+ }
+
+ value_t* getVictim(const key_t& key) {
+ auto& set = getSet(key);
+ if (set.size() < _assoc) {
+ return nullptr;
+ }
+ return &set.back().second;
+ }
+
+ void touch(const key_t& key) {
+ auto it = _index.find(key);
+ if (it == _index.end()) {
+ return;
+ }
+ auto& set = getSet(key);
+ set.splice(set.begin(), set, it->second);
+ }
+
+ void bump(const key_t& key, bool front=true) {
+ auto it = _index.find(key);
+ if (it == _index.end()) {
+ return;
+ }
+ auto& set = getSet(key);
+ if (front) {
+ set.splice(set.begin(), set, it->second);
+ } else {
+ set.splice(set.end(), set, it->second);
+ }
+ }
+
+ bool exists(const key_t& key) const {
+ return _index.find(key) != _index.end();
+ }
+
+ int distance(const key_t& key) {
+ auto it = _index.find(key);
+ if (it == _index.end()) {
+ return -1;
+ }
+ auto& set = getSet(key);
+ return std::distance(set.begin(), it->second);
+ }
+
+ set_t& getResizedSet(const key_t& key) {
+ auto& set = getSet(key);
+
+ // If this element will exceed the max size, remove the last element
+ if (set.size() >= _assoc) {
+ auto last = set.end();
+ last--;
+ _index.erase(last->first);
+ set.pop_back();
+ }
+ return set;
+ }
+
+ value_t* insertAt(const key_t& key, int at = 0) {
+ auto v = get(key);
+ if (v != nullptr) {
+ return v;
+ }
+
+ // Get the set with a free item
+ auto& set = getResizedSet(key);
+
+ // Move to the insert position
+ auto it2 = set.begin();
+ at = std::min(at, (int)set.size());
+ std::advance(it2, at);
+
+ it2 = set.emplace(it2, key_value_pair_t(key, value_t()));
+ _index[key] = it2;
+ return &(it2->second);
+ }
+
+ value_t* insert(const key_t& key) {
+ auto v = get(key);
+ if (v != nullptr) {
+ return v;
+ }
+
+ // Get the set with a free item
+ auto& set = getResizedSet(key);
+
+ // Move to the insert position
+ auto it = set.begin();
+
+ it = set.emplace(it, key_value_pair_t(key, value_t()));
+ _index[key] = it;
+ return &(it->second);
+ }
+};
diff --git a/src/cpu/pred/llbpref/common.h b/src/cpu/pred/llbpref/common.h
new file mode 100755
index 00000000000..9b83ba9770c
--- /dev/null
+++ b/src/cpu/pred/llbpref/common.h
@@ -0,0 +1,96 @@
+///////////////////////////////////////////////////////////////////////
+// Copyright 2015 Samsung Austin Semiconductor, LLC. //
+///////////////////////////////////////////////////////////////////////
+
+
+#ifndef UTILS_H
+#define UTILS_H
+
+#include
+#include
+#include
+#include
+#include
+
+using namespace std;
+
+#define UINT32 unsigned int
+#define INT32 int
+#define UINT64 unsigned long long
+// #define COUNTER unsigned long long
+
+
+// #define NOT_TAKEN 0
+// #define TAKEN 1
+
+#define FAILURE 0
+#define SUCCESS 1
+
+//JD2_2_2016
+//typedef enum {
+// OPTYPE_OP =2,
+// OPTYPE_BRANCH_COND =3,
+// OPTYPE_RET =4,
+// OPTYPE_BRANCH =6,
+// OPTYPE_INDIRECT =7,
+// OPTYPE_MAX =8
+//}OpType;
+
+//JD2_17_2016 break down types into COND/UNCOND
+typedef enum {
+ OPTYPE_OP =2,
+
+ OPTYPE_RET_UNCOND,
+ OPTYPE_JMP_DIRECT_UNCOND,
+ OPTYPE_JMP_INDIRECT_UNCOND,
+ OPTYPE_CALL_DIRECT_UNCOND,
+ OPTYPE_CALL_INDIRECT_UNCOND,
+
+ OPTYPE_RET_COND,
+ OPTYPE_JMP_DIRECT_COND,
+ OPTYPE_JMP_INDIRECT_COND,
+ OPTYPE_CALL_DIRECT_COND,
+ OPTYPE_CALL_INDIRECT_COND,
+
+ OPTYPE_ERROR,
+
+ OPTYPE_MAX
+}OpType;
+
+
+
+typedef enum {
+ NoBranch,
+ Return,
+ CallDirect,
+ CallIndirect,
+ DirectCond,
+ DirectUncond,
+ IndirectCond,
+ IndirectUncond,
+ MAX
+} BrType;
+
+
+/** Some helper functions */
+OpType convertBrType(BrType type);
+
+#define PRINTDEBUG 0
+
+#define DPRINTFF(...) \
+ if (PRINTDEBUG) [[unlikely]] { \
+ printf(__VA_ARGS__); \
+ }
+
+#define DPRINTIF(cond, ...) \
+ if (PRINTDEBUG && (cond)) [[unlikely]] { \
+ printf(__VA_ARGS__); \
+ }
+
+#define PRINTIF(cond, ...) \
+ if (cond) [[unlikely]] { \
+ printf(__VA_ARGS__); \
+ }
+
+#endif
+
diff --git a/src/cpu/pred/llbpref/counters.h b/src/cpu/pred/llbpref/counters.h
new file mode 100644
index 00000000000..55c42f50666
--- /dev/null
+++ b/src/cpu/pred/llbpref/counters.h
@@ -0,0 +1,104 @@
+#pragma once
+
+#include
+
+
+// up-down saturating counter
+inline void ctrupdate(int8_t& ctr, bool taken, int nbits) {
+ if (taken) {
+ if (ctr < ((1 << (nbits - 1)) - 1)) ctr++;
+ } else {
+ if (ctr > -(1 << (nbits - 1))) ctr--;
+ }
+}
+
+// up-down saturating counter
+template
+inline void ctrupdate(T& ctr, bool up, int nbits) {
+ if (up) {
+ if (ctr < ((1 << (T)nbits) - 1)) ctr++;
+ } else {
+ if (ctr > 0) ctr--;
+ }
+}
+
+enum {LowConf = 0, MedConf = 1, HighConf = 2};
+inline unsigned compConf(int8_t ctr, const int cwidth) {
+ if (cwidth < 2)
+ return HighConf;
+ // Two bit counters saturate at +1 and -2
+ if (cwidth < 3)
+ return ((ctr == -2) || (ctr == 1)) ? HighConf : LowConf;
+
+ if (abs (2 * ctr + 1) >= (1 << cwidth) - 1)
+ return HighConf;
+ if (abs (2 * ctr + 1) >= (1 << (cwidth - 1)) - 1)
+ return MedConf;
+ return LowConf;
+}
+
+inline int8_t saturate(int8_t ctr, int nbits) {
+ if (ctr > ((1 << (nbits - 1)) - 1)) return ((1 << (nbits - 1)) - 1);
+ if (ctr < -(1 << (nbits - 1))) return -(1 << (nbits - 1));
+ return ctr;
+}
+
+inline int center(int8_t ctr) {
+ return 2 * ctr + 1;
+}
+
+#define CUMAX(x) ((1 << (x)) - 1)
+
+//counter
+//MAX : max value
+//MIN : min value
+template
+class COUNTER {
+private:
+ T ctr;
+public:
+ T read() {
+ return ctr;
+ }
+
+ bool pred() {
+ return ctr >= 0;
+ }
+ bool satmax(){
+ return ctr == MAX;
+ }
+ bool satmin(){
+ return ctr == MIN;
+ }
+ void write(T v) {
+ assert(v <= MAX);
+ assert(v >= MIN);
+ ctr = v;
+ }
+ void add(T d) {
+ ctr = ctr + d;
+ if (ctr > MAX){
+ ctr = MAX;
+ }else if (ctr < MIN){
+ ctr = MIN;
+ }
+ }
+ void update(bool incr) {
+ if (incr) {
+ if (ctr < MAX)
+ ctr = ctr + 1;
+ } else {
+ if (ctr > MIN)
+ ctr = ctr - 1;
+ }
+ }
+};
+//signed integer counter
+template
+class SCOUNTER : public COUNTER{
+};
+//unsigned integer counter
+template
+class UCOUNTER : public COUNTER{
+};
+
diff --git a/src/cpu/pred/llbpref/hist_registers.h b/src/cpu/pred/llbpref/hist_registers.h
new file mode 100644
index 00000000000..1c4e19b94a3
--- /dev/null
+++ b/src/cpu/pred/llbpref/hist_registers.h
@@ -0,0 +1,508 @@
+#pragma once
+
+#include
+#include
+#include
+#include
+
+
+
+//==========================================================
+// History Code:
+// The code is based on the "Dynamically Sizing the TAGE Branch Predictor"
+// paper by Stephen Pruett submitted to the CBP-5 workshop.
+
+
+
+// [PPM, page 4] Discusses how to build a low latency FoldedHistory register
+struct HistoryRegister {
+ public:
+ uint32_t size;
+ uint32_t head;
+ std::vector history;
+ long long history_l;
+
+ void init(uint32_t s) {
+ size = s;
+ history.resize(size);
+
+ for (uint32_t i = 0; i < size; ++i) {
+ history[i] = false;
+ }
+ history_l = 0;
+
+ head = 0;
+ }
+
+ HistoryRegister() {}
+
+ HistoryRegister(uint32_t s) { init(s); }
+
+ void push(bool p) {
+ head = (head + 1) % size;
+ history[head] = p;
+
+ history_l <<= 1;
+ history_l += (p & 0x1);
+ }
+
+ bool operator[](const uint32_t i) {
+ uint32_t index = (head + size - i) % size;
+ assert(index < size);
+ return history[index];
+ }
+
+ void print() {
+ printf("History");
+ for (uint32_t i = 0; i < size; ++i) {
+ printf("%d, ", (bool)history[(head - i) % size]);
+ }
+ printf("\n");
+ }
+
+ long long getHistory() { return history_l; }
+
+ uint32_t getSize() { return size; }
+};
+
+struct PCHistoryRegister {
+ public:
+ uint32_t size;
+ uint32_t head;
+ typedef std::pair entry_t;
+ std::vector history;
+ long long history_l;
+
+ void init(uint32_t s) {
+ size = s;
+ history.resize(size);
+
+ for (uint32_t i = 0; i < size; ++i) {
+ history[i] = entry_t(0,false);
+ }
+ history_l = 0;
+
+ head = 0;
+ }
+
+ PCHistoryRegister() {}
+
+ PCHistoryRegister(uint32_t s) { init(s); }
+
+ void push(uint64_t pc, bool t) {
+ head = (head + 1) % size;
+ history[head] = entry_t(pc,t);
+ }
+
+ entry_t operator[](const uint32_t i) {
+ uint32_t index = (head + size - i) % size;
+ assert(index < size);
+ return history[index];
+ }
+
+ void print() {
+ printf("History");
+ for (uint32_t i = 0; i < size; ++i) {
+ auto e = history[(head - i) % size];
+ printf("%lu:%d ",e.first, (bool)e.second);
+ }
+ printf("\n");
+ }
+
+ std::string toStr() {
+ std::string s = "";
+ for (uint32_t i = 0; i < size; ++i) {
+ auto e = history[(head - i) % size];
+ // s += std::to_string(e.first) + "," + std::to_string(e.second) + ",";
+ s += std::to_string(e.first) + ",";
+ }
+ return s;
+ }
+
+
+ uint32_t getSize() { return size; }
+};
+
+
+class FoldedHistory {
+ private:
+ uint32_t inputWidth; // size of emulated history register
+ uint32_t outputWidth; // size of folded register
+ uint32_t
+ maxOutputWidth; // first width register is set to. Used to calc size.
+ int32_t remainder;
+ int32_t value;
+ HistoryRegister* ghr;
+
+ FoldedHistory() {}
+
+ public:
+ FoldedHistory(HistoryRegister* g, uint32_t iw, uint32_t ow) {
+ inputWidth = iw;
+ outputWidth = ow;
+ maxOutputWidth = outputWidth;
+ ghr = g;
+
+ // using a 32-bit integer as register
+ // -need an extra bit, so max is 31 bits...
+ assert(outputWidth < 32);
+ assert(outputWidth != 0);
+ remainder = inputWidth % outputWidth;
+ value = 0;
+ }
+
+ // Expectation is that FoldedHistory push is called
+ // after HistoryRegister push
+ void update() {
+ // input bit most recent shifted into ghr
+ bool inBit = (*ghr)[0];
+
+ // Shift in bit
+ value = (value << 1) | (inBit ? 0x01 : 0x00);
+
+ // Fold shifted-out bit in
+ value = value ^ (value >> outputWidth);
+ value = value & ((1 << outputWidth) - 1);
+
+ // Get bit to shift out from Global History
+ bool outputBit = (*ghr)[inputWidth];
+ int32_t outputValue = (outputBit) ? (0x01 << (remainder)) : 0x0;
+
+ // Shift out bit
+ value = value ^ outputValue;
+ }
+
+ inline int32_t get() { return value; }
+
+ void reset() { value = 0; }
+
+ uint32_t getSize() { return maxOutputWidth; }
+};
+
+
+
+
+// [PPM, page 4] Discusses how to build a low latency FoldedHistory register
+struct HistoryRegisterFast {
+ public:
+ const uint32_t size;
+ uint32_t head;
+
+ uint8_t* history;
+ long long history_l;
+
+ HistoryRegisterFast(uint32_t s)
+ : size(s)
+ {
+ history = new uint8_t[size]();
+
+ for (uint32_t i = 0; i < size; ++i) {
+ history[i] = false;
+ }
+ history_l = 0;
+
+ head = 0;
+ }
+
+ void push(bool p) {
+ head--;
+ history[head & (size - 1)] = p;
+ history_l = (history_l << 1) + (p & 0x1);
+ }
+
+ inline bool operator[](const uint32_t i) {
+ return history[(head + i) & (size - 1)];
+ }
+
+ void print() {
+ printf("History");
+ for (uint32_t i = 0; i < size; ++i) {
+ printf("%d, ", (bool)history[(head - i) % size]);
+ }
+ printf("\n");
+ }
+
+ long long getHistory() { return history_l; }
+
+ uint32_t getSize() { return size; }
+};
+
+
+class FoldedHistoryFast {
+ private:
+ const uint32_t inputWidth; // size of emulated history register
+ const uint32_t outputWidth; // size of folded register
+ const uint32_t
+ maxOutputWidth; // first width register is set to. Used to calc size.
+ const int32_t remainder;
+ HistoryRegisterFast& ghr; // Reference to global history register
+
+
+ public:
+ int32_t value;
+
+ FoldedHistoryFast(HistoryRegisterFast& g, uint32_t iw, uint32_t ow)
+ : inputWidth(iw),
+ outputWidth(ow),
+ maxOutputWidth(ow),
+ remainder(iw % ow),
+ ghr(g)
+ {
+ // using a 32-bit integer as register
+ // -need an extra bit, so max is 31 bits...
+ assert(outputWidth < 32);
+ assert(outputWidth != 0);
+ value = 0;
+ }
+
+ // Expectation is that FoldedHistory push is called
+ // after HistoryRegister push
+ void update() {
+
+
+
+ // // Shift in new bit
+ // value = (value << 1) | ghr[0];
+
+ // // Get bit to shift out from Global History
+ // value = value ^ ghr[inputWidth] << outputWidth;
+
+ // // Fold and mask
+ // value = value ^ (value >> outputWidth);
+ // value = value & ((1 << outputWidth) - 1);
+
+
+
+
+ // input bit most recent shifted into ghr
+ bool inBit = ghr[0];
+
+ // Shift in bit
+ value = (value << 1) | (inBit ? 0x01 : 0x00);
+
+
+
+
+
+
+
+ // Fold shifted-out bit in
+ value = value ^ (value >> outputWidth);
+ value = value & ((1 << outputWidth) - 1);
+
+ // Get bit to shift out from Global History
+ bool outputBit = ghr[inputWidth];
+ int32_t outputValue = (outputBit) ? (0x01 << (remainder)) : 0x0;
+
+ // Shift out bit
+ value = value ^ outputValue;
+ }
+
+ inline int32_t get() { return value; }
+
+ void reset() { value = 0; }
+
+ uint32_t getSize() { return maxOutputWidth; }
+};
+
+
+
+
+
+
+
+
+// utility class for index computation
+// this is the cyclic shift register for folding
+// a long global history into a smaller number of bits; see P. Michaud's
+// PPM-like predictor at CBP-1
+class folded_history {
+ public:
+ unsigned comp;
+ int CLENGTH;
+ int OLENGTH;
+ int OUTPOINT;
+ int histbufferlen;
+
+ folded_history() {}
+
+ void init(int original_length, int compressed_length, int _histbufferlen) {
+ comp = 0;
+ OLENGTH = original_length;
+ CLENGTH = compressed_length;
+ OUTPOINT = OLENGTH % CLENGTH;
+ histbufferlen = _histbufferlen;
+ }
+
+ void update(uint8_t *h, int PT) {
+ comp = (comp << 1) ^ h[PT & (histbufferlen - 1)];
+ comp ^= h[(PT + OLENGTH) & (histbufferlen - 1)] << OUTPOINT;
+ comp ^= (comp >> CLENGTH);
+ comp = (comp) & ((1 << CLENGTH) - 1);
+ }
+};
+
+class Bentry // TAGE bimodal table entry
+{
+ public:
+ int8_t ctr;
+ int8_t hyst;
+ int8_t pred;
+ uint64_t pc;
+ uint64_t id;
+
+ Bentry() {
+ ctr = -1;
+ pred = 0;
+ hyst = 1;
+ pc = 0;
+ id = 0;
+ }
+};
+
+class Gentry // TAGE global table entry
+{
+ public:
+ int8_t ctr;
+ uint tag;
+ int8_t u;
+ int correct;
+ int incorrect;
+ int useful;
+ uint64_t pc;
+ int hlen;
+ int idx;
+ uint64_t key;
+ uint64_t ctx;
+
+ Gentry() {
+ ctr = 0;
+ u = 0;
+ tag = 0;
+ correct = -1;
+ incorrect = 0;
+ useful = 0;
+ pc = 0;
+ key = 0;
+ ctx = 0;
+ }
+};
+
+
+
+
+// FTL++
+
+
+//history register
+class HISTORY {
+private:
+ int MAXHIST; // constant value
+
+ bool *bhr; // branch history
+
+public:
+ void init(int HLENGTH) {
+ MAXHIST = HLENGTH+1; // set constant value
+ bhr = new bool[MAXHIST];
+ for (int i = 0; i < MAXHIST; i++){
+ bhr[i] = false;
+ }
+ }
+
+ bool read(int n) {
+ assert(n < MAXHIST);
+ return bhr[n];
+ }
+
+ uint32_t read(int from, int n) {
+ assert(n < MAXHIST);
+ int r = 0;
+ for (int i = from; i < n; ++i) {
+ r ^= bhr[i] << ((i - from) % 32);
+ }
+ return r;
+ }
+ //push
+ void push(bool t) {
+ for(int i=MAXHIST-1; i>=0; i--) {
+ bhr[i] = bhr[i-1];
+ }
+ bhr[0] = t;
+ }
+};
+
+//folded history
+template
+class FOLDHIST {
+private:
+ //path history length
+ static const int PathHistoryLength = 32;
+
+private:
+ uint32_t start[2]; // constant value
+ uint32_t end[2]; // constant value
+ uint32_t pos[3]; // constant value
+
+ uint32_t comp; // folded history
+
+public:
+
+ void init(int s, int e,int s2,int e2) {
+ comp = 0;
+ start[0] = s;
+ start[1] = s2;
+ end[0] = e;
+ end[1] = e2;
+ pos[0] = 0;
+ pos[1] = (pos[0] + end[0] - start[0]) % WIDTH;
+ pos[2] = (pos[1] + end[1] - start[1]) % WIDTH;
+ assert(pos[0] < WIDTH);
+ assert(pos[1] < WIDTH);
+ assert(pos[2] < WIDTH);
+ }
+ void init(int s, int e) {
+ comp = 0;
+ start[0] = s;
+ start[1] = s;
+ end[0] = e;
+ end[1] = e;
+ pos[0] = 0;
+ pos[1] = (pos[0] + end[0] - start[0]) % WIDTH;
+ pos[2] = (pos[1] + end[1] - start[1]) % WIDTH;
+ assert(pos[0] < WIDTH);
+ assert(pos[1] < WIDTH);
+ assert(pos[2] < WIDTH);
+ }
+
+ void init(int l) {
+ init(0, l,0,(l>=PathHistoryLength)?PathHistoryLength:l);
+ }
+
+ uint32_t read(uint32_t pc) {
+ assert(comp >= 0);
+ assert(comp < (1 << WIDTH));
+
+ pc &= (1 << WIDTH) - 1;
+ return pc ^ comp;
+ }
+ uint32_t read(uint32_t pc, int rot) {
+ assert(comp >= 0);
+ assert(comp < (1 << WIDTH));
+ uint32_t r = rot % WIDTH;
+ uint32_t p = pc & ((1 << WIDTH) - 1);
+ p = (p << r);
+ p = p ^ (p >> WIDTH);
+ p = (p & ((1 << WIDTH) - 1));
+ return p ^ comp;
+ }
+ void update(class HISTORY &ghr, class HISTORY &phr) {
+ comp = (comp << 1);
+ comp |= (comp >> WIDTH);
+ comp &= ((1 << WIDTH) - 1);
+ comp ^= (ghr.read(start[0]) ? 1 : 0) << pos[0];
+ comp ^= (ghr.read(end[0]) ? 1 : 0) << pos[1];
+ comp ^= (phr.read(start[1]) ? 1 : 0) << pos[1];
+ comp ^= (phr.read(end[1]) ? 1 : 0) << pos[2];
+ }
+};
diff --git a/src/cpu/pred/llbpref/histogram.h b/src/cpu/pred/llbpref/histogram.h
new file mode 100644
index 00000000000..f00d8125d50
--- /dev/null
+++ b/src/cpu/pred/llbpref/histogram.h
@@ -0,0 +1,183 @@
+
+
+// S
+#pragma once
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "error.h"
+
+
+template
+class Histogram {
+
+ // Define upper and lower bounds
+ const T lower;
+ const T upper;
+ const int bins;
+
+
+
+ // Bucket struct
+ struct Bucket {
+ std::string mark;
+ T count;
+ double frequency;
+ };
+
+ // Containers to hold the buckets and counts
+ // The number of buckets is bins + 2 because there
+ // is one extra bucket on each side to hold values
+ // that are outside the range
+ std::vector buckets;
+
+ // Bucket size
+ const T bs;
+
+ // total number of elements inserted
+ int samples;
+ T max;
+ T min;
+ T sum;
+
+
+ public:
+
+ Histogram(T lower, T upper, int bins)
+ : lower(lower), upper(upper), bins(bins),
+ bs((upper - lower) / static_cast(bins)),
+ samples(0), max(0), min(0), sum(0)
+ {
+
+ // panic_if(bs <= 0, "Bucket size must be positive");
+
+ buckets.resize(bins + 2);
+ {
+ std::ostringstream oss;
+ oss << "< " << std::fixed << std::setprecision(1) << lower;
+ buckets[0] = Bucket{oss.str(), 0, 0};
+ }
+ {
+ std::ostringstream oss;
+ oss << "> " << std::fixed << std::setprecision(1) << upper;
+ buckets[bins + 1] = Bucket{oss.str(), 0, 0};
+ }
+
+ for (int i = 1; i <= bins; ++i) {
+ std::ostringstream oss;
+ if (bs == 1) {
+ oss << (lower + bs * static_cast(i-1));
+ buckets[i] = Bucket{oss.str(), 0, 0};
+ } else {
+ oss << std::fixed << std::setprecision(1)
+ << (lower + bs * static_cast(i-1)) << "-"
+ << (lower + bs * static_cast(i));
+ buckets[i] = Bucket{oss.str(), 0, 0};
+ }
+ }
+ }
+
+ void insert(T value, int count=1) {
+ samples += count;
+ sum += value * count;
+ if (value > max) {
+ max = value;
+ }
+ if (value < min) {
+ min = value;
+ }
+
+ if (value < lower) {
+ buckets[0].count += count;
+ return;
+ } else if (value > upper) {
+ buckets[bins + 1].count += count;
+ return;
+ }
+
+ int idx = static_cast((value - lower) / bs) + 1;
+ buckets[idx].count += count;
+ }
+
+ void insert(const std::vector& values) {
+ for (auto v : values) {
+ insert(v);
+ }
+ }
+
+ void reset() {
+ samples = 0;
+ max = 0;
+ min = 0;
+ sum = 0;
+ for (auto& b : buckets) {
+ b.count = 0;
+ }
+ }
+
+
+ std::string print(bool perc=false, bool cdf=false, int width = 40) {
+ T _max = 0;
+ for (const auto& b : buckets) {
+ if (b.count > _max) {
+ _max = b.count;
+ }
+ }
+
+ std::ostringstream res;
+
+ res << "N:" << samples
+ << " Min:" << min
+ << " Max:" << max
+ << " Sum:" << sum
+ << " Avg:" << std::fixed << std::setprecision(2) << getAvg()
+ << "\n";
+
+
+ int cum = 0;
+ for (const auto& b : buckets) {
+ int barLen = _max > 0 ? static_cast((b.count * width + _max / 2) / _max) : 0;
+
+ res << std::left << std::setw(10) << b.mark
+ << " [" << std::setw(4) << b.count << "]\t";
+
+ if (perc) {
+ res << std::fixed << std::setprecision(1) << (100.0 * b.count / samples) << "%\t";
+ }
+
+ if (cdf) {
+ cum += b.count;
+ res << std::fixed << std::setprecision(1) << (100.0 * cum / samples) << "%\t";
+ }
+
+ res << "|" << std::string(barLen, '*') << "\n";
+
+ }
+ return res.str();
+ }
+
+ std::string printCDF() {
+ return print(false, true);
+ }
+
+ double getAvg() {
+ return static_cast(sum) / static_cast(samples);
+ }
+
+ T getMax() {
+ return max;
+ }
+
+ T getMin() {
+ return min;
+ }
+
+};
+
diff --git a/src/cpu/pred/llbpref/intmath.hh b/src/cpu/pred/llbpref/intmath.hh
new file mode 100644
index 00000000000..73ce2d7b13c
--- /dev/null
+++ b/src/cpu/pred/llbpref/intmath.hh
@@ -0,0 +1,304 @@
+/*
+ * COPY FROM GEM5
+ *
+ * Copyright (c) 2021 ARM Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder. You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Copyright (c) 2001, 2003-2005 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __BASE_INTMATH_HH__
+#define __BASE_INTMATH_HH__
+
+#include
+#include
+#include
+#include
+
+// #include "base/bitfield.hh"
+
+
+
+/**
+ * @ingroup api_base_utils
+ */
+template
+static constexpr std::enable_if_t, int>
+floorLog2(T x)
+{
+ assert(x > 0);
+
+ // A guaranteed unsigned version of x.
+ uint64_t ux = (typename std::make_unsigned::type)x;
+
+ int y = 0;
+ constexpr auto ts = sizeof(T);
+
+ if (ts >= 8 && (ux & 0xffffffff00000000ULL)) { y += 32; ux >>= 32; }
+ if (ts >= 4 && (ux & 0x00000000ffff0000ULL)) { y += 16; ux >>= 16; }
+ if (ts >= 2 && (ux & 0x000000000000ff00ULL)) { y += 8; ux >>= 8; }
+ if (ux & 0x00000000000000f0ULL) { y += 4; ux >>= 4; }
+ if (ux & 0x000000000000000cULL) { y += 2; ux >>= 2; }
+ if (ux & 0x0000000000000002ULL) { y += 1; }
+
+ return y;
+}
+
+/**
+ * @ingroup api_base_utils
+ */
+template
+static constexpr int
+ceilLog2(const T& n)
+{
+ assert(n > 0);
+ if (n == 1)
+ return 0;
+
+ return floorLog2(n - (T)1) + 1;
+}
+
+/**
+ * @ingroup api_base_utils
+ */
+template
+static constexpr bool
+isPowerOf2(const T& n)
+{
+ // If n is non-zero, and subtracting one borrows all the way to the MSB
+ // and flips all bits, then this is a power of 2.
+ return n && !(n & (n - 1));
+}
+
+/**
+ * @ingroup api_base_utils
+ */
+template
+static constexpr T
+divCeil(const T& a, const U& b)
+{
+ return (a + b - 1) / b;
+}
+
+/**
+ * @ingroup api_base_utils
+ */
+template
+static constexpr std::enable_if_t
+mulUnsigned(std::make_unsigned_t &high, std::make_unsigned_t &low,
+ std::make_unsigned_t val_a, std::make_unsigned_t val_b)
+{
+ uint64_t product = (uint64_t)val_a * (uint64_t)val_b;
+ low = product;
+ high = (product >> (sizeof(low) * 8));
+};
+
+/**
+ * @ingroup api_base_utils
+ */
+template
+static constexpr std::enable_if_t
+mulSigned(std::make_signed_t &high, std::make_signed_t &low,
+ std::make_signed_t val_a, std::make_signed_t val_b)
+{
+ uint64_t product = (int64_t)val_a * (int64_t)val_b;
+ low = product;
+ high = (product >> (sizeof(low) * 8));
+};
+
+/**
+ * Multiply two values with place value p.
+ *
+ * (A * p + a) * (B * p + b) =
+ * (A * B) * p^2 + (a * B + A * b) * p + (a * b)
+ *
+ * low result = (a * B + A * b) * p + (a * b)
+ * high result = (A * B) + carry out from low result.
+ *
+ * As long as p is at most half the capacity of the underlying type, no
+ * individual multiplication will overflow. We just have to carefully manage
+ * carries to avoid losing any during the addition steps.
+ */
+template
+static constexpr std::enable_if_t
+mulUnsignedManual(std::make_unsigned_t &high, std::make_unsigned_t &low,
+ std::make_unsigned_t val_a, std::make_unsigned_t val_b)
+{
+ low = val_a * val_b;
+
+ uint64_t A = (uint32_t)(val_a >> 32);
+ uint64_t a = (uint32_t)val_a;
+ uint64_t B = (uint32_t)(val_b >> 32);
+ uint64_t b = (uint32_t)val_b;
+
+ uint64_t c1 = 0, c2 = 0; // Carry between place values.
+ uint64_t ab = a * b, Ab = A * b, aB = a * B, AB = A * B;
+
+ c1 = (uint32_t)(ab >> 32);
+
+ // Be careful to avoid overflow.
+ c2 = (c1 >> 1) + (Ab >> 1) + (aB >> 1);
+ c2 += ((c1 & 0x1) + (Ab & 0x1) + (aB & 0x1)) >> 1;
+ c2 >>= 31;
+
+ high = AB + c2;
+}
+
+/**
+ * @ingroup api_base_utils
+ */
+template
+static constexpr std::enable_if_t
+mulUnsigned(std::make_unsigned_t &high, std::make_unsigned_t &low,
+ std::make_unsigned_t val_a, std::make_unsigned_t val_b)
+{
+#ifdef __SIZEOF_INT128__
+ __uint128_t val = (__uint128_t)val_a * (__uint128_t)val_b;
+ low = val;
+ high = (val >> 64);
+#else
+ mulUnsignedManual(high, low, val_a, val_b);
+#endif
+}
+
+template
+static constexpr std::enable_if_t
+mulSignedManual(std::make_signed_t &high, std::make_signed_t &low,
+ std::make_signed_t val_a, std::make_signed_t val_b)
+{
+ uint64_t u_high = 0, u_low = 0;
+ mulUnsigned(u_high, u_low, val_a, val_b);
+
+ if (val_a < 0)
+ u_high -= val_b;
+ if (val_b < 0)
+ u_high -= val_a;
+
+ high = u_high;
+ low = u_low;
+}
+
+/**
+ * @ingroup api_base_utils
+ */
+template
+static constexpr std::enable_if_t
+mulSigned(std::make_signed_t &high, std::make_signed_t &low,
+ std::make_signed_t val_a, std::make_signed_t val_b)
+{
+#ifdef __SIZEOF_INT128__
+ __int128_t val = (__int128_t)val_a * (__int128_t)val_b;
+ low = val;
+ high = (val >> 64);
+#else
+ mulSignedManual(high, low, val_a, val_b);
+#endif
+}
+
+template
+static constexpr std::pair, std::make_unsigned_t>
+mulUnsigned(std::make_unsigned_t val_a, std::make_unsigned_t val_b)
+{
+ std::make_unsigned_t hi{}, low{};
+ mulUnsigned(hi, low, val_a, val_b);
+ return {hi, low};
+};
+
+template
+static constexpr std::pair, std::make_signed_t>
+mulSigned(std::make_signed_t val_a, std::make_signed_t val_b)
+{
+ std::make_signed_t hi{}, low{};
+ mulSigned(hi, low, val_a, val_b);
+ return {hi, low};
+};
+
+/**
+ * This function is used to align addresses in memory.
+ *
+ * @param val is the address to be aligned.
+ * @param align is the alignment. Can only be a power of 2.
+ * @return The aligned address. The smallest number divisible
+ * by @param align which is greater than or equal to @param val.
+ *
+ * @ingroup api_base_utils
+ */
+template
+static constexpr T
+roundUp(const T& val, const U& align)
+{
+ assert(isPowerOf2(align));
+ T mask = (T)align - 1;
+ return (val + mask) & ~mask;
+}
+
+/**
+ * This function is used to align addresses in memory.
+ *
+ * @param val is the address to be aligned.
+ * @param align is the alignment. Can only be a power of 2.
+ * @return The aligned address. The biggest number divisible
+ * by @param align which is less than or equal to @param val.
+ *
+ * @ingroup api_base_utils
+ */
+template
+static constexpr T
+roundDown(const T& val, const U& align)
+{
+ assert(isPowerOf2(align));
+ T mask = (T)align - 1;
+ return val & ~mask;
+}
+
+// /**
+// * Calculate the log2 of a power of 2 integer
+// *
+// * @param An input value
+// * @return The base 2 log of value
+// *
+// * @ingroup api_base_utils
+// */
+// static constexpr int
+// log2i(int value)
+// {
+// assert(isPowerOf2(value) && value > 0);
+// return ctz32(value);
+// }
+
+// } // namespace gem5
+
+#endif // __BASE_INTMATH_HH__
diff --git a/src/cpu/pred/llbpref/llbp.cc b/src/cpu/pred/llbpref/llbp.cc
new file mode 100755
index 00000000000..23bfaa1c268
--- /dev/null
+++ b/src/cpu/pred/llbpref/llbp.cc
@@ -0,0 +1,1248 @@
+/* MIT License
+ *
+ * Copyright (c) 2024 David Schall and EASE lab
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include "llbp.h"
+
+#include
+#include "intmath.hh"
+
+
+namespace LLBP {
+
+
+#define PRINTDEBUG 0
+// #define COND (stats.total > 3000000)
+#define COND (false)
+#define COND2 (false)
+
+
+#ifdef LLBP_CONSTRAINED
+// Only sort if the constrained version is used
+#define L2_SORT_CTX
+#define L2_SORT_PTRN
+#define FILTER_TABLES
+
+#endif
+
+#define OVERWRITE_SCL
+
+/* The hash function is defined by 4 paramenters
+ *
+ * T: Type of history (T). Which branches should be hased
+ * 0: All branches, 1: Only calls, 2: Calls and returns
+ * 3: All unconditional branches, 4: All taken branches
+ *
+ * W: Number of branches that should be hashed (W in the paper).
+ * D: Number of most recent branches skipped for CCID. Adds delay which
+ * is used to prefetch. (D in the paper.)
+ * S: Number of bits to shift the PC's. Is useful to avoid ping-pong context
+ * due to the XOR function in case a loop is executed
+ *
+ * ********************************************************************* *
+ * EXAMPLE *
+ * *
+ * pb-index (2.) (3.) *
+ * v v v *
+ * history buffer : |l|k|j|i|h|g|f|e|d|c|b|a| *
+ * ^prefetch (2.)^ *
+ * *
+ * a is the newest branch PC added to the buffer, l the oldest. *
+ * (2.) = W = 7; (3.) = D = 3 *
+ * branches used to obtain PB index hash: j to d *
+ * branches used to obtain hash to prefetch into PB: g to a *
+ * ********************************************************************* *
+ */
+
+
+
+LLBP::LLBP(LLBPConfig cfg)
+ : TageSCL(cfg.tsclConfig),
+ llbpStorage(cfg.numContexts, cfg.numPatterns,
+ cfg.ctxAssoc, cfg.ptrnAssoc),
+ rcr(cfg.T, cfg.W, cfg.D, cfg.S, cfg.CTWidth),
+ patternBuffer(cfg.pbSize, cfg.pbAssoc),
+
+ numContexts(cfg.numContexts),
+ numPatterns(cfg.numPatterns),
+ TTWidth(cfg.TTWidth),
+ CtrWidth(cfg.CtrWidth),
+ ReplCtrWidth(cfg.ReplCtrWidth),
+ CtxReplCtrWidth(cfg.CtxReplCtrWidth),
+
+ simulateTiming(cfg.simulateTiming),
+ constrained(cfg.constrained),
+ warmup(false),
+ accessDelay(cfg.accessDelay),
+ primMispLength(0,36,36),
+ llbpMispLength(0,36,36),
+ primProvLength(0,36,36),
+ llbpProvLength(0,36,36),
+ numHistPerContext(1,17,16),
+ numUsefulHistPerContext(1,17,16)
+{
+ printf("LLBP branch predictor configs -------\n");
+ cfg.print();
+
+ printf("CD: ");
+ llbpStorage.printCfg();
+ printf("PS: ");
+ llbpStorage.allocate(0,0)->patterns.printCfg();
+ llbpStorage.erase(0);
+ printf("PB: ");
+ patternBuffer.printCfg();
+
+ assert((!simulateTiming || (cfg.pbSize >= rcr.D))
+ || "Pattern buffer hold at least all prefetches.");
+
+ int mllbp[MAXNHIST];
+ for (int i = 1; i <= nhist; i++) {
+ mllbp[i] = (i%2) ? m[i] : m[i]+2;
+
+ fghrT1[i] = new FoldedHistoryFast(ghr, mllbp[i], TTWidth);
+ fghrT2[i] = new FoldedHistoryFast(ghr, mllbp[i], TTWidth - 1);
+ }
+
+
+#ifdef FILTER_TABLES
+ // LLBP does not provide for all different history lenghts in
+ // TAGE a prediction only for the following once which where
+ // empirically determined. Note this
+ // are not the actual length but the table indices in TAGE.
+ auto l = {6,10,13,14,15,16,17,18, 19,20,22,24,26,28,32,36};
+
+ int n = 0;
+ for (auto i : l) {
+ // To reduce the complexity of the multiplexer LLBP groups
+ // always four consecutive history lenght in one bucket.
+ // As the pattern sets are implemented a set associative
+ // structure the lower bits determine the set=bucket.
+ // The `fltTable`-map not only filters the history lengths
+ // but also maps each length the correct pattern set index.
+ // E.e. for the four way associativity the following function
+ // ensures that history length 6,10,13,14 gets assign
+ // 0,4,8,12 with the lowest two bits 0b00. Thus, the set will
+ // be the same.
+ auto bucket = n / cfg.ptrnAssoc;
+ fltTables[i] = ((n%cfg.ptrnAssoc) << ceilLog2(cfg.ptrnAssoc) ) | bucket;
+ printf("%i=>%i:%i:%i:%i ", i, n, bucket, fltTables[i], mllbp[i]);
+ n++;
+ }
+
+
+ printf("\n");
+
+#endif //FILTER_TABLES
+}
+
+LLBP::~LLBP() {
+
+ for (int i = 1; i <= nhist; i++) {
+ delete[] fghrT1[i];
+ delete[] fghrT2[i];
+ }
+}
+
+
+
+
+/////////////////////////////////////////////////////////////////////////////////
+// Main TAGE and chooser overwrites
+/////////////////////////////////////////////////////////////////////////////////
+
+
+bool LLBP::predict(uint64_t pc) {
+
+ // 1. The base prediction
+ TageSCL::basePredict(pc);
+ tage_provider = BASE;
+ bim_pred = base_pred;
+ bimConf = baseConf;
+
+ // 2. Make the LLBP prediction
+ llbpPredict(pc);
+
+ // If there was a hit in level 2 mark it.
+ // Also override the base prediction. The TAGE arbiter
+ // will decide which prediction to use.
+ // Note the TAGE chooser will use altConf in its index
+ baseConf = llbp.hit ? llbp.conf : bimConf;
+ base_pred = llbp.hit ? llbp.pred : bim_pred;
+ llbp.isProvider = llbp.hit;
+
+ // 3. The TAGE prediction
+ // Tage will call the `chooseProvider` function
+ // Which arbitrates between the TAGE and L2 predictions.
+ tagePredict(pc);
+
+ DPRINTFF("Prov: [TAGE:%i, L2:%i]\n", tage_pred, llbp.isProvider);
+
+ tage_scl_pred = tage_pred;
+ scl_provider = tage_provider;
+
+ // 3. SCL prediction
+ SCLPredict(pc);
+
+ // 4. Choose the correct prediction
+ provider = scl_provider;
+
+#ifdef OVERWRITE_SCL
+ if (llbp.isProvider) {
+ provider = BASE;
+ return llbp.pred;
+ }
+#endif
+
+ return tage_scl_pred;
+}
+
+
+unsigned LLBP::chooseProvider() {
+
+ bool chooseL2 = llbp.hit;
+
+ // If LLBP hits, we don't use LLBP if it has a longer history.
+ if (chooseL2 && (llbp.histLength < HitBank)) {
+ chooseL2 = false;
+ llbp.shorter = true;
+ }
+
+ // Don't override if the prefetch is too late.
+ if (simulateTiming && !warmup && !llbp.prefetched) {
+ chooseL2 = false;
+ }
+
+ if (chooseL2) {
+ AltBank = 0;
+ altConf = baseConf = llbp.conf;
+ alttaken = base_pred = llbp.pred;
+ llbp.isProvider = true;
+ return BASE;
+ }
+
+ // Clear the provider bit if instead the main TAGE is used.
+ llbp.isProvider = false;
+
+
+ // If the longest is somehow certain use its prediction.
+ if (tageConf != LowConf)
+ return LONGEST;
+
+ // Use on low confidence if the USE_ALT_ON_NA is negative.
+ if (use_alt_on_na[idxChooser()] < 0) {
+ return LONGEST;
+ }
+
+ return (AltBank > 0) ? ALT : BASE;
+}
+
+bool LLBP::isNotUseful(bool taken) {
+
+ if (llbp.hit) {
+ return false;
+ }
+ // If there was no hit in level 2 use the default algorithm.
+ return TageSCL::isNotUseful(taken);
+}
+
+bool LLBP::isUseful(bool taken) {
+
+ if (llbp.hit) {
+ return false;
+ }
+ // If there was no hit in LLBP use the default algorithm.
+ return TageSCL::isUseful(taken);
+}
+
+bool LLBP::llbpCorrect(bool taken) {
+ return llbp.hit && (taken == llbp.pred);
+}
+
+bool LLBP::primCorrect(bool taken) {
+ return (scl_provider == STC) ? (sc_pred == taken) :
+ HitBank ? (LongestMatchPred == taken) :
+ AltBank ? (alttaken == taken) : (bim_pred == taken);
+}
+
+bool LLBP::tageCorrect(bool taken) {
+ return HitBank ? (LongestMatchPred == taken) :
+ AltBank ? (alttaken == taken) : (bim_pred == taken);
+}
+
+bool LLBP::llbpUseful(bool taken) {
+ return llbp.isProvider && (taken == llbp.pred) && !primCorrect(taken);
+}
+
+void LLBP::updateL2Usefulness(bool taken) {
+
+ if (!llbp.hit) return;
+
+ auto llbp_correct = llbpCorrect(taken);
+ bool prim_correct = tageCorrect(taken);
+
+ // If Level 1 was provider, it was correct and
+ // level 2 was incorrect this prediction was useful.
+ if (!llbp.isProvider && prim_correct && !llbp_correct) {
+ if (HitBank) {
+ if (HitEntry->u < (1 << uwidth) - 1)
+ HitEntry->u++;
+ }
+ }
+}
+
+
+void LLBP::updateTables(uint64_t pc, bool resolveDir, bool predDir) {
+
+
+ DPRINTIF(COND,"%s nM:%i, TAGE:[d:%i, conf:%i, prov:%d HitBank:%d], BASE:[d:%i, conf:%i]\n",
+ (resolveDir != tage_pred) ? "Misp" : "Corr",
+ stats.tageMispred,
+ tage_pred, tageConf, tage_provider, HitBank, base_pred, baseConf);
+
+
+ // 1. Table allocation --------------------
+ bool ALLOC = false;
+ if (llbp.isProvider) {
+ // If LLBP was provider we allocate if the prediction was wrong
+ // and the history length is shorter than the maximum.
+ ALLOC = (tage_pred != resolveDir) & (llbp.histLength < nhist);
+
+ } else {
+
+ // If the prediction came from TAGE, it was wrong and the history
+ // length is shorter than the maximum we allocate.
+ ALLOC = (tage_pred != resolveDir) & (HitBank < nhist);
+
+ // If LLBP was actually correct, it was longer than TAGE,
+ // but it was not chosen as provider, then we don't allocate.
+ if (llbp.hit && (llbp.pred == resolveDir) && !llbp.shorter) {
+ ALLOC = false;
+ }
+
+ // This comes from the TAGE update function (alternative prediction)
+ if (HitBank > 0) {
+ if ((tageConf == LowConf) && (LongestMatchPred == resolveDir)) {
+ ALLOC = false;
+ }
+
+ updateChooser(resolveDir);
+ }
+
+ }
+
+ // Do the actual allocation
+ // In case LLBP was the provider we overwrite the history length
+ // of the TAGE prediction. This forces the TAGE allocation
+ // to start allocating with the history length of the LLBP.
+ auto tmp2 = HitBank;
+ if (llbp.isProvider) {
+ HitBank = llbp.histLength;
+ }
+ int nalloc = 0;
+ if (ALLOC) {
+ nalloc = 1+nnn;
+ DPRINTFF("Alloc:%i,%i, HL:%i, L2:[H:%i,S:%i,P:%i,D:%i]\n",
+ stats.totalAllocInit, stats.total, HitBank, llbp.hit, llbp.shorter, llbp.isProvider, llbp.pred);
+ }
+
+ allocateTables(nalloc, pc, resolveDir);
+ HitBank = tmp2;
+
+ // 2. The LLBP + TAGE table updates
+ // We only update either the TAGE or the LLBP tables.
+ llbpUpdate(pc, resolveDir, predDir);
+
+ // 3. Finally, the statistical corrector.
+ SCLUpdate(pc, resolveDir, predDir);
+}
+
+
+
+/////////////////////////////////////////////////////////////////////////////////
+// LLBP PREDICTOR
+/////////////////////////////////////////////////////////////////////////////////
+
+void LLBP::llbpPredict(uint64_t pc) {
+
+ // Calculate indices and tags
+ // We need to do this explicity because we perform the prediction
+ // before the TAGE prediction.
+ calcIndicesAndTags(pc);
+
+ llbpEntry = nullptr;
+ llbp = {};
+
+ for (int i = 1; i <= nhist; i++) {
+
+ if (!NOSKIP[i]) continue;
+
+ // We don't use all history lengths. Only 16
+ // By using the lower bits for the table number we can
+ // use it to manage the assocativity of the different history lengths.
+ auto _i = i;
+ auto it = fltTables.find(i);
+ if (it != fltTables.end()) {
+ _i = it->second;
+ }
+ // auto _i = fltTables.contains(i) ? fltTables[i] : i;
+ // Table index (10 bits)
+
+
+ auto _key = pc;
+ _key ^= fghrT1[i]->value ^ (fghrT2[i]->value << 1);
+ // Mask the patterns bits
+ _key &= ((1 << TTWidth) - 1);
+
+ KEY[i] = uint64_t(_key) << 10ULL | uint64_t(_i);
+
+ }
+
+
+
+ // Get the current context (CCID)
+ auto ctx_key = rcr.getCCID();
+ // PRINTIF(COND2,"%llu L2Predict: %lx\n", branchCount, ctx_key);
+ HitContext = llbpStorage.get(ctx_key);
+
+
+ if (HitContext) {
+ for (int i = nhist; i > 0; i--) {
+ if (NOSKIP[i]) {
+ llbpEntry = HitContext->patterns.get(KEY[i]);
+
+
+ if (llbpEntry) {
+ llbp.hit = i;
+ llbp.pVal = llbpEntry->ctr;
+ llbp.pred = llbp.pVal >= 0;
+ llbp.conf = compConf(llbp.pVal, CtrWidth);
+ llbp.histLength = i;
+ break;
+ }
+ }
+ }
+ }
+
+
+ if (llbp.hit) {
+ PRINTIF(COND,"S1Hit:%i,GI:%i,GT:%i,c:%i\n",
+ llbpEntry->length, GI[llbpEntry->length], GTAG[llbpEntry->length],
+ llbpEntry->ctr);
+ }
+
+ // In case of timing simulation we check if the entry was already
+ // prefetched.
+ if (simulateTiming) {
+ pbEntry = patternBuffer.get(ctx_key);
+ if (pbEntry) {
+ pbEntry->locked = false;
+ }
+ llbp.prefetched = llbp.hit && pbEntry;
+ } else {
+ llbp.prefetched = true;
+ }
+
+}
+
+
+
+// PREDICTOR UPDATE
+void LLBP::llbpUpdate(uint64_t pc, bool resolveDir, bool predDir) {
+
+ // Update ----------------------------------------------------
+ bool updateBim = false;
+ bool updateLLBP = llbp.isProvider;
+ bool updateTAGE = !llbp.isProvider;
+
+
+ // Only the providing component is updated.
+ // If the prediction came from LLBP its pattern gets updated.
+ if (updateLLBP) {
+ ctrupdate(llbpEntry->ctr, resolveDir, CtrWidth);
+
+ // This function updates the context replacement counter
+ // - If a pattern becomes confident (correct prediction)
+ // the replacement counter is increased
+ // - If a pattern becomes low confident (incorrect prediction)
+ // the replacement counter is decreased
+ if (llbpEntry->ctr == (resolveDir ? 1 : -2)) {
+ // entry became medium confident
+ ctrupdate(HitContext->replace, true, CtxReplCtrWidth);
+ }
+ else if (llbpEntry->ctr == (resolveDir ? -1 : 0)) {
+ // entry became low confident
+ ctrupdate(HitContext->replace, false, CtxReplCtrWidth);
+ }
+
+ // If the prediction wrong update also the BIM
+ if (!llbpCorrect(resolveDir) && (llbp.conf == LowConf)) {
+ updateBim = true;
+ }
+ }
+
+ // If the prediction was from the TAGE predictor update it.
+ if (updateTAGE) {
+ updateBim = tageUpdate(pc, resolveDir);
+ }
+
+ // The base predictor is sometimes updated if the confidence of the
+ // prediction is low.
+ if (updateBim) {
+
+ // If the prediction was from the base predictor, update it.
+ TageSCL::baseUpdate(pc, resolveDir, predDir);
+
+ DPRINTIF(COND,"BIMUp: ctr:%i\n", BIM);
+ }
+
+ // Usefulness ------------------------------------------------
+ if (llbp.hit) {
+ updateL2Usefulness(resolveDir);
+ }
+
+
+ // Update the pattern buffers statistics
+ // and dirty bits.
+ if (simulateTiming && pbEntry) {
+ if (updateLLBP) {
+ pbEntry->dirty = true;
+ }
+ if (llbp.hit) {
+ pbEntry->used = true;
+ }
+ if (llbpUseful(resolveDir)) {
+ pbEntry->useful = true;
+ }
+ }
+}
+
+bool LLBP::llbpAllocate(int histLen, uint64_t pc, bool taken) {
+
+#ifdef FILTER_TABLES
+ auto it = fltTables.find(histLen);
+ if (it == fltTables.end()) {
+ // If the history length is not in the filter table
+ // we don't allocate.
+ return false;
+ }
+ // if (!fltTables.contains(histLen)) {
+ // return false;
+ // }
+#endif //FILTER_TABLES
+
+
+ // Create context key and pattern key
+ auto ctx_key = rcr.getCCID();
+ auto k = KEY[histLen];
+
+ auto ctx = allocateNewContext(pc, ctx_key);
+
+ // Check if the pattern already exists in LLBP.
+ auto ptrn = ctx->patterns.get(k);
+ if (ptrn) {
+ return true;
+ }
+
+ // No pattern found. Allocate a new one.
+ // Sorting before allocation to find the victim
+
+ if (constrained) {
+#ifdef L2_SORT_PTRN
+ ctx->sortPatters(k);
+#endif
+
+#ifdef LLBP_CONSTRAINED
+ ptrn = ctx->patterns.getVictim(k);
+#endif
+ }
+
+ ptrn = ctx->patterns.insert(k);
+
+ ptrn->length = histLen;
+ ptrn->useful = 0;
+ ptrn->correct = 0;
+ ptrn->key = k;
+ ptrn->pc = pc;
+
+ ptrn->ctr = taken ? 0 : -1;
+ ptrn->dir = taken;
+
+ return true;
+}
+
+
+
+LLBP::Context* LLBP::allocateNewContext(uint64_t pc, uint64_t ctx_key) {
+
+ Context* ctx = llbpStorage.get(ctx_key);
+
+ // If the context does not exist we allocate a new one.
+ if (!ctx) {
+
+ if (constrained) {
+#ifdef L2_SORT_CTX
+ llbpStorage.sortContexts(ctx_key);
+#endif
+ }
+
+ // Ensure the victim is not in the L2 predictor
+ ctx = llbpStorage.getVictim(ctx_key);
+
+ // If the victim context is still in pattern buffer
+ // we need to remove it.
+ if (simulateTiming && ctx) {
+
+ if (patternBuffer.exists(ctx->key)) {
+ patternBuffer.erase(ctx->key);
+ }
+
+ // Also invalidate all entries in the prefetch queue
+ // with this key.
+ // auto n = prefetchQueue.size();
+ prefetchQueue.erase(std::remove_if(
+ prefetchQueue.begin(), prefetchQueue.end(),
+ [ctx_key](auto& e) { return e.key == ctx_key; }),
+ prefetchQueue.end());
+ }
+
+ // Allocate a new context in the LLBP storage.
+ ctx = llbpStorage.allocate(ctx_key, pc);
+
+ if (simulateTiming && ctx) {
+ // Put the newly allocated entry into the PB.
+ PBEntry entry(ctx_key);
+ entry.valid = true;
+ entry.dirty = true;
+ entry.newlyAllocated = true;
+ installInPB(entry, true);
+ }
+
+ }
+
+ return ctx;
+}
+
+
+
+int LLBP::allocate(int Tidx, uint64_t pc, bool taken) {
+
+ int alloc_res = TageBase::allocate(Tidx, pc, taken);
+
+ // Get the newly allocated entry and mark with the context and key
+ if (alloc_res > 0) {
+ auto& entry = gtable[Tidx][GI[Tidx]];
+ entry.ctx = rcr.getCCID();
+ entry.key = KEY[Tidx];
+ // DPRINTIF(COND,"L1Alloc:%i,GI:%i,T:%i,K:%#llx,C:%#llx\n", Tidx, GI[Tidx], entry.tag, entry.key, entry.ctx);
+ }
+
+ if (alloc_res <= 0) {
+ // Allocation not successful -> we also don't allocate in the LLBP
+ return alloc_res;
+ }
+
+ // Try allocating in the LLBP.
+ if (llbpAllocate(Tidx, pc, taken)) {
+ stats.allocations[Tidx]++;
+ stats.totalAllocations++;
+ return 1;
+ }
+ return alloc_res;
+}
+
+// Prefetch functionality
+
+void LLBP::prefetch() {
+
+ if (warmup) return;
+
+ // Perform the prefetching -----
+ // Calculate the hash from the head of the history.
+ auto ctx_key = rcr.getPCID();
+ // PRINTIF(COND2,"%i/%i Prefetch: %lx -> ", ticks, branchCount, ctx_key);
+
+
+
+ // First check the preload queue if this entry is already enqueued.
+ auto it = std::find_if(
+ prefetchQueue.begin(), prefetchQueue.end(),
+ [ctx_key](auto& e)
+ { return (e.key == ctx_key) && e.valid;});
+
+ if (it != prefetchQueue.end()) {
+ PRINTIF(COND2," Hit in prefetchQueue %lx", it->key);
+ llbpstats.l2PFHitInQueue++;
+ }
+
+ // Second check if its already cached.
+ else if (patternBuffer.exists(ctx_key)) {
+ // Copy the entry from the cache to the preload queue.
+ PRINTIF(COND2," Hit in pattern cache");
+ llbpstats.l2PFHitInCache++;
+ patternBuffer.touch(ctx_key);
+ }
+
+ // Finally check if the context is available in the LLBP
+ // and needs to be prefetched.
+ else if (llbpStorage.exists(ctx_key)) {
+ PRINTIF(COND2," Hit in CI -> prefetch");
+ llbpstats.l2PFHitInCI++;
+ auto& pf_entry = prefetchQueue.emplace_back(ctx_key);
+ pf_entry.valid = true;
+ pf_entry.prefetchtime = ticks;
+ } else {
+ PRINTIF(COND2," Miss");
+ }
+ PRINTIF(COND2,"\n");
+
+}
+
+void LLBP::squashPrefetchQueue(bool btbMiss) {
+ if (btbMiss)
+ llbpstats.pfDroppedBTBMiss += prefetchQueue.size();
+ else
+ llbpstats.pfDroppedMispredict += prefetchQueue.size();
+ prefetchQueue.clear();
+
+ // Once all prefetches are squashed we trigger prefetches
+ // for an upcoming context.
+ if (btbMiss)
+ prefetch();
+}
+
+void LLBP::tickPrefetchQueue() {
+
+ // Tick should be called before the prediction is made.
+
+ // Install prefetches if the prefetch delay has passed.
+ if (!prefetchQueue.empty()) {
+ auto& pf_entry = prefetchQueue.front();
+
+ // If the prefetch delay has passed
+ if (ticks - pf_entry.prefetchtime >= (accessDelay)) {
+
+ PRINTIF(COND2," Install in Cache: %lx\n", pf_entry.key);
+ pf_entry.locked = true;
+ installInPB(pf_entry);
+ prefetchQueue.pop_front();
+ }
+ }
+}
+
+void LLBP::installInPB(PBEntry &entry, bool bypass) {
+
+
+ // First get the victim from the LLBP predictor
+ auto victim = patternBuffer.getVictim(entry.key);
+
+ if (victim) {
+
+ // If the entry is locked due to ongoing prefetch. Don't install in
+ // PB but in LLBP right away.
+ if (victim->locked && bypass) {
+ llbpstats.pfDroppedLocked++;
+ llbpstats.l2cacheDirtyEvict++;
+ return;
+ }
+
+ if (victim->dirty) llbpstats.l2cacheDirtyEvict++;
+ else llbpstats.l2cacheCleanEvict++;
+ PRINTIF(COND2," Evict: %lx\n", victim->key);
+ }
+
+ // Copy the prefetched pattern set into the PB
+ patternBuffer.insert(entry);
+}
+
+
+void LLBP::updateStats(bool resolveDir, bool predDir, uint64_t pc) {
+
+ TageSCL::updateStats(resolveDir, predDir, pc);
+
+
+ // Check if storing the last history would have been useful.
+ auto correct = predDir == resolveDir;
+
+ auto llbp_correct = llbp.isProvider && (resolveDir == llbp.pred);
+
+ bool prim_correct = (scl_provider == STC) ? (sc_pred == resolveDir) :
+ HitBank ? (LongestMatchPred == resolveDir) :
+ AltBank ? (alttaken == resolveDir) : (bim_pred == resolveDir);
+
+
+ bool llbp_useful = llbp_correct && !prim_correct;
+
+
+ if (llbp.hit) {
+ if (llbp.isProvider) {
+ if (llbp_correct) {
+ if (prim_correct)
+ llbpstats.l2OverrideSameCorr++;
+ else
+ llbpstats.l2OverrideGood++;
+ } else {
+ if (prim_correct)
+ llbpstats.l2OverrideBad++;
+ else
+ llbpstats.l2OverrideSameWrong++;
+ }
+ if (llbpEntry->pc != pc) {
+ if (llbp_correct)
+ llbpstats.ovrPosAlias++;
+ else {
+ llbpstats.ovrNegAlias++;
+ }
+ }
+ } else {
+ llbpstats.l2NoOverride++;
+ }
+ }
+
+
+
+
+ // Hits for contexts and patterns
+ if (HitContext) {
+ llbpstats.l2CtxHit++;
+ if (llbpEntry) {
+ llbpstats.l2PtrnHit++;
+ }
+ }
+
+
+ if (llbp.isProvider) {
+ llbpstats.l2Prov++;
+ llbpProvLength.insert(llbp.histLength);
+ if (llbp_correct) {
+ llbpstats.l2Correct++;
+ llbpEntry->correct++;
+ HitContext->correct++;
+ if (llbp_useful) {
+
+ llbpEntry->useful++;
+ HitContext->useful++;
+ if (llbpEntry->useful == 1) {
+ HitContext->usefulPtrns++;
+ }
+ }
+
+
+ } else {
+ llbpstats.l2Wrong++;
+ llbpMispLength.insert(llbp.histLength);
+ llbpEntry->incorrect++;
+ HitContext->incorrect++;
+ }
+ } else {
+ switch (provider) {
+ case LONGEST:
+ case ALT:
+ {
+ llbpstats.tageProv++;
+ auto l = (provider == LONGEST) ? HitBank : AltBank;
+ primProvLength.insert(l);
+ if (correct) llbpstats.tageCorrect++;
+ else {
+ llbpstats.tageWrong++;
+ primMispLength.insert(l);
+ }
+ }
+ break;
+
+ case LOOP:
+ case STC:
+ llbpstats.sclProv++;
+ if (correct) llbpstats.sclCorrect++;
+ else llbpstats.sclWrong++;
+ break;
+
+ case BASE:
+ llbpstats.baseProv++;
+ if (correct) {
+ llbpstats.baseCorrect++;
+ } else {
+ llbpstats.baseWrong++;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (llbp.hit && !llbp.isProvider) {
+ if (llbp.shorter) {
+ llbpstats.l2notBecauseShorter++;
+ }
+
+ if (!llbp.prefetched) {
+ llbpstats.l2notBecauseNotPrefetched++;
+ }
+ }
+}
+
+
+
+// Predictor update ----------------------------------------
+void LLBP::UpdatePredictor(uint64_t PC, bool resolveDir,
+ bool predDir, uint64_t branchTarget) {
+
+ // Update the TAGE and LLBP predictors via the
+ // base class. This Will also update the histories and statistics.
+ Tage::UpdatePredictor(PC, resolveDir, predDir, branchTarget);
+
+ // Only thing left is the update of the prefetch queue and
+ // the context hash.
+ bool do_prefetch = false;
+ if (simulateTiming && (resolveDir != predDir)) {
+ squashPrefetchQueue();
+ do_prefetch = true;
+ }
+
+ // In the default LLBP predictor there will be no update of the
+ // runtime hash for conditional branches. However, this model
+ // supports different types of histories.
+ do_prefetch |= rcr.update(PC, OPTYPE_JMP_DIRECT_COND, resolveDir);
+ if (simulateTiming && do_prefetch) {
+ prefetch();
+ }
+}
+
+
+void LLBP::TrackOtherInst(uint64_t PC, OpType opType, bool taken,
+ uint64_t branchTarget) {
+
+ TageSCL::TrackOtherInst(PC, opType, taken, branchTarget);
+
+ auto do_prefetch = rcr.update(PC, opType, taken);
+ if (simulateTiming && do_prefetch) {
+ // PRINTIF(COND2,"%i/%i Prefetch: %lx/t:%i from UpdateOther -> ", ticks, branchCount, PC, opType);
+ prefetch();
+ }
+}
+
+
+
+void LLBP::updateGHist(const bool bit) {
+ TageSCL::updateGHist(bit);
+ for (uint32_t i = 1; i <= nhist; ++i) {
+ fghrT1[i]->update();
+ fghrT2[i]->update();
+ }
+}
+
+
+/************************************************************
+ * RCR Functionality
+ */
+
+LLBP::RCR::RCR(int _T, int _W, int _D, int _shift, int _CTWidth)
+ : CTWidth(_CTWidth), T(_T), W(_W), D(_D), S(_shift)
+{
+ bb[0].resize(maxwindow);
+ bb[1].resize(maxwindow);
+ ctxs = {0, 0};
+ printf("\n\nRCR: context hash config: [T:%i, W:%i, D:%i, S:%i, CTWidth:%i]\n",
+ T, W, D, S, CTWidth);
+}
+
+
+/*
+ * Given the {n} number of branches staring from vec[end-start]
+ * to vec[end-start-n-1] we create the hash function by shifting
+ * each PC by {shift} number if bits i.e.
+ *
+ * 000000000000| PC | :vec[end-start]
+ * ^ 0000000000| PC |00 :vec[end-start-1]
+ * ^ 00000000| PC |0000 :vec[end-start-2]
+ * . .
+ * . .
+ * . .
+ * ^ | PC |000000000000 :vec[end-start-n-1]
+ * ----------------------
+ * final hash value
+ * */
+uint64_t LLBP::RCR::calcHash(std::list &vec, int n, int start, int shift) {
+ uint64_t hash = 0;
+ if (vec.size() < (start + n)) {
+ return 0;
+ }
+ uint64_t sh = 0;
+ auto it = vec.begin();
+ std::advance(it, start);
+ for (; (it != vec.end()) && (n > 0); it++, n--) {
+ uint64_t val = *it;
+
+ // Shift the value
+ hash ^= val << uint64_t(sh);
+
+ sh += shift;
+ if (sh >= CTWidth) {
+ sh -= uint64_t(CTWidth);
+ }
+ }
+ return hash & ((1 << CTWidth) - 1);
+}
+
+uint64_t LLBP::RCR::getCCID() {
+ return ctxs.ccid & ((1 << CTWidth) - 1);
+}
+
+uint64_t LLBP::RCR::getPCID() {
+ return ctxs.pcid & ((1 << CTWidth) - 1);
+}
+
+
+bool LLBP::RCR::update(uint64_t pc, OpType opType, bool taken) {
+
+ branchCount++;
+ // Hash of all branches
+ auto isCall = (opType == OPTYPE_CALL_DIRECT_UNCOND)
+ || (opType == OPTYPE_CALL_INDIRECT_UNCOND)
+ || (opType == OPTYPE_CALL_DIRECT_COND);
+
+
+ switch (T) {
+ case 0: // All branches
+ bb[0].push_front(pc);
+ bb[1].push_front(branchCount);
+ break;
+
+ case 1: // Only calls
+ if (isCall) {
+ bb[0].push_front(pc);
+ bb[1].push_front(branchCount);
+ }
+ break;
+
+ case 2: // Only calls and returns
+ if (isCall || (opType == OPTYPE_RET_UNCOND)) {
+ bb[0].push_front(pc);
+ bb[1].push_front(branchCount);
+ }
+ break;
+
+ case 3: // Only unconditional branches
+ if (opType != OPTYPE_JMP_DIRECT_COND) {
+ bb[0].push_front(pc);
+ bb[1].push_front(branchCount);
+ }
+ break;
+
+ case 4: // All taken branches
+ if (taken) {
+ bb[0].push_front(pc);
+ bb[1].push_front(branchCount);
+ }
+ break;
+ }
+
+
+
+ // PRINTIF(COND,"UH:%llx, %i, %i\n", pc, opType, taken);
+ // If the size has changed the hash has changed
+ bool changed = false;
+ if (bb[0].size() > maxwindow) {
+ changed = true;
+
+ // Resize the history
+ bb[0].pop_back();
+ bb[1].pop_back();
+
+
+ // The current context.
+ ctxs.ccid = calcHash(bb[0], W, D, S);
+ // The prefetch context.
+ ctxs.pcid = calcHash(bb[0], W, 0, S);
+ }
+ return changed;
+}
+
+
+void LLBP::tick() {
+ TageSCL::tick();
+ if (simulateTiming) {
+ tickPrefetchQueue();
+ }
+}
+
+void LLBP::btbMiss() {
+ if (simulateTiming) {
+ squashPrefetchQueue(true);
+ }
+}
+
+void LLBP::setState(bool _warmup) {
+ warmup = _warmup;
+}
+
+
+
+void LLBP::PrintStat(double instr) {
+
+ TageSCL::PrintStat(instr);
+
+ // Analyze the branch context
+ numHistPerContext.reset();
+ numUsefulHistPerContext.reset();
+
+ int nPattern = 0, nUseful = 0;
+ int nCtx = 0, nCtxUseful = 0;
+
+ for (auto& ctx_pair : llbpStorage.getMap()) {
+
+ auto& ctx = ctx_pair.second->second;
+
+ int nuseful = 0;
+ for (auto& pt : ctx.patterns.getMap()) {
+ if (pt.second->second.useful > 0) {
+ nuseful++;
+ }
+ }
+
+ int n = ctx.patterns.size();
+ numHistPerContext.insert(n);
+ numUsefulHistPerContext.insert(nuseful);
+ nPattern += n;
+ nUseful += nuseful;
+ if (nuseful) {
+ nCtxUseful++;
+ }
+ }
+
+
+ printf("LLBP branch predictor stats -------\n");
+
+ printf("LLBP:: CtxHit:%i(%.4f), PtrnHit:%i(%.4f)\n",
+ llbpstats.l2CtxHit, llbpstats.l2CtxHit / (double)stats.total,
+ llbpstats.l2PtrnHit, llbpstats.l2PtrnHit / (double)stats.total
+ );
+
+
+ printf("PROVIDER:: BIM:[P:%i(%.4f), C:%i(%.4f), W:%i(%.4f) MPKI:%.4f] \n",
+ llbpstats.baseProv, (double)llbpstats.baseProv / (double)stats.total,
+ llbpstats.baseCorrect, (double)llbpstats.baseCorrect / (double)llbpstats.baseProv,
+ llbpstats.baseWrong, (double)llbpstats.baseWrong / (double)llbpstats.baseProv,
+ (double)llbpstats.baseWrong / (double)instr * 1000
+ );
+
+ printf("PROVIDER:: TAGE:[P:%i(%.4f), C:%i(%.4f), W:%i(%.4f) MPKI:%.4f], \n",
+ llbpstats.tageProv, (double)llbpstats.tageProv / (double)stats.total,
+ llbpstats.tageCorrect, (double)llbpstats.tageCorrect / (double)llbpstats.tageProv,
+ llbpstats.tageWrong, (double)llbpstats.tageWrong / (double)llbpstats.tageProv,
+ (double)llbpstats.tageWrong / (double)instr * 1000);
+
+ printf("PROVIDER:: SCL:[P:%i(%.4f), C:%i(%.4f), W:%i(%.4f) MPKI:%.4f], \n",
+ llbpstats.sclProv, (double)llbpstats.sclProv / (double)stats.total,
+ llbpstats.sclCorrect, (double)llbpstats.sclCorrect / (double)llbpstats.sclProv,
+ llbpstats.sclWrong, (double)llbpstats.sclWrong / (double)llbpstats.sclProv,
+ (double)llbpstats.sclWrong / (double)instr * 1000);
+
+ printf("PROVIDER:: LLBP:[P:%i(%.4f), C:%i(%.4f), W:%i(%.4f) MPKI:%.4f], \n",
+ llbpstats.l2Prov, (double)llbpstats.l2Prov / (double)stats.total,
+ llbpstats.l2Correct, (double)llbpstats.l2Correct / (double)llbpstats.l2Prov,
+ llbpstats.l2Wrong, (double)llbpstats.l2Wrong / (double)llbpstats.l2Prov,
+ (double)llbpstats.l2Wrong / (double)instr * 1000);
+
+
+ printf("LLBP:: CtxHit:%i, PtrnHit:%i, Provider:%i(%.4f), NoProvider:[Shorter:%i(%.4f), NoPrefetch:%i(%.4f)]\n",
+ llbpstats.l2CtxHit, llbpstats.l2PtrnHit, llbpstats.l2Prov, (double)llbpstats.l2Prov / (double)llbpstats.l2PtrnHit,
+ llbpstats.l2notBecauseShorter, (double)llbpstats.l2notBecauseShorter / (double)llbpstats.l2PtrnHit,
+ llbpstats.l2notBecauseNotPrefetched, (double)llbpstats.l2notBecauseNotPrefetched / (double)llbpstats.l2PtrnHit
+ );
+
+
+ printf("LLBP:: PB Prefetch:[HitInPfq:%i, HitInPB:%i, HitInCI:%i], dropped[locked:%i, misp:%i, btbmiss:%i]\n",
+ llbpstats.l2PFHitInQueue, llbpstats.l2PFHitInCache, llbpstats.l2PFHitInCI, llbpstats.pfDroppedLocked, llbpstats.pfDroppedMispredict, llbpstats.pfDroppedBTBMiss
+ );
+
+ auto tot_evicts = llbpstats.l2cacheDirtyEvict + llbpstats.l2cacheCleanEvict;
+ printf("LLBP:: PB Evict:[Clean:%i(%.3f) Dirty:%i(%.3f)]\n",
+ llbpstats.l2cacheCleanEvict, (double)llbpstats.l2cacheCleanEvict / (double)tot_evicts,
+ llbpstats.l2cacheDirtyEvict, (double)llbpstats.l2cacheDirtyEvict / (double)tot_evicts
+ );
+
+
+ printf("LLBP:: LLBPHits:[NoOv:%i, SameCorr:%i, SameWrong:%i, GoodOv:%i, BadOv:%i] Alias:[P:%i(%.4f),N:%i(%.4f)]\n",
+ llbpstats.l2NoOverride, llbpstats.l2OverrideSameCorr, llbpstats.l2OverrideSameWrong, llbpstats.l2OverrideGood, llbpstats.l2OverrideBad,
+ llbpstats.ovrPosAlias, llbpstats.ovrPosAlias / (double)llbpstats.l2Prov,
+ llbpstats.ovrNegAlias, llbpstats.ovrNegAlias / (double)llbpstats.l2Prov
+ );
+
+
+ auto tot_pattern = (numPatterns * numContexts);
+
+
+ nCtx = llbpStorage.getMap().size();
+ printf(
+ "LLBP:: Utilization: Patterns:[Total:%i,Alloc:%i(%.4f),Useful:%i(%.4f)], Ctx:[Total:%i,Alloc:%i(%.4f),Useful:%i(%.4f)]\n",
+
+ tot_pattern, nPattern, nPattern / (double)tot_pattern,
+ nUseful, nUseful / (double)tot_pattern,
+
+ numContexts, nCtx, nCtx / (double)numContexts,
+ nCtxUseful, nCtxUseful / (double)numContexts
+ );
+
+#define PRINTHIST
+
+#ifdef PRINTHIST
+
+
+ printf("Hist Histories per context\n");
+ printf("%s\n", numHistPerContext.print(true,true).c_str());
+
+ printf("Hist Useful histories per context\n");
+ printf("%s\n", numUsefulHistPerContext.print(true,true).c_str());
+
+ printf("Hist primary mispredict length (incorrect)\n");
+ printf("%s\n", primMispLength.print(true,true).c_str());
+
+ printf("Hist LLBP mispredict length (incorrect)\n");
+ printf("%s\n", llbpMispLength.print(true,true).c_str());
+
+ printf("Hist primary provider length\n");
+ printf("%s\n", primProvLength.print(true,true).c_str());
+
+ printf("Hist LLBP provider length\n");
+ printf("%s\n", llbpProvLength.print(true,true).c_str());
+
+
+#endif
+
+
+}
+
+void LLBP::resetStats() {
+ TageSCL::resetStats();
+ llbpstats = {};
+
+ primMispLength.reset();
+ llbpMispLength.reset();
+ primProvLength.reset();
+ llbpProvLength.reset();
+ numHistPerContext.reset();
+ numUsefulHistPerContext.reset();
+}
+
+
+}; // namespace LLBP
diff --git a/src/cpu/pred/llbpref/llbp.h b/src/cpu/pred/llbpref/llbp.h
new file mode 100644
index 00000000000..585236c2d21
--- /dev/null
+++ b/src/cpu/pred/llbpref/llbp.h
@@ -0,0 +1,641 @@
+/* MIT License
+ *
+ * Copyright (c) 2024 David Schall and EASE lab
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#pragma once
+
+#include
+// #include
+#include
+#include
+#include
+#include
+#include
+#include
+
+
+#include "tage_scl.h"
+
+// #include "utils/fileutils.h"
+#include "counters.h"
+#include "cache.h"
+#include "histogram.h"
+
+namespace LLBP {
+
+
+struct LLBPConfig;
+
+class LLBP : public TageSCL {
+
+ public:
+ LLBP(LLBPConfig config);
+ ~LLBP();
+
+ void UpdatePredictor(uint64_t PC, bool resolveDir,
+ bool predDir, uint64_t branchTarget) override;
+ void TrackOtherInst(uint64_t PC, OpType opType, bool taken,
+ uint64_t branchTarget) override;
+ void PrintStat(double NUMINST) override;
+ void tick() override;
+ void btbMiss() override;
+ void setState(bool warmup) override;
+
+ private:
+
+
+ // Override some base class functions
+ bool predict(uint64_t pc) override;
+ void updateTables(uint64_t pc, bool resolveDir, bool predDir) override;
+ void updateStats(bool taken, bool predtaken, uint64_t PC) override;
+ void updateGHist(const bool bit) override;
+ int allocate(int idx, uint64_t pc, bool taken) override;
+ void resetStats() override;
+
+
+ typedef uint64_t Key;
+ Key KEY[MAXNHIST]; //
+
+
+ /********************************************************************
+ * LLBP Pattern
+ *
+ * Consists of the history length field and the tag.
+ * In the model we concatenate both to form a key.
+ * key = (tag << 10) | length
+ * This simplifies model complexity
+ ******************************************************************/
+ struct Pattern {
+ int length;
+ uint tag;
+ int idx;
+ int8_t ctr;
+ uint replace;
+ bool dir;
+ int useful = 0;
+ int correct = 0;
+ int incorrect = 0;
+ Key key = 0;
+ int evicted = 0;
+ int evicted_ctx = 0;
+ uint64_t pc = 0;
+ };
+
+
+ /********************************************************************
+ * Pattern Set
+ *
+ * The pattern sets are implemented as set associative cache. The
+ * lower bits of the key - to lookup a pattern in the pattern set
+ * - are used for the history length which realizes the four way
+ * associativity. In the constructor we assign each history an
+ * index
+ ******************************************************************/
+ struct PatternSet : public BaseCache{
+ PatternSet(size_t max_size, size_t assoc) :
+ BaseCache(max_size, assoc)
+ {}
+
+ Pattern* insert(const uint64_t &key) {
+ return BaseCache::insert(key);
+ }
+ };
+
+ /********************************************************************
+ * Program Context
+ *
+ * A program context contains one pattern set and is indexed by
+ * a key formed by hashing W unconditional branches.
+ * This struct contains some additional meta data for replacement
+ * and statistics.
+ ********************************************************************/
+ struct Context {
+ bool valid;
+ uint64_t key;
+ uint64_t pc;
+ int correct;
+ int incorrect;
+ int useful;
+ int conflict;
+ uint replace;
+ int ctr;
+ int usefulPtrns;
+
+ // The contexts pattern set.
+ PatternSet patterns;
+
+ Context(uint64_t k, uint64_t p, int n, int assoc)
+ : valid(true), key(k), pc(p),
+ correct(0), incorrect(0), useful(0), conflict(0),
+ replace(0), ctr(0), usefulPtrns(0),
+ patterns(n, assoc)
+ {}
+
+ // Before a pattern in the pattern set is replaced, the patterns are
+ // sorted from the highest to the lowest confidence. This is done to
+ // determine which pattern should be evicted.
+ void sortPatters(const uint64_t key) {
+ auto& set = patterns.getSet(key);
+ set.sort(
+ [](const std::pair& a, const std::pair& b)
+ {
+ return abs(center(a.second.ctr)) > abs(center(b.second.ctr));
+ });
+ }
+ };
+
+
+ /********************************************************************
+ * LLBP Storage
+ *
+ * LLBPs high-capacity structure to store all pattern sets.
+ * It's implemented as a set associative cache.
+ * The Context directory (CD) can be thought of as the tag array while the
+ * LLBPStorage is the data array. In this simulation model, both LLBP
+ * and CD are represented with a single data structure.
+ ********************************************************************/
+ class LLBPStorage : public BaseCache{
+ typedef typename std::pair key_value_pair_t;
+ typedef typename std::list::iterator list_iterator_t;
+ const int n_patterns;
+ const int _ptrn_assoc;
+
+ public:
+
+ LLBPStorage(int n_ctx, int n_patterns, int ctx_assoc, int ptrn_assoc)
+ : BaseCache