From da72656193bd039a7bafea2406a81338ee05953a Mon Sep 17 00:00:00 2001 From: Amir Ayupov <aaupov@fb.com> Date: Fri, 9 May 2025 16:23:30 -0700 Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20change?= =?UTF-8?q?s=20to=20main=20this=20commit=20is=20based=20on?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Created using spr 1.3.4 [skip ci] --- bolt/include/bolt/Core/BinaryFunction.h | 15 +- bolt/include/bolt/Profile/DataAggregator.h | 20 +- bolt/include/bolt/Profile/DataReader.h | 3 + bolt/include/bolt/Profile/Heatmap.h | 4 +- bolt/include/bolt/Utils/CommandLineOpts.h | 1 + bolt/lib/Core/BinaryFunction.cpp | 2 +- bolt/lib/Passes/BinaryPasses.cpp | 2 +- bolt/lib/Profile/DataAggregator.cpp | 283 ++++++++---------- bolt/lib/Profile/DataReader.cpp | 11 +- bolt/lib/Profile/Heatmap.cpp | 33 +- bolt/lib/Profile/YAMLProfileReader.cpp | 6 +- bolt/lib/Rewrite/RewriteInstance.cpp | 11 +- bolt/lib/Utils/CommandLineOpts.cpp | 5 + .../X86/bolt-address-translation-yaml.test | 4 +- bolt/test/X86/heatmap-preagg.test | 35 +++ bolt/test/X86/pre-aggregated-perf.test | 4 +- bolt/test/perf2bolt/perf_test.test | 9 +- 17 files changed, 222 insertions(+), 226 deletions(-) create mode 100644 bolt/test/X86/heatmap-preagg.test diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h index a52998564ee1b..e82b857446ce2 100644 --- a/bolt/include/bolt/Core/BinaryFunction.h +++ b/bolt/include/bolt/Core/BinaryFunction.h @@ -386,8 +386,8 @@ class BinaryFunction { /// Profile match ratio. float ProfileMatchRatio{0.0f}; - /// Raw branch count for this function in the profile. - uint64_t RawBranchCount{0}; + /// Raw sample/branch count for this function in the profile. + uint64_t RawSampleCount{0}; /// Dynamically executed function bytes, used for density computation. uint64_t SampleCountInBytes{0}; @@ -1880,13 +1880,12 @@ class BinaryFunction { /// Return COUNT_NO_PROFILE if there's no profile info. uint64_t getExecutionCount() const { return ExecutionCount; } - /// Return the raw profile information about the number of branch - /// executions corresponding to this function. - uint64_t getRawBranchCount() const { return RawBranchCount; } + /// Return the raw profile information about the number of samples (basic + /// profile) or branch executions (branch profile) recorded in this function. + uint64_t getRawSampleCount() const { return RawSampleCount; } - /// Set the profile data about the number of branch executions corresponding - /// to this function. - void setRawBranchCount(uint64_t Count) { RawBranchCount = Count; } + /// Set raw count of samples or branches recorded in this function. + void setRawSampleCount(uint64_t Count) { RawSampleCount = Count; } /// Return the number of dynamically executed bytes, from raw perf data. uint64_t getSampleCountInBytes() const { return SampleCountInBytes; } diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h index c4ee75e7a6da6..d66d198e37d61 100644 --- a/bolt/include/bolt/Profile/DataAggregator.h +++ b/bolt/include/bolt/Profile/DataAggregator.h @@ -92,16 +92,6 @@ class DataAggregator : public DataReader { uint64_t Addr; }; - /// Used for parsing specific pre-aggregated input files. - struct AggregatedLBREntry { - enum Type : char { BRANCH = 0, FT, FT_EXTERNAL_ORIGIN, TRACE }; - Location From; - Location To; - uint64_t Count; - uint64_t Mispreds; - Type EntryType; - }; - struct Trace { uint64_t From; uint64_t To; @@ -131,7 +121,6 @@ class DataAggregator : public DataReader { /// and use them later for processing and assigning profile. std::unordered_map<Trace, TakenBranchInfo, TraceHash> BranchLBRs; std::unordered_map<Trace, FTInfo, TraceHash> FallthroughLBRs; - std::vector<AggregatedLBREntry> AggregatedLBRs; std::unordered_map<uint64_t, uint64_t> BasicSamples; std::vector<PerfMemSample> MemSamples; @@ -416,14 +405,7 @@ class DataAggregator : public DataReader { /// F 41be90 41be90 4 /// B 4b1942 39b57f0 3 0 /// B 4b196f 4b19e0 2 0 - void parsePreAggregated(); - - /// Parse the full output of pre-aggregated LBR samples generated by - /// an external tool. - std::error_code parsePreAggregatedLBRSamples(); - - /// Process parsed pre-aggregated data. - void processPreAggregated(); + std::error_code parsePreAggregated(); /// If \p Address falls into the binary address space based on memory /// mapping info \p MMI, then adjust it for further processing by subtracting diff --git a/bolt/include/bolt/Profile/DataReader.h b/bolt/include/bolt/Profile/DataReader.h index 314dcc9115586..a7a0933bd4f03 100644 --- a/bolt/include/bolt/Profile/DataReader.h +++ b/bolt/include/bolt/Profile/DataReader.h @@ -252,6 +252,9 @@ struct FuncSampleData { /// Get the number of samples recorded in [Start, End) uint64_t getSamples(uint64_t Start, uint64_t End) const; + /// Returns the total number of samples recorded in this function. + uint64_t getSamples() const; + /// Aggregation helper DenseMap<uint64_t, size_t> Index; diff --git a/bolt/include/bolt/Profile/Heatmap.h b/bolt/include/bolt/Profile/Heatmap.h index 74d7eedc30782..fc1e2cd30011e 100644 --- a/bolt/include/bolt/Profile/Heatmap.h +++ b/bolt/include/bolt/Profile/Heatmap.h @@ -57,9 +57,9 @@ class Heatmap { } /// Register a single sample at \p Address. - void registerAddress(uint64_t Address) { + void registerAddress(uint64_t Address, uint64_t Count) { if (!ignoreAddress(Address)) - ++Map[Address / BucketSize]; + Map[Address / BucketSize] += Count; } /// Register \p Count samples at [\p StartAddress, \p EndAddress ]. diff --git a/bolt/include/bolt/Utils/CommandLineOpts.h b/bolt/include/bolt/Utils/CommandLineOpts.h index 3de945f6a1507..b5a7be53e4189 100644 --- a/bolt/include/bolt/Utils/CommandLineOpts.h +++ b/bolt/include/bolt/Utils/CommandLineOpts.h @@ -44,6 +44,7 @@ extern llvm::cl::opt<unsigned> HeatmapBlock; extern llvm::cl::opt<unsigned long long> HeatmapMaxAddress; extern llvm::cl::opt<unsigned long long> HeatmapMinAddress; extern llvm::cl::opt<bool> HeatmapPrintMappings; +extern llvm::cl::opt<bool> HeatmapStats; extern llvm::cl::opt<bool> HotData; extern llvm::cl::opt<bool> HotFunctionsAtEnd; extern llvm::cl::opt<bool> HotText; diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index 9773e21aa7522..fc521dcbc7a10 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -473,7 +473,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation) { OS << "\n Image : 0x" << Twine::utohexstr(getImageAddress()); if (ExecutionCount != COUNT_NO_PROFILE) { OS << "\n Exec Count : " << ExecutionCount; - OS << "\n Branch Count: " << RawBranchCount; + OS << "\n Branch Count: " << RawSampleCount; OS << "\n Profile Acc : " << format("%.1f%%", ProfileMatchRatio * 100.0f); } diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp index d8628c62d8654..420ffc8e01c5c 100644 --- a/bolt/lib/Passes/BinaryPasses.cpp +++ b/bolt/lib/Passes/BinaryPasses.cpp @@ -1445,7 +1445,7 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) { if (!Function.hasProfile()) continue; - uint64_t SampleCount = Function.getRawBranchCount(); + uint64_t SampleCount = Function.getRawSampleCount(); TotalSampleCount += SampleCount; if (Function.hasValidProfile()) { diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp index 80f4ea0c1b707..a47bba296c137 100644 --- a/bolt/lib/Profile/DataAggregator.cpp +++ b/bolt/lib/Profile/DataAggregator.cpp @@ -164,6 +164,10 @@ void DataAggregator::findPerfExecutable() { void DataAggregator::start() { outs() << "PERF2BOLT: Starting data aggregation job for " << Filename << "\n"; + // Turn on heatmap building if requested by --print-heatmap-stats flag. + if (opts::HeatmapStats) + opts::HeatmapMode = true; + // Don't launch perf for pre-aggregated files or when perf input is specified // by the user. if (opts::ReadPreAggregated || !opts::ReadPerfEvents.empty()) @@ -349,25 +353,29 @@ bool DataAggregator::checkPerfDataMagic(StringRef FileName) { return false; } -void DataAggregator::parsePreAggregated() { - std::string Error; +std::error_code DataAggregator::parsePreAggregated() { + outs() << "PERF2BOLT: parsing pre-aggregated profile...\n"; + NamedRegionTimer T("parseAggregated", "Parsing aggregated branch events", + TimerGroupName, TimerGroupDesc, opts::TimeAggregator); ErrorOr<std::unique_ptr<MemoryBuffer>> MB = MemoryBuffer::getFileOrSTDIN(Filename); - if (std::error_code EC = MB.getError()) { - errs() << "PERF2BOLT-ERROR: cannot open " << Filename << ": " - << EC.message() << "\n"; - exit(1); - } + if (std::error_code EC = MB.getError()) + return EC; FileBuf = std::move(*MB); ParsingBuf = FileBuf->getBuffer(); Col = 0; Line = 1; - if (parsePreAggregatedLBRSamples()) { - errs() << "PERF2BOLT: failed to parse samples\n"; - exit(1); + size_t AggregatedLBRs = 0; + while (hasData()) { + if (std::error_code EC = parseAggregatedLBREntry()) + return EC; + ++AggregatedLBRs; } + + outs() << "PERF2BOLT: read " << AggregatedLBRs << " aggregated LBR entries\n"; + return std::error_code(); } void DataAggregator::filterBinaryMMapInfo() { @@ -446,19 +454,6 @@ int DataAggregator::prepareToParse(StringRef Name, PerfProcessInfo &Process, Error DataAggregator::preprocessProfile(BinaryContext &BC) { this->BC = &BC; - if (opts::ReadPreAggregated) { - parsePreAggregated(); - return Error::success(); - } - - if (std::optional<StringRef> FileBuildID = BC.getFileBuildID()) { - outs() << "BOLT-INFO: binary build-id is: " << *FileBuildID << "\n"; - processFileBuildID(*FileBuildID); - } else { - errs() << "BOLT-WARNING: build-id will not be checked because we could " - "not read one from input binary\n"; - } - auto ErrorCallback = [](int ReturnCode, StringRef ErrBuf) { errs() << "PERF-ERROR: return code " << ReturnCode << "\n" << ErrBuf; exit(1); @@ -471,6 +466,20 @@ Error DataAggregator::preprocessProfile(BinaryContext &BC) { ErrorCallback(ReturnCode, ErrBuf); }; + if (opts::ReadPreAggregated) { + if (std::error_code EC = parsePreAggregated()) + return errorCodeToError(EC); + goto heatmap; + } + + if (std::optional<StringRef> FileBuildID = BC.getFileBuildID()) { + outs() << "BOLT-INFO: binary build-id is: " << *FileBuildID << "\n"; + processFileBuildID(*FileBuildID); + } else { + errs() << "BOLT-WARNING: build-id will not be checked because we could " + "not read one from input binary\n"; + } + if (BC.IsLinuxKernel) { // Current MMap parsing logic does not work with linux kernel. // MMap entries for linux kernel uses PERF_RECORD_MMAP @@ -499,29 +508,28 @@ Error DataAggregator::preprocessProfile(BinaryContext &BC) { filterBinaryMMapInfo(); prepareToParse("events", MainEventsPPI, ErrorCallback); - if (opts::HeatmapMode) { - if (std::error_code EC = printLBRHeatMap()) { - errs() << "ERROR: failed to print heat map: " << EC.message() << '\n'; - exit(1); - } - exit(0); - } - - if ((!opts::BasicAggregation && parseBranchEvents()) || - (opts::BasicAggregation && parseBasicEvents())) + if (opts::BasicAggregation ? parseBasicEvents() : parseBranchEvents()) errs() << "PERF2BOLT: failed to parse samples\n"; // Special handling for memory events - if (prepareToParse("mem events", MemEventsPPI, MemEventsErrorCallback)) + if (!prepareToParse("mem events", MemEventsPPI, MemEventsErrorCallback)) + if (const std::error_code EC = parseMemEvents()) + errs() << "PERF2BOLT: failed to parse memory events: " << EC.message() + << '\n'; + + deleteTempFiles(); + +heatmap: + if (!opts::HeatmapMode) return Error::success(); - if (const std::error_code EC = parseMemEvents()) - errs() << "PERF2BOLT: failed to parse memory events: " << EC.message() - << '\n'; + if (std::error_code EC = printLBRHeatMap()) + return errorCodeToError(EC); - deleteTempFiles(); + if (opts::HeatmapStats) + return Error::success(); - return Error::success(); + exit(0); } Error DataAggregator::readProfile(BinaryContext &BC) { @@ -557,9 +565,7 @@ bool DataAggregator::mayHaveProfileData(const BinaryFunction &Function) { } void DataAggregator::processProfile(BinaryContext &BC) { - if (opts::ReadPreAggregated) - processPreAggregated(); - else if (opts::BasicAggregation) + if (opts::BasicAggregation) processBasicEvents(); else processBranchEvents(); @@ -567,15 +573,14 @@ void DataAggregator::processProfile(BinaryContext &BC) { processMemEvents(); // Mark all functions with registered events as having a valid profile. - const auto Flags = opts::BasicAggregation ? BinaryFunction::PF_SAMPLE - : BinaryFunction::PF_LBR; for (auto &BFI : BC.getBinaryFunctions()) { BinaryFunction &BF = BFI.second; - FuncBranchData *FBD = getBranchData(BF); - if (FBD || getFuncSampleData(BF.getNames())) { - BF.markProfiled(Flags); - if (FBD) - BF.RawBranchCount = FBD->getNumExecutedBranches(); + if (FuncBranchData *FBD = getBranchData(BF)) { + BF.markProfiled(BinaryFunction::PF_LBR); + BF.RawSampleCount = FBD->getNumExecutedBranches(); + } else if (FuncSampleData *FSD = getFuncSampleData(BF.getNames())) { + BF.markProfiled(BinaryFunction::PF_SAMPLE); + BF.RawSampleCount = FSD->getSamples(); } } @@ -588,7 +593,6 @@ void DataAggregator::processProfile(BinaryContext &BC) { // Release intermediate storage. clear(BranchLBRs); clear(FallthroughLBRs); - clear(AggregatedLBRs); clear(BasicSamples); clear(MemSamples); } @@ -632,10 +636,18 @@ StringRef DataAggregator::getLocationName(const BinaryFunction &Func, bool DataAggregator::doSample(BinaryFunction &OrigFunc, uint64_t Address, uint64_t Count) { + // To record executed bytes, use basic block size as is regardless of BAT. + uint64_t BlockSize = 0; + if (BinaryBasicBlock *BB = OrigFunc.getBasicBlockContainingOffset( + Address - OrigFunc.getAddress())) + BlockSize = BB->getOriginalSize(); + BinaryFunction *ParentFunc = getBATParentFunction(OrigFunc); BinaryFunction &Func = ParentFunc ? *ParentFunc : OrigFunc; - if (ParentFunc || (BAT && !BAT->isBATFunction(OrigFunc.getAddress()))) + if (ParentFunc || (BAT && !BAT->isBATFunction(Func.getAddress()))) NumColdSamples += Count; + // Attach executed bytes to parent function in case of cold fragment. + Func.SampleCountInBytes += Count * BlockSize; auto I = NamesToSamples.find(Func.getOneName()); if (I == NamesToSamples.end()) { @@ -1209,15 +1221,14 @@ std::error_code DataAggregator::parseAggregatedLBREntry() { ErrorOr<StringRef> TypeOrErr = parseString(FieldSeparator); if (std::error_code EC = TypeOrErr.getError()) return EC; - auto Type = AggregatedLBREntry::TRACE; - if (LLVM_LIKELY(TypeOrErr.get() == "T")) { - } else if (TypeOrErr.get() == "B") { - Type = AggregatedLBREntry::BRANCH; - } else if (TypeOrErr.get() == "F") { - Type = AggregatedLBREntry::FT; - } else if (TypeOrErr.get() == "f") { - Type = AggregatedLBREntry::FT_EXTERNAL_ORIGIN; - } else { + enum TType { TRACE, BRANCH, FT, FT_EXTERNAL_ORIGIN, INVALID }; + auto Type = StringSwitch<TType>(TypeOrErr.get()) + .Case("T", TRACE) + .Case("B", BRANCH) + .Case("F", FT) + .Case("f", FT_EXTERNAL_ORIGIN) + .Default(INVALID); + if (Type == INVALID) { reportError("expected T, B, F or f"); return make_error_code(llvm::errc::io_error); } @@ -1235,7 +1246,7 @@ std::error_code DataAggregator::parseAggregatedLBREntry() { return EC; ErrorOr<Location> TraceFtEnd = std::error_code(); - if (Type == AggregatedLBREntry::TRACE) { + if (Type == TRACE) { while (checkAndConsumeFS()) { } TraceFtEnd = parseLocationOrOffset(); @@ -1245,13 +1256,12 @@ std::error_code DataAggregator::parseAggregatedLBREntry() { while (checkAndConsumeFS()) { } - ErrorOr<int64_t> Frequency = - parseNumberField(FieldSeparator, Type != AggregatedLBREntry::BRANCH); + ErrorOr<int64_t> Frequency = parseNumberField(FieldSeparator, Type != BRANCH); if (std::error_code EC = Frequency.getError()) return EC; uint64_t Mispreds = 0; - if (Type == AggregatedLBREntry::BRANCH) { + if (Type == BRANCH) { while (checkAndConsumeFS()) { } ErrorOr<int64_t> MispredsOrErr = parseNumberField(FieldSeparator, true); @@ -1273,13 +1283,28 @@ std::error_code DataAggregator::parseAggregatedLBREntry() { BF->setHasProfileAvailable(); uint64_t Count = static_cast<uint64_t>(Frequency.get()); - AggregatedLBREntry Entry{From.get(), To.get(), Count, Mispreds, Type}; - AggregatedLBRs.emplace_back(Entry); - if (Type == AggregatedLBREntry::TRACE) { - auto FtType = (FromFunc == ToFunc) ? AggregatedLBREntry::FT - : AggregatedLBREntry::FT_EXTERNAL_ORIGIN; - AggregatedLBREntry TraceFt{To.get(), TraceFtEnd.get(), Count, 0, FtType}; - AggregatedLBRs.emplace_back(TraceFt); + + Trace Trace(From->Offset, To->Offset); + // Taken trace + if (Type == TRACE || Type == BRANCH) { + TakenBranchInfo &Info = BranchLBRs[Trace]; + Info.TakenCount += Count; + Info.MispredCount += Mispreds; + + NumTotalSamples += Count; + } + // Construct fallthrough part of the trace + if (Type == TRACE) { + Trace.From = To->Offset; + Trace.To = TraceFtEnd->Offset; + Type = FromFunc == ToFunc ? FT : FT_EXTERNAL_ORIGIN; + } + // Add fallthrough trace + if (Type != BRANCH) { + FTInfo &Info = FallthroughLBRs[Trace]; + (Type == FT ? Info.InternCount : Info.ExternCount) += Count; + + NumTraces += Count; } return std::error_code(); @@ -1301,53 +1326,6 @@ std::error_code DataAggregator::printLBRHeatMap() { } Heatmap HM(opts::HeatmapBlock, opts::HeatmapMinAddress, opts::HeatmapMaxAddress, getTextSections(BC)); - uint64_t NumTotalSamples = 0; - - if (opts::BasicAggregation) { - while (hasData()) { - ErrorOr<PerfBasicSample> SampleRes = parseBasicSample(); - if (std::error_code EC = SampleRes.getError()) { - if (EC == errc::no_such_process) - continue; - return EC; - } - PerfBasicSample &Sample = SampleRes.get(); - HM.registerAddress(Sample.PC); - NumTotalSamples++; - } - outs() << "HEATMAP: read " << NumTotalSamples << " basic samples\n"; - } else { - while (hasData()) { - ErrorOr<PerfBranchSample> SampleRes = parseBranchSample(); - if (std::error_code EC = SampleRes.getError()) { - if (EC == errc::no_such_process) - continue; - return EC; - } - - PerfBranchSample &Sample = SampleRes.get(); - - // LBRs are stored in reverse execution order. NextLBR refers to the next - // executed branch record. - const LBREntry *NextLBR = nullptr; - for (const LBREntry &LBR : Sample.LBR) { - if (NextLBR) { - // Record fall-through trace. - const uint64_t TraceFrom = LBR.To; - const uint64_t TraceTo = NextLBR->From; - ++FallthroughLBRs[Trace(TraceFrom, TraceTo)].InternCount; - } - NextLBR = &LBR; - } - if (!Sample.LBR.empty()) { - HM.registerAddress(Sample.LBR.front().To); - HM.registerAddress(Sample.LBR.back().From); - } - NumTotalSamples += Sample.LBR.size(); - } - outs() << "HEATMAP: read " << NumTotalSamples << " LBR samples\n"; - outs() << "HEATMAP: " << FallthroughLBRs.size() << " unique traces\n"; - } if (!NumTotalSamples) { if (opts::BasicAggregation) { @@ -1363,6 +1341,8 @@ std::error_code DataAggregator::printLBRHeatMap() { outs() << "HEATMAP: building heat map...\n"; + for (const auto &[PC, Hits] : BasicSamples) + HM.registerAddress(PC, Hits); for (const auto &LBR : FallthroughLBRs) { const Trace &Trace = LBR.first; const FTInfo &Info = LBR.second; @@ -1377,6 +1357,10 @@ std::error_code DataAggregator::printLBRHeatMap() { exit(1); } + if (opts::HeatmapStats) { + HM.printSectionHotness(outs()); + return std::error_code(); + } HM.print(opts::OutputFilename); if (opts::OutputFilename == "-") HM.printCDF(opts::OutputFilename); @@ -1412,7 +1396,10 @@ void DataAggregator::parseLBRSample(const PerfBranchSample &Sample, const uint64_t TraceTo = NextLBR->From; const BinaryFunction *TraceBF = getBinaryFunctionContainingAddress(TraceFrom); - if (TraceBF && TraceBF->containsAddress(TraceTo)) { + if (opts::HeatmapMode) { + FTInfo &Info = FallthroughLBRs[Trace(TraceFrom, TraceTo)]; + ++Info.InternCount; + } else if (TraceBF && TraceBF->containsAddress(TraceTo)) { FTInfo &Info = FallthroughLBRs[Trace(TraceFrom, TraceTo)]; if (TraceBF->containsAddress(LBR.From)) ++Info.InternCount; @@ -1446,6 +1433,11 @@ void DataAggregator::parseLBRSample(const PerfBranchSample &Sample, } NextLBR = &LBR; + if (opts::HeatmapMode) { + TakenBranchInfo &Info = BranchLBRs[Trace(LBR.From, LBR.To)]; + ++Info.TakenCount; + continue; + } uint64_t From = getBinaryFunctionContainingAddress(LBR.From) ? LBR.From : 0; uint64_t To = getBinaryFunctionContainingAddress(LBR.To) ? LBR.To : 0; if (!From && !To) @@ -1454,6 +1446,10 @@ void DataAggregator::parseLBRSample(const PerfBranchSample &Sample, ++Info.TakenCount; Info.MispredCount += LBR.Mispred; } + if (opts::HeatmapMode && !Sample.LBR.empty()) { + ++BasicSamples[Sample.LBR.front().To]; + ++BasicSamples[Sample.LBR.back().From]; + } } void DataAggregator::printColdSamplesDiagnostic() const { @@ -1589,7 +1585,6 @@ std::error_code DataAggregator::parseBranchEvents() { printBranchStacksDiagnostics(NumTotalSamples - NumSamples); } } - printBranchSamplesDiagnostics(); return std::error_code(); } @@ -1617,6 +1612,7 @@ void DataAggregator::processBranchEvents() { const TakenBranchInfo &Info = AggrLBR.second; doBranch(Loc.From, Loc.To, Info.TakenCount, Info.MispredCount); } + printBranchSamplesDiagnostics(); } std::error_code DataAggregator::parseBasicEvents() { @@ -1630,6 +1626,7 @@ std::error_code DataAggregator::parseBasicEvents() { if (!Sample->PC) continue; + ++NumTotalSamples; if (BinaryFunction *BF = getBinaryFunctionContainingAddress(Sample->PC)) BF->setHasProfileAvailable(); @@ -1637,6 +1634,7 @@ std::error_code DataAggregator::parseBasicEvents() { ++BasicSamples[Sample->PC]; EventNames.insert(Sample->EventName); } + outs() << "PERF2BOLT: read " << NumTotalSamples << " basic samples\n"; return std::error_code(); } @@ -1649,7 +1647,6 @@ void DataAggregator::processBasicEvents() { for (auto &Sample : BasicSamples) { const uint64_t PC = Sample.first; const uint64_t HitCount = Sample.second; - NumTotalSamples += HitCount; BinaryFunction *Func = getBinaryFunctionContainingAddress(PC); if (!Func) { OutOfRangeSamples += HitCount; @@ -1658,7 +1655,6 @@ void DataAggregator::processBasicEvents() { doSample(*Func, PC, HitCount); } - outs() << "PERF2BOLT: read " << NumTotalSamples << " samples\n"; printBasicSamplesDiagnostics(OutOfRangeSamples); } @@ -1722,49 +1718,6 @@ void DataAggregator::processMemEvents() { } } -std::error_code DataAggregator::parsePreAggregatedLBRSamples() { - outs() << "PERF2BOLT: parsing pre-aggregated profile...\n"; - NamedRegionTimer T("parseAggregated", "Parsing aggregated branch events", - TimerGroupName, TimerGroupDesc, opts::TimeAggregator); - while (hasData()) - if (std::error_code EC = parseAggregatedLBREntry()) - return EC; - - return std::error_code(); -} - -void DataAggregator::processPreAggregated() { - outs() << "PERF2BOLT: processing pre-aggregated profile...\n"; - NamedRegionTimer T("processAggregated", "Processing aggregated branch events", - TimerGroupName, TimerGroupDesc, opts::TimeAggregator); - - for (const AggregatedLBREntry &AggrEntry : AggregatedLBRs) { - switch (AggrEntry.EntryType) { - case AggregatedLBREntry::BRANCH: - case AggregatedLBREntry::TRACE: - doBranch(AggrEntry.From.Offset, AggrEntry.To.Offset, AggrEntry.Count, - AggrEntry.Mispreds); - NumTotalSamples += AggrEntry.Count; - break; - case AggregatedLBREntry::FT: - case AggregatedLBREntry::FT_EXTERNAL_ORIGIN: { - LBREntry First{AggrEntry.EntryType == AggregatedLBREntry::FT - ? AggrEntry.From.Offset - : 0, - AggrEntry.From.Offset, false}; - LBREntry Second{AggrEntry.To.Offset, AggrEntry.To.Offset, false}; - doTrace(First, Second, AggrEntry.Count); - NumTraces += AggrEntry.Count; - break; - } - } - } - - outs() << "PERF2BOLT: read " << AggregatedLBRs.size() - << " aggregated LBR entries\n"; - printBranchSamplesDiagnostics(); -} - std::optional<int32_t> DataAggregator::parseCommExecEvent() { size_t LineEnd = ParsingBuf.find_first_of("\n"); if (LineEnd == StringRef::npos) { diff --git a/bolt/lib/Profile/DataReader.cpp b/bolt/lib/Profile/DataReader.cpp index f2e999bbfdc6d..4a92c9eb0a912 100644 --- a/bolt/lib/Profile/DataReader.cpp +++ b/bolt/lib/Profile/DataReader.cpp @@ -128,6 +128,13 @@ uint64_t FuncSampleData::getSamples(uint64_t Start, uint64_t End) const { return Result; } +uint64_t FuncSampleData::getSamples() const { + uint64_t Result = 0; + for (const SampleInfo &I : Data) + Result += I.Hits; + return Result; +} + void FuncSampleData::bumpCount(uint64_t Offset, uint64_t Count) { auto Iter = Index.find(Offset); if (Iter == Index.end()) { @@ -407,12 +414,12 @@ void DataReader::matchProfileData(BinaryFunction &BF) { FuncBranchData *FBD = getBranchData(BF); if (FBD) { BF.ProfileMatchRatio = evaluateProfileData(BF, *FBD); - BF.RawBranchCount = FBD->getNumExecutedBranches(); + BF.RawSampleCount = FBD->getNumExecutedBranches(); if (BF.ProfileMatchRatio == 1.0f) { if (fetchProfileForOtherEntryPoints(BF)) { BF.ProfileMatchRatio = evaluateProfileData(BF, *FBD); BF.ExecutionCount = FBD->ExecutionCount; - BF.RawBranchCount = FBD->getNumExecutedBranches(); + BF.RawSampleCount = FBD->getNumExecutedBranches(); } return; } diff --git a/bolt/lib/Profile/Heatmap.cpp b/bolt/lib/Profile/Heatmap.cpp index 5fc3e0669352d..14ef68d8a024a 100644 --- a/bolt/lib/Profile/Heatmap.cpp +++ b/bolt/lib/Profile/Heatmap.cpp @@ -297,6 +297,7 @@ void Heatmap::printSectionHotness(StringRef FileName) const { void Heatmap::printSectionHotness(raw_ostream &OS) const { uint64_t NumTotalCounts = 0; StringMap<uint64_t> SectionHotness; + StringMap<uint64_t> BucketUtilization; unsigned TextSectionIndex = 0; if (TextSections.empty()) @@ -304,10 +305,11 @@ void Heatmap::printSectionHotness(raw_ostream &OS) const { uint64_t UnmappedHotness = 0; auto RecordUnmappedBucket = [&](uint64_t Address, uint64_t Frequency) { - errs() << "Couldn't map the address bucket [0x" << Twine::utohexstr(Address) - << ", 0x" << Twine::utohexstr(Address + BucketSize) - << "] containing " << Frequency - << " samples to a text section in the binary."; + if (opts::Verbosity >= 1) + errs() << "Couldn't map the address bucket [0x" + << Twine::utohexstr(Address) << ", 0x" + << Twine::utohexstr(Address + BucketSize) << "] containing " + << Frequency << " samples to a text section in the binary."; UnmappedHotness += Frequency; }; @@ -325,23 +327,26 @@ void Heatmap::printSectionHotness(raw_ostream &OS) const { continue; } SectionHotness[TextSections[TextSectionIndex].Name] += KV.second; + ++BucketUtilization[TextSections[TextSectionIndex].Name]; } assert(NumTotalCounts > 0 && "total number of heatmap buckets should be greater than 0"); - OS << "Section Name, Begin Address, End Address, Percentage Hotness\n"; - for (auto &TextSection : TextSections) { - OS << TextSection.Name << ", 0x" - << Twine::utohexstr(TextSection.BeginAddress) << ", 0x" - << Twine::utohexstr(TextSection.EndAddress) << ", " - << format("%.4f", - 100.0 * SectionHotness[TextSection.Name] / NumTotalCounts) - << "\n"; + OS << "Section Name, Begin Address, End Address, Percentage Hotness, " + << "Utilization Pct, Partition Score\n"; + for (const auto [Name, Begin, End] : TextSections) { + const float Hotness = 1. * SectionHotness[Name] / NumTotalCounts; + const uint64_t NumBuckets = + End / BucketSize + !!(End % BucketSize) - Begin / BucketSize; + const float Utilization = 1. * BucketUtilization[Name] / NumBuckets; + const float PartitionScore = Hotness * Utilization; + OS << formatv("{0}, {1:x}, {2:x}, {3:f4}, {4:f4}, {5:f4}\n", Name, Begin, + End, 100. * Hotness, 100. * Utilization, PartitionScore); } if (UnmappedHotness > 0) - OS << "[unmapped], 0x0, 0x0, " - << format("%.4f", 100.0 * UnmappedHotness / NumTotalCounts) << "\n"; + OS << formatv("[unmapped], 0x0, 0x0, {0:f4}, 0, 0\n", + 100.0 * UnmappedHotness / NumTotalCounts); } } // namespace bolt } // namespace llvm diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp b/bolt/lib/Profile/YAMLProfileReader.cpp index f5636bfe3e1f1..88b806c7a9ca2 100644 --- a/bolt/lib/Profile/YAMLProfileReader.cpp +++ b/bolt/lib/Profile/YAMLProfileReader.cpp @@ -177,11 +177,11 @@ bool YAMLProfileReader::parseFunctionProfile( BF.setExecutionCount(YamlBF.ExecCount); - uint64_t FuncRawBranchCount = 0; + uint64_t FuncRawSampleCount = 0; for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF.Blocks) for (const yaml::bolt::SuccessorInfo &YamlSI : YamlBB.Successors) - FuncRawBranchCount += YamlSI.Count; - BF.setRawBranchCount(FuncRawBranchCount); + FuncRawSampleCount += YamlSI.Count; + BF.setRawSampleCount(FuncRawSampleCount); if (BF.empty()) return true; diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index 1c24200059035..e7030e721c55b 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -1997,13 +1997,10 @@ Error RewriteInstance::readSpecialSections() { if (ErrorOr<BinarySection &> BATSec = BC->getUniqueSectionByName(BoltAddressTranslation::SECTION_NAME)) { BC->HasBATSection = true; - // Do not read BAT when plotting a heatmap - if (!opts::HeatmapMode) { - if (std::error_code EC = BAT->parse(BC->outs(), BATSec->getContents())) { - BC->errs() << "BOLT-ERROR: failed to parse BOLT address translation " - "table.\n"; - exit(1); - } + if (std::error_code EC = BAT->parse(BC->outs(), BATSec->getContents())) { + BC->errs() << "BOLT-ERROR: failed to parse BOLT address translation " + "table.\n"; + exit(1); } } diff --git a/bolt/lib/Utils/CommandLineOpts.cpp b/bolt/lib/Utils/CommandLineOpts.cpp index ad714371436e0..98ae00b34652a 100644 --- a/bolt/lib/Utils/CommandLineOpts.cpp +++ b/bolt/lib/Utils/CommandLineOpts.cpp @@ -119,6 +119,11 @@ cl::opt<bool> HeatmapPrintMappings( "sections (default false)"), cl::Optional, cl::cat(HeatmapCategory)); +cl::opt<bool> HeatmapStats( + "print-heatmap-stats", + cl::desc("print heatmap statistics without producing the heatmap"), + cl::Optional, cl::cat(HeatmapCategory)); + cl::opt<bool> HotData("hot-data", cl::desc("hot data symbols support (relocation mode)"), cl::cat(BoltCategory)); diff --git a/bolt/test/X86/bolt-address-translation-yaml.test b/bolt/test/X86/bolt-address-translation-yaml.test index a6a212d9c1b38..c253b2b1d606e 100644 --- a/bolt/test/X86/bolt-address-translation-yaml.test +++ b/bolt/test/X86/bolt-address-translation-yaml.test @@ -28,7 +28,7 @@ ORDER-YAML-CHECK-NEXT: calls: [ { off: 0x26, fid: [[#]], cnt: 20 } ] ORDER-YAML-CHECK-NEXT: succ: [ { bid: 5, cnt: 7 } ## Large profile test RUN: perf2bolt %t.out --pa -p %p/Inputs/blarge_new_bat.preagg.txt -w %t.yaml -o %t.fdata \ -RUN: 2>&1 | FileCheck --check-prefix READ-BAT-CHECK %s +RUN: --print-heatmap-stats 2>&1 | FileCheck --check-prefix READ-BAT-CHECK %s RUN: FileCheck --input-file %t.yaml --check-prefix YAML-BAT-CHECK %s ## Check that YAML converted from fdata matches YAML created directly with BAT. RUN: llvm-bolt %t.exe -data %t.fdata -w %t.yaml-fdata -o /dev/null \ @@ -46,6 +46,8 @@ WRITE-BAT-CHECK: BOLT-INFO: BAT section size (bytes): 404 READ-BAT-CHECK-NOT: BOLT-ERROR: unable to save profile in YAML format for input file processed by BOLT READ-BAT-CHECK: BOLT-INFO: Parsed 5 BAT entries READ-BAT-CHECK: PERF2BOLT: read 79 aggregated LBR entries +READ-BAT-CHECK: HEATMAP: building heat map +READ-BAT-CHECK: .text, 0x800000, 0x8002cc, 38.7595, 91.6667, 0.3553 READ-BAT-CHECK: BOLT-INFO: 5 out of 21 functions in the binary (23.8%) have non-empty execution profile READ-BAT-FDATA-CHECK: BOLT-INFO: 5 out of 16 functions in the binary (31.2%) have non-empty execution profile diff --git a/bolt/test/X86/heatmap-preagg.test b/bolt/test/X86/heatmap-preagg.test new file mode 100644 index 0000000000000..702dc804f5133 --- /dev/null +++ b/bolt/test/X86/heatmap-preagg.test @@ -0,0 +1,35 @@ +## Test heatmap with pre-aggregated profile + +RUN: yaml2obj %p/Inputs/blarge_new.yaml &> %t.exe +## Non-BOLTed input binary +RUN: llvm-bolt-heatmap %t.exe -o %t --pa -p %p/Inputs/blarge_new.preagg.txt \ +RUN: 2>&1 | FileCheck --check-prefix CHECK-HEATMAP %s +RUN: FileCheck %s --check-prefix CHECK-SEC-HOT --input-file %t-section-hotness.csv + +## BOLTed input binary +RUN: llvm-bolt %t.exe -o %t.out --pa -p %p/Inputs/blarge_new.preagg.txt \ +RUN: --reorder-blocks=ext-tsp --split-functions --split-strategy=cdsplit \ +RUN: --reorder-functions=cdsort --enable-bat --dyno-stats --skip-funcs=main +RUN: llvm-bolt-heatmap %t.out -o %t2 --pa -p %p/Inputs/blarge_new_bat.preagg.txt \ +RUN: 2>&1 | FileCheck --check-prefix CHECK-HEATMAP-BAT %s +RUN: FileCheck %s --check-prefix CHECK-SEC-HOT-BAT --input-file %t2-section-hotness.csv + +CHECK-HEATMAP: PERF2BOLT: read 81 aggregated LBR entries +CHECK-HEATMAP: HEATMAP: invalid traces: 1 + +CHECK-SEC-HOT: Section Name, Begin Address, End Address, Percentage Hotness, Utilization Pct, Partition Score +CHECK-SEC-HOT-NEXT: .init, 0x401000, 0x40101b, 16.8545, 100.0000, 0.1685 +CHECK-SEC-HOT-NEXT: .plt, 0x401020, 0x4010b0, 4.7583, 66.6667, 0.0317 +CHECK-SEC-HOT-NEXT: .text, 0x4010b0, 0x401c25, 78.3872, 85.1064, 0.6671 +CHECK-SEC-HOT-NEXT: .fini, 0x401c28, 0x401c35, 0.0000, 0.0000, 0.0000 + +CHECK-HEATMAP-BAT: PERF2BOLT: read 79 aggregated LBR entries +CHECK-HEATMAP-BAT: HEATMAP: invalid traces: 2 + +CHECK-SEC-HOT-BAT: Section Name, Begin Address, End Address, Percentage Hotness, Utilization Pct, Partition Score +CHECK-SEC-HOT-BAT-NEXT: .init, 0x401000, 0x40101b, 17.2888, 100.0000, 0.1729 +CHECK-SEC-HOT-BAT-NEXT: .plt, 0x401020, 0x4010b0, 5.6132, 66.6667, 0.0374 +CHECK-SEC-HOT-BAT-NEXT: .bolt.org.text, 0x4010b0, 0x401c25, 38.3385, 51.0638, 0.1958 +CHECK-SEC-HOT-BAT-NEXT: .fini, 0x401c28, 0x401c35, 0.0000, 0.0000, 0.0000 +CHECK-SEC-HOT-BAT-NEXT: .text, 0x800000, 0x8002cc, 38.7595, 91.6667, 0.3553 +CHECK-SEC-HOT-BAT-NEXT: .text.cold, 0x800300, 0x800415, 0.0000, 0.0000, 0.0000 diff --git a/bolt/test/X86/pre-aggregated-perf.test b/bolt/test/X86/pre-aggregated-perf.test index cf745ca7bf7b6..da0d2950f037a 100644 --- a/bolt/test/X86/pre-aggregated-perf.test +++ b/bolt/test/X86/pre-aggregated-perf.test @@ -11,10 +11,12 @@ REQUIRES: system-linux RUN: yaml2obj %p/Inputs/blarge.yaml &> %t.exe RUN: perf2bolt %t.exe -o %t --pa -p %p/Inputs/pre-aggregated.txt -w %t.new \ -RUN: --show-density \ +RUN: --show-density --print-heatmap-stats \ RUN: --profile-density-threshold=9 --profile-density-cutoff-hot=970000 \ RUN: --profile-use-dfs | FileCheck %s --check-prefix=CHECK-P2B +CHECK-P2B: HEATMAP: building heat map +CHECK-P2B: .text, 0x400680, 0x401232, 100.0000, 4.2553, 0.0426 CHECK-P2B: BOLT-INFO: 4 out of 7 functions in the binary (57.1%) have non-empty execution profile CHECK-P2B: BOLT-INFO: Functions with density >= 21.7 account for 97.00% total sample counts. diff --git a/bolt/test/perf2bolt/perf_test.test b/bolt/test/perf2bolt/perf_test.test index 7bec4420214d6..79771e29714c7 100644 --- a/bolt/test/perf2bolt/perf_test.test +++ b/bolt/test/perf2bolt/perf_test.test @@ -4,11 +4,16 @@ REQUIRES: system-linux, perf RUN: %clang %S/Inputs/perf_test.c -fuse-ld=lld -Wl,--script=%S/Inputs/perf_test.lds -o %t RUN: perf record -Fmax -e cycles:u -o %t2 -- %t -RUN: perf2bolt %t -p=%t2 -o %t3 -nl -ignore-build-id 2>&1 | FileCheck %s +RUN: perf2bolt %t -p=%t2 -o %t3 -nl -ignore-build-id --print-heatmap-stats \ +RUN: 2>&1 | FileCheck %s CHECK-NOT: PERF2BOLT-ERROR CHECK-NOT: !! WARNING !! This high mismatch ratio indicates the input binary is probably not the same binary used during profiling collection. +CHECK: HEATMAP: building heat map +CHECK-NEXT: Section Name, Begin Address, End Address, Percentage Hotness, Utilization Pct, Partition Score +CHECK: BOLT-INFO: Functions with density >= {{.*}} account for 99.00% total sample counts. RUN: %clang %S/Inputs/perf_test.c -no-pie -fuse-ld=lld -o %t4 RUN: perf record -Fmax -e cycles:u -o %t5 -- %t4 -RUN: perf2bolt %t4 -p=%t5 -o %t6 -nl -ignore-build-id 2>&1 | FileCheck %s +RUN: perf2bolt %t4 -p=%t5 -o %t6 -nl -ignore-build-id --print-heatmap-stats \ +RUN: 2>&1 | FileCheck %s