diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h index aa83d7f9b13ab..56eb463fc98fc 100644 --- a/bolt/include/bolt/Profile/DataAggregator.h +++ b/bolt/include/bolt/Profile/DataAggregator.h @@ -94,7 +94,7 @@ class DataAggregator : public DataReader { /// Used for parsing specific pre-aggregated input files. struct AggregatedLBREntry { - enum Type : char { BRANCH = 0, FT, FT_EXTERNAL_ORIGIN }; + enum Type : char { BRANCH = 0, FT, FT_EXTERNAL_ORIGIN, TRACE }; Location From; Location To; uint64_t Count; @@ -197,6 +197,10 @@ class DataAggregator : public DataReader { BoltAddressTranslation *BAT{nullptr}; + /// Whether pre-aggregated profile needs to convert branch profile into call + /// to continuation fallthrough profile. + bool NeedsConvertRetProfileToCallCont{false}; + /// Update function execution profile with a recorded trace. /// A trace is region of code executed between two LBR entries supplied in /// execution order. @@ -268,8 +272,7 @@ class DataAggregator : public DataReader { uint64_t Mispreds); /// Register a \p Branch. - bool doBranch(uint64_t From, uint64_t To, uint64_t Count, uint64_t Mispreds, - bool IsPreagg); + bool doBranch(uint64_t From, uint64_t To, uint64_t Count, uint64_t Mispreds); /// Register a trace between two LBR entries supplied in execution order. bool doTrace(const LBREntry &First, const LBREntry &Second, @@ -298,7 +301,7 @@ class DataAggregator : public DataReader { ErrorOr parseMemSample(); /// Parse pre-aggregated LBR samples created by an external tool - ErrorOr parseAggregatedLBREntry(); + std::error_code parseAggregatedLBREntry(); /// Parse either buildid:offset or just offset, representing a location in the /// binary. Used exclusively for pre-aggregated LBR samples. @@ -384,14 +387,15 @@ class DataAggregator : public DataReader { /// memory. /// /// File format syntax: - /// {B|F|f} [:] [:] - /// [] + /// {B|F|f|T} [:] [:] [] + /// [] /// /// B - indicates an aggregated branch /// F - an aggregated fall-through /// f - an aggregated fall-through with external origin - used to disambiguate /// between a return hitting a basic block head and a regular internal /// jump to the block + /// T - an aggregated trace: branch with a fall-through (from, to, ft_end) /// /// - build id of the object containing the start address. We can /// skip it for the main binary and use "X" for an unknown object. This will @@ -402,6 +406,8 @@ class DataAggregator : public DataReader { /// /// , - same for the end address. /// + /// - same for the fallthrough_end address. + /// /// - total aggregated count of the branch or a fall-through. /// /// - the number of times the branch was mispredicted. diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp index de9ec6c1723d5..a859f27569385 100644 --- a/bolt/lib/Profile/DataAggregator.cpp +++ b/bolt/lib/Profile/DataAggregator.cpp @@ -711,7 +711,7 @@ bool DataAggregator::doInterBranch(BinaryFunction *FromFunc, } bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count, - uint64_t Mispreds, bool IsPreagg) { + uint64_t Mispreds) { // Returns whether \p Offset in \p Func contains a return instruction. auto checkReturn = [&](const BinaryFunction &Func, const uint64_t Offset) { auto isReturn = [&](auto MI) { return MI && BC->MIB->isReturn(*MI); }; @@ -772,7 +772,8 @@ bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count, return false; // Record call to continuation trace. - if (IsPreagg && FromFunc != ToFunc && (IsReturn || IsCallCont)) { + if (NeedsConvertRetProfileToCallCont && FromFunc != ToFunc && + (IsReturn || IsCallCont)) { LBREntry First{ToOrig - 1, ToOrig - 1, false}; LBREntry Second{ToOrig, ToOrig, false}; return doTrace(First, Second, Count); @@ -1216,23 +1217,30 @@ ErrorOr DataAggregator::parseLocationOrOffset() { return Location(true, BuildID.get(), Offset.get()); } -ErrorOr -DataAggregator::parseAggregatedLBREntry() { +std::error_code DataAggregator::parseAggregatedLBREntry() { while (checkAndConsumeFS()) { } ErrorOr TypeOrErr = parseString(FieldSeparator); if (std::error_code EC = TypeOrErr.getError()) return EC; + // Pre-aggregated profile with branches and fallthroughs needs to convert + // return profile into call to continuation fall-through. auto Type = AggregatedLBREntry::BRANCH; if (TypeOrErr.get() == "B") { + NeedsConvertRetProfileToCallCont = true; Type = AggregatedLBREntry::BRANCH; } else if (TypeOrErr.get() == "F") { + NeedsConvertRetProfileToCallCont = true; Type = AggregatedLBREntry::FT; } else if (TypeOrErr.get() == "f") { + NeedsConvertRetProfileToCallCont = true; Type = AggregatedLBREntry::FT_EXTERNAL_ORIGIN; + } else if (TypeOrErr.get() == "T") { + // Trace is expanded into B and [Ff] + Type = AggregatedLBREntry::TRACE; } else { - reportError("expected B, F or f"); + reportError("expected T, B, F or f"); return make_error_code(llvm::errc::io_error); } @@ -1248,6 +1256,15 @@ DataAggregator::parseAggregatedLBREntry() { if (std::error_code EC = To.getError()) return EC; + ErrorOr TraceFtEnd = std::error_code(); + if (Type == AggregatedLBREntry::TRACE) { + while (checkAndConsumeFS()) { + } + TraceFtEnd = parseLocationOrOffset(); + if (std::error_code EC = TraceFtEnd.getError()) + return EC; + } + while (checkAndConsumeFS()) { } ErrorOr Frequency = @@ -1270,9 +1287,24 @@ DataAggregator::parseAggregatedLBREntry() { return make_error_code(llvm::errc::io_error); } - return AggregatedLBREntry{From.get(), To.get(), - static_cast(Frequency.get()), Mispreds, - Type}; + BinaryFunction *FromFunc = getBinaryFunctionContainingAddress(From->Offset); + BinaryFunction *ToFunc = getBinaryFunctionContainingAddress(To->Offset); + + for (BinaryFunction *BF : {FromFunc, ToFunc}) + if (BF) + BF->setHasProfileAvailable(); + + uint64_t Count = static_cast(Frequency.get()); + AggregatedLBREntry Entry{From.get(), To.get(), Count, Mispreds, Type}; + AggregatedLBRs.emplace_back(Entry); + if (Type == AggregatedLBREntry::TRACE) { + auto FtType = (FromFunc == ToFunc) ? AggregatedLBREntry::FT + : AggregatedLBREntry::FT_EXTERNAL_ORIGIN; + AggregatedLBREntry TraceFt{To.get(), TraceFtEnd.get(), Count, 0, FtType}; + AggregatedLBRs.emplace_back(TraceFt); + } + + return std::error_code(); } bool DataAggregator::ignoreKernelInterrupt(LBREntry &LBR) const { @@ -1585,8 +1617,7 @@ void DataAggregator::processBranchEvents() { for (const auto &AggrLBR : BranchLBRs) { const Trace &Loc = AggrLBR.first; const TakenBranchInfo &Info = AggrLBR.second; - doBranch(Loc.From, Loc.To, Info.TakenCount, Info.MispredCount, - /*IsPreagg*/ false); + doBranch(Loc.From, Loc.To, Info.TakenCount, Info.MispredCount); } } @@ -1722,18 +1753,10 @@ std::error_code DataAggregator::parsePreAggregatedLBRSamples() { outs() << "PERF2BOLT: parsing pre-aggregated profile...\n"; NamedRegionTimer T("parseAggregated", "Parsing aggregated branch events", TimerGroupName, TimerGroupDesc, opts::TimeAggregator); - while (hasData()) { - ErrorOr AggrEntry = parseAggregatedLBREntry(); - if (std::error_code EC = AggrEntry.getError()) + while (hasData()) + if (std::error_code EC = parseAggregatedLBREntry()) return EC; - for (const uint64_t Addr : {AggrEntry->From.Offset, AggrEntry->To.Offset}) - if (BinaryFunction *BF = getBinaryFunctionContainingAddress(Addr)) - BF->setHasProfileAvailable(); - - AggregatedLBRs.emplace_back(std::move(AggrEntry.get())); - } - return std::error_code(); } @@ -1746,8 +1769,9 @@ void DataAggregator::processPreAggregated() { for (const AggregatedLBREntry &AggrEntry : AggregatedLBRs) { switch (AggrEntry.EntryType) { case AggregatedLBREntry::BRANCH: + case AggregatedLBREntry::TRACE: doBranch(AggrEntry.From.Offset, AggrEntry.To.Offset, AggrEntry.Count, - AggrEntry.Mispreds, /*IsPreagg*/ true); + AggrEntry.Mispreds); break; case AggregatedLBREntry::FT: case AggregatedLBREntry::FT_EXTERNAL_ORIGIN: { diff --git a/bolt/test/X86/callcont-fallthru.s b/bolt/test/X86/callcont-fallthru.s index d76f869c971fd..95cb4c5fc2df4 100644 --- a/bolt/test/X86/callcont-fallthru.s +++ b/bolt/test/X86/callcont-fallthru.s @@ -4,19 +4,21 @@ # RUN: %clang %cflags -fpic -shared -xc /dev/null -o %t.so ## Link against a DSO to ensure PLT entries. # RUN: %clangxx %cxxflags %s %t.so -o %t -Wl,-q -nostdlib -# RUN: link_fdata %s %t %t.pa1 PREAGG +# RUN: link_fdata %s %t %t.pa1 PREAGG1 # RUN: link_fdata %s %t %t.pa2 PREAGG2 # RUN: link_fdata %s %t %t.pa3 PREAGG3 -# RUN: link_fdata %s %t %t.pa4 PREAGG4 +# RUN: link_fdata %s %t %t.pat PREAGGT1 +# RUN: link_fdata %s %t %t.pat2 PREAGGT2 ## Check normal case: fallthrough is not LP or secondary entry. -# RUN: llvm-strip --strip-unneeded %t -o %t.exe -# RUN: llvm-bolt %t.exe --pa -p %t.pa1 -o %t.out \ +# RUN: llvm-strip --strip-unneeded %t -o %t.strip +# RUN: llvm-objcopy --remove-section=.eh_frame %t.strip %t.noeh +# RUN: llvm-bolt %t.strip --pa -p %t.pa1 -o %t.out \ # RUN: --print-cfg --print-only=main | FileCheck %s ## Check that getFallthroughsInTrace correctly handles a trace starting at plt ## call continuation -# RUN: llvm-bolt %t.exe --pa -p %t.pa2 -o %t.out2 \ +# RUN: llvm-bolt %t.strip --pa -p %t.pa2 -o %t.out2 \ # RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK2 ## Check that we don't treat secondary entry points as call continuation sites. @@ -24,8 +26,21 @@ # RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3 ## Check fallthrough to a landing pad case. -# RUN: llvm-bolt %t.exe --pa -p %t.pa4 -o %t.out \ -# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK4 +# RUN: llvm-bolt %t.strip --pa -p %t.pa3 -o %t.out \ +# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3 + +## Check pre-aggregated traces attach call continuation fallthrough count +# RUN: llvm-bolt %t.noeh --pa -p %t.pat -o %t.out \ +# RUN: --print-cfg --print-only=main | FileCheck %s + +## Check pre-aggregated traces don't attach call continuation fallthrough count +## to secondary entry point (unstripped) +# RUN: llvm-bolt %t --pa -p %t.pat2 -o %t.out \ +# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3 +## Check pre-aggregated traces don't attach call continuation fallthrough count +## to landing pad (stripped, LP) +# RUN: llvm-bolt %t.strip --pa -p %t.pat2 -o %t.out \ +# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3 .globl foo .type foo, %function @@ -51,8 +66,9 @@ main: movl %edi, -0x8(%rbp) movq %rsi, -0x10(%rbp) callq puts@PLT -## Target is a call continuation -# PREAGG: B X:0 #Ltmp1# 2 0 +## Target is an external-origin call continuation +# PREAGG1: B X:0 #Ltmp1# 2 0 +# PREAGGT1: T X:0 #Ltmp1# #Ltmp4_br# 2 # CHECK: callq puts@PLT # CHECK-NEXT: count: 2 @@ -63,14 +79,16 @@ Ltmp1: Ltmp4: cmpl $0x0, -0x14(%rbp) +Ltmp4_br: je Ltmp0 # CHECK2: je .Ltmp0 # CHECK2-NEXT: count: 3 movl $0xa, -0x18(%rbp) callq foo -## Target is a call continuation -# PREAGG: B #Lfoo_ret# #Ltmp3# 1 0 +## Target is a binary-local call continuation +# PREAGG1: B #Lfoo_ret# #Ltmp3# 1 0 +# PREAGGT1: T #Lfoo_ret# #Ltmp3# #Ltmp3_br# 1 # CHECK: callq foo # CHECK-NEXT: count: 1 @@ -79,16 +97,12 @@ Ltmp4: # CHECK2: callq foo # CHECK2-NEXT: count: 3 -## Target is a secondary entry point +## Target is a secondary entry point (unstripped) or a landing pad (stripped) # PREAGG3: B X:0 #Ltmp3# 2 0 +# PREAGGT2: T X:0 #Ltmp3# #Ltmp3_br# 2 # CHECK3: callq foo # CHECK3-NEXT: count: 0 -## Target is a landing pad -# PREAGG4: B X:0 #Ltmp3# 2 0 -# CHECK4: callq puts@PLT -# CHECK4-NEXT: count: 0 - Ltmp3: cmpl $0x0, -0x18(%rbp) Ltmp3_br: diff --git a/bolt/test/link_fdata.py b/bolt/test/link_fdata.py index 3837e394ccc87..028823a69ce00 100755 --- a/bolt/test/link_fdata.py +++ b/bolt/test/link_fdata.py @@ -34,9 +34,9 @@ fdata_pat = re.compile(r"([01].*) (?P\d+) (?P\d+)") # Pre-aggregated profile: -# {B|F|f} [:] [:] -# [] -preagg_pat = re.compile(r"(?P[BFf]) (?P.*)") +# {T|B|F|f} [:] [:] [] +# [] +preagg_pat = re.compile(r"(?P[TBFf]) (?P.*)") # No-LBR profile: #