-
Notifications
You must be signed in to change notification settings - Fork 11.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[MC][NFC] Count pseudo probes and function records #102774
[MC][NFC] Count pseudo probes and function records #102774
Conversation
Created using spr 1.3.4
@llvm/pr-subscribers-mc @llvm/pr-subscribers-bolt Author: Amir Ayupov (aaupov) ChangesPre-parse pseudo probes section counting the number of probes and Additional benefit is avoiding error handling during parsing. This pre-parsing is fast: for a 404MiB .pseudo_probe section with Full diff: https://github.com/llvm/llvm-project/pull/102774.diff 3 Files Affected:
diff --git a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
index 886bbdbf9d686e..37a5b937ebcaa3 100644
--- a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
+++ b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
@@ -143,7 +143,6 @@ void PseudoProbeRewriter::parsePseudoProbe() {
if (!ProbeDecoder.buildAddress2ProbeMap(
reinterpret_cast<const uint8_t *>(Contents.data()), Contents.size(),
GuidFilter, FuncStartAddrs)) {
- ProbeDecoder.getAddress2ProbesMap().clear();
errs() << "BOLT-WARNING: fail in building Address2ProbeMap\n";
return;
}
diff --git a/llvm/include/llvm/MC/MCPseudoProbe.h b/llvm/include/llvm/MC/MCPseudoProbe.h
index 54a7f832e50081..b42bbf2057f161 100644
--- a/llvm/include/llvm/MC/MCPseudoProbe.h
+++ b/llvm/include/llvm/MC/MCPseudoProbe.h
@@ -369,6 +369,11 @@ class MCPseudoProbeDecoder {
// Decode pseudo_probe_desc section to build GUID to PseudoProbeFuncDesc map.
bool buildGUID2FuncDescMap(const uint8_t *Start, std::size_t Size);
+ // Decode pseudo_probe section to count the number of probes and inlined
+ // function records for each function record.
+ bool countRecords(bool IsTopLevelFunc, bool &Discard, uint32_t &ProbeCount,
+ uint32_t &InlinedCount, const Uint64Set &GuidFilter);
+
// Decode pseudo_probe section to build address to probes map for specifed
// functions only.
bool buildAddress2ProbeMap(const uint8_t *Start, std::size_t Size,
diff --git a/llvm/lib/MC/MCPseudoProbe.cpp b/llvm/lib/MC/MCPseudoProbe.cpp
index a5a030e19b849e..bb9538d55b7aa2 100644
--- a/llvm/lib/MC/MCPseudoProbe.cpp
+++ b/llvm/lib/MC/MCPseudoProbe.cpp
@@ -18,6 +18,7 @@
#include "llvm/MC/MCObjectStreamer.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
#include "llvm/Support/LEB128.h"
#include "llvm/Support/MD5.h"
#include "llvm/Support/raw_ostream.h"
@@ -429,17 +430,11 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
Index = Cur->getChildren().size();
} else {
// Read inline site for inlinees
- auto ErrorOrIndex = readUnsignedNumber<uint32_t>();
- if (!ErrorOrIndex)
- return false;
- Index = std::move(*ErrorOrIndex);
+ Index = cantFail(errorOrToExpected(readUnsignedNumber<uint32_t>()));
}
// Read guid
- auto ErrorOrCurGuid = readUnencodedNumber<uint64_t>();
- if (!ErrorOrCurGuid)
- return false;
- uint64_t Guid = std::move(*ErrorOrCurGuid);
+ uint64_t Guid = cantFail(errorOrToExpected(readUnencodedNumber<uint64_t>()));
// Decide if top-level node should be disgarded.
if (IsTopLevelFunc && !GuidFilter.empty() && !GuidFilter.count(Guid))
@@ -457,41 +452,27 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
}
// Read number of probes in the current node.
- auto ErrorOrNodeCount = readUnsignedNumber<uint32_t>();
- if (!ErrorOrNodeCount)
- return false;
- uint32_t NodeCount = std::move(*ErrorOrNodeCount);
+ uint32_t NodeCount =
+ cantFail(errorOrToExpected(readUnsignedNumber<uint32_t>()));
// Read number of direct inlinees
- auto ErrorOrCurChildrenToProcess = readUnsignedNumber<uint32_t>();
- if (!ErrorOrCurChildrenToProcess)
- return false;
+ uint32_t ChildrenToProcess =
+ cantFail(errorOrToExpected(readUnsignedNumber<uint32_t>()));
// Read all probes in this node
for (std::size_t I = 0; I < NodeCount; I++) {
// Read index
- auto ErrorOrIndex = readUnsignedNumber<uint32_t>();
- if (!ErrorOrIndex)
- return false;
- uint32_t Index = std::move(*ErrorOrIndex);
+ uint32_t Index =
+ cantFail(errorOrToExpected(readUnsignedNumber<uint32_t>()));
// Read type | flag.
- auto ErrorOrValue = readUnencodedNumber<uint8_t>();
- if (!ErrorOrValue)
- return false;
- uint8_t Value = std::move(*ErrorOrValue);
+ uint8_t Value = cantFail(errorOrToExpected(readUnencodedNumber<uint8_t>()));
uint8_t Kind = Value & 0xf;
uint8_t Attr = (Value & 0x70) >> 4;
// Read address
uint64_t Addr = 0;
if (Value & 0x80) {
- auto ErrorOrOffset = readSignedNumber<int64_t>();
- if (!ErrorOrOffset)
- return false;
- int64_t Offset = std::move(*ErrorOrOffset);
+ int64_t Offset = cantFail(errorOrToExpected(readSignedNumber<int64_t>()));
Addr = LastAddr + Offset;
} else {
- auto ErrorOrAddr = readUnencodedNumber<int64_t>();
- if (!ErrorOrAddr)
- return false;
- Addr = std::move(*ErrorOrAddr);
+ Addr = cantFail(errorOrToExpected(readUnencodedNumber<int64_t>()));
if (isSentinelProbe(Attr)) {
// For sentinel probe, the addr field actually stores the GUID of the
// split function. Convert it to the real address.
@@ -508,10 +489,8 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
uint32_t Discriminator = 0;
if (hasDiscriminator(Attr)) {
- auto ErrorOrDiscriminator = readUnsignedNumber<uint32_t>();
- if (!ErrorOrDiscriminator)
- return false;
- Discriminator = std::move(*ErrorOrDiscriminator);
+ Discriminator =
+ cantFail(errorOrToExpected(readUnsignedNumber<uint32_t>()));
}
if (Cur && !isSentinelProbe(Attr)) {
@@ -524,17 +503,109 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
LastAddr = Addr;
}
- uint32_t ChildrenToProcess = std::move(*ErrorOrCurChildrenToProcess);
for (uint32_t I = 0; I < ChildrenToProcess; I++) {
buildAddress2ProbeMap(Cur, LastAddr, GuidFilter, FuncStartAddrs);
}
+ return true;
+}
+
+bool MCPseudoProbeDecoder::countRecords(bool IsTopLevelFunc, bool &Discard,
+ uint32_t &ProbeCount,
+ uint32_t &InlinedCount,
+ const Uint64Set &GuidFilter) {
+ if (!IsTopLevelFunc)
+ // Read inline site for inlinees
+ if (!readUnsignedNumber<uint32_t>())
+ return false;
+
+ // Read guid
+ auto ErrorOrCurGuid = readUnencodedNumber<uint64_t>();
+ if (!ErrorOrCurGuid)
+ return false;
+ uint64_t Guid = std::move(*ErrorOrCurGuid);
+
+ // Decide if top-level node should be disgarded.
+ if (IsTopLevelFunc) {
+ Discard = !GuidFilter.empty() && !GuidFilter.count(Guid);
+ if (!Discard)
+ // Allocate an entry for top-level function record.
+ ++InlinedCount;
+ }
+
+ // Read number of probes in the current node.
+ auto ErrorOrNodeCount = readUnsignedNumber<uint32_t>();
+ if (!ErrorOrNodeCount)
+ return false;
+ uint32_t NodeCount = std::move(*ErrorOrNodeCount);
+ uint32_t CurrentProbeCount = 0;
+
+ // Read number of direct inlinees
+ auto ErrorOrCurChildrenToProcess = readUnsignedNumber<uint32_t>();
+ if (!ErrorOrCurChildrenToProcess)
+ return false;
+ uint32_t ChildrenToProcess = std::move(*ErrorOrCurChildrenToProcess);
+
+ // Read all probes in this node
+ for (std::size_t I = 0; I < NodeCount; I++) {
+ // Read index
+ if (!readUnsignedNumber<uint32_t>())
+ return false;
+
+ // Read type | flag.
+ auto ErrorOrValue = readUnencodedNumber<uint8_t>();
+ if (!ErrorOrValue)
+ return false;
+ uint8_t Value = std::move(*ErrorOrValue);
+
+ uint8_t Attr = (Value & 0x70) >> 4;
+ if (Value & 0x80) {
+ // Offset
+ if (!readSignedNumber<int64_t>())
+ return false;
+ } else {
+ // Addr
+ if (!readUnencodedNumber<int64_t>())
+ return false;
+ }
+
+ if (hasDiscriminator(Attr))
+ // Discriminator
+ if (!readUnsignedNumber<uint32_t>())
+ return false;
+
+ if (!Discard && !isSentinelProbe(Attr))
+ ++CurrentProbeCount;
+ }
+ if (!Discard) {
+ ProbeCount += CurrentProbeCount;
+ InlinedCount += ChildrenToProcess;
+ }
+
+ for (uint32_t I = 0; I < ChildrenToProcess; I++)
+ if (!countRecords(false, Discard, ProbeCount, InlinedCount, GuidFilter))
+ return false;
return true;
}
bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
const uint8_t *Start, std::size_t Size, const Uint64Set &GuidFilter,
const Uint64Map &FuncStartAddrs) {
+ // For function records in the order of their appearance in the encoded data
+ // (DFS), count the number of contained probes and inlined function records.
+ uint32_t ProbeCount = 0;
+ uint32_t InlinedCount = 0;
+ uint32_t TopLevelFuncs = 0;
+ Data = Start;
+ End = Data + Size;
+ bool Discard = false;
+ while (Data < End) {
+ if (!countRecords(true, Discard, ProbeCount, InlinedCount, GuidFilter))
+ return false;
+ TopLevelFuncs += !Discard;
+ }
+ assert(Data == End && "Have unprocessed data in pseudo_probe section");
+
Data = Start;
End = Data + Size;
uint64_t LastAddr = 0;
|
Can you add @wlei-llvm as reviewer for MC changes touching pseudo-probe going forward? thanks. |
Created using spr 1.3.5
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks.
…unction records Use #102774 to allocate storage for decoded probes (`PseudoProbeVec`) and function records (`InlineTreeVec`). Leverage that to also shrink sizes of `MCDecodedPseudoProbe`: - Drop Guid since it's accessible via `InlineTree`. `MCDecodedPseudoProbeInlineTree`: - Keep track of probes and inlinees using `ArrayRef`s now that probes and function records belonging to the same function are allocated contiguously. This reduces peak RSS from 13.7 GiB to 9.7 GiB and pseudo probe parsing time (as part of perf2bolt) from 15.3s to 9.6s for a large binary with 400MiB .pseudo_probe section containing 43M probes and 25M function records. Depends on: #102774 #102787 #102788 Reviewers: maksfb, rafaelauler, dcci, ayermolo, wlei-llvm Reviewed By: wlei-llvm Pull Request: #102789
Pre-parse pseudo probes section counting the number of probes and function records. These numbers are used in follow-up diff to pre-allocate vectors for decoded probes and inline tree nodes. Additional benefit is avoiding error handling during parsing. This pre-parsing is fast: for a 404MiB .pseudo_probe section with 43373881 probes and 25228770 function records, it only takes 0.68±0.01s. The total time of buildAddress2ProbeMap is 21s. Reviewers: dcci, maksfb, rafaelauler, wlei-llvm, ayermolo Reviewed By: wlei-llvm Pull Request: llvm#102774
…unction records Use llvm#102774 to allocate storage for decoded probes (`PseudoProbeVec`) and function records (`InlineTreeVec`). Leverage that to also shrink sizes of `MCDecodedPseudoProbe`: - Drop Guid since it's accessible via `InlineTree`. `MCDecodedPseudoProbeInlineTree`: - Keep track of probes and inlinees using `ArrayRef`s now that probes and function records belonging to the same function are allocated contiguously. This reduces peak RSS from 13.7 GiB to 9.7 GiB and pseudo probe parsing time (as part of perf2bolt) from 15.3s to 9.6s for a large binary with 400MiB .pseudo_probe section containing 43M probes and 25M function records. Depends on: llvm#102774 llvm#102787 llvm#102788 Reviewers: maksfb, rafaelauler, dcci, ayermolo, wlei-llvm Reviewed By: wlei-llvm Pull Request: llvm#102789
Pre-parse pseudo probes section counting the number of probes and function records. These numbers are used in follow-up diff to pre-allocate vectors for decoded probes and inline tree nodes. Additional benefit is avoiding error handling during parsing. This pre-parsing is fast: for a 404MiB .pseudo_probe section with 43373881 probes and 25228770 function records, it only takes 0.68±0.01s. The total time of buildAddress2ProbeMap is 21s. Reviewers: dcci, maksfb, rafaelauler, wlei-llvm, ayermolo Reviewed By: wlei-llvm Pull Request: llvm#102774
…unction records Use llvm#102774 to allocate storage for decoded probes (`PseudoProbeVec`) and function records (`InlineTreeVec`). Leverage that to also shrink sizes of `MCDecodedPseudoProbe`: - Drop Guid since it's accessible via `InlineTree`. `MCDecodedPseudoProbeInlineTree`: - Keep track of probes and inlinees using `ArrayRef`s now that probes and function records belonging to the same function are allocated contiguously. This reduces peak RSS from 13.7 GiB to 9.7 GiB and pseudo probe parsing time (as part of perf2bolt) from 15.3s to 9.6s for a large binary with 400MiB .pseudo_probe section containing 43M probes and 25M function records. Depends on: llvm#102774 llvm#102787 llvm#102788 Reviewers: maksfb, rafaelauler, dcci, ayermolo, wlei-llvm Reviewed By: wlei-llvm Pull Request: llvm#102789
Pre-parse pseudo probes section counting the number of probes and function records. These numbers are used in follow-up diff to pre-allocate vectors for decoded probes and inline tree nodes. Additional benefit is avoiding error handling during parsing. This pre-parsing is fast: for a 404MiB .pseudo_probe section with 43373881 probes and 25228770 function records, it only takes 0.68±0.01s. The total time of buildAddress2ProbeMap is 21s. Reviewers: dcci, maksfb, rafaelauler, wlei-llvm, ayermolo Reviewed By: wlei-llvm Pull Request: #102774
…unction records Use #102774 to allocate storage for decoded probes (`PseudoProbeVec`) and function records (`InlineTreeVec`). Leverage that to also shrink sizes of `MCDecodedPseudoProbe`: - Drop Guid since it's accessible via `InlineTree`. `MCDecodedPseudoProbeInlineTree`: - Keep track of probes and inlinees using `ArrayRef`s now that probes and function records belonging to the same function are allocated contiguously. This reduces peak RSS from 13.7 GiB to 9.7 GiB and pseudo probe parsing time (as part of perf2bolt) from 15.3s to 9.6s for a large binary with 400MiB .pseudo_probe section containing 43M probes and 25M function records. Depends on: #102774 #102787 #102788 Reviewers: maksfb, rafaelauler, dcci, ayermolo, wlei-llvm Reviewed By: wlei-llvm Pull Request: #102789
Pre-parse pseudo probes section counting the number of probes and
function records. These numbers are used in follow-up diff to
pre-allocate vectors for decoded probes and inline tree nodes.
Additional benefit is avoiding error handling during parsing.
This pre-parsing is fast: for a 404MiB .pseudo_probe section with
43373881 probes and 25228770 function records, it only takes 0.68±0.01s.
The total time of buildAddress2ProbeMap is 21s.