Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cleanup avalanche bootstrapping fetching #2947

Merged
merged 119 commits into from
Apr 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
119 commits
Select commit Hold shift + click to select a range
16bf45a
Cleanup ID initialization
StephenButtolph Jan 31, 2024
e8724d1
wip
StephenButtolph Feb 1, 2024
11749d3
Merge branch 'master' into parallelize-bootstrapping-fetching
StephenButtolph Feb 10, 2024
5f18895
wip
StephenButtolph Feb 11, 2024
d016bae
Add checkpoints and validators
StephenButtolph Feb 19, 2024
27206ba
Fetch checkpoints during bootstrapping
StephenButtolph Feb 19, 2024
31ca68f
Fix comment
StephenButtolph Feb 19, 2024
a1a9f18
Improve comment
StephenButtolph Feb 19, 2024
b61359a
Merge branch 'master' into add-checkpoints
StephenButtolph Feb 20, 2024
c5b7aa9
Implement interval tree for syncing
StephenButtolph Feb 20, 2024
63efd3b
Persist intervals
StephenButtolph Feb 20, 2024
2454c46
rename
StephenButtolph Feb 20, 2024
0efa5eb
Add block execution
StephenButtolph Feb 21, 2024
6a65a2a
Add length + TODOs
StephenButtolph Feb 21, 2024
f2234bd
nit
StephenButtolph Feb 21, 2024
1f63238
error rather than panic
StephenButtolph Feb 21, 2024
6947876
Refactor database passing
StephenButtolph Feb 21, 2024
28207f4
Merge branch 'interval-tree-syncing' into interval-tree-syncing-integ…
StephenButtolph Mar 19, 2024
be64296
Add interval tests
StephenButtolph Mar 19, 2024
884f39c
Fix too frequent iterator releases and batch writes
StephenButtolph Mar 19, 2024
e4e92cc
nit + support cancellation
StephenButtolph Mar 19, 2024
72314d1
Add invariant comment + tests
StephenButtolph Mar 19, 2024
e494033
Merge branch 'interval-tree-syncing' into interval-tree-syncing-integ…
StephenButtolph Mar 19, 2024
a64b863
wip
StephenButtolph Mar 19, 2024
782a02c
wip
StephenButtolph Mar 20, 2024
8851470
Merge branch 'master' into interval-tree-syncing
StephenButtolph Mar 20, 2024
2b07179
Merge branch 'interval-tree-syncing' into interval-tree-syncing-integ…
StephenButtolph Mar 20, 2024
5358587
support cancellation
StephenButtolph Mar 20, 2024
9d24b20
save
StephenButtolph Mar 20, 2024
0de6142
remove test code
StephenButtolph Mar 20, 2024
e658404
keep acceptor callback
StephenButtolph Mar 20, 2024
03f60c6
lint
StephenButtolph Mar 20, 2024
30ac2fb
Merge branch 'interval-tree-syncing' into interval-tree-syncing-integ…
StephenButtolph Mar 20, 2024
f81281a
fix test
StephenButtolph Mar 20, 2024
410db8c
Merge branch 'interval-tree-syncing-integration' of github.com:ava-la…
StephenButtolph Mar 20, 2024
984849c
remove incorrect db cleanup
StephenButtolph Mar 20, 2024
f4a724a
nits
StephenButtolph Mar 20, 2024
f01b6a3
allow specifying the log level
StephenButtolph Mar 20, 2024
9d7383f
merged
StephenButtolph Mar 20, 2024
0021f3e
cleanup metrics
StephenButtolph Mar 20, 2024
f1933d6
lint
StephenButtolph Mar 20, 2024
dd4a57f
add error information
StephenButtolph Mar 20, 2024
d5da1e9
upstream
StephenButtolph Mar 20, 2024
7373a89
Merge branch 'interval-tree-syncing' into interval-tree-syncing-integ…
StephenButtolph Mar 20, 2024
25e2801
cleanup
StephenButtolph Mar 20, 2024
5ca206b
reduce copied code
StephenButtolph Mar 20, 2024
cd90ef1
Merge branch 'interval-tree-syncing-integration' of github.com:ava-la…
StephenButtolph Mar 20, 2024
d9a7f8a
nit cleanup
StephenButtolph Mar 20, 2024
bd2b2d2
nit cleanup
StephenButtolph Mar 20, 2024
f0c5c10
Merge branch 'master' into interval-tree-syncing
StephenButtolph Mar 20, 2024
68ea448
Merge branch 'interval-tree-syncing' into interval-tree-syncing-integ…
StephenButtolph Mar 20, 2024
bef4ed5
merged
StephenButtolph Mar 20, 2024
f05e324
nit
StephenButtolph Mar 20, 2024
b6d6ba9
Merge branch 'master' into interval-tree-syncing
StephenButtolph Mar 21, 2024
281dc8d
merged
StephenButtolph Mar 21, 2024
06fd656
Merge branch 'master' into interval-tree-syncing
StephenButtolph Mar 21, 2024
672c23f
merged
StephenButtolph Mar 21, 2024
175ab8e
nit
StephenButtolph Mar 21, 2024
c89d24d
Remove redundent interface
StephenButtolph Mar 21, 2024
249819c
Merge branch 'interval-tree-syncing' into interval-tree-syncing-integ…
StephenButtolph Mar 21, 2024
a736e6c
nit
StephenButtolph Mar 21, 2024
faa5329
nit
StephenButtolph Mar 21, 2024
bc3fedf
nit
StephenButtolph Mar 25, 2024
8134e16
nit
StephenButtolph Mar 25, 2024
9991e39
Move GetMissingBlockIDs and Execute out of the interval package
StephenButtolph Mar 25, 2024
2877502
Merge branch 'master' into interval-tree-syncing
StephenButtolph Mar 25, 2024
50754ad
Merge branch 'interval-tree-syncing' into interval-tree-syncing-integ…
StephenButtolph Mar 25, 2024
a7e2458
wip
StephenButtolph Mar 25, 2024
f1dd0a6
nit
StephenButtolph Mar 26, 2024
bf29d86
remove storage
StephenButtolph Mar 26, 2024
0626129
merged
StephenButtolph Mar 26, 2024
5a8fadc
nit
StephenButtolph Mar 26, 2024
69ed897
nit
StephenButtolph Mar 26, 2024
9815047
nit
StephenButtolph Mar 26, 2024
5bf4899
nit
StephenButtolph Mar 26, 2024
9da3bc1
nit
StephenButtolph Mar 26, 2024
93aac21
Merge branch 'interval-tree-syncing' into interval-tree-syncing-integ…
StephenButtolph Mar 26, 2024
bd5687a
nit
StephenButtolph Mar 26, 2024
b6e3ee1
merged
StephenButtolph Mar 26, 2024
c376df2
add process test
StephenButtolph Mar 26, 2024
53bdb78
nit
StephenButtolph Mar 26, 2024
7310832
nit
StephenButtolph Mar 26, 2024
7de5499
wip
StephenButtolph Mar 26, 2024
4f88827
wip
StephenButtolph Mar 26, 2024
21d1fd1
remove duplicate code
StephenButtolph Mar 26, 2024
dd98c10
nit
StephenButtolph Mar 26, 2024
81f4b67
nit fix log check
StephenButtolph Mar 26, 2024
03a22b8
nit
StephenButtolph Mar 26, 2024
3499480
merged
StephenButtolph Mar 26, 2024
f19715d
merged
StephenButtolph Apr 16, 2024
ab8688c
add comment
StephenButtolph Apr 16, 2024
29913c9
update checkpoints
StephenButtolph Apr 16, 2024
baec0cc
fix merge
StephenButtolph Apr 16, 2024
5ab0a6e
update validators
StephenButtolph Apr 16, 2024
348cb0f
Manually track recent validators
StephenButtolph Apr 16, 2024
e3b7eed
merged
StephenButtolph Apr 16, 2024
a8ff0a1
parallel
StephenButtolph Apr 16, 2024
667372c
increase number of checkpoints
StephenButtolph Apr 16, 2024
db7c0c8
update validators
StephenButtolph Apr 16, 2024
ed549e3
Add regression test
StephenButtolph Apr 16, 2024
266d775
merged
StephenButtolph Apr 16, 2024
2802c19
Remove dead code
StephenButtolph Apr 16, 2024
bb1748f
fix tests
StephenButtolph Apr 16, 2024
ad60d8a
Apply suggestions from code review
StephenButtolph Apr 16, 2024
bcf1c26
nits
StephenButtolph Apr 16, 2024
754fd1c
regen
StephenButtolph Apr 16, 2024
1348b66
nit
StephenButtolph Apr 16, 2024
28a6c3b
Merge branch 'add-checkpoints' into parallelize-bootstrapping-fetching
StephenButtolph Apr 16, 2024
71513e7
nit
StephenButtolph Apr 16, 2024
6578195
Merge branch 'parallelize-bootstrapping-fetching' of github.com:ava-l…
StephenButtolph Apr 16, 2024
918acdf
nit
StephenButtolph Apr 16, 2024
713a06f
this test sucks
StephenButtolph Apr 16, 2024
bdeb1b8
Cleanup avalanche bootstrapping fetching
StephenButtolph Apr 17, 2024
965dc19
merged
StephenButtolph Apr 17, 2024
951f30c
Merge branch 'parallelize-bootstrapping-fetching' into parallelize-bo…
StephenButtolph Apr 17, 2024
76ad408
Merge branch 'master' into parallelize-bootstrapping-fetching
StephenButtolph Apr 17, 2024
8bcc74c
Merge branch 'parallelize-bootstrapping-fetching' into parallelize-bo…
StephenButtolph Apr 17, 2024
adcde6e
merged
StephenButtolph Apr 18, 2024
6d2043a
merged
StephenButtolph Apr 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion chains/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -885,9 +885,9 @@ func (m *manager) createAvalancheChain(
avalancheBootstrapperConfig := avbootstrap.Config{
AllGetsServer: avaGetHandler,
Ctx: ctx,
Beacons: vdrs,
StartupTracker: startupTracker,
Sender: avalancheMessageSender,
PeerTracker: peerTracker,
AncestorsMaxContainersReceived: m.BootstrapAncestorsMaxContainersReceived,
VtxBlocked: vtxBlocker,
TxBlocked: txBlocker,
Expand Down
169 changes: 89 additions & 80 deletions snow/engine/avalanche/bootstrap/bootstrapper.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ package bootstrap
import (
"context"
"fmt"
"time"

"go.uber.org/zap"

Expand All @@ -18,7 +19,6 @@ import (
"github.com/ava-labs/avalanchego/snow/engine/common"
"github.com/ava-labs/avalanchego/utils/bimap"
"github.com/ava-labs/avalanchego/utils/heap"
"github.com/ava-labs/avalanchego/utils/logging"
"github.com/ava-labs/avalanchego/utils/set"
"github.com/ava-labs/avalanchego/version"
)
Expand All @@ -38,6 +38,8 @@ const (
// maxOutstandingGetAncestorsRequests is the maximum number of GetAncestors
// sent but not yet responded to/failed
maxOutstandingGetAncestorsRequests = 10

epsilon = 1e-6 // small amount to add to time to avoid division by 0
)

var _ common.BootstrapableEngine = (*bootstrapper)(nil)
Expand All @@ -58,7 +60,8 @@ func New(
ChitsHandler: common.NewNoOpChitsHandler(config.Ctx.Log),
AppHandler: config.VM,

outstandingRequests: bimap.New[common.Request, ids.ID](),
outstandingRequests: bimap.New[common.Request, ids.ID](),
outstandingRequestTimes: make(map[common.Request]time.Time),

processedCache: &cache.LRU[ids.ID, struct{}]{Size: cacheSize},
onFinished: onFinished,
Expand All @@ -85,7 +88,8 @@ type bootstrapper struct {
metrics

// tracks which validators were asked for which containers in which requests
outstandingRequests *bimap.BiMap[common.Request, ids.ID]
outstandingRequests *bimap.BiMap[common.Request, ids.ID]
outstandingRequestTimes map[common.Request]time.Time

// IDs of vertices that we will send a GetAncestors request for once we are
// not at the max number of outstanding requests
Expand Down Expand Up @@ -125,84 +129,76 @@ func (b *bootstrapper) Clear(context.Context) error {
// response to a GetAncestors message to [nodeID] with request ID [requestID].
// Expects vtxs[0] to be the vertex requested in the corresponding GetAncestors.
func (b *bootstrapper) Ancestors(ctx context.Context, nodeID ids.NodeID, requestID uint32, vtxs [][]byte) error {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function was refactored to be more similar to the snowman code. The major difference here is that we no longer "optimistically" accept any vertices. If we didn't request it, we will drop it. (which should never happen anyways because the message should have been dropped at the network level now)

request := common.Request{
NodeID: nodeID,
RequestID: requestID,
}
requestedVtxID, ok := b.outstandingRequests.DeleteKey(request)
if !ok { // this message isn't in response to a request we made
b.Ctx.Log.Debug("received unexpected Ancestors",
zap.Stringer("nodeID", nodeID),
zap.Uint32("requestID", requestID),
)
return nil
}
requestTime := b.outstandingRequestTimes[request]
delete(b.outstandingRequestTimes, request)

lenVtxs := len(vtxs)
if lenVtxs == 0 {
b.Ctx.Log.Debug("Ancestors contains no vertices",
zap.Stringer("nodeID", nodeID),
zap.Uint32("requestID", requestID),
)
return b.GetAncestorsFailed(ctx, nodeID, requestID)

b.PeerTracker.RegisterFailure(nodeID)
return b.fetch(ctx, requestedVtxID)
}

if lenVtxs > b.Config.AncestorsMaxContainersReceived {
vtxs = vtxs[:b.Config.AncestorsMaxContainersReceived]

b.Ctx.Log.Debug("ignoring containers in Ancestors",
zap.Stringer("nodeID", nodeID),
zap.Uint32("requestID", requestID),
zap.Int("numIgnored", lenVtxs-b.Config.AncestorsMaxContainersReceived),
)

vtxs = vtxs[:b.Config.AncestorsMaxContainersReceived]
}

requestedVtxID, requested := b.outstandingRequests.DeleteKey(common.Request{
NodeID: nodeID,
RequestID: requestID,
})
vtx, err := b.Manager.ParseVtx(ctx, vtxs[0]) // first vertex should be the one we requested in GetAncestors request
vtx, err := b.Manager.ParseVtx(ctx, vtxs[0])
if err != nil {
if !requested {
b.Ctx.Log.Debug("failed to parse unrequested vertex",
zap.Stringer("nodeID", nodeID),
zap.Uint32("requestID", requestID),
zap.Error(err),
)
return nil
}
if b.Ctx.Log.Enabled(logging.Verbo) {
b.Ctx.Log.Verbo("failed to parse requested vertex",
zap.Stringer("nodeID", nodeID),
zap.Uint32("requestID", requestID),
zap.Stringer("vtxID", requestedVtxID),
zap.Binary("vtxBytes", vtxs[0]),
zap.Error(err),
)
} else {
b.Ctx.Log.Debug("failed to parse requested vertex",
zap.Stringer("nodeID", nodeID),
zap.Uint32("requestID", requestID),
zap.Stringer("vtxID", requestedVtxID),
zap.Error(err),
)
}
return b.fetch(ctx, requestedVtxID)
}

vtxID := vtx.ID()
// If the vertex is neither the requested vertex nor a needed vertex, return early and re-fetch if necessary
if requested && requestedVtxID != vtxID {
b.Ctx.Log.Debug("received incorrect vertex",
b.Ctx.Log.Debug("failed to parse requested vertex",
zap.Stringer("nodeID", nodeID),
zap.Uint32("requestID", requestID),
zap.Stringer("vtxID", vtxID),
zap.Stringer("vtxID", requestedVtxID),
zap.Error(err),
)

b.PeerTracker.RegisterFailure(nodeID)
return b.fetch(ctx, requestedVtxID)
}
if !requested && !b.outstandingRequests.HasValue(vtxID) && !b.needToFetch.Contains(vtxID) {
b.Ctx.Log.Debug("received un-needed vertex",

if actualID := vtx.ID(); actualID != requestedVtxID {
b.Ctx.Log.Debug("received incorrect vertex",
zap.Stringer("nodeID", nodeID),
zap.Uint32("requestID", requestID),
zap.Stringer("vtxID", vtxID),
zap.Stringer("vtxID", actualID),
)
return nil

b.PeerTracker.RegisterFailure(nodeID)
return b.fetch(ctx, requestedVtxID)
}

// Do not remove from outstanding requests if this did not answer a specific outstanding request
// to ensure that real responses are not dropped in favor of potentially byzantine Ancestors messages that
// could force the node to bootstrap 1 vertex at a time.
b.needToFetch.Remove(vtxID)
b.needToFetch.Remove(requestedVtxID)

// All vertices added to [verticesToProcess] have received transitive votes
// from the accepted frontier.
var (
numBytes = len(vtxs[0])
verticesToProcess = make([]avalanche.Vertex, 1, len(vtxs))
)
verticesToProcess[0] = vtx

// All vertices added to [processVertices] have received transitive votes from the accepted frontier
processVertices := make([]avalanche.Vertex, 1, len(vtxs)) // Process all of the valid vertices in this message
processVertices[0] = vtx
parents, err := vtx.Parents()
if err != nil {
return err
Expand All @@ -212,20 +208,14 @@ func (b *bootstrapper) Ancestors(ctx context.Context, nodeID ids.NodeID, request
eligibleVertices.Add(parent.ID())
}

for _, vtxBytes := range vtxs[1:] { // Parse/persist all the vertices
for _, vtxBytes := range vtxs[1:] {
vtx, err := b.Manager.ParseVtx(ctx, vtxBytes) // Persists the vtx
if err != nil {
b.Ctx.Log.Debug("failed to parse vertex",
zap.Stringer("nodeID", nodeID),
zap.Uint32("requestID", requestID),
zap.Error(err),
)
b.Ctx.Log.Debug("failed to parse vertex",
zap.Stringer("nodeID", nodeID),
zap.Uint32("requestID", requestID),
zap.Binary("vtxBytes", vtxBytes),
zap.Error(err),
)
break
}
vtxID := vtx.ID()
Expand All @@ -245,26 +235,41 @@ func (b *bootstrapper) Ancestors(ctx context.Context, nodeID ids.NodeID, request
for _, parent := range parents {
eligibleVertices.Add(parent.ID())
}
processVertices = append(processVertices, vtx)

numBytes += len(vtxBytes)
verticesToProcess = append(verticesToProcess, vtx)
b.needToFetch.Remove(vtxID) // No need to fetch this vertex since we have it now
}

return b.process(ctx, processVertices...)
// TODO: Calculate bandwidth based on the vertices that were persisted to
// disk.
var (
requestLatency = time.Since(requestTime).Seconds() + epsilon
marun marked this conversation as resolved.
Show resolved Hide resolved
bandwidth = float64(numBytes) / requestLatency
)
b.PeerTracker.RegisterResponse(nodeID, bandwidth)

return b.process(ctx, verticesToProcess...)
}

func (b *bootstrapper) GetAncestorsFailed(ctx context.Context, nodeID ids.NodeID, requestID uint32) error {
vtxID, ok := b.outstandingRequests.DeleteKey(common.Request{
request := common.Request{
NodeID: nodeID,
RequestID: requestID,
})
}
vtxID, ok := b.outstandingRequests.DeleteKey(request)
if !ok {
b.Ctx.Log.Debug("skipping GetAncestorsFailed call",
zap.String("reason", "no matching outstanding request"),
b.Ctx.Log.Debug("unexpectedly called GetAncestorsFailed",
zap.Stringer("nodeID", nodeID),
zap.Uint32("requestID", requestID),
)
return nil
}
delete(b.outstandingRequestTimes, request)

// This node timed out their request.
b.PeerTracker.RegisterFailure(nodeID)

// Send another request for the vertex
return b.fetch(ctx, vtxID)
}
Expand Down Expand Up @@ -411,21 +416,25 @@ func (b *bootstrapper) fetch(ctx context.Context, vtxIDs ...ids.ID) error {
continue
}

validatorIDs, err := b.Config.Beacons.Sample(b.Ctx.SubnetID, 1) // validator to send request to
if err != nil {
return fmt.Errorf("dropping request for %s as there are no validators", vtxID)
nodeID, ok := b.PeerTracker.SelectPeer()
if !ok {
// If we aren't connected to any peers, we send a request to ourself
// which is guaranteed to fail. We send this message to use the
// message timeout as a retry mechanism. Once we are connected to
// another node again we will select them to sample from.
nodeID = b.Ctx.NodeID
}
validatorID := validatorIDs[0]
b.requestID++

b.outstandingRequests.Put(
common.Request{
NodeID: validatorID,
RequestID: b.requestID,
},
vtxID,
)
b.Config.Sender.SendGetAncestors(ctx, validatorID, b.requestID, vtxID) // request vertex and ancestors
b.PeerTracker.RegisterRequest(nodeID)

b.requestID++
request := common.Request{
NodeID: nodeID,
RequestID: b.requestID,
}
b.outstandingRequests.Put(request, vtxID)
b.outstandingRequestTimes[request] = time.Now()
b.Config.Sender.SendGetAncestors(ctx, nodeID, b.requestID, vtxID) // request vertex and ancestors
}
return b.checkFinish(ctx)
}
Expand Down
Loading
Loading