Skip to content

Commit

Permalink
fix: don't record failures on context cancellation
Browse files Browse the repository at this point in the history
Gate the case where the retriever (parent) cancels a protocol retriever because
another has completed successfully. If a retrieval fails in this case, it's not
a failure that we should either report in the event system, or record against
the SP's performance stats.

Fixes: #364

I believe this fixes #364, but it's hard to reproduce reliably so I can't be
sure.
  • Loading branch information
rvagg committed Aug 11, 2023
1 parent 175ec2a commit 26c5ca7
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 10 deletions.
11 changes: 9 additions & 2 deletions pkg/retriever/bitswapretriever.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package retriever
import (
"bytes"
"context"
"errors"
"fmt"
"io"
"math"
Expand Down Expand Up @@ -242,8 +243,14 @@ func (br *bitswapRetrieval) RetrieveFromAsyncCandidates(ayncCandidates types.Inb
br.routing.RemoveProviders(br.request.RetrievalID)
br.bstore.RemoveLinkSystem(br.request.RetrievalID)
if err != nil {
// record failure
br.events(events.FailedRetrieval(br.clock.Now(), br.request.RetrievalID, bitswapCandidate, multicodec.TransportBitswap, err.Error()))
// If the local context was canceled and the parent wasn't, it could be from a block timeout
// (see lastBytesReceivedTimer), in which case it needs to be recorded as a failure.
// However, if the parent context has an error we want to exclude cancellation errors
// since the failure could be because another protocol finished, not because of a bitswap
// problem.
if br.ctx.Err() == nil || !errors.Is(err, context.Canceled) {
br.events(events.FailedRetrieval(br.clock.Now(), br.request.RetrievalID, bitswapCandidate, multicodec.TransportBitswap, err.Error()))
}
return nil, err
}
duration := br.clock.Since(startTime)
Expand Down
22 changes: 14 additions & 8 deletions pkg/retriever/parallelpeerretriever.go
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,9 @@ func (retrieval *retrieval) runRetrievalCandidate(
// Setup in parallel
connectTime, err := retrieval.Protocol.Connect(connectCtx, retrieval, startTime, candidate)
if err != nil {
if ctx.Err() == nil { // not cancelled, maybe timed out though
// Exclude the case where the context was cancelled by the parent, whichikely means that
// another protocol has succeeded.
if ctx.Err() == nil || !errors.Is(err, context.Canceled) {
logger.Warnf("Failed to connect to SP %s on protocol %s: %v", candidate.MinerPeer.ID, retrieval.Protocol.Code().String(), err)
retrievalErr = fmt.Errorf("%w: %v", ErrConnectFailed, err)
if err := retrieval.Session.RecordFailure(retrieval.request.RetrievalID, candidate.MinerPeer.ID); err != nil {
Expand All @@ -347,13 +349,17 @@ func (retrieval *retrieval) runRetrievalCandidate(
stats, retrievalErr = retrieval.Protocol.Retrieve(ctx, retrieval, shared, timeout, candidate)

if retrievalErr != nil {
msg := retrievalErr.Error()
if errors.Is(retrievalErr, ErrRetrievalTimedOut) {
msg = fmt.Sprintf("timeout after %s", timeout)
}
shared.sendEvent(ctx, events.FailedRetrieval(retrieval.parallelPeerRetriever.Clock.Now(), retrieval.request.RetrievalID, candidate, retrieval.Protocol.Code(), msg))
if err := retrieval.Session.RecordFailure(retrieval.request.RetrievalID, candidate.MinerPeer.ID); err != nil {
logger.Errorf("Error recording retrieval failure for protocol %s: %v", retrieval.Protocol.Code().String(), err)
// Exclude the case where the context was cancelled by the parent, which likely
// means that another protocol has succeeded.
if ctx.Err() == nil || !errors.Is(err, context.Canceled) {
msg := retrievalErr.Error()
if errors.Is(retrievalErr, ErrRetrievalTimedOut) {
msg = fmt.Sprintf("timeout after %s", timeout)
}
shared.sendEvent(ctx, events.FailedRetrieval(retrieval.parallelPeerRetriever.Clock.Now(), retrieval.request.RetrievalID, candidate, retrieval.Protocol.Code(), msg))
if err := retrieval.Session.RecordFailure(retrieval.request.RetrievalID, candidate.MinerPeer.ID); err != nil {
logger.Errorf("Error recording retrieval failure for protocol %s: %v", retrieval.Protocol.Code().String(), err)
}
}
} else {
shared.sendEvent(ctx, events.Success(
Expand Down

0 comments on commit 26c5ca7

Please sign in to comment.