diff --git a/dial_error.go b/dial_error.go index d2179617..f2986348 100644 --- a/dial_error.go +++ b/dial_error.go @@ -2,6 +2,7 @@ package swarm import ( "fmt" + "os" "strings" "github.com/libp2p/go-libp2p-core/peer" @@ -20,6 +21,10 @@ type DialError struct { Skipped int } +func (e *DialError) Timeout() bool { + return os.IsTimeout(e.Cause) +} + func (e *DialError) recordErr(addr ma.Multiaddr, err error) { if len(e.DialErrors) >= maxDialDialErrors { e.Skipped++ @@ -48,11 +53,7 @@ func (e *DialError) Error() string { // Unwrap implements https://godoc.org/golang.org/x/xerrors#Wrapper. func (e *DialError) Unwrap() error { - // If we have a context error, that's the "ultimate" error. - if e.Cause != nil { - return e.Cause - } - return nil + return e.Cause } var _ error = (*DialError)(nil) diff --git a/dial_sync.go b/dial_sync.go index f746b9a9..4c1230f5 100644 --- a/dial_sync.go +++ b/dial_sync.go @@ -2,11 +2,15 @@ package swarm import ( "context" + "errors" "sync" "github.com/libp2p/go-libp2p-core/peer" ) +// TODO: change this text when we fix the bug +var errDialCanceled = errors.New("dial was aborted internally, likely due to https://git.io/Je2wW") + // DialFunc is the type of function expected by DialSync. type DialFunc func(context.Context, peer.ID) (*Conn, error) @@ -78,6 +82,16 @@ func (ad *activeDial) decref() { func (ad *activeDial) start(ctx context.Context) { ad.conn, ad.err = ad.ds.dialFunc(ctx, ad.id) + + // This isn't the user's context so we should fix the error. + switch ad.err { + case context.Canceled: + // The dial was canceled with `CancelDial`. + ad.err = errDialCanceled + case context.DeadlineExceeded: + // We hit an internal timeout, not a context timeout. + ad.err = ErrDialTimeout + } close(ad.waitch) ad.cancel() } diff --git a/swarm.go b/swarm.go index 5366bdcc..f7c57717 100644 --- a/swarm.go +++ b/swarm.go @@ -40,6 +40,9 @@ var ErrSwarmClosed = errors.New("swarm closed") // transport is misbehaving. var ErrAddrFiltered = errors.New("address filtered") +// ErrDialTimeout is returned when one a dial times out due to the global timeout +var ErrDialTimeout = errors.New("dial timed out") + // Swarm is a connection muxer, allowing connections to other peers to // be opened and closed, while still using the same Chan for all // communication. The Chan sends/receives Messages, which note the diff --git a/swarm_dial.go b/swarm_dial.go index 054902e7..475a7e25 100644 --- a/swarm_dial.go +++ b/swarm_dial.go @@ -221,12 +221,23 @@ func (s *Swarm) dialPeer(ctx context.Context, p peer.ID) (*Conn, error) { defer cancel() conn, err = s.dsync.DialLock(ctx, p) - if err != nil { - return nil, err + if err == nil { + return conn, nil } log.Debugf("network for %s finished dialing %s", s.local, p) - return conn, err + + if ctx.Err() != nil { + // Context error trumps any dial errors as it was likely the ultimate cause. + return nil, ctx.Err() + } + + if s.ctx.Err() != nil { + // Ok, so the swarm is shutting down. + return nil, ErrSwarmClosed + } + + return nil, err } // doDial is an ugly shim method to retain all the logging and backoff logic @@ -317,13 +328,14 @@ func (s *Swarm) dial(ctx context.Context, p peer.ID) (*Conn, error) { connC, dialErr := s.dialAddrs(ctx, p, goodAddrsChan) if dialErr != nil { logdial["error"] = dialErr.Cause.Error() - if dialErr.Cause == context.Canceled { - // always prefer the "context canceled" error. - // we rely on behing able to check `err == context.Canceled` + switch dialErr.Cause { + case context.Canceled, context.DeadlineExceeded: + // Always prefer the context errors as we rely on being + // able to check them. // // Removing this will BREAK backoff (causing us to // backoff when canceling dials). - return nil, context.Canceled + return nil, dialErr.Cause } return nil, dialErr }