Skip to content
This repository has been archived by the owner on May 26, 2022. It is now read-only.

improve dial errors #145

Merged
merged 3 commits into from
Nov 5, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions dial_error.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package swarm

import (
"fmt"
"os"
"strings"

"github.com/libp2p/go-libp2p-core/peer"
Expand All @@ -20,6 +21,10 @@ type DialError struct {
Skipped int
}

func (e *DialError) Timeout() bool {
return os.IsTimeout(e.Cause)
}

func (e *DialError) recordErr(addr ma.Multiaddr, err error) {
if len(e.DialErrors) >= maxDialDialErrors {
e.Skipped++
Expand Down Expand Up @@ -48,11 +53,7 @@ func (e *DialError) Error() string {

// Unwrap implements https://godoc.org/golang.org/x/xerrors#Wrapper.
func (e *DialError) Unwrap() error {
// If we have a context error, that's the "ultimate" error.
if e.Cause != nil {
return e.Cause
}
return nil
return e.Cause
}

var _ error = (*DialError)(nil)
Expand Down
14 changes: 14 additions & 0 deletions dial_sync.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,15 @@ package swarm

import (
"context"
"errors"
"sync"

"github.com/libp2p/go-libp2p-core/peer"
)

// TODO: change this text when we fix the bug
var errDialCanceled = errors.New("dial was aborted internally, likely due to https://git.io/Je2wW")

// DialFunc is the type of function expected by DialSync.
type DialFunc func(context.Context, peer.ID) (*Conn, error)

Expand Down Expand Up @@ -78,6 +82,16 @@ func (ad *activeDial) decref() {

func (ad *activeDial) start(ctx context.Context) {
ad.conn, ad.err = ad.ds.dialFunc(ctx, ad.id)

// This isn't the user's context so we should fix the error.
switch ad.err {
case context.Canceled:
// The dial was canceled with `CancelDial`.
ad.err = errDialCanceled
case context.DeadlineExceeded:
// We hit an internal timeout, not a context timeout.
ad.err = ErrDialTimeout
}
close(ad.waitch)
ad.cancel()
}
Expand Down
3 changes: 3 additions & 0 deletions swarm.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ var ErrSwarmClosed = errors.New("swarm closed")
// transport is misbehaving.
var ErrAddrFiltered = errors.New("address filtered")

// ErrDialTimeout is returned when one a dial times out due to the global timeout
var ErrDialTimeout = errors.New("dial timed out")

// Swarm is a connection muxer, allowing connections to other peers to
// be opened and closed, while still using the same Chan for all
// communication. The Chan sends/receives Messages, which note the
Expand Down
26 changes: 19 additions & 7 deletions swarm_dial.go
Original file line number Diff line number Diff line change
Expand Up @@ -221,12 +221,23 @@ func (s *Swarm) dialPeer(ctx context.Context, p peer.ID) (*Conn, error) {
defer cancel()

conn, err = s.dsync.DialLock(ctx, p)
if err != nil {
return nil, err
if err == nil {
return conn, nil
}

log.Debugf("network for %s finished dialing %s", s.local, p)
return conn, err

if ctx.Err() != nil {
// Context error trumps any dial errors as it was likely the ultimate cause.
return nil, ctx.Err()
}

if s.ctx.Err() != nil {
// Ok, so the swarm is shutting down.
return nil, ErrSwarmClosed
}

return nil, err
}

// doDial is an ugly shim method to retain all the logging and backoff logic
Expand Down Expand Up @@ -317,13 +328,14 @@ func (s *Swarm) dial(ctx context.Context, p peer.ID) (*Conn, error) {
connC, dialErr := s.dialAddrs(ctx, p, goodAddrsChan)
if dialErr != nil {
logdial["error"] = dialErr.Cause.Error()
if dialErr.Cause == context.Canceled {
// always prefer the "context canceled" error.
// we rely on behing able to check `err == context.Canceled`
switch dialErr.Cause {
case context.Canceled, context.DeadlineExceeded:
// Always prefer the context errors as we rely on being
// able to check them.
//
// Removing this will BREAK backoff (causing us to
// backoff when canceling dials).
return nil, context.Canceled
return nil, dialErr.Cause
}
return nil, dialErr
}
Expand Down