Skip to content

Commit

Permalink
Merge pull request #4666 from filecoin-project/fix/sched-issues
Browse files Browse the repository at this point in the history
Fix worker reenabling, handle multiple restarts in worker
  • Loading branch information
magik6k authored Oct 30, 2020
2 parents 6cdbf52 + 69e44eb commit 094ea3f
Show file tree
Hide file tree
Showing 5 changed files with 130 additions and 34 deletions.
48 changes: 28 additions & 20 deletions cmd/lotus-seal-worker/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -451,14 +451,24 @@ var runCmd = &cli.Command{
return xerrors.Errorf("getting miner session: %w", err)
}

waitQuietCh := func() chan struct{} {
out := make(chan struct{})
go func() {
workerApi.LocalWorker.WaitQuiet()
close(out)
}()
return out
}

go func() {
heartbeats := time.NewTicker(stores.HeartbeatInterval)
defer heartbeats.Stop()

var connected, reconnect bool
var redeclareStorage bool
var readyCh chan struct{}
for {
// If we're reconnecting, redeclare storage first
if reconnect {
if redeclareStorage {
log.Info("Redeclaring local storage")

if err := localStore.Redeclare(ctx); err != nil {
Expand All @@ -471,14 +481,13 @@ var runCmd = &cli.Command{
}
continue
}

connected = false
}

log.Info("Making sure no local tasks are running")

// TODO: we could get rid of this, but that requires tracking resources for restarted tasks correctly
workerApi.LocalWorker.WaitQuiet()
if readyCh == nil {
log.Info("Making sure no local tasks are running")
readyCh = waitQuietCh()
}

for {
curSession, err := nodeApi.Session(ctx)
Expand All @@ -489,29 +498,28 @@ var runCmd = &cli.Command{
minerSession = curSession
break
}

if !connected {
if err := nodeApi.WorkerConnect(ctx, "http://"+address+"/rpc/v0"); err != nil {
log.Errorf("Registering worker failed: %+v", err)
cancel()
return
}

log.Info("Worker registered successfully, waiting for tasks")
connected = true
}
}

select {
case <-readyCh:
if err := nodeApi.WorkerConnect(ctx, "http://"+address+"/rpc/v0"); err != nil {
log.Errorf("Registering worker failed: %+v", err)
cancel()
return
}

log.Info("Worker registered successfully, waiting for tasks")

readyCh = nil
case <-heartbeats.C:
case <-ctx.Done():
return // graceful shutdown
case <-heartbeats.C:
}
}

log.Errorf("LOTUS-MINER CONNECTION LOST")

reconnect = true
redeclareStorage = true
}
}()

Expand Down
75 changes: 75 additions & 0 deletions extern/sector-storage/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"path/filepath"
"strings"
"sync"
"sync/atomic"
"testing"
"time"

Expand Down Expand Up @@ -376,3 +377,77 @@ func TestRestartWorker(t *testing.T) {
require.NoError(t, err)
require.Empty(t, uf)
}

func TestReenableWorker(t *testing.T) {
logging.SetAllLoggers(logging.LevelDebug)
stores.HeartbeatInterval = 5 * time.Millisecond

ctx, done := context.WithCancel(context.Background())
defer done()

ds := datastore.NewMapDatastore()

m, lstor, stor, idx, cleanup := newTestMgr(ctx, t, ds)
defer cleanup()

localTasks := []sealtasks.TaskType{
sealtasks.TTAddPiece, sealtasks.TTPreCommit1, sealtasks.TTCommit1, sealtasks.TTFinalize, sealtasks.TTFetch,
}

wds := datastore.NewMapDatastore()

arch := make(chan chan apres)
w := newLocalWorker(func() (ffiwrapper.Storage, error) {
return &testExec{apch: arch}, nil
}, WorkerConfig{
SealProof: 0,
TaskTypes: localTasks,
}, stor, lstor, idx, m, statestore.New(wds))

err := m.AddWorker(ctx, w)
require.NoError(t, err)

time.Sleep(time.Millisecond * 100)

i, _ := m.sched.Info(ctx)
require.Len(t, i.(SchedDiagInfo).OpenWindows, 2)

// disable
atomic.StoreInt64(&w.testDisable, 1)

for i := 0; i < 100; i++ {
if !m.WorkerStats()[w.session].Enabled {
break
}

time.Sleep(time.Millisecond * 3)
}
require.False(t, m.WorkerStats()[w.session].Enabled)

i, _ = m.sched.Info(ctx)
require.Len(t, i.(SchedDiagInfo).OpenWindows, 0)

// reenable
atomic.StoreInt64(&w.testDisable, 0)

for i := 0; i < 100; i++ {
if m.WorkerStats()[w.session].Enabled {
break
}

time.Sleep(time.Millisecond * 3)
}
require.True(t, m.WorkerStats()[w.session].Enabled)

for i := 0; i < 100; i++ {
info, _ := m.sched.Info(ctx)
if len(info.(SchedDiagInfo).OpenWindows) != 0 {
break
}

time.Sleep(time.Millisecond * 3)
}

i, _ = m.sched.Info(ctx)
require.Len(t, i.(SchedDiagInfo).OpenWindows, 2)
}
5 changes: 3 additions & 2 deletions extern/sector-storage/sched.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"sync"
"time"

"github.com/google/uuid"
"golang.org/x/xerrors"

"github.com/filecoin-project/go-state-types/abi"
Expand Down Expand Up @@ -217,7 +218,7 @@ type SchedDiagRequestInfo struct {

type SchedDiagInfo struct {
Requests []SchedDiagRequestInfo
OpenWindows []WorkerID
OpenWindows []string
}

func (sh *scheduler) runSched() {
Expand Down Expand Up @@ -324,7 +325,7 @@ func (sh *scheduler) diag() SchedDiagInfo {
defer sh.workersLk.RUnlock()

for _, window := range sh.openWindows {
out.OpenWindows = append(out.OpenWindows, window.worker)
out.OpenWindows = append(out.OpenWindows, uuid.UUID(window.worker).String())
}

return out
Expand Down
26 changes: 16 additions & 10 deletions extern/sector-storage/sched_worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,14 +104,16 @@ func (sw *schedWorker) handleWorker() {
defer sw.heartbeatTimer.Stop()

for {
sched.workersLk.Lock()
enabled := worker.enabled
sched.workersLk.Unlock()

// ask for more windows if we need them (non-blocking)
if enabled {
if !sw.requestWindows() {
return // graceful shutdown
{
sched.workersLk.Lock()
enabled := worker.enabled
sched.workersLk.Unlock()

// ask for more windows if we need them (non-blocking)
if enabled {
if !sw.requestWindows() {
return // graceful shutdown
}
}
}

Expand All @@ -123,12 +125,16 @@ func (sw *schedWorker) handleWorker() {
}

// session looks good
if !enabled {
{
sched.workersLk.Lock()
enabled := worker.enabled
worker.enabled = true
sched.workersLk.Unlock()

// we'll send window requests on the next loop
if !enabled {
// go send window requests
break
}
}

// wait for more tasks to be assigned by the main scheduler or for the worker
Expand Down
10 changes: 8 additions & 2 deletions extern/sector-storage/worker_local.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"reflect"
"runtime"
"sync"
"sync/atomic"
"time"

"github.com/elastic/go-sysinfo"
Expand Down Expand Up @@ -51,8 +52,9 @@ type LocalWorker struct {
acceptTasks map[sealtasks.TaskType]struct{}
running sync.WaitGroup

session uuid.UUID
closing chan struct{}
session uuid.UUID
testDisable int64
closing chan struct{}
}

func newLocalWorker(executor ExecutorFunc, wcfg WorkerConfig, store stores.Store, local *stores.Local, sindex stores.SectorIndex, ret storiface.WorkerReturn, cst *statestore.StateStore) *LocalWorker {
Expand Down Expand Up @@ -501,6 +503,10 @@ func (l *LocalWorker) Info(context.Context) (storiface.WorkerInfo, error) {
}

func (l *LocalWorker) Session(ctx context.Context) (uuid.UUID, error) {
if atomic.LoadInt64(&l.testDisable) == 1 {
return uuid.UUID{}, xerrors.Errorf("disabled")
}

select {
case <-l.closing:
return ClosedWorkerID, nil
Expand Down

0 comments on commit 094ea3f

Please sign in to comment.