Skip to content

Commit

Permalink
fix: don't access agentState when it may be nil (#8921)
Browse files Browse the repository at this point in the history
AgentState is only set on the agent master-side whenever it is "started", which
is when we've made it through all the initialization messages. But an agent can
crash while it is starting, and then this access into AgentState causes a NPE,
which probably ends up being a convincing red herring when debugging.
  • Loading branch information
stoksc authored Mar 6, 2024
1 parent dcaa893 commit 191a144
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 11 deletions.
24 changes: 13 additions & 11 deletions master/internal/rm/agentrm/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ func (a *agent) stop(cause error) {
}
}

a.syslog.Infof("removing agent: %s", a.agentState.agentID())
a.syslog.Infof("removing agent: %s", a.id)
err := a.updateAgentEndStats(string(a.id))
if err != nil {
a.syslog.WithError(err).Error("failed to update agent end stats")
Expand Down Expand Up @@ -885,16 +885,18 @@ func (a *agent) socketDisconnected() {
timer := time.AfterFunc(a.agentReconnectWait, a.HandleReconnectTimeout)
a.reconnectTimers = append(a.reconnectTimers, timer)

a.preDisconnectEnabled = a.agentState.enabled
a.preDisconnectDraining = a.agentState.draining
// Mark ourselves as draining to avoid action on ourselves while we recover. While the
// system is technically correct without this, it's better because we avoid any waste
// effort scheduling things only to have them suffer AgentErrors later.
a.agentState.disable(true)
a.agentState.patchAllSlotsState(patchAllSlotsState{
enabled: &a.agentState.enabled,
drain: &a.agentState.draining,
})
if a.agentState != nil { // This is nil for a bit after `a.socket` is connected but before `a.started` is true.
a.preDisconnectEnabled = a.agentState.enabled
a.preDisconnectDraining = a.agentState.draining
// Mark ourselves as draining to avoid action on ourselves while we recover. While the
// system is technically correct without this, it's better because we avoid any waste
// effort scheduling things only to have them suffer AgentErrors later.
a.agentState.disable(true)
a.agentState.patchAllSlotsState(patchAllSlotsState{
enabled: &a.agentState.enabled,
drain: &a.agentState.draining,
})
}
a.notifyListeners()
}

Expand Down
66 changes: 66 additions & 0 deletions master/internal/rm/agentrm/agent_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
package agentrm

import (
"fmt"
"net/http/httptest"
"strings"
"sync/atomic"
"testing"
"time"

"github.com/gorilla/websocket"
"github.com/labstack/echo/v4"
"github.com/stretchr/testify/require"

"github.com/determined-ai/determined/master/internal/config"
"github.com/determined-ai/determined/master/pkg/aproto"
"github.com/determined-ai/determined/master/pkg/model"
"github.com/determined-ai/determined/master/pkg/syncx/queue"
"github.com/determined-ai/determined/master/pkg/ws"
)

func TestAgentFastFailAfterFirstConnect2(t *testing.T) {
var closed atomic.Bool
a := newAgent(
"test",
queue.New[agentUpdatedEvent](),
"default",
&config.ResourcePoolConfig{},
&aproto.MasterSetAgentOptions{
MasterInfo: aproto.MasterInfo{},
LoggingOptions: model.LoggingConfig{
DefaultLoggingConfig: &model.DefaultLoggingConfig{},
},
ContainersToReattach: []aproto.ContainerReattach{},
},
nil,
func() { closed.Store(true) },
)

// Connect a fake websocket.
e := echo.New()
e.GET("/", func(c echo.Context) error {
err := a.HandleWebsocketConnection(webSocketRequest{echoCtx: c})
require.NoError(t, err)
return nil
})
server := httptest.NewServer(e.Server.Handler)

var dialer websocket.Dialer
conn, _, err := dialer.Dial(fmt.Sprintf("ws://%s", strings.TrimPrefix(server.URL, "http://")), nil)
require.NoError(t, err)
_, err = ws.Wrap[*aproto.MasterMessage, aproto.AgentMessage]("test", conn)
require.NoError(t, err)

// Close the underlying conn to simulate a failure.
err = conn.UnderlyingConn().Close()
require.NoError(t, err)

for {
if closed.Load() {
// The agent should close without a panic. A panic in the agent would bubble up and fail this test.
return
}
time.Sleep(50 * time.Millisecond)
}
}

0 comments on commit 191a144

Please sign in to comment.