-
Notifications
You must be signed in to change notification settings - Fork 354
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix: don't access agentState when it may be nil (#8921)
AgentState is only set on the agent master-side whenever it is "started", which is when we've made it through all the initialization messages. But an agent can crash while it is starting, and then this access into AgentState causes a NPE, which probably ends up being a convincing red herring when debugging.
- Loading branch information
Showing
2 changed files
with
79 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
package agentrm | ||
|
||
import ( | ||
"fmt" | ||
"net/http/httptest" | ||
"strings" | ||
"sync/atomic" | ||
"testing" | ||
"time" | ||
|
||
"github.com/gorilla/websocket" | ||
"github.com/labstack/echo/v4" | ||
"github.com/stretchr/testify/require" | ||
|
||
"github.com/determined-ai/determined/master/internal/config" | ||
"github.com/determined-ai/determined/master/pkg/aproto" | ||
"github.com/determined-ai/determined/master/pkg/model" | ||
"github.com/determined-ai/determined/master/pkg/syncx/queue" | ||
"github.com/determined-ai/determined/master/pkg/ws" | ||
) | ||
|
||
func TestAgentFastFailAfterFirstConnect2(t *testing.T) { | ||
var closed atomic.Bool | ||
a := newAgent( | ||
"test", | ||
queue.New[agentUpdatedEvent](), | ||
"default", | ||
&config.ResourcePoolConfig{}, | ||
&aproto.MasterSetAgentOptions{ | ||
MasterInfo: aproto.MasterInfo{}, | ||
LoggingOptions: model.LoggingConfig{ | ||
DefaultLoggingConfig: &model.DefaultLoggingConfig{}, | ||
}, | ||
ContainersToReattach: []aproto.ContainerReattach{}, | ||
}, | ||
nil, | ||
func() { closed.Store(true) }, | ||
) | ||
|
||
// Connect a fake websocket. | ||
e := echo.New() | ||
e.GET("/", func(c echo.Context) error { | ||
err := a.HandleWebsocketConnection(webSocketRequest{echoCtx: c}) | ||
require.NoError(t, err) | ||
return nil | ||
}) | ||
server := httptest.NewServer(e.Server.Handler) | ||
|
||
var dialer websocket.Dialer | ||
conn, _, err := dialer.Dial(fmt.Sprintf("ws://%s", strings.TrimPrefix(server.URL, "http://")), nil) | ||
require.NoError(t, err) | ||
_, err = ws.Wrap[*aproto.MasterMessage, aproto.AgentMessage]("test", conn) | ||
require.NoError(t, err) | ||
|
||
// Close the underlying conn to simulate a failure. | ||
err = conn.UnderlyingConn().Close() | ||
require.NoError(t, err) | ||
|
||
for { | ||
if closed.Load() { | ||
// The agent should close without a panic. A panic in the agent would bubble up and fail this test. | ||
return | ||
} | ||
time.Sleep(50 * time.Millisecond) | ||
} | ||
} |