diff --git a/x-pack/elastic-agent/CHANGELOG.asciidoc b/x-pack/elastic-agent/CHANGELOG.asciidoc index 14ca5ccb3a4..9eb0f512ae3 100644 --- a/x-pack/elastic-agent/CHANGELOG.asciidoc +++ b/x-pack/elastic-agent/CHANGELOG.asciidoc @@ -53,6 +53,7 @@ - Fix issues with merging of elastic-agent.yml and fleet.yml {pull}20026[20026] - Unzip failures on Windows 8/Windows server 2012 {pull}20088[20088] - Fix failing unit tests on windows {pull}20127[20127] +- Improve GRPC stop to be more relaxed {pull}20118[20118] ==== New features diff --git a/x-pack/elastic-agent/pkg/core/server/server.go b/x-pack/elastic-agent/pkg/core/server/server.go index 4cd5c8386ec..12885e2f012 100644 --- a/x-pack/elastic-agent/pkg/core/server/server.go +++ b/x-pack/elastic-agent/pkg/core/server/server.go @@ -526,6 +526,7 @@ func (as *ApplicationState) WriteConnInfo(w io.Writer) error { // the application times out during stop and ErrApplication func (as *ApplicationState) Stop(timeout time.Duration) error { as.checkinLock.Lock() + wasConn := as.checkinDone != nil cfgIdx := as.statusConfigIdx as.expected = proto.StateExpected_STOPPING as.checkinLock.Unlock() @@ -548,8 +549,10 @@ func (as *ApplicationState) Stop(timeout time.Duration) error { s := as.status doneChan := as.checkinDone as.checkinLock.RUnlock() - if s == proto.StateObserved_STOPPING && doneChan == nil { - // sent stopping and now is disconnected (so its stopped) + if (wasConn && doneChan == nil) || (!wasConn && s == proto.StateObserved_STOPPING && doneChan == nil) { + // either occurred + // * client was connected then disconnected on stop + // * client was not connected; connected; received stopping; then disconnected as.Destroy() return nil } diff --git a/x-pack/elastic-agent/pkg/core/server/server_test.go b/x-pack/elastic-agent/pkg/core/server/server_test.go index 608be5641f9..424efb14311 100644 --- a/x-pack/elastic-agent/pkg/core/server/server_test.go +++ b/x-pack/elastic-agent/pkg/core/server/server_test.go @@ -416,6 +416,57 @@ func TestServer_Stop(t *testing.T) { assert.NoError(t, stopErr) } +func TestServer_StopJustDisconnect(t *testing.T) { + initConfig := "initial_config" + app := &StubApp{} + srv := createAndStartServer(t, &StubHandler{}) + defer srv.Stop() + as, err := srv.Register(app, initConfig) + require.NoError(t, err) + cImpl := &StubClientImpl{} + c := newClientFromApplicationState(t, as, cImpl) + require.NoError(t, c.Start(context.Background())) + defer c.Stop() + + // clients should get initial check-ins then set as healthy + require.NoError(t, waitFor(func() error { + if cImpl.Config() != initConfig { + return fmt.Errorf("client never got intial config") + } + return nil + })) + c.Status(proto.StateObserved_HEALTHY, "Running", nil) + assert.NoError(t, waitFor(func() error { + if app.Status() != proto.StateObserved_HEALTHY { + return fmt.Errorf("server never updated currect application state") + } + return nil + })) + + // send stop to the client + done := make(chan bool) + var stopErr error + go func() { + stopErr = as.Stop(time.Second * 5) + close(done) + }() + + // process of testing the flow + // 1. server sends stop + // 2. client disconnects + require.NoError(t, waitFor(func() error { + if cImpl.Stop() == 0 { + return fmt.Errorf("client never got expected stop") + } + return nil + })) + c.Stop() + <-done + + // no error on stop + assert.NoError(t, stopErr) +} + func TestServer_StopTimeout(t *testing.T) { initConfig := "initial_config" app := &StubApp{}