Skip to content

Commit

Permalink
[ASCII-2295] Fix the forwarder health check when api key is invalid (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
pgimalac committed Sep 18, 2024
1 parent 34af062 commit 65e0623
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 21 deletions.
29 changes: 21 additions & 8 deletions comp/forwarder/defaultforwarder/forwarder_health.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,15 +149,28 @@ func (fh *forwarderHealth) healthCheckLoop() {
}

for {
select {
case <-fh.stop:
return
case <-validateTicker.C:
valid := fh.checkValidAPIKey()
if !valid {
fh.log.Errorf("No valid api key found, reporting the forwarder as unhealthy.")
// only read from the health channel if the api key is valid
if valid {
select {
case <-fh.stop:
return
case <-validateTicker.C:
valid = fh.checkValidAPIKey()
if !valid {
fh.log.Errorf("No valid api key found, reporting the forwarder as unhealthy.")
}
case <-fh.health.C:
}
} else {
select {
case <-fh.stop:
return
case <-validateTicker.C:
valid = fh.checkValidAPIKey()
if !valid {
fh.log.Errorf("No valid api key found, reporting the forwarder as unhealthy.")
}
}
case <-fh.health.C:
}
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Each section from every release note are combined when the
# CHANGELOG.rst is rendered. So the text needs to be worded so that
# it does not depend on any information only available in another
# section. This may mean repeating some details, but each section
# must be readable independently of the other.
#
# Each section note must be formatted as reStructuredText.
---
fixes:
- |
Fix the forwarder health check so that it reports unhealthy when the API key is invalid.
41 changes: 28 additions & 13 deletions test/new-e2e/tests/agent-subcommands/health/health_nix_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,13 @@
package health

import (
"net/http"
"testing"
"time"

"github.com/DataDog/test-infra-definitions/components/datadog/agentparams"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

"github.com/DataDog/datadog-agent/test/fakeintake/api"
"github.com/DataDog/datadog-agent/test/new-e2e/pkg/e2e"
Expand All @@ -28,23 +31,35 @@ func TestLinuxHealthSuite(t *testing.T) {
func (v *linuxHealthSuite) TestDefaultInstallUnhealthy() {
// the fakeintake says that any API key is invalid by sending a 403 code
override := api.ResponseOverride{
Endpoint: "/api/v1/validate",
StatusCode: 403,
ContentType: "text/plain",
Body: []byte("invalid API key"),
Endpoint: "/api/v1/validate",
StatusCode: 403,
Method: http.MethodGet,
Body: []byte("invalid API key"),
}
v.Env().FakeIntake.Client().ConfigureOverride(override)
err := v.Env().FakeIntake.Client().ConfigureOverride(override)
require.NoError(v.T(), err)

// restart the agent, which validates the key using the fakeintake at startup
v.UpdateEnv(awshost.Provisioner(
awshost.WithAgentOptions(agentparams.WithAgentConfig("log_level: info\n")),
awshost.WithAgentOptions(agentparams.WithAgentConfig("log_level: info\nforwarder_apikey_validation_interval: 1")),
))

// agent should be unhealthy because the key is invalid
_, err := v.Env().Agent.Client.Health()
if err == nil {
assert.Fail(v.T(), "agent expected to be unhealthy, but no error found!")
return
}
assert.Contains(v.T(), err.Error(), "Agent health: FAIL")
require.EventuallyWithT(v.T(), func(collect *assert.CollectT) {
// forwarder should be unhealthy because the key is invalid
_, err = v.Env().Agent.Client.Health()
assert.ErrorContains(collect, err, "Agent health: FAIL")
assert.ErrorContains(collect, err, "=== 1 unhealthy components ===\nforwarder")
}, time.Second*30, time.Second)

// the fakeintake now says that the api key is valid
override.StatusCode = 200
override.Body = []byte("valid API key")
err = v.Env().FakeIntake.Client().ConfigureOverride(override)
require.NoError(v.T(), err)

// the agent will check every minute if the key is valid, so wait at most 1m30
require.EventuallyWithT(v.T(), func(collect *assert.CollectT) {
_, err = v.Env().Agent.Client.Health()
assert.NoError(collect, err)
}, time.Second*90, 3*time.Second)
}

0 comments on commit 65e0623

Please sign in to comment.