Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

etcdserver/api/etcdhttp: add reason field for /health response #11983

Merged
merged 3 commits into from
Jun 16, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG-3.5.md
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ Note that any `etcd_debugging_*` metrics are experimental and subject to change.
- See https://github.com/etcd-io/etcd/issues/11918.
- Improve logging around snapshot send and receive.
- [Push down RangeOptions.limit argv into index tree to reduce memory overhead](https://github.com/etcd-io/etcd/pull/11990).
- Add [reason field for /health response](https://github.com/etcd-io/etcd/pull/11983).

### Package `embed`

Expand Down
11 changes: 11 additions & 0 deletions etcdserver/api/etcdhttp/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ func init() {
// TODO: remove manual parsing in etcdctl cluster-health
type Health struct {
Health string `json:"health"`
Reason string `json:"reason"`
}

// TODO: server NOSPACE, etcdserver.ErrNoLeader in health API
Expand All @@ -109,13 +110,22 @@ func checkHealth(lg *zap.Logger, srv etcdserver.ServerV2) (h Health) {
if len(as) > 0 {
h.Health = "false"
for _, v := range as {
switch v.Alarm {
case etcdserverpb.AlarmType_NOSPACE:
h.Reason = "ALARM NOSPACE"
case etcdserverpb.AlarmType_CORRUPT:
h.Reason = "ALARM CORRUPT"
default:
h.Reason = "ALARM UNKNOWN"
}
lg.Warn("serving /health false due to an alarm", zap.String("alarm", v.String()))
}
return
}

if uint64(srv.Leader()) == raft.None {
h.Health = "false"
h.Reason = "RAFT NO LEADER"
lg.Warn("serving /health false; no leader")
return
}
Expand All @@ -125,6 +135,7 @@ func checkHealth(lg *zap.Logger, srv etcdserver.ServerV2) (h Health) {
cancel()
if err != nil {
h.Health = "false"
h.Reason = "QGET ERROR"
lg.Warn("serving /health false; QGET fails", zap.Error(err))
}

Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/ctl_v3_alarm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ func alarmTest(cx ctlCtx) {
}

// '/health' handler should return 'false'
if err := cURLGet(cx.epc, cURLReq{endpoint: "/health", expected: `{"health":"false"}`}); err != nil {
if err := cURLGet(cx.epc, cURLReq{endpoint: "/health", expected: `{"health":"false","reason":"ALARM NOSPACE"}`}); err != nil {
cx.t.Fatalf("failed get with curl (%v)", err)
}

Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ func metricsTest(cx ctlCtx) {
{"/metrics", fmt.Sprintf("etcd_mvcc_delete_total 3")},
{"/metrics", fmt.Sprintf(`etcd_server_version{server_version="%s"} 1`, version.Version)},
{"/metrics", fmt.Sprintf(`etcd_cluster_version{cluster_version="%s"} 1`, version.Cluster(version.Version))},
{"/health", `{"health":"true"}`},
{"/health", `{"health":"true","reason":""}`},
} {
i++
if err := ctlV3Put(cx, fmt.Sprintf("%d", i), "v", ""); err != nil {
Expand Down