Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPM] Differentiate null from no error data #4985

Merged
merged 5 commits into from
Dec 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docker-compose/monitor/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
build: export DOCKER_TAG = dev
build: clean-jaeger
cd ../../ && \
make build-all-in-one && \
make build-all-in-one-linux && \
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The docker images rely on linux binaries.

make docker-images-jaeger-backend

# starts up the system required for SPM using the latest otel image and a development jaeger image.
Expand Down
4 changes: 2 additions & 2 deletions docker-compose/monitor/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,13 @@ make build
## Bring up the dev environment

```bash
make run-dev
make dev
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it make sense to add a ci workflow that will build and run this example like that?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting idea, thanks. I'll have a think about this and put together a separate PR for it.

```

## Backwards compatibility testing with spanmetrics processor

```bash
make run-dev-processor
make dev-processor
```

For each "run" make target, you should expect to see the following in the Monitor tab after a few minutes:
Expand Down
33 changes: 32 additions & 1 deletion plugin/metrics/prometheus/metricsstore/reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,38 @@ func (m MetricsReader) GetErrorRates(ctx context.Context, requestParams *metrics
)
},
}
return m.executeQuery(ctx, metricsParams)
errorMetrics, err := m.executeQuery(ctx, metricsParams)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the main change, the rest supports this change, mostly for testing purposes.

if err != nil {
return nil, fmt.Errorf("failed getting error metrics: %w", err)
}
// Non-zero error rates are available.
if len(errorMetrics.Metrics) > 0 {
return errorMetrics, nil
}

// Check for the presence of call rate metrics to differentiate the absence of error rate from
// the absence of call rate metrics altogether.
callMetrics, err := m.GetCallRates(ctx, &metricsstore.CallRateQueryParameters{BaseQueryParameters: requestParams.BaseQueryParameters})
if err != nil {
return nil, fmt.Errorf("failed getting call metrics: %w", err)
}
// No call rate metrics are available, and by association, means no error rate metrics are available.
if len(callMetrics.Metrics) == 0 {
return errorMetrics, nil
}

// Non-zero call rate metrics are available, which implies that there are just no errors, so we report a zero error rate.
zeroErrorMetrics := make([]*metrics.Metric, 0, len(callMetrics.Metrics))
for _, cm := range callMetrics.Metrics {
zm := *cm
for i := 0; i < len(zm.MetricPoints); i++ {
zm.MetricPoints[i].Value = &metrics.MetricPoint_GaugeValue{GaugeValue: &metrics.GaugeValue{Value: &metrics.GaugeValue_DoubleValue{DoubleValue: 0.0}}}
}
zeroErrorMetrics = append(zeroErrorMetrics, &zm)
}

errorMetrics.Metrics = zeroErrorMetrics
return errorMetrics, nil
}

// GetMinStepDuration gets the minimum step duration (the smallest possible duration between two data points in a time series) supported.
Expand Down
204 changes: 203 additions & 1 deletion plugin/metrics/prometheus/metricsstore/reader_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -487,6 +487,208 @@ func TestGetErrorRates(t *testing.T) {
}
}

func TestGetErrorRatesZero(t *testing.T) {
params := metricsstore.ErrorRateQueryParameters{
BaseQueryParameters: buildTestBaseQueryParametersFrom(metricsTestCase{
serviceNames: []string{"emailservice"},
spanKinds: []string{"SPAN_KIND_SERVER"},
}),
}
tracer, exp, closer := tracerProvider(t)
defer closer()

const (
queryErrorRate = `sum(rate(calls{service_name =~ "emailservice", status_code = "STATUS_CODE_ERROR", ` +
`span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name) / ` +
`sum(rate(calls{service_name =~ "emailservice", span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name)`
queryCallRate = `sum(rate(calls{service_name =~ "emailservice", ` +
`span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name)`
)
wantPromQLQueries := []string{queryErrorRate, queryCallRate}
responses := []string{"testdata/empty_response.json", "testdata/service_datapoint_response.json"}
var callCount int

mockPrometheus := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
body, _ := io.ReadAll(r.Body)
defer r.Body.Close()

u, err := url.Parse("http://" + r.Host + r.RequestURI + "?" + string(body))
require.NoError(t, err)

q := u.Query()
promQuery := q.Get("query")
assert.Equal(t, wantPromQLQueries[callCount], promQuery)
sendResponse(t, w, responses[callCount])
callCount++
}))

logger := zap.NewNop()
address := mockPrometheus.Listener.Addr().String()

cfg := defaultConfig
cfg.ServerURL = "http://" + address
cfg.ConnectTimeout = defaultTimeout

reader, err := NewMetricsReader(cfg, logger, tracer)
require.NoError(t, err)

defer mockPrometheus.Close()

m, err := reader.GetErrorRates(context.Background(), &params)
require.NoError(t, err)

require.Len(t, m.Metrics, 1)
mps := m.Metrics[0].MetricPoints

require.Len(t, mps, 1)

// Assert that we essentially zeroed the call rate data point.
// That is, the timestamp is the same as the call rate's data point, but the value is 0.
actualVal := mps[0].Value.(*metrics.MetricPoint_GaugeValue).GaugeValue.Value.(*metrics.GaugeValue_DoubleValue).DoubleValue
assert.Zero(t, actualVal)
assert.Equal(t, int64(1620351786), mps[0].Timestamp.GetSeconds())
assert.Len(t, exp.GetSpans(), 2, "expected an error rate query and a call rate query to be made")
}

func TestGetErrorRatesNull(t *testing.T) {
params := metricsstore.ErrorRateQueryParameters{
BaseQueryParameters: buildTestBaseQueryParametersFrom(metricsTestCase{
serviceNames: []string{"emailservice"},
spanKinds: []string{"SPAN_KIND_SERVER"},
}),
}
tracer, exp, closer := tracerProvider(t)
defer closer()

const (
queryErrorRate = `sum(rate(calls{service_name =~ "emailservice", status_code = "STATUS_CODE_ERROR", ` +
`span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name) / ` +
`sum(rate(calls{service_name =~ "emailservice", span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name)`
queryCallRate = `sum(rate(calls{service_name =~ "emailservice", ` +
`span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name)`
)
wantPromQLQueries := []string{queryErrorRate, queryCallRate}
responses := []string{"testdata/empty_response.json", "testdata/empty_response.json"}
var callCount int

mockPrometheus := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
body, _ := io.ReadAll(r.Body)
defer r.Body.Close()

u, err := url.Parse("http://" + r.Host + r.RequestURI + "?" + string(body))
require.NoError(t, err)

q := u.Query()
promQuery := q.Get("query")
assert.Equal(t, wantPromQLQueries[callCount], promQuery)
sendResponse(t, w, responses[callCount])
callCount++
}))

logger := zap.NewNop()
address := mockPrometheus.Listener.Addr().String()

cfg := defaultConfig
cfg.ServerURL = "http://" + address
cfg.ConnectTimeout = defaultTimeout

reader, err := NewMetricsReader(cfg, logger, tracer)
require.NoError(t, err)

defer mockPrometheus.Close()

m, err := reader.GetErrorRates(context.Background(), &params)
require.NoError(t, err)
assert.Empty(t, m.Metrics, "expect no error data available")
assert.Len(t, exp.GetSpans(), 2, "expected an error rate query and a call rate query to be made")
}

func TestGetErrorRatesErrors(t *testing.T) {
for _, tc := range []struct {
name string
failErrorRateQuery bool
failCallRateQuery bool
wantErr string
}{
{
name: "error rate query failure",
failErrorRateQuery: true,
wantErr: "failed getting error metrics: failed executing metrics query: server_error: server error: 500",
},
{
name: "call rate query failure",
failCallRateQuery: true,
wantErr: "failed getting call metrics: failed executing metrics query: server_error: server error: 500",
},
} {
t.Run(tc.name, func(t *testing.T) {
params := metricsstore.ErrorRateQueryParameters{
BaseQueryParameters: buildTestBaseQueryParametersFrom(metricsTestCase{
serviceNames: []string{"emailservice"},
spanKinds: []string{"SPAN_KIND_SERVER"},
}),
}
tracer, _, closer := tracerProvider(t)
defer closer()

const (
queryErrorRate = `sum(rate(calls{service_name =~ "emailservice", status_code = "STATUS_CODE_ERROR", ` +
`span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name) / ` +
`sum(rate(calls{service_name =~ "emailservice", span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name)`
queryCallRate = `sum(rate(calls{service_name =~ "emailservice", ` +
`span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name)`
)
wantPromQLQueries := []string{queryErrorRate, queryCallRate}
responses := []string{"testdata/empty_response.json", "testdata/service_datapoint_response.json"}
var callCount int

mockPrometheus := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
body, _ := io.ReadAll(r.Body)
defer r.Body.Close()

u, err := url.Parse("http://" + r.Host + r.RequestURI + "?" + string(body))
require.NoError(t, err)

q := u.Query()
promQuery := q.Get("query")
assert.Equal(t, wantPromQLQueries[callCount], promQuery)

switch promQuery {
case queryErrorRate:
if tc.failErrorRateQuery {
w.WriteHeader(http.StatusInternalServerError)
} else {
sendResponse(t, w, responses[callCount])
}
case queryCallRate:
if tc.failCallRateQuery {
w.WriteHeader(http.StatusInternalServerError)
} else {
sendResponse(t, w, responses[callCount])
}
}
callCount++
}))

logger := zap.NewNop()
address := mockPrometheus.Listener.Addr().String()

cfg := defaultConfig
cfg.ServerURL = "http://" + address
cfg.ConnectTimeout = defaultTimeout

reader, err := NewMetricsReader(cfg, logger, tracer)
require.NoError(t, err)

defer mockPrometheus.Close()

_, err = reader.GetErrorRates(context.Background(), &params)
require.Error(t, err)
assert.EqualError(t, err, tc.wantErr)
})
}
}

func TestInvalidLatencyUnit(t *testing.T) {
defer func() {
if r := recover(); r == nil {
Expand Down Expand Up @@ -515,7 +717,7 @@ func TestWarningResponse(t *testing.T) {
m, err := reader.GetErrorRates(context.Background(), &params)
require.NoError(t, err)
assert.NotNil(t, m)
assert.Len(t, exp.GetSpans(), 1, "HTTP request was traced and span reported")
assert.Len(t, exp.GetSpans(), 2, "expected an error rate query and a call rate query to be made")
}

type fakePromServer struct {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"status": "success",
"data": {
"resultType": "matrix",
"result": []
}
}
Loading