Merge pull request #22 from box/metrics-update

Refactor metrics, add tests
box · Aug 1, 2017 · 6ebfe2d · 6ebfe2d
2 parents 5128ed0 + 687fa05
commit 6ebfe2d
Show file tree

Hide file tree

Showing 12 changed files with 240 additions and 151 deletions.
diff --git a/README.md b/README.md
@@ -123,6 +123,7 @@ In rare cases, you may wish to trigger a kube-applier run without checking in a
 ![screenshot](https://github.com/box/kube-applier/raw/master/static/img/status_page_screenshot.png "Status Page Screenshot")
 
 kube-applier hosts a status page on a webserver, served at the service endpoint URL. The status page displays information about the most recent apply run, including:
+* Run Type
 * Start and end times
 * Latency
 * Most recent commit
@@ -135,7 +136,7 @@ The HTML template for the status page lives in `templates/status.html`, and `sta
 
 ### Metrics
 kube-applier uses [Prometheus](https://github.com/prometheus/client_golang) for metrics. Metrics are hosted on the webserver at /metrics (status UI is the index page). In addition to the Prometheus default metrics, the following custom metrics are included:
-* **run_latency_seconds** - A [Summary](https://godoc.org/github.com/prometheus/client_golang/prometheus#Summary) that keeps track of the durations of each apply run, tagged with a boolean for whether or not the run was a success (i.e. no failed apply attempts).
+* **run_latency_seconds** - A [Summary](https://godoc.org/github.com/prometheus/client_golang/prometheus#Summary) that keeps track of the durations of each apply run, tagged with the run type and a boolean for whether or not the run was a success (i.e. no failed apply attempts).
 * **file_apply_count** - A [Counter](https://godoc.org/github.com/prometheus/client_golang/prometheus#Counter) for each file that has had an apply attempt over the lifetime of the container, incremented with each apply attempt and tagged by the filepath and the result of the attempt.
 
 The Prometheus [HTTP API](https://prometheus.io/docs/querying/api/) (also see the [Go library](https://github.com/prometheus/client_golang/tree/master/api/prometheus)) can be used for querying the metrics server.

diff --git a/main.go b/main.go
@@ -43,9 +43,6 @@ func main() {
 		log.Fatalf("Invalid DIFF_URL_FORMAT, must contain %q: %v", "%s", diffURLFormat)
 	}
 
-	metrics := &metrics.Prometheus{}
-	metrics.Init()
-
 	clock := &sysutil.Clock{}
 
 	if err := sysutil.WaitForDir(repoPath, clock, waitForRepoInterval); err != nil {
@@ -55,7 +52,6 @@ func main() {
 	kubeClient := &kube.Client{Server: server}
 	kubeClient.Configure()
 
-	batchApplier := &run.BatchApplier{kubeClient, metrics}
 	gitUtil := &git.GitUtil{repoPath}
 	fileSystem := &sysutil.FileSystem{}
 	listFactory := &applylist.Factory{repoPath, blacklistPath, whitelistPath, fileSystem}
@@ -74,6 +70,10 @@ func main() {
 	// Limit of 5 is arbitrary - there is significant delay between sends, and receives are handled near instantaneously.
 	runResults := make(chan run.Result, 5)
 
+	// Runner sends run results to runMetrics channel, metrics handler receives the results and updates its metrics.
+	// Limit of 5 is arbitrary - there is significant delay between sends, and receives are handled hear instantaneously.
+	runMetrics := make(chan run.Result, 5)
+
 	// Runner, webserver, and scheduler all send fatal errors to errors channel, and main() exits upon receiving an error.
 	// No limit needed, as a single fatal error will exit the program anyway.
 	errors := make(chan error)
@@ -84,6 +84,10 @@ func main() {
 	// The runner will block on popping the current count until it is updated.
 	runCount := make(chan int)
 
+	metrics := &metrics.Prometheus{RunMetrics: runMetrics}
+	metrics.Configure()
+	batchApplier := &run.BatchApplier{kubeClient}
+
 	pollTicker := time.Tick(pollInterval)
 	fullRunTicker := time.Tick(fullRunInterval)
 
@@ -92,18 +96,19 @@ func main() {
 		listFactory,
 		gitUtil,
 		clock,
-		metrics,
 		diffURLFormat,
 		"",
 		quickRunQueue,
 		fullRunQueue,
 		runResults,
+		runMetrics,
 		errors,
 		runCount,
 	}
 	scheduler := &run.Scheduler{gitUtil, pollTicker, fullRunTicker, quickRunQueue, fullRunQueue, errors, ""}
 	webserver := &webserver.WebServer{listenPort, clock, metrics.GetHandler(), fullRunQueue, runResults, errors}
 
+	go metrics.StartMetricsLoop()
 	go scheduler.Start()
 	go runner.StartRunCounter()
 	go runner.StartQuickLoop()

diff --git a/metrics/mock_prometheus.go b/metrics/mock_prometheus.go
diff --git a/metrics/prometheus.go b/metrics/prometheus.go
@@ -1,21 +1,17 @@
 package metrics
 
 import (
+	"github.com/box/kube-applier/run"
 	"github.com/prometheus/client_golang/prometheus"
 	"net/http"
 	"strconv"
 )
 
-// PrometheusInterface allows for mocking out the functionality of Prometheus when testing the full process of an apply run.
-type PrometheusInterface interface {
-	UpdateFileSuccess(string, bool)
-	UpdateRunLatency(float64, bool)
-}
-
 // Prometheus implements instrumentation of metrics for kube-applier.
 // fileApplyCount is a Counter vector to increment the number of successful and failed apply attempts for each file in the repo.
 // runLatency is a Summary vector that keeps track of the duration for apply runs.
 type Prometheus struct {
+	RunMetrics     <-chan run.Result
 	fileApplyCount *prometheus.CounterVec
 	runLatency     *prometheus.SummaryVec
 }
@@ -25,8 +21,8 @@ func (p *Prometheus) GetHandler() http.Handler {
 	return prometheus.UninstrumentedHandler()
 }
 
-// Init creates and registers the custom metrics for kube-applier.
-func (p *Prometheus) Init() {
+// Configure creates and registers the custom metrics for kube-applier, and starts a loop to receive run results.
+func (p *Prometheus) Configure() {
 	p.fileApplyCount = prometheus.NewCounterVec(prometheus.CounterOpts{
 		Name: "file_apply_count",
 		Help: "Success metric for every file applied",
@@ -45,23 +41,35 @@ func (p *Prometheus) Init() {
 		[]string{
 			// Result: true if the run was successful, false otherwise
 			"success",
+			// FullRun or QuickRun
+			"run_type",
 		},
 	)
 
 	prometheus.MustRegister(p.fileApplyCount)
 	prometheus.MustRegister(p.runLatency)
 }
 
-// UpdateFileSuccess increments the given file's Counter for either successful apply attempts or failed apply attempts.
-func (p *Prometheus) UpdateFileSuccess(file string, success bool) {
-	p.fileApplyCount.With(prometheus.Labels{
-		"file": file, "success": strconv.FormatBool(success),
-	}).Inc()
+// StartMetricsLoop receives from the RunMetrics channel and calls processResult when a run result comes in.
+func (p *Prometheus) StartMetricsLoop() {
+	for result := range p.RunMetrics {
+		p.processResult(result)
+	}
 }
 
-// UpdateRunLatency adds a data point (latency of the most recent run) to the run_latency_seconds Summary metric, with a tag indicating whether or not the run was successful.
-func (p *Prometheus) UpdateRunLatency(runLatency float64, success bool) {
+// processResult parses a run result for info and updates the metrics (file_apply_count and run_latency_seconds).
+func (p *Prometheus) processResult(result run.Result) {
+	runSuccess := len(result.Failures) == 0
+	runType := result.RunType
+	latency := result.Finish.Sub(result.Start).Seconds()
+	for _, successFile := range result.Successes {
+		p.fileApplyCount.With(prometheus.Labels{"file": successFile.FilePath, "success": "true"}).Inc()
+	}
+	for _, failureFile := range result.Failures {
+		p.fileApplyCount.With(prometheus.Labels{"file": failureFile.FilePath, "success": "false"}).Inc()
+	}
 	p.runLatency.With(prometheus.Labels{
-		"success": strconv.FormatBool(success),
-	}).Observe(runLatency)
+		"success":  strconv.FormatBool(runSuccess),
+		"run_type": string(runType),
+	}).Observe(latency)
 }
diff --git a/metrics/prometheus_test.go b/metrics/prometheus_test.go
@@ -0,0 +1,139 @@
+package metrics
+
+import (
+	"fmt"
+	"github.com/box/kube-applier/run"
+	"github.com/stretchr/testify/assert"
+	"net/http"
+	"net/http/httptest"
+	"regexp"
+	"testing"
+)
+
+type testCase struct {
+	successes        []run.ApplyAttempt
+	failures         []run.ApplyAttempt
+	runType          run.RunType
+	expectedPatterns []string
+}
+
+// TestPrometheusProcessResult tests the processResult() function to ensure that the metrics page is updated properly.
+// With each "test case", we construct a fake run.Result and call processResult.
+// We then make a request to the metrics page handler and parse its raw output.
+// We then use regexp patterns to check that the raw output has the expected state for each metric.
+// Note that filenames are reused in order to ensure that the metrics update iteratively.
+func TestPrometheusProcessResult(t *testing.T) {
+	runMetrics := make(chan run.Result, 5)
+	p := &Prometheus{RunMetrics: runMetrics}
+	p.Configure()
+
+	testCases := []testCase{
+		// Case 1: No successes, no failures, full run
+		{
+			[]run.ApplyAttempt{},
+			[]run.ApplyAttempt{},
+			run.FullRun,
+			[]string{
+				// Expect count 1 for latency metric with run_type=fullRun, success=true
+				makeLatencyPattern(run.FullRun, true, 1),
+			},
+		},
+		// Case 2: Successes, no failures, full run
+		{
+			[]run.ApplyAttempt{{FilePath: "file1"}, {FilePath: "file2"}},
+			[]run.ApplyAttempt{},
+			run.FullRun,
+			[]string{
+				// Expect count 2 for latency metric with run_type=fullRun, success=true
+				makeLatencyPattern(run.FullRun, true, 2),
+				// Expect count 1 for file1 with success=true
+				makeFilePattern("file1", true, 1),
+				// Expect count 1 for file2 with success=true
+				makeFilePattern("file2", true, 1),
+			},
+		},
+		// Case 3: Successes, failures, full run
+		{
+			[]run.ApplyAttempt{{FilePath: "file1"}, {FilePath: "file3"}},
+			[]run.ApplyAttempt{{FilePath: "file2"}},
+			run.FullRun,
+			[]string{
+				// Expect count 1 for latency metric with run_type=fullRun, success=false
+				makeLatencyPattern(run.FullRun, false, 1),
+				// Expect count 2 for file1 with success=true
+				makeFilePattern("file1", true, 2),
+				// Expect count 1 for file3 with success=true
+				makeFilePattern("file3", true, 1),
+				// Expect count 1 for file2 with success=false
+				makeFilePattern("file2", false, 1),
+
+				// Ensure that previous metrics remain unchanged.
+				// Expect count 2 for latency metric with run_type=fullRun, success=true
+				makeLatencyPattern(run.FullRun, true, 2),
+				// Expect count 1 for file2 with success=true
+				makeFilePattern("file2", true, 1),
+			},
+		},
+		// Case 4: Successes, failures, quick run
+		{
+			[]run.ApplyAttempt{{FilePath: "file1"}, {FilePath: "file3"}},
+			[]run.ApplyAttempt{{FilePath: "file2"}},
+			run.QuickRun,
+			[]string{
+				// Expect count 1 for latency metric with run_type=quickRun, success=false
+				makeLatencyPattern(run.QuickRun, false, 1),
+				// Expect count 3 for file1 with success=true
+				makeFilePattern("file1", true, 3),
+				// Expect count 2 for file3 with success=true
+				makeFilePattern("file3", true, 2),
+				// Expect count 2 for file2 with success=false
+				makeFilePattern("file2", false, 2),
+
+				// Ensure that previous metrics remain unchanged.
+				// Expect count 2 for latency metric with run_type=fullRun, success=true
+				makeLatencyPattern(run.FullRun, true, 2),
+				// Expect count 1 for latency metric with run_type=fullRun, success=false
+				makeLatencyPattern(run.FullRun, false, 1),
+				// Expect count 1 for file2 with success=true
+				makeFilePattern("file2", true, 1),
+			},
+		},
+	}
+
+	for _, tc := range testCases {
+		processAndCheckOutput(t, p, tc)
+	}
+}
+
+// Request content body from the handler.
+func requestContentBody(handler http.Handler) string {
+	req, _ := http.NewRequest("GET", "", nil)
+	w := httptest.NewRecorder()
+	handler.ServeHTTP(w, req)
+	return w.Body.String()
+}
+
+// Build a regex pattern for file_apply_count metric.
+func makeFilePattern(filename string, success bool, count int) string {
+	return fmt.Sprintf(
+		"\\bfile_apply_count\\{file\\=\"%v\",success\\=\"%v\"\\} %v\\b",
+		filename, success, count)
+}
+
+// Build a regex pattern for run_latency_seconds_count metric.
+func makeLatencyPattern(runType run.RunType, success bool, count int) string {
+	return fmt.Sprintf(
+		"\\brun_latency_seconds_count\\{run_type\\=\"%v\",success\\=\"%v\"\\} %v\\b",
+		runType, success, count)
+}
+
+// Process the test case and check that the metrics output contains the expected patterns.
+func processAndCheckOutput(t *testing.T, p *Prometheus, tc testCase) {
+	assert := assert.New(t)
+	result := run.Result{Successes: tc.successes, Failures: tc.failures, RunType: tc.runType}
+	p.processResult(result)
+	metricsRaw := requestContentBody(p.GetHandler())
+	for _, pattern := range tc.expectedPatterns {
+		assert.True(regexp.MatchString(pattern, metricsRaw))
+	}
+}
diff --git a/run/batch_applier.go b/run/batch_applier.go
@@ -2,7 +2,6 @@ package run
 
 import (
 	"github.com/box/kube-applier/kube"
-	"github.com/box/kube-applier/metrics"
 	"log"
 )
 
@@ -19,10 +18,9 @@ type BatchApplierInterface interface {
 	Apply(int, []string) (successes []ApplyAttempt, failures []ApplyAttempt)
 }
 
-// BatchApplier makes apply calls for a batch of files, and updates metrics based on the results of each call.
+// BatchApplier makes apply calls for a batch of files.
 type BatchApplier struct {
 	KubeClient kube.ClientInterface
-	Metrics    metrics.PrometheusInterface
 }
 
 // Apply takes a list of files and attempts an apply command on each, labeling logs with the run ID.
@@ -47,7 +45,6 @@ func (a *BatchApplier) Apply(id int, applyList []string) (successes []ApplyAttem
 			failures = append(failures, appliedFile)
 			log.Printf("RUN %v: %v\n%v\n%v", id, cmd, output, appliedFile.ErrorMessage)
 		}
-		a.Metrics.UpdateFileSuccess(path, success)
 	}
 	return successes, failures
 }