From a6aa2ba55b8d07a070097a306ba8aa5491fcdc68 Mon Sep 17 00:00:00 2001 From: Saylor Berman Date: Fri, 8 Mar 2024 12:26:56 -0700 Subject: [PATCH] Code review, gather logs from gcloud --- tests/Makefile | 2 +- tests/README.md | 8 +++-- tests/scripts/create-gke-cluster.sh | 3 +- tests/scripts/run-tests-gcp-vm.sh | 26 ++++++++++++-- tests/suite/longevity_test.go | 54 ++++------------------------- 5 files changed, 40 insertions(+), 53 deletions(-) diff --git a/tests/Makefile b/tests/Makefile index a561b7ea8..463fb6f01 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -80,7 +80,7 @@ run-tests-on-vm: ## Run the functional tests on a GCP VM .PHONY: nfr-test nfr-test: ## Run the NFR tests on a GCP VM - bash scripts/run-tests-gcp-vm.sh true + NFR=true bash scripts/run-tests-gcp-vm.sh .PHONY: start-longevity-test start-longevity-test: ## Start the longevity test to run for 4 days in GKE diff --git a/tests/README.md b/tests/README.md index 9c81467dd..50d0f5945 100644 --- a/tests/README.md +++ b/tests/README.md @@ -218,13 +218,17 @@ To start the longevity test, set up your VM (`create-and-setup-vm`) and run make start-longevity-test ``` + +> Note: If you want to change the time period for which the test runs, update the `wrk` commands in `suite/scripts/longevity-wrk.sh` to the time period you want, and run `make sync-files-to-vm`. + + > Note: If you want to re-run the longevity test, you need to clear out the `cafe.example.com` entry from the `/etc/hosts` file on your VM. You can verify the test is working by checking nginx logs to see traffic flow, and check that the cronjob is running and redeploying apps. -To complete the longevity test and collect results, first visit the [GCP Monitoring Dashboards](https://console.cloud.google.com/monitoring/dashboards) page and select the `NGF Longevity Test` dashboard. Take PNG screenshots of each chart for the time period in which your test ran, and save those to be added to the results file. +After 4 days (96h), you can complete the longevity tests and collect results. To ensure that the traffic has stopped flowing, you can ssh to the VM using `gcloud compute ssh` and run `ps aux | grep wrk` to verify the `wrk` commands are no longer running. Then, visit the [GCP Monitoring Dashboards](https://console.cloud.google.com/monitoring/dashboards) page and select the `NGF Longevity Test` dashboard. Take PNG screenshots of each chart for the time period in which your test ran, and save those to be added to the results file. -Next, run: +Finally, run ```makefile make stop-longevity-test diff --git a/tests/scripts/create-gke-cluster.sh b/tests/scripts/create-gke-cluster.sh index 1e05db4d4..9d034e1c6 100644 --- a/tests/scripts/create-gke-cluster.sh +++ b/tests/scripts/create-gke-cluster.sh @@ -15,7 +15,8 @@ gcloud container clusters create ${GKE_CLUSTER_NAME} \ --enable-private-nodes \ --master-ipv4-cidr 172.16.${ip_random_digit}.32/28 \ --metadata=block-project-ssh-keys=TRUE \ - --monitoring=SYSTEM,POD,DEPLOYMENT + --monitoring=SYSTEM,POD,DEPLOYMENT \ + --logging=SYSTEM,WORKLOAD # Add current IP to GKE master control node access, if this script is not invoked during a CI run. if [ "${IS_CI}" = "false" ]; then diff --git a/tests/scripts/run-tests-gcp-vm.sh b/tests/scripts/run-tests-gcp-vm.sh index 521cd078a..7f35e26c0 100644 --- a/tests/scripts/run-tests-gcp-vm.sh +++ b/tests/scripts/run-tests-gcp-vm.sh @@ -2,8 +2,6 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) -NFR=${1:-false} - source scripts/vars.env SCRIPT=run-tests.sh @@ -21,3 +19,27 @@ gcloud compute ssh --zone ${GKE_CLUSTER_ZONE} --project=${GKE_PROJECT} username@ if [ "${NFR}" = "true" ]; then gcloud compute scp --zone ${GKE_CLUSTER_ZONE} --project=${GKE_PROJECT} --recurse username@${RESOURCE_NAME}:~/nginx-gateway-fabric/tests/results . fi + +## If tearing down the longevity test, we need to collect logs from gcloud and add to the results +if [ "${STOP_LONGEVITY}" = "true" ]; then + version=${NGF_VERSION} + if [ "$version" = "" ]; then + version=${TAG} + fi + + results="${SCRIPT_DIR}/../results/longevity/$version/$version.md" + printf "\n## Error Logs\n\n" >> $results + + ## ngf error logs + ngfErrText=$(gcloud logging read --project=${GKE_PROJECT} 'resource.labels.cluster_name='"${RESOURCE_NAME}"' AND resource.type=k8s_container AND resource.labels.container_name=nginx-gateway AND severity=ERROR AND SEARCH("error")' --format "value(textPayload)") + ngfErrJSON=$(gcloud logging read --project=${GKE_PROJECT} 'resource.labels.cluster_name='"${RESOURCE_NAME}"' AND resource.type=k8s_container AND resource.labels.container_name=nginx-gateway AND severity=ERROR AND SEARCH("error")' --format "value(jsonPayload)") + printf "### nginx-gateway\n$ngfErrText\n$ngfErrJSON\n\n" >> $results + + ## nginx error logs + ngxErr=$(gcloud logging read --project=${GKE_PROJECT} 'resource.labels.cluster_name='"${RESOURCE_NAME}"' AND resource.type=k8s_container AND resource.labels.container_name=nginx AND severity=ERROR AND SEARCH("`[warn]`") OR SEARCH("`[error]`") OR SEARCH("`[emerg]`")' --format "value(textPayload)") + printf "### nginx\n$ngxErr\n\n" >> $results + + ## nginx non-200 responses (also filter out 499 since wrk cancels connections) + ngxNon200=$(gcloud logging read --project=${GKE_PROJECT} 'resource.labels.cluster_name='"${RESOURCE_NAME}"' AND resource.type=k8s_container AND resource.labels.container_name=nginx AND "GET" "HTTP/1.1" -"200" -"499" -"client prematurely closed connection"' --format "value(textPayload)") + printf "$ngxNon200\n\n" >> $results +fi diff --git a/tests/suite/longevity_test.go b/tests/suite/longevity_test.go index 4542c2a84..0f1382620 100644 --- a/tests/suite/longevity_test.go +++ b/tests/suite/longevity_test.go @@ -1,11 +1,7 @@ package suite import ( - "bufio" - "bytes" - "context" "fmt" - "io" "os" "path/filepath" "strings" @@ -81,57 +77,21 @@ var _ = Describe("Longevity", Label("longevity-setup", "longevity-teardown"), fu Expect(err).ToNot(HaveOccurred()) Expect(framework.WriteContent(resultsFile, "\n## Traffic\n")) - writeTrafficResults(resultsFile, homeDir, "coffee.txt", "HTTP") - writeTrafficResults(resultsFile, homeDir, "tea.txt", "HTTPS") - - // gather any error logs - names, err := framework.GetReadyNGFPodNames(k8sClient, ngfNamespace, releaseName, timeoutConfig.GetTimeout) - Expect(err).ToNot(HaveOccurred()) - - Expect(framework.WriteContent(resultsFile, "\n## Error Logs\n")) - writeErrorLogs(resultsFile, names[0], "nginx-gateway") - writeErrorLogs(resultsFile, names[0], "nginx") + Expect(writeTrafficResults(resultsFile, homeDir, "coffee.txt", "HTTP")).To(Succeed()) + Expect(writeTrafficResults(resultsFile, homeDir, "tea.txt", "HTTPS")).To(Succeed()) Expect(resourceManager.DeleteFromFiles(files, ns.Name)).To(Succeed()) Expect(resourceManager.Delete([]client.Object{ns})).To(Succeed()) }) }) -func writeTrafficResults(resultsFile *os.File, homeDir, filename, testname string) { +func writeTrafficResults(resultsFile *os.File, homeDir, filename, testname string) error { file := fmt.Sprintf("%s/%s", homeDir, filename) content, err := os.ReadFile(file) - Expect(err).ToNot(HaveOccurred()) - - formattedContent := fmt.Sprintf("%s:\n\n```text\n%s```\n", testname, string(content)) - Expect(framework.WriteContent(resultsFile, formattedContent)).To(Succeed()) -} - -func writeErrorLogs(resultsFile *os.File, pod, container string) { - logReq := clientGoClient.CoreV1().Pods(ngfNamespace).GetLogs(pod, &core.PodLogOptions{Container: container}) - - ctx, cancel := context.WithTimeout(context.Background(), timeoutConfig.GetTimeout) - defer cancel() - - logs, err := logReq.Stream(ctx) - Expect(err).ToNot(HaveOccurred()) - defer logs.Close() - - buf := new(bytes.Buffer) - _, err = io.Copy(buf, logs) - Expect(err).ToNot(HaveOccurred()) - - Expect(framework.WriteContent(resultsFile, fmt.Sprintf("\n### %s\n", container))) - - scanner := bufio.NewScanner(strings.NewReader(buf.String())) - for scanner.Scan() { - line := scanner.Text() - if isError(line) { - Expect(framework.WriteContent(resultsFile, line)).To(Succeed()) - } + if err != nil { + return err } - Expect(scanner.Err()).ToNot(HaveOccurred()) -} -func isError(line string) bool { - return strings.Contains(line, "error") || strings.Contains(line, "warn") || strings.Contains(line, "emerg") + formattedContent := fmt.Sprintf("%s:\n\n```text\n%s```\n", testname, string(content)) + return framework.WriteContent(resultsFile, formattedContent) }