diff --git a/Makefile b/Makefile index 3f656da46..262f98b9d 100644 --- a/Makefile +++ b/Makefile @@ -18,6 +18,12 @@ run-appmesh: -slack-url=https://hooks.slack.com/services/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 \ -slack-channel="devops-alerts" +run-nginx: + go run cmd/flagger/* -kubeconfig=$$HOME/.kube/config -log-level=info -mesh-provider=nginx -namespace=nginx \ + -metrics-server=http://prometheus-weave.istio.weavedx.com \ + -slack-url=https://hooks.slack.com/services/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 \ + -slack-channel="devops-alerts" + build: docker build -t weaveworks/flagger:$(TAG) . -f Dockerfile diff --git a/artifacts/flagger/account.yaml b/artifacts/flagger/account.yaml index 9bbd84107..0fac89ad3 100644 --- a/artifacts/flagger/account.yaml +++ b/artifacts/flagger/account.yaml @@ -31,6 +31,12 @@ rules: resources: - horizontalpodautoscalers verbs: ["*"] + - apiGroups: + - "extensions" + resources: + - ingresses + - ingresses/status + verbs: ["*"] - apiGroups: - flagger.app resources: diff --git a/artifacts/flagger/crd.yaml b/artifacts/flagger/crd.yaml index 134020e63..61eac4087 100644 --- a/artifacts/flagger/crd.yaml +++ b/artifacts/flagger/crd.yaml @@ -69,6 +69,18 @@ spec: type: string name: type: string + ingressRef: + anyOf: + - type: string + - type: object + required: ['apiVersion', 'kind', 'name'] + properties: + apiVersion: + type: string + kind: + type: string + name: + type: string service: type: object required: ['port'] diff --git a/artifacts/nginx/canary.yaml b/artifacts/nginx/canary.yaml new file mode 100644 index 000000000..bca0a7098 --- /dev/null +++ b/artifacts/nginx/canary.yaml @@ -0,0 +1,59 @@ +apiVersion: flagger.app/v1alpha3 +kind: Canary +metadata: + name: podinfo + namespace: test +spec: + # deployment reference + targetRef: + apiVersion: apps/v1 + kind: Deployment + name: podinfo + # ingress reference + ingressRef: + apiVersion: extensions/v1beta1 + kind: Ingress + name: podinfo + # HPA reference (optional) + autoscalerRef: + apiVersion: autoscaling/v2beta1 + kind: HorizontalPodAutoscaler + name: podinfo + # the maximum time in seconds for the canary deployment + # to make progress before it is rollback (default 600s) + progressDeadlineSeconds: 60 + service: + # container port + port: 9898 + canaryAnalysis: + # schedule interval (default 60s) + interval: 10s + # max number of failed metric checks before rollback + threshold: 10 + # max traffic percentage routed to canary + # percentage (0-100) + maxWeight: 50 + # canary increment step + # percentage (0-100) + stepWeight: 5 + # NGINX Prometheus checks + metrics: + - name: request-success-rate + # minimum req success rate (non 5xx responses) + # percentage (0-100) + threshold: 99 + interval: 1m + - name: request-duration + # maximum avg req duration + # milliseconds + threshold: 500 + interval: 1m + # external checks (optional) + webhooks: + - name: load-test + url: http://flagger-loadtester.test/ + timeout: 5s + metadata: + type: cmd + cmd: "hey -z 1m -q 10 -c 2 http://app.example.com/" + logCmdOutput: "true" diff --git a/artifacts/nginx/deployment.yaml b/artifacts/nginx/deployment.yaml new file mode 100644 index 000000000..814dd9c2f --- /dev/null +++ b/artifacts/nginx/deployment.yaml @@ -0,0 +1,69 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: podinfo + namespace: test + labels: + app: podinfo +spec: + replicas: 1 + strategy: + rollingUpdate: + maxUnavailable: 0 + type: RollingUpdate + selector: + matchLabels: + app: podinfo + template: + metadata: + annotations: + prometheus.io/scrape: "true" + labels: + app: podinfo + spec: + containers: + - name: podinfod + image: quay.io/stefanprodan/podinfo:1.4.0 + imagePullPolicy: IfNotPresent + ports: + - containerPort: 9898 + name: http + protocol: TCP + command: + - ./podinfo + - --port=9898 + - --level=info + - --random-delay=false + - --random-error=false + env: + - name: PODINFO_UI_COLOR + value: green + livenessProbe: + exec: + command: + - podcli + - check + - http + - localhost:9898/healthz + failureThreshold: 3 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 2 + readinessProbe: + exec: + command: + - podcli + - check + - http + - localhost:9898/readyz + failureThreshold: 3 + periodSeconds: 3 + successThreshold: 1 + timeoutSeconds: 2 + resources: + limits: + cpu: 1000m + memory: 256Mi + requests: + cpu: 100m + memory: 16Mi diff --git a/artifacts/nginx/hpa.yaml b/artifacts/nginx/hpa.yaml new file mode 100644 index 000000000..fa2b5a6f4 --- /dev/null +++ b/artifacts/nginx/hpa.yaml @@ -0,0 +1,19 @@ +apiVersion: autoscaling/v2beta1 +kind: HorizontalPodAutoscaler +metadata: + name: podinfo + namespace: test +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: podinfo + minReplicas: 2 + maxReplicas: 4 + metrics: + - type: Resource + resource: + name: cpu + # scale up if usage is above + # 99% of the requested CPU (100m) + targetAverageUtilization: 99 diff --git a/artifacts/nginx/ingress.yaml b/artifacts/nginx/ingress.yaml new file mode 100644 index 000000000..5cb6b8269 --- /dev/null +++ b/artifacts/nginx/ingress.yaml @@ -0,0 +1,17 @@ +apiVersion: extensions/v1beta1 +kind: Ingress +metadata: + name: podinfo + namespace: test + labels: + app: podinfo + annotations: + kubernetes.io/ingress.class: "nginx" +spec: + rules: + - host: app.exmaple.com + http: + paths: + - backend: + serviceName: podinfo + servicePort: 9898 diff --git a/charts/flagger/templates/crd.yaml b/charts/flagger/templates/crd.yaml index aad996b89..f189a31a0 100644 --- a/charts/flagger/templates/crd.yaml +++ b/charts/flagger/templates/crd.yaml @@ -70,6 +70,18 @@ spec: type: string name: type: string + ingressRef: + anyOf: + - type: string + - type: object + required: ['apiVersion', 'kind', 'name'] + properties: + apiVersion: + type: string + kind: + type: string + name: + type: string service: type: object required: ['port'] diff --git a/charts/flagger/templates/deployment.yaml b/charts/flagger/templates/deployment.yaml index c23ce93b2..0df390b75 100644 --- a/charts/flagger/templates/deployment.yaml +++ b/charts/flagger/templates/deployment.yaml @@ -38,7 +38,11 @@ spec: {{- if .Values.meshProvider }} - -mesh-provider={{ .Values.meshProvider }} {{- end }} + {{- if .Values.prometheus.install }} + - -metrics-server=http://{{ template "flagger.fullname" . }}-prometheus:9090 + {{- else }} - -metrics-server={{ .Values.metricsServer }} + {{- end }} {{- if .Values.namespace }} - -namespace={{ .Values.namespace }} {{- end }} diff --git a/charts/flagger/templates/prometheus.yaml b/charts/flagger/templates/prometheus.yaml new file mode 100644 index 000000000..f1fe583b4 --- /dev/null +++ b/charts/flagger/templates/prometheus.yaml @@ -0,0 +1,292 @@ +{{- if .Values.prometheus.install }} +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRole +metadata: + name: {{ template "flagger.fullname" . }}-prometheus + labels: + helm.sh/chart: {{ template "flagger.chart" . }} + app.kubernetes.io/name: {{ template "flagger.name" . }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/instance: {{ .Release.Name }} +rules: + - apiGroups: [""] + resources: + - nodes + - services + - endpoints + - pods + - nodes/proxy + verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: + - configmaps + verbs: ["get"] + - nonResourceURLs: ["/metrics"] + verbs: ["get"] +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + name: {{ template "flagger.fullname" . }}-prometheus + labels: + helm.sh/chart: {{ template "flagger.chart" . }} + app.kubernetes.io/name: {{ template "flagger.name" . }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/instance: {{ .Release.Name }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ template "flagger.fullname" . }}-prometheus +subjects: + - kind: ServiceAccount + name: {{ template "flagger.serviceAccountName" . }}-prometheus + namespace: {{ .Release.Namespace }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ template "flagger.serviceAccountName" . }}-prometheus + namespace: {{ .Release.Namespace }} + labels: + helm.sh/chart: {{ template "flagger.chart" . }} + app.kubernetes.io/name: {{ template "flagger.name" . }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/instance: {{ .Release.Name }} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ template "flagger.fullname" . }}-prometheus + namespace: {{ .Release.Namespace }} + labels: + helm.sh/chart: {{ template "flagger.chart" . }} + app.kubernetes.io/name: {{ template "flagger.name" . }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/instance: {{ .Release.Name }} +data: + prometheus.yml: |- + global: + scrape_interval: 5s + scrape_configs: + + # Scrape config for AppMesh Envoy sidecar + - job_name: 'appmesh-envoy' + metrics_path: /stats/prometheus + kubernetes_sd_configs: + - role: pod + + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_name] + action: keep + regex: '^envoy$' + - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] + action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: ${1}:9901 + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: kubernetes_pod_name + + # Exclude high cardinality metrics + metric_relabel_configs: + - source_labels: [ cluster_name ] + regex: '(outbound|inbound|prometheus_stats).*' + action: drop + - source_labels: [ tcp_prefix ] + regex: '(outbound|inbound|prometheus_stats).*' + action: drop + - source_labels: [ listener_address ] + regex: '(.+)' + action: drop + - source_labels: [ http_conn_manager_listener_prefix ] + regex: '(.+)' + action: drop + - source_labels: [ http_conn_manager_prefix ] + regex: '(.+)' + action: drop + - source_labels: [ __name__ ] + regex: 'envoy_tls.*' + action: drop + - source_labels: [ __name__ ] + regex: 'envoy_tcp_downstream.*' + action: drop + - source_labels: [ __name__ ] + regex: 'envoy_http_(stats|admin).*' + action: drop + - source_labels: [ __name__ ] + regex: 'envoy_cluster_(lb|retry|bind|internal|max|original).*' + action: drop + + # Scrape config for API servers + - job_name: 'kubernetes-apiservers' + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: + - default + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + relabel_configs: + - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: kubernetes;https + + # Scrape config for nodes + - job_name: 'kubernetes-nodes' + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - target_label: __address__ + replacement: kubernetes.default.svc:443 + - source_labels: [__meta_kubernetes_node_name] + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/${1}/proxy/metrics + + # scrape config for cAdvisor + - job_name: 'kubernetes-cadvisor' + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - target_label: __address__ + replacement: kubernetes.default.svc:443 + - source_labels: [__meta_kubernetes_node_name] + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor + + # scrape config for pods + - job_name: kubernetes-pods + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape + - source_labels: [ __address__ ] + regex: '.*9901.*' + action: drop + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_pod_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: kubernetes_namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: kubernetes_pod_name +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ template "flagger.fullname" . }}-prometheus + namespace: {{ .Release.Namespace }} + labels: + helm.sh/chart: {{ template "flagger.chart" . }} + app.kubernetes.io/name: {{ template "flagger.name" . }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/instance: {{ .Release.Name }} +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: {{ template "flagger.name" . }}-prometheus + app.kubernetes.io/instance: {{ .Release.Name }} + template: + metadata: + labels: + app.kubernetes.io/name: {{ template "flagger.name" . }}-prometheus + app.kubernetes.io/instance: {{ .Release.Name }} + annotations: + appmesh.k8s.aws/sidecarInjectorWebhook: disabled + sidecar.istio.io/inject: "false" + spec: + serviceAccountName: {{ template "flagger.serviceAccountName" . }}-prometheus + containers: + - name: prometheus + image: "docker.io/prom/prometheus:v2.7.1" + imagePullPolicy: IfNotPresent + args: + - '--storage.tsdb.retention=6h' + - '--config.file=/etc/prometheus/prometheus.yml' + ports: + - containerPort: 9090 + name: http + livenessProbe: + httpGet: + path: /-/healthy + port: 9090 + readinessProbe: + httpGet: + path: /-/ready + port: 9090 + resources: + requests: + cpu: 10m + memory: 128Mi + volumeMounts: + - name: config-volume + mountPath: /etc/prometheus + - name: data-volume + mountPath: /prometheus/data + + volumes: + - name: config-volume + configMap: + name: {{ template "flagger.fullname" . }}-prometheus + - name: data-volume + emptyDir: {} +--- +apiVersion: v1 +kind: Service +metadata: + name: {{ template "flagger.fullname" . }}-prometheus + namespace: {{ .Release.Namespace }} + labels: + helm.sh/chart: {{ template "flagger.chart" . }} + app.kubernetes.io/name: {{ template "flagger.name" . }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/instance: {{ .Release.Name }} +spec: + selector: + app.kubernetes.io/name: {{ template "flagger.name" . }}-prometheus + app.kubernetes.io/instance: {{ .Release.Name }} + ports: + - name: http + protocol: TCP + port: 9090 +{{- end }} diff --git a/charts/flagger/templates/rbac.yaml b/charts/flagger/templates/rbac.yaml index 95a2f2498..e03755c08 100644 --- a/charts/flagger/templates/rbac.yaml +++ b/charts/flagger/templates/rbac.yaml @@ -27,6 +27,12 @@ rules: resources: - horizontalpodautoscalers verbs: ["*"] + - apiGroups: + - "extensions" + resources: + - ingresses + - ingresses/status + verbs: ["*"] - apiGroups: - flagger.app resources: diff --git a/charts/flagger/values.yaml b/charts/flagger/values.yaml index 0f1195ab2..0d1a07afc 100644 --- a/charts/flagger/values.yaml +++ b/charts/flagger/values.yaml @@ -7,7 +7,7 @@ image: metricsServer: "http://prometheus:9090" -# accepted values are istio or appmesh (defaults to istio) +# accepted values are istio, appmesh, nginx or supergloo:mesh.namespace (defaults to istio) meshProvider: "" # single namespace restriction @@ -49,3 +49,7 @@ nodeSelector: {} tolerations: [] affinity: {} + +prometheus: + # to be used with AppMesh or nginx ingress + install: false diff --git a/cmd/flagger/main.go b/cmd/flagger/main.go index 9297577e0..0c09b7dc3 100644 --- a/cmd/flagger/main.go +++ b/cmd/flagger/main.go @@ -99,7 +99,7 @@ func main() { canaryInformer := flaggerInformerFactory.Flagger().V1alpha3().Canaries() - logger.Infof("Starting flagger version %s revision %s", version.VERSION, version.REVISION) + logger.Infof("Starting flagger version %s revision %s mesh provider %s", version.VERSION, version.REVISION, meshProvider) ver, err := kubeClient.Discovery().ServerVersion() if err != nil { diff --git a/docs/diagrams/flagger-nginx-overview.png b/docs/diagrams/flagger-nginx-overview.png new file mode 100644 index 000000000..f8dcaadcd Binary files /dev/null and b/docs/diagrams/flagger-nginx-overview.png differ diff --git a/docs/gitbook/usage/nginx-progressive-delivery.md b/docs/gitbook/usage/nginx-progressive-delivery.md new file mode 100644 index 000000000..2fa7a5493 --- /dev/null +++ b/docs/gitbook/usage/nginx-progressive-delivery.md @@ -0,0 +1,355 @@ +# NGNIX Ingress Controller Canary Deployments + +This guide shows you how to use the NGINX ingress controller and Flagger to automate canary deployments and A/B testing. + +![Flagger NGINX Ingress Controller](https://github.com/raw/weaveworks/flagger/master/docs/diagrams/flagger-nginx-overview.png) + +### Prerequisites + +Flagger requires a Kubernetes cluster **v1.11** or newer and NGINX ingress **0.24** or newer. + +Install NGINX with Helm: + +```bash +helm upgrade -i nginx-ingress stable/nginx-ingress \ +--namespace ingress-nginx \ +--set controller.stats.enabled=true \ +--set controller.metrics.enabled=true +``` + +Install Flagger and the Prometheus add-on in the same namespace as NGINX: + +```bash +helm repo add flagger https://flagger.app + +helm upgrade -i flagger flagger/flagger \ +--namespace ingress-nginx \ +--set prometheus.install=true \ +--set meshProvider=nginx +``` + +Optionally you can enable Slack notifications: + +```bash +helm upgrade -i flagger flagger/flagger \ +--reuse-values \ +--namespace ingress-nginx \ +--set slack.url=https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK \ +--set slack.channel=general \ +--set slack.user=flagger +``` + +### Bootstrap + +Flagger takes a Kubernetes deployment and optionally a horizontal pod autoscaler (HPA), +then creates a series of objects (Kubernetes deployments, ClusterIP services and canary ingress). +These objects expose the application outside the cluster and drive the canary analysis and promotion. + +Create a test namespace: + +```bash +kubectl create ns test +``` + +Create a deployment and a horizontal pod autoscaler: + +```bash +kubectl apply -f ${REPO}/artifacts/nginx/deployment.yaml +kubectl apply -f ${REPO}/artifacts/nginx/hpa.yaml +``` + +Deploy the load testing service to generate traffic during the canary analysis: + +```bash +helm upgrade -i flagger-loadtester flagger/loadtester \ +--namespace=test +``` + +Create an ingress definition (replace `app.exmaple.com` with your own domain): + +```yaml +apiVersion: extensions/v1beta1 +kind: Ingress +metadata: + name: podinfo + namespace: test + labels: + app: podinfo + annotations: + kubernetes.io/ingress.class: "nginx" +spec: + rules: + - host: app.exmaple.com + http: + paths: + - backend: + serviceName: podinfo + servicePort: 9898 +``` + +Save the above resource as podinfo-ingress.yaml and then apply it: + +```bash +kubectl apply -f ./podinfo-ingress.yaml +``` + +Create a canary custom resource (replace `app.exmaple.com` with your own domain): + +```yaml +apiVersion: flagger.app/v1alpha3 +kind: Canary +metadata: + name: podinfo + namespace: test +spec: + # deployment reference + targetRef: + apiVersion: apps/v1 + kind: Deployment + name: podinfo + # ingress reference + ingressRef: + apiVersion: extensions/v1beta1 + kind: Ingress + name: podinfo + # HPA reference (optional) + autoscalerRef: + apiVersion: autoscaling/v2beta1 + kind: HorizontalPodAutoscaler + name: podinfo + # the maximum time in seconds for the canary deployment + # to make progress before it is rollback (default 600s) + progressDeadlineSeconds: 60 + service: + # container port + port: 9898 + canaryAnalysis: + # schedule interval (default 60s) + interval: 10s + # max number of failed metric checks before rollback + threshold: 10 + # max traffic percentage routed to canary + # percentage (0-100) + maxWeight: 50 + # canary increment step + # percentage (0-100) + stepWeight: 5 + # NGINX Prometheus checks + metrics: + - name: request-success-rate + # minimum req success rate (non 5xx responses) + # percentage (0-100) + threshold: 99 + interval: 1m + # load testing (optional) + webhooks: + - name: load-test + url: http://flagger-loadtester.test/ + timeout: 5s + metadata: + type: cmd + cmd: "hey -z 1m -q 10 -c 2 http://app.example.com/" +``` + +Save the above resource as podinfo-canary.yaml and then apply it: + +```bash +kubectl apply -f ./podinfo-canary.yaml +``` + +After a couple of seconds Flagger will create the canary objects: + +```bash +# applied +deployment.apps/podinfo +horizontalpodautoscaler.autoscaling/podinfo +ingresses.extensions/podinfo +canary.flagger.app/podinfo + +# generated +deployment.apps/podinfo-primary +horizontalpodautoscaler.autoscaling/podinfo-primary +service/podinfo +service/podinfo-canary +service/podinfo-primary +ingresses.extensions/podinfo-canary +``` + +### Automated canary promotion + +Flagger implements a control loop that gradually shifts traffic to the canary while measuring key performance indicators +like HTTP requests success rate, requests average duration and pod health. +Based on analysis of the KPIs a canary is promoted or aborted, and the analysis result is published to Slack. + +![Flagger Canary Stages](https://github.com/raw/weaveworks/flagger/master/docs/diagrams/flagger-canary-steps.png) + +Trigger a canary deployment by updating the container image: + +```bash +kubectl -n test set image deployment/podinfo \ +podinfod=quay.io/stefanprodan/podinfo:1.4.1 +``` + +Flagger detects that the deployment revision changed and starts a new rollout: + +```text +kubectl -n test describe canary/podinfo + +Status: + Canary Weight: 0 + Failed Checks: 0 + Phase: Succeeded +Events: + Type Reason Age From Message + ---- ------ ---- ---- ------- + Normal Synced 3m flagger New revision detected podinfo.test + Normal Synced 3m flagger Scaling up podinfo.test + Warning Synced 3m flagger Waiting for podinfo.test rollout to finish: 0 of 1 updated replicas are available + Normal Synced 3m flagger Advance podinfo.test canary weight 5 + Normal Synced 3m flagger Advance podinfo.test canary weight 10 + Normal Synced 3m flagger Advance podinfo.test canary weight 15 + Normal Synced 2m flagger Advance podinfo.test canary weight 20 + Normal Synced 2m flagger Advance podinfo.test canary weight 25 + Normal Synced 1m flagger Advance podinfo.test canary weight 30 + Normal Synced 1m flagger Advance podinfo.test canary weight 35 + Normal Synced 55s flagger Advance podinfo.test canary weight 40 + Normal Synced 45s flagger Advance podinfo.test canary weight 45 + Normal Synced 35s flagger Advance podinfo.test canary weight 50 + Normal Synced 25s flagger Copying podinfo.test template spec to podinfo-primary.test + Warning Synced 15s flagger Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available + Normal Synced 5s flagger Promotion completed! Scaling down podinfo.test +``` + +**Note** that if you apply new changes to the deployment during the canary analysis, Flagger will restart the analysis. + +You can monitor all canaries with: + +```bash +watch kubectl get canaries --all-namespaces + +NAMESPACE NAME STATUS WEIGHT LASTTRANSITIONTIME +test podinfo Progressing 15 2019-05-06T14:05:07Z +prod frontend Succeeded 0 2019-05-05T16:15:07Z +prod backend Failed 0 2019-05-04T17:05:07Z +``` + +### Automated rollback + +During the canary analysis you can generate HTTP 500 errors to test if Flagger pauses and rolls back the faulted version. + +Trigger another canary deployment: + +```bash +kubectl -n test set image deployment/podinfo \ +podinfod=quay.io/stefanprodan/podinfo:1.4.2 +``` + +Generate HTTP 500 errors: + +```bash +watch curl http://app.exmaple.com/status/500 +``` + +When the number of failed checks reaches the canary analysis threshold, the traffic is routed back to the primary, +the canary is scaled to zero and the rollout is marked as failed. + +```text +kubectl -n test describe canary/podinfo + +Status: + Canary Weight: 0 + Failed Checks: 10 + Phase: Failed +Events: + Type Reason Age From Message + ---- ------ ---- ---- ------- + Normal Synced 3m flagger Starting canary deployment for podinfo.test + Normal Synced 3m flagger Advance podinfo.test canary weight 5 + Normal Synced 3m flagger Advance podinfo.test canary weight 10 + Normal Synced 3m flagger Advance podinfo.test canary weight 15 + Normal Synced 3m flagger Halt podinfo.test advancement success rate 69.17% < 99% + Normal Synced 2m flagger Halt podinfo.test advancement success rate 61.39% < 99% + Normal Synced 2m flagger Halt podinfo.test advancement success rate 55.06% < 99% + Normal Synced 2m flagger Halt podinfo.test advancement success rate 47.00% < 99% + Normal Synced 2m flagger (combined from similar events): Halt podinfo.test advancement success rate 38.08% < 99% + Warning Synced 1m flagger Rolling back podinfo.test failed checks threshold reached 10 + Warning Synced 1m flagger Canary failed! Scaling down podinfo.test +``` + +### A/B Testing + +Besides weighted routing, Flagger can be configured to route traffic to the canary based on HTTP match conditions. +In an A/B testing scenario, you'll be using HTTP headers or cookies to target a certain segment of your users. +This is particularly useful for frontend applications that require session affinity. + +![Flagger A/B Testing Stages](https://github.com/raw/weaveworks/flagger/master/docs/diagrams/flagger-abtest-steps.png) + +Edit the canary analysis, remove the max/step weight and add the match conditions and iterations: + +```yaml + canaryAnalysis: + interval: 1m + threshold: 10 + iterations: 10 + match: + # curl -H 'X-Canary: insider' http://app.example.com + - headers: + x-canary: + exact: "insider" + # curl -b 'canary=always' http://app.example.com + - headers: + cookie: + exact: "canary" + metrics: + - name: request-success-rate + threshold: 99 + interval: 1m + webhooks: + - name: load-test + url: http://localhost:8888/ + timeout: 5s + metadata: + type: cmd + cmd: "hey -z 1m -q 10 -c 2 -H 'Cookie: canary=always' http://app.example.com/" + logCmdOutput: "true" +``` + +The above configuration will run an analysis for ten minutes targeting users that have a `canary` cookie set to `always` or +those that call the service using the `X-Canary: always` header. + +Trigger a canary deployment by updating the container image: + +```bash +kubectl -n test set image deployment/podinfo \ +podinfod=quay.io/stefanprodan/podinfo:1.5.0 +``` + +Flagger detects that the deployment revision changed and starts the A/B testing: + +```text +kubectl -n test describe canary/podinfo + +Status: + Failed Checks: 0 + Phase: Succeeded +Events: + Type Reason Age From Message + ---- ------ ---- ---- ------- + Normal Synced 3m flagger New revision detected podinfo.test + Normal Synced 3m flagger Scaling up podinfo.test + Warning Synced 3m flagger Waiting for podinfo.test rollout to finish: 0 of 1 updated replicas are available + Normal Synced 3m flagger Advance podinfo.test canary iteration 1/10 + Normal Synced 3m flagger Advance podinfo.test canary iteration 2/10 + Normal Synced 3m flagger Advance podinfo.test canary iteration 3/10 + Normal Synced 2m flagger Advance podinfo.test canary iteration 4/10 + Normal Synced 2m flagger Advance podinfo.test canary iteration 5/10 + Normal Synced 1m flagger Advance podinfo.test canary iteration 6/10 + Normal Synced 1m flagger Advance podinfo.test canary iteration 7/10 + Normal Synced 55s flagger Advance podinfo.test canary iteration 8/10 + Normal Synced 45s flagger Advance podinfo.test canary iteration 9/10 + Normal Synced 35s flagger Advance podinfo.test canary iteration 10/10 + Normal Synced 25s flagger Copying podinfo.test template spec to podinfo-primary.test + Warning Synced 15s flagger Waiting for podinfo-primary.test rollout to finish: 1 of 2 updated replicas are available + Normal Synced 5s flagger Promotion completed! Scaling down podinfo.test +``` + diff --git a/pkg/apis/flagger/v1alpha3/types.go b/pkg/apis/flagger/v1alpha3/types.go index 4b0b9d129..e9eb9a49c 100755 --- a/pkg/apis/flagger/v1alpha3/types.go +++ b/pkg/apis/flagger/v1alpha3/types.go @@ -52,6 +52,10 @@ type CanarySpec struct { // +optional AutoscalerRef *hpav1.CrossVersionObjectReference `json:"autoscalerRef,omitempty"` + // reference to NGINX ingress resource + // +optional + IngressRef *hpav1.CrossVersionObjectReference `json:"ingressRef,omitempty"` + // virtual service spec Service CanaryService `json:"service"` diff --git a/pkg/apis/flagger/v1alpha3/zz_generated.deepcopy.go b/pkg/apis/flagger/v1alpha3/zz_generated.deepcopy.go index 4bb9551cc..dc710448a 100644 --- a/pkg/apis/flagger/v1alpha3/zz_generated.deepcopy.go +++ b/pkg/apis/flagger/v1alpha3/zz_generated.deepcopy.go @@ -205,6 +205,11 @@ func (in *CanarySpec) DeepCopyInto(out *CanarySpec) { *out = new(v1.CrossVersionObjectReference) **out = **in } + if in.IngressRef != nil { + in, out := &in.IngressRef, &out.IngressRef + *out = new(v1.CrossVersionObjectReference) + **out = **in + } in.Service.DeepCopyInto(&out.Service) in.CanaryAnalysis.DeepCopyInto(&out.CanaryAnalysis) if in.ProgressDeadlineSeconds != nil { diff --git a/pkg/controller/scheduler.go b/pkg/controller/scheduler.go index 5c16cb4e3..0fc069789 100644 --- a/pkg/controller/scheduler.go +++ b/pkg/controller/scheduler.go @@ -632,6 +632,41 @@ func (c *Controller) analyseCanary(r *flaggerv1.Canary) bool { } } + // NGINX checks + if c.meshProvider == "nginx" { + if metric.Name == "request-success-rate" { + val, err := c.observer.GetNginxSuccessRate(r.Spec.IngressRef.Name, r.Namespace, metric.Name, metric.Interval) + if err != nil { + if strings.Contains(err.Error(), "no values found") { + c.recordEventWarningf(r, "Halt advancement no values found for metric %s probably %s.%s is not receiving traffic", + metric.Name, r.Spec.TargetRef.Name, r.Namespace) + } else { + c.recordEventErrorf(r, "Metrics server %s query failed: %v", c.observer.GetMetricsServer(), err) + } + return false + } + if float64(metric.Threshold) > val { + c.recordEventWarningf(r, "Halt %s.%s advancement success rate %.2f%% < %v%%", + r.Name, r.Namespace, val, metric.Threshold) + return false + } + } + + if metric.Name == "request-duration" { + val, err := c.observer.GetNginxRequestDuration(r.Spec.IngressRef.Name, r.Namespace, metric.Name, metric.Interval) + if err != nil { + c.recordEventErrorf(r, "Metrics server %s query failed: %v", c.observer.GetMetricsServer(), err) + return false + } + t := time.Duration(metric.Threshold) * time.Millisecond + if val > t { + c.recordEventWarningf(r, "Halt %s.%s advancement request duration %v > %v", + r.Name, r.Namespace, val, t) + return false + } + } + } + // custom checks if metric.Query != "" { val, err := c.observer.GetScalar(metric.Query) diff --git a/pkg/metrics/nginx.go b/pkg/metrics/nginx.go new file mode 100644 index 000000000..7c6eb56e7 --- /dev/null +++ b/pkg/metrics/nginx.go @@ -0,0 +1,122 @@ +package metrics + +import ( + "fmt" + "net/url" + "strconv" + "time" +) + +const nginxSuccessRateQuery = ` +sum(rate( +nginx_ingress_controller_requests{kubernetes_namespace="{{ .Namespace }}", +ingress="{{ .Name }}", +status!~"5.*"} +[{{ .Interval }}])) +/ +sum(rate( +nginx_ingress_controller_requests{kubernetes_namespace="{{ .Namespace }}", +ingress="{{ .Name }}"} +[{{ .Interval }}])) +* 100 +` + +// GetNginxSuccessRate returns the requests success rate (non 5xx) using nginx_ingress_controller_requests metric +func (c *Observer) GetNginxSuccessRate(name string, namespace string, metric string, interval string) (float64, error) { + if c.metricsServer == "fake" { + return 100, nil + } + + meta := struct { + Name string + Namespace string + Interval string + }{ + name, + namespace, + interval, + } + + query, err := render(meta, nginxSuccessRateQuery) + if err != nil { + return 0, err + } + + var rate *float64 + querySt := url.QueryEscape(query) + result, err := c.queryMetric(querySt) + if err != nil { + return 0, err + } + + for _, v := range result.Data.Result { + metricValue := v.Value[1] + switch metricValue.(type) { + case string: + f, err := strconv.ParseFloat(metricValue.(string), 64) + if err != nil { + return 0, err + } + rate = &f + } + } + if rate == nil { + return 0, fmt.Errorf("no values found for metric %s", metric) + } + return *rate, nil +} + +const nginxRequestDurationQuery = ` +sum(rate( +nginx_ingress_controller_ingress_upstream_latency_seconds_sum{kubernetes_namespace="{{ .Namespace }}", +ingress="{{ .Name }}"}[{{ .Interval }}])) +/ +sum(rate(nginx_ingress_controller_ingress_upstream_latency_seconds_count{kubernetes_namespace="{{ .Namespace }}", +ingress="{{ .Name }}"}[{{ .Interval }}])) * 1000 +` + +// GetNginxRequestDuration returns the avg requests latency using nginx_ingress_controller_ingress_upstream_latency_seconds_sum metric +func (c *Observer) GetNginxRequestDuration(name string, namespace string, metric string, interval string) (time.Duration, error) { + if c.metricsServer == "fake" { + return 1, nil + } + + meta := struct { + Name string + Namespace string + Interval string + }{ + name, + namespace, + interval, + } + + query, err := render(meta, nginxRequestDurationQuery) + if err != nil { + return 0, err + } + + var rate *float64 + querySt := url.QueryEscape(query) + result, err := c.queryMetric(querySt) + if err != nil { + return 0, err + } + + for _, v := range result.Data.Result { + metricValue := v.Value[1] + switch metricValue.(type) { + case string: + f, err := strconv.ParseFloat(metricValue.(string), 64) + if err != nil { + return 0, err + } + rate = &f + } + } + if rate == nil { + return 0, fmt.Errorf("no values found for metric %s", metric) + } + ms := time.Duration(int64(*rate)) * time.Millisecond + return ms, nil +} diff --git a/pkg/metrics/nginx_test.go b/pkg/metrics/nginx_test.go new file mode 100644 index 000000000..e2a93c5f1 --- /dev/null +++ b/pkg/metrics/nginx_test.go @@ -0,0 +1,51 @@ +package metrics + +import ( + "testing" +) + +func Test_NginxSuccessRateQueryRender(t *testing.T) { + meta := struct { + Name string + Namespace string + Interval string + }{ + "podinfo", + "nginx", + "1m", + } + + query, err := render(meta, nginxSuccessRateQuery) + if err != nil { + t.Fatal(err) + } + + expected := `sum(rate(nginx_ingress_controller_requests{kubernetes_namespace="nginx",ingress="podinfo",status!~"5.*"}[1m])) / sum(rate(nginx_ingress_controller_requests{kubernetes_namespace="nginx",ingress="podinfo"}[1m])) * 100` + + if query != expected { + t.Errorf("\nGot %s \nWanted %s", query, expected) + } +} + +func Test_NginxRequestDurationQueryRender(t *testing.T) { + meta := struct { + Name string + Namespace string + Interval string + }{ + "podinfo", + "nginx", + "1m", + } + + query, err := render(meta, nginxRequestDurationQuery) + if err != nil { + t.Fatal(err) + } + + expected := `sum(rate(nginx_ingress_controller_ingress_upstream_latency_seconds_sum{kubernetes_namespace="nginx",ingress="podinfo"}[1m])) /sum(rate(nginx_ingress_controller_ingress_upstream_latency_seconds_count{kubernetes_namespace="nginx",ingress="podinfo"}[1m])) * 1000` + + if query != expected { + t.Errorf("\nGot %s \nWanted %s", query, expected) + } +} diff --git a/pkg/router/factory.go b/pkg/router/factory.go index 9d8f66ecf..ab836d318 100644 --- a/pkg/router/factory.go +++ b/pkg/router/factory.go @@ -41,9 +41,14 @@ func (factory *Factory) KubernetesRouter(label string) *KubernetesRouter { } } -// MeshRouter returns a service mesh router (Istio or AppMesh) +// MeshRouter returns a service mesh router func (factory *Factory) MeshRouter(provider string) Interface { switch { + case provider == "nginx": + return &IngressRouter{ + logger: factory.logger, + kubeClient: factory.kubeClient, + } case provider == "appmesh": return &AppMeshRouter{ logger: factory.logger, diff --git a/pkg/router/ingress.go b/pkg/router/ingress.go new file mode 100644 index 000000000..ffc663b1f --- /dev/null +++ b/pkg/router/ingress.go @@ -0,0 +1,231 @@ +package router + +import ( + "fmt" + "github.com/google/go-cmp/cmp" + flaggerv1 "github.com/weaveworks/flagger/pkg/apis/flagger/v1alpha3" + "go.uber.org/zap" + "k8s.io/api/extensions/v1beta1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/kubernetes" + "strconv" + "strings" +) + +type IngressRouter struct { + kubeClient kubernetes.Interface + logger *zap.SugaredLogger +} + +func (i *IngressRouter) Reconcile(canary *flaggerv1.Canary) error { + if canary.Spec.IngressRef == nil || canary.Spec.IngressRef.Name == "" { + return fmt.Errorf("ingress selector is empty") + } + + targetName := canary.Spec.TargetRef.Name + canaryName := fmt.Sprintf("%s-canary", targetName) + canaryIngressName := fmt.Sprintf("%s-canary", canary.Spec.IngressRef.Name) + + ingress, err := i.kubeClient.ExtensionsV1beta1().Ingresses(canary.Namespace).Get(canary.Spec.IngressRef.Name, metav1.GetOptions{}) + if err != nil { + return err + } + + ingressClone := ingress.DeepCopy() + + // change backend to -canary + backendExists := false + for k, v := range ingressClone.Spec.Rules { + for x, y := range v.HTTP.Paths { + if y.Backend.ServiceName == targetName { + ingressClone.Spec.Rules[k].HTTP.Paths[x].Backend.ServiceName = canaryName + backendExists = true + break + } + } + } + + if !backendExists { + return fmt.Errorf("backend %s not found in ingress %s", targetName, canary.Spec.IngressRef.Name) + } + + canaryIngress, err := i.kubeClient.ExtensionsV1beta1().Ingresses(canary.Namespace).Get(canaryIngressName, metav1.GetOptions{}) + + if errors.IsNotFound(err) { + ing := &v1beta1.Ingress{ + ObjectMeta: metav1.ObjectMeta{ + Name: canaryIngressName, + Namespace: canary.Namespace, + OwnerReferences: []metav1.OwnerReference{ + *metav1.NewControllerRef(canary, schema.GroupVersionKind{ + Group: flaggerv1.SchemeGroupVersion.Group, + Version: flaggerv1.SchemeGroupVersion.Version, + Kind: flaggerv1.CanaryKind, + }), + }, + Annotations: i.makeAnnotations(ingressClone.Annotations), + Labels: ingressClone.Labels, + }, + Spec: ingressClone.Spec, + } + + _, err := i.kubeClient.ExtensionsV1beta1().Ingresses(canary.Namespace).Create(ing) + if err != nil { + return err + } + + i.logger.With("canary", fmt.Sprintf("%s.%s", canary.Name, canary.Namespace)). + Infof("Ingress %s.%s created", ing.GetName(), canary.Namespace) + return nil + } + + if err != nil { + return fmt.Errorf("ingress %s query error %v", canaryIngressName, err) + } + + if diff := cmp.Diff(ingressClone.Spec, canaryIngress.Spec); diff != "" { + iClone := canaryIngress.DeepCopy() + iClone.Spec = ingressClone.Spec + + _, err := i.kubeClient.ExtensionsV1beta1().Ingresses(canary.Namespace).Update(iClone) + if err != nil { + return fmt.Errorf("ingress %s update error %v", canaryIngressName, err) + } + + i.logger.With("canary", fmt.Sprintf("%s.%s", canary.Name, canary.Namespace)). + Infof("Ingress %s updated", canaryIngressName) + } + + return nil +} + +func (i *IngressRouter) GetRoutes(canary *flaggerv1.Canary) ( + primaryWeight int, + canaryWeight int, + err error, +) { + canaryIngressName := fmt.Sprintf("%s-canary", canary.Spec.IngressRef.Name) + canaryIngress, err := i.kubeClient.ExtensionsV1beta1().Ingresses(canary.Namespace).Get(canaryIngressName, metav1.GetOptions{}) + if err != nil { + return 0, 0, err + } + + // A/B testing + if len(canary.Spec.CanaryAnalysis.Match) > 0 { + for k := range canaryIngress.Annotations { + if k == "nginx.ingress.kubernetes.io/canary-by-cookie" || k == "nginx.ingress.kubernetes.io/canary-by-header" { + return 0, 100, nil + } + } + } + + // Canary + for k, v := range canaryIngress.Annotations { + if k == "nginx.ingress.kubernetes.io/canary-weight" { + val, err := strconv.Atoi(v) + if err != nil { + return 0, 0, err + } + + canaryWeight = val + break + } + } + + primaryWeight = 100 - canaryWeight + return +} + +func (i *IngressRouter) SetRoutes( + canary *flaggerv1.Canary, + primaryWeight int, + canaryWeight int, +) error { + canaryIngressName := fmt.Sprintf("%s-canary", canary.Spec.IngressRef.Name) + canaryIngress, err := i.kubeClient.ExtensionsV1beta1().Ingresses(canary.Namespace).Get(canaryIngressName, metav1.GetOptions{}) + if err != nil { + return err + } + + iClone := canaryIngress.DeepCopy() + + // A/B testing + if len(canary.Spec.CanaryAnalysis.Match) > 0 { + cookie := "" + header := "" + headerValue := "" + for _, m := range canary.Spec.CanaryAnalysis.Match { + for k, v := range m.Headers { + if k == "cookie" { + cookie = v.Exact + } else { + header = k + headerValue = v.Exact + } + } + } + + iClone.Annotations = i.makeHeaderAnnotations(iClone.Annotations, header, headerValue, cookie) + } else { + // canary + iClone.Annotations["nginx.ingress.kubernetes.io/canary-weight"] = fmt.Sprintf("%v", canaryWeight) + } + + // toggle canary + if canaryWeight > 0 { + iClone.Annotations["nginx.ingress.kubernetes.io/canary"] = "true" + } else { + iClone.Annotations = i.makeAnnotations(iClone.Annotations) + } + + _, err = i.kubeClient.ExtensionsV1beta1().Ingresses(canary.Namespace).Update(iClone) + if err != nil { + return fmt.Errorf("ingress %s update error %v", canaryIngressName, err) + } + + return nil +} + +func (i *IngressRouter) makeAnnotations(annotations map[string]string) map[string]string { + res := make(map[string]string) + for k, v := range annotations { + if !strings.Contains(k, "nginx.ingress.kubernetes.io/canary") && + !strings.Contains(k, "kubectl.kubernetes.io/last-applied-configuration") { + res[k] = v + } + } + + res["nginx.ingress.kubernetes.io/canary"] = "false" + res["nginx.ingress.kubernetes.io/canary-weight"] = "0" + + return res +} + +func (i *IngressRouter) makeHeaderAnnotations(annotations map[string]string, + header string, headerValue string, cookie string) map[string]string { + res := make(map[string]string) + for k, v := range annotations { + if !strings.Contains(v, "nginx.ingress.kubernetes.io/canary") { + res[k] = v + } + } + + res["nginx.ingress.kubernetes.io/canary"] = "true" + res["nginx.ingress.kubernetes.io/canary-weight"] = "0" + + if cookie != "" { + res["nginx.ingress.kubernetes.io/canary-by-cookie"] = cookie + } + + if header != "" { + res["nginx.ingress.kubernetes.io/canary-by-header"] = header + } + + if headerValue != "" { + res["nginx.ingress.kubernetes.io/canary-by-header-value"] = headerValue + } + + return res +}