From f7388c5c5a432a9e24a6c2d65315ab25d509183b Mon Sep 17 00:00:00 2001 From: sanposhiho <44139130+sanposhiho@users.noreply.github.com> Date: Tue, 2 Nov 2021 07:23:47 +0900 Subject: [PATCH] KEP-3022: Tuning the number of domains on Pod Topology Spread --- keps/prod-readiness/sig-scheduling/3022.yaml | 6 + .../README.md | 513 ++++++++++++++++++ .../kep.yaml | 28 + 3 files changed, 547 insertions(+) create mode 100644 keps/prod-readiness/sig-scheduling/3022.yaml create mode 100644 keps/sig-scheduling/3022-tuning-the-number-of-domains-on-pod-topology-spread/README.md create mode 100644 keps/sig-scheduling/3022-tuning-the-number-of-domains-on-pod-topology-spread/kep.yaml diff --git a/keps/prod-readiness/sig-scheduling/3022.yaml b/keps/prod-readiness/sig-scheduling/3022.yaml new file mode 100644 index 000000000000..703595b4ee23 --- /dev/null +++ b/keps/prod-readiness/sig-scheduling/3022.yaml @@ -0,0 +1,6 @@ +# The KEP must have an approver from the +# "prod-readiness-approvers" group +# of http://git.k8s.io/enhancements/OWNERS_ALIASES +kep-number: 3022 +alpha: + approver: "@wojtek-t" diff --git a/keps/sig-scheduling/3022-tuning-the-number-of-domains-on-pod-topology-spread/README.md b/keps/sig-scheduling/3022-tuning-the-number-of-domains-on-pod-topology-spread/README.md new file mode 100644 index 000000000000..a88bb5e22538 --- /dev/null +++ b/keps/sig-scheduling/3022-tuning-the-number-of-domains-on-pod-topology-spread/README.md @@ -0,0 +1,513 @@ +# KEP-3022: Tuning the number of domains on Pod Topology Spread + + + +- [Release Signoff Checklist](#release-signoff-checklist) +- [Summary](#summary) +- [Motivation](#motivation) + - [Goals](#goals) +- [Proposal](#proposal) + - [User Stories](#user-stories) + - [Story 1](#story-1) + - [Story 2](#story-2) +- [Design Details](#design-details) + - [API](#api) + - [How user stories are addressed](#how-user-stories-are-addressed) + - [Test Plan](#test-plan) + - [Graduation Criteria](#graduation-criteria) + - [Alpha (v1.24):](#alpha-v124) + - [Beta (v1.25):](#beta-v125) +- [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire) + - [Feature Enablement and Rollback](#feature-enablement-and-rollback) + - [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning) + - [Monitoring Requirements](#monitoring-requirements) + - [Dependencies](#dependencies) + - [Scalability](#scalability) + - [Troubleshooting](#troubleshooting) +- [Implementation History](#implementation-history) +- [Drawbacks](#drawbacks) +- [Alternatives](#alternatives) +- [Infrastructure Needed (Optional)](#infrastructure-needed-optional) + + +## Release Signoff Checklist + +Items marked with (R) are required _prior to targeting to a milestone / release_. + +- [ ] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR) +- [ ] (R) KEP approvers have approved the KEP status as `implementable` +- [ ] (R) Design details are appropriately documented +- [ ] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors) + - [ ] e2e Tests for all Beta API Operations (endpoints) + - [ ] (R) Ensure GA e2e tests for meet requirements for [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) + - [ ] (R) Minimum Two Week Window for GA e2e tests to prove flake free +- [ ] (R) Graduation criteria is in place + - [ ] (R) [all GA Endpoints](https://github.com/kubernetes/community/pull/1806) must be hit by [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) +- [ ] (R) Production readiness review completed +- [ ] (R) Production readiness review approved +- [ ] "Implementation History" section is up-to-date for milestone +- [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io] +- [ ] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes + +[kubernetes.io]: https://kubernetes.io/ +[kubernetes/enhancements]: https://git.k8s.io/enhancements +[kubernetes/kubernetes]: https://git.k8s.io/kubernetes +[kubernetes/website]: https://git.k8s.io/website + +## Summary + +With [Pod Topology Spread](/keps/sig-scheduling/895-pod-topology-spread), users can define the rule to spread pods across your cluster among failure-domains. +And, we propose to add the parameters `minDomains` and `maxDomains` to control the number of domains on Pod Topology Spread. + +## Motivation + +Pod Topology Spread has [`maxSkew` parameter](https://github.com/kubernetes/enhancements/tree/11a976c74e1358efccf251d4c7611d05ce27feb3/keps/sig-scheduling/895-pod-topology-spread#maxskew), which control the degree to which Pods may be unevenly distributed. +But, there isn't a way to control the number of domains over which we should spread. + +### Goals + +Users can define a maximum number of domains with `maxDomains` parameter + +When the number of Pods is more than `maxDomains`, + +- when `whenUnsatisfiable` equals `DoNotSchedule`, scheduler schedules each Pods to a different Node. + i.e. scheduler filters Nodes that don't have matching Pods in the `Filter` phase. +- when `whenUnsatisfiable` equals `ScheduleAnyway`, scheduler prefers Nodes that have matching Pods. + i.e. scheduler gives a high score to Nodes that have matching Pods in the `Score` phase. + +And, Users can define a minimum number of domains with `minDomains` parameter + +When the number of Pods is less than `minDomains`, + +- when `whenUnsatisfiable` equals `DoNotSchedule`, scheduler doesn't schedule a matching Pod to Nodes that have matching Pods. + i.e. scheduler filters Nodes that have matching Pods in the `Filter` phase. +- when `whenUnsatisfiable` equals `ScheduleAnyway`, scheduler prefers Nodes that don't have matching Pods. + i.e. scheduler gives a high score to Nodes that don't have matching Pods in the `Score` phase. + +## Proposal + +### User Stories + +#### Story 1 + +I am using cluster autoscaler and I want to force spreading a deployment over at least 10 Nodes. + +#### Story 2 + +I am using preferred topology spread, I have a large cluster, but I don't want to spread the service over many Nodes, I prefer to spread it over at most 4 before spreading to other Nodes. + +## Design Details + +### API + +New parameters called `MaxDomains` and `MinDomains` are introduced. + +```go +type TopologySpreadConstraint struct { +...... + // MaxDomains describes the maximum number of domains. + // When the number of Pods is more than `maxDomains`, + // - when `whenUnsatisfiable` equals `DoNotSchedule`, scheduler schedules each Pods to a different Node. + // i.e. scheduler filters Nodes that don't have matching Pods in the `Filter` phase. + // - when `whenUnsatisfiable` equals `ScheduleAnyway`, scheduler prefers Nodes that have matching Pods. + // i.e. scheduler gives a high score to Nodes that have matching Pods in the `Score` phase. + // +optional + MaxDomains int32 + // MinDomains describes the minimum number of domains. + // When the number of Pods is less than `minDomains`, + // - when `whenUnsatisfiable` equals `DoNotSchedule`, + // scheduler doesn't schedule a matching Pod to Nodes that have matching Pods. + // i.e. scheduler filters Nodes that have matching Pods in the `Filter` phase. + // - when `whenUnsatisfiable` equals `ScheduleAnyway`, scheduler prefers Nodes that don't have matching Pods. + // i.e. scheduler gives a high score to Nodes that don't have matching Pods in the `Score` phase. + // +optional + MinDomains int32 +} +``` + +### How user stories are addressed + +In terms of story 1, users can set `MinDomains` and `whenUnsatisfiable: DoNotSchedule` to achieve it. + +```yaml +spec: + topologySpreadConstraints: + - minDomains: 10 + topologyKey: node + whenUnsatisfiable: DoNotSchedule + labelSelector: + matchLabels: + foo: bar +``` + +With this, until 10 Pods have been created, scheduler schedules each Pod to a different Node. +This means that, for example, if there are only 5 Nodes that can be scheduled, only a maximum of 5 Pods will be created. + +--- + +In terms of story 2, users can set `MaxDomains` and `whenUnsatisfiable: ScheduleAnyway` to achieve it. + +```yaml +spec: + topologySpreadConstraints: + - maxDomains: 4 + topologyKey: node + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + foo: bar +``` + +With this, when it is not possible to schedule to the 4 Nodes which have matching Pods for some reason, +scheduler prefers to schedule them to be spread to as few Nodes as possible. + +### Test Plan + +To ensure this feature to be rolled out in high quality. Following tests are mandatory: + +- **Unit Tests**: All core changes must be covered by unit tests. +- **Integration Tests / E2E Tests:** All user cases discussed in this KEP must + be covered by either integration tests or e2e tests. +- **Benchmark Tests:** We can bear with slight performance overhead if users are + using this feature, but it shouldn't impose penalty to users who are not using + this feature. We will verify it by designing some benchmark tests. + +### Graduation Criteria + +#### Alpha (v1.24): + +- [ ] Add parameters to `TopologySpreadConstraint` and future gating. +- [ ] Score extension point implementation. +- [ ] Filter extension point implementation. +- [ ] Unit test cases mentioned in the [Test Plan](#test-plan). + +#### Beta (v1.25): + +- [ ] This feature will be enabled by default as a Beta feature in v1.25. +- [ ] Add necessary integration/end-to-end tests. + +## Production Readiness Review Questionnaire + + + +### Feature Enablement and Rollback + + + +###### How can this feature be enabled / disabled in a live cluster? + +- [ ] Feature gate (also fill in values in `kep.yaml`) + - Feature gate name: + - Components depending on the feature gate: + +###### Does enabling the feature change any default behavior? + +No. + +###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)? + +The feature can be disabled in Alpha and Beta versions. +In terms of Stable versions, users can choose to opt-out by not setting the +`pod.spec.topologySpreadConstraints.maxDomains` and `pod.spec.topologySpreadConstraints.minDomains` field. + +###### What happens if we reenable the feature if it was previously rolled back? + +N/A. + +###### Are there any tests for feature enablement/disablement? + +No. + +### Rollout, Upgrade and Rollback Planning + + + +###### How can a rollout or rollback fail? Can it impact already running workloads? + + + +###### What specific metrics should inform a rollback? + + + +###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested? + + + +###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.? + + + +### Monitoring Requirements + + + +###### How can an operator determine if the feature is in use by workloads? + + + +###### How can someone using this feature know that it is working for their instance? + + + +- [ ] Events + - Event Reason: +- [ ] API .status + - Condition name: + - Other field: +- [ ] Other (treat as last resort) + - Details: + +###### What are the reasonable SLOs (Service Level Objectives) for the enhancement? + + + +###### What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service? + + + +- [ ] Metrics + - Metric name: + - [Optional] Aggregation method: + - Components exposing the metric: +- [ ] Other (treat as last resort) + - Details: + +###### Are there any missing metrics that would be useful to have to improve observability of this feature? + + + +### Dependencies + + + +###### Does this feature depend on any specific services running in the cluster? + + + +### Scalability + + + +###### Will enabling / using this feature result in any new API calls? + + + +###### Will enabling / using this feature result in introducing new API types? + + + +###### Will enabling / using this feature result in any new calls to the cloud provider? + + + +###### Will enabling / using this feature result in increasing size or count of the existing API objects? + + + +###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs? + + + +###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components? + + + +### Troubleshooting + + + +###### How does this feature react if the API server and/or etcd is unavailable? + +###### What are other known failure modes? + + + +###### What steps should be taken if SLOs are not being met to determine the problem? + +## Implementation History + + + +## Drawbacks + + + +## Alternatives + + + +## Infrastructure Needed (Optional) + + diff --git a/keps/sig-scheduling/3022-tuning-the-number-of-domains-on-pod-topology-spread/kep.yaml b/keps/sig-scheduling/3022-tuning-the-number-of-domains-on-pod-topology-spread/kep.yaml new file mode 100644 index 000000000000..79f3f410ecfa --- /dev/null +++ b/keps/sig-scheduling/3022-tuning-the-number-of-domains-on-pod-topology-spread/kep.yaml @@ -0,0 +1,28 @@ +title: Tuning the number of domains in PodTopologySpread +kep-number: 3022 +authors: + - "@sanposhiho" +owning-sig: sig-scheduling +status: provisional +creation-date: 2021-10-28 +reviewers: + - "@alculquicondor" + - "@ahg-g" + - "@Huang-Wei" + - "@damemi" +approvers: + - "@alculquicondor" + - "@ahg-g" + - "@Huang-Wei" + - "@damemi" +stage: alpha +latest-milestone: "v1.24" +milestone: + alpha: "v1.24" + beta: "v1.25" + stable: "v1.26" +disable-supported: true +feature-gates: + - name: TuneTheNumberOfDomainsOnPodTopologySpread + components: + - kube-scheduler