From f14bfee34358d8e946a5b109e73b7779d531674d Mon Sep 17 00:00:00 2001 From: sanposhiho <44139130+sanposhiho@users.noreply.github.com> Date: Tue, 2 Nov 2021 07:23:47 +0900 Subject: [PATCH] KEP-3022: Tuning the number of domains on Pod Topology Spread --- keps/prod-readiness/sig-scheduling/3022.yaml | 6 + .../README.md | 522 ++++++++++++++++++ .../kep.yaml | 27 + 3 files changed, 555 insertions(+) create mode 100644 keps/prod-readiness/sig-scheduling/3022.yaml create mode 100644 keps/sig-scheduling/3022-min-domains-in-pod-topology-spread/README.md create mode 100644 keps/sig-scheduling/3022-min-domains-in-pod-topology-spread/kep.yaml diff --git a/keps/prod-readiness/sig-scheduling/3022.yaml b/keps/prod-readiness/sig-scheduling/3022.yaml new file mode 100644 index 00000000000..703595b4ee2 --- /dev/null +++ b/keps/prod-readiness/sig-scheduling/3022.yaml @@ -0,0 +1,6 @@ +# The KEP must have an approver from the +# "prod-readiness-approvers" group +# of http://git.k8s.io/enhancements/OWNERS_ALIASES +kep-number: 3022 +alpha: + approver: "@wojtek-t" diff --git a/keps/sig-scheduling/3022-min-domains-in-pod-topology-spread/README.md b/keps/sig-scheduling/3022-min-domains-in-pod-topology-spread/README.md new file mode 100644 index 00000000000..79d8c8e08af --- /dev/null +++ b/keps/sig-scheduling/3022-min-domains-in-pod-topology-spread/README.md @@ -0,0 +1,522 @@ +# KEP-3022: min domains in Pod Topology Spread + + + +- [Release Signoff Checklist](#release-signoff-checklist) +- [Summary](#summary) +- [Motivation](#motivation) + - [Goals](#goals) + - [Non-Goals](#non-goals) +- [Proposal](#proposal) + - [User Story](#user-story) +- [Design Details](#design-details) + - [API](#api) + - [Implementation details](#implementation-details) + - [How user stories are addressed](#how-user-stories-are-addressed) + - [Test Plan](#test-plan) + - [Graduation Criteria](#graduation-criteria) + - [Alpha (v1.24):](#alpha-v124) + - [Beta (v1.25):](#beta-v125) +- [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire) + - [Feature Enablement and Rollback](#feature-enablement-and-rollback) + - [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning) + - [Monitoring Requirements](#monitoring-requirements) + - [Dependencies](#dependencies) + - [Scalability](#scalability) + - [Troubleshooting](#troubleshooting) +- [Implementation History](#implementation-history) +- [Drawbacks](#drawbacks) +- [Alternatives](#alternatives) + - [Support minDomains in ScheduleAnyway as well](#support--in-scheduleanyway-as-well) +- [Infrastructure Needed (Optional)](#infrastructure-needed-optional) + + +## Release Signoff Checklist + +Items marked with (R) are required _prior to targeting to a milestone / release_. + +- [ ] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR) +- [ ] (R) KEP approvers have approved the KEP status as `implementable` +- [ ] (R) Design details are appropriately documented +- [ ] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors) + - [ ] e2e Tests for all Beta API Operations (endpoints) + - [ ] (R) Ensure GA e2e tests for meet requirements for [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) + - [ ] (R) Minimum Two Week Window for GA e2e tests to prove flake free +- [ ] (R) Graduation criteria is in place + - [ ] (R) [all GA Endpoints](https://github.com/kubernetes/community/pull/1806) must be hit by [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) +- [ ] (R) Production readiness review completed +- [ ] (R) Production readiness review approved +- [ ] "Implementation History" section is up-to-date for milestone +- [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io] +- [ ] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes + +[kubernetes.io]: https://kubernetes.io/ +[kubernetes/enhancements]: https://git.k8s.io/enhancements +[kubernetes/kubernetes]: https://git.k8s.io/kubernetes +[kubernetes/website]: https://git.k8s.io/website + +## Summary + +A new field `minDomains` is introduced to `PodSpec.TopologySpreadConstraint[*]` to limit +the minimum number of topology domains. +`minDomains` can be used only when `whenUnsatisfiable=DoNotSchedule`. + +## Motivation + +Pod Topology Spread has [`maxSkew` parameter](https://github.com/kubernetes/enhancements/tree/11a976c74e1358efccf251d4c7611d05ce27feb3/keps/sig-scheduling/895-pod-topology-spread#maxskew), which control the degree to which Pods may be unevenly distributed. +But, there isn't a way to control the number of domains over which we should spread. +In some cases, users want to force spreading Pods over a minimum number of domains and, if there aren't enough already present, make the cluster-autoscaler provision them. + +### Goals + +- Users can specify `minDomains` to limit the number of domains when using `WhenUnsatisfiable=DoNotSchedule`. + +### Non-Goals + +- Add new field to limit the maximum number of topology domains. +- Users can use it as a best-efforts manner with `WhenUnsatisfiable=ScheduleAnyway`. + +## Proposal + +### User Story + +I am using cluster autoscaler and I want to force spreading a deployment over at least 5 Nodes. + +## Design Details + +Users can define a minimum number of domains with `minDomains` parameter. +This parameter only applies when `whenUnsatisfiable=DoNotSchedule`. + +Pod Topology Spread has the semantics of "global minimum", which means the minimum number of pods that match the label selector in a topology domain. + +However, the global minimum is only calculated for the nodes that exist and match the node affinity. In other words, if a topology domain was scaled down to zero (for example, because of low utilization), this topology domain is unknown to the scheduler, thus it's not considered in the global minimum calculations. + +The new `minDomains` field can help with this problem. + +When the number of domains with matching topology keys is less than `minDomains`, +Pod Topology Spread treats "global minimum" as 0; otherwise, "global minimum" +is equal to the minimum number of matching pods on a domain. + +As a result, when the number of domains is less than `minDomains`, scheduler doesn't schedule a matching Pod to Nodes on the domains that have the same or more number of matching Pods as `maxSkew`. + +`minDomains` is an optional parameter and the default value is 0. + +### API + +New optional parameter called `MinDomains` is introduced to `PodSpec.TopologySpreadConstraint[*]`. + +```go +type TopologySpreadConstraint struct { +...... + // When the number of domains with matching topology keys is less than `minDomains`, + // Pod Topology Spread treats "global minimum" as 0. + // As a result, when the number of domains is less than `minDomains`, + // scheduler doesn't schedule a matching Pod to Nodes on the domains that have the same or more number of matching Pods as `maxSkew`. + // Default value is 0. When value is different than 0, WhenUnsatisfiable must be DoNotSchedule. + // +optional + MinDomains *int32 +} +``` + +### Implementation details + +In Filter of Pod Topology Spread, current filtering criteria is + +``` +('existing matching num' + 'if self-match (1 or 0)' - 'global min matching num') <= 'maxSkew' +``` + +- `existing matching num` denotes the number of current existing matching Pods on the domain. +- `if self-match` denotes if the labels of Pod matches with selector of the constraint. +- `global min matching num` denotes the minumun number of matching Pods. + +For `whenUnsatisfiable: DoNotSchedule`, Pod Topology Spread will treat `global min matching num` as 0 +when the number of domains with matching topology keys is less than `minDomains`. + +We can calculate the number of domains with matching topology keys in PreFilter, along with the calculation of [`TpPairToMatchNum`](https://github.com/kubernetes/kubernetes/blob/0153febd9f0098d4b8d0d484927710eaf899ef40/pkg/scheduler/framework/plugins/podtopologyspread/filtering.go#L49). +This extra calculation doesn't increase the complexity of the preFilter logic. +Pod Topology Spread will be able to use the number of domains to determine the value of `global min matching num` when we calculate filtering criteria. + +### How user stories are addressed + +Users can set `MinDomains` and `whenUnsatisfiable: DoNotSchedule` to achieve it. + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: nginx-deployment +spec: + selector: + matchLabels: + app: nginx + replicas: 10 + template: + metadata: + labels: + foo: bar + spec: + containers: + - name: nginx + image: nginx:1.14.2 + ports: + - containerPort: 80 + topologySpreadConstraints: + - maxSkew: 2 + minDomains: 5 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: DoNotSchedule + labelSelector: + matchLabels: + foo: bar +``` + +Considering the case that we have 3 Nodes which can schedule Pods to. + +6 Pods will be scheduled to that Nodes, and the rest 4 Pods can only be scheduled when 2 more Node join the cluster. + +With the flow, this deployment will be spread over at least 5 Nodes while protecting the constraints of `maxSkew`. + +### Test Plan + +To ensure this feature to be rolled out in high quality. Following tests are mandatory: + +- **Unit Tests**: All core changes must be covered by unit tests. +- **Integration Tests / E2E Tests:** Tests to ensure the behavior of this feature must + be covered by either integration tests or e2e tests. +- **Benchmark Tests:** We can bear with slight performance overhead if users are + using this feature, but it shouldn't impose penalty to users who are not using + this feature. We will verify it by designing some benchmark tests. + +### Graduation Criteria + +#### Alpha (v1.24): + +- [ ] Add new parameter `MinDomains` to `TopologySpreadConstraint` and feature gating. +- [ ] Filter extension point implementation. +- [ ] Implement all tests mentioned in the [Test Plan](#test-plan). + +#### Beta (v1.25): + +- [ ] This feature will be enabled by default as a Beta feature in v1.25. + +## Production Readiness Review Questionnaire + + + +### Feature Enablement and Rollback + + + +###### How can this feature be enabled / disabled in a live cluster? + +- [x] Feature gate (also fill in values in `kep.yaml`) + - Feature gate name: `MinDomainsInPodTopologySpread` + - Components depending on the feature gate: `kube-scheduler`, `kube-apiserver` + +###### Does enabling the feature change any default behavior? + +No. + +###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)? + +The feature can be disabled in Alpha and Beta versions +by restarting kube-apiserver and kube-scheduler with feature-gate off. +In terms of Stable versions, users can choose to opt-out by not setting the +`pod.spec.topologySpreadConstraints.minDomains` field. + +###### What happens if we reenable the feature if it was previously rolled back? + +Scheduling of new Pods is affected. + +###### Are there any tests for feature enablement/disablement? + +No - unit and integration tests will be added. + +### Rollout, Upgrade and Rollback Planning + + + +###### How can a rollout or rollback fail? Can it impact already running workloads? + + + +###### What specific metrics should inform a rollback? + + + +###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested? + + + +###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.? + + + +### Monitoring Requirements + + + +###### How can an operator determine if the feature is in use by workloads? + + + +###### How can someone using this feature know that it is working for their instance? + + + +- [ ] Events + - Event Reason: +- [ ] API .status + - Condition name: + - Other field: +- [ ] Other (treat as last resort) + - Details: + +###### What are the reasonable SLOs (Service Level Objectives) for the enhancement? + + + +###### What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service? + + + +- [ ] Metrics + - Metric name: + - [Optional] Aggregation method: + - Components exposing the metric: +- [ ] Other (treat as last resort) + - Details: + +###### Are there any missing metrics that would be useful to have to improve observability of this feature? + + + +### Dependencies + + + +###### Does this feature depend on any specific services running in the cluster? + + + +### Scalability + +###### Will enabling / using this feature result in any new API calls? + +No. + +###### Will enabling / using this feature result in introducing new API types? + +No. + +###### Will enabling / using this feature result in any new calls to the cloud provider? + +No. + +###### Will enabling / using this feature result in increasing size or count of the existing API objects? + +Describe them, providing: + +- API type(s): Pod +- Estimated increase in size: new field `.Spec.topologySpreadConstraint.MinDomains` about 4 bytes (int32) + +###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs? + +No. The performance degradation on scheduler is not expected. + + + +###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components? + +The scheduler have to process `MinDomains` parameter which may result in some small increase in CPU usage. + + + +### Troubleshooting + + + +###### How does this feature react if the API server and/or etcd is unavailable? + +###### What are other known failure modes? + + + +###### What steps should be taken if SLOs are not being met to determine the problem? + +## Implementation History + + + +## Drawbacks + + + +## Alternatives + +### Support `minDomains` in ScheduleAnyway as well + +When the number of domains with matching topology keys is less than `minDomains` and `whenUnsatisfiable` equals to `ScheduleAnyway`, +Pod Topology Spread will give low scores to Nodes on the domains which have the same or more number of matching Pods as `maxSkew`. + +In Pod Topology Spread, the higher the score from Score, the lower will be the normalized score calculated by Normalized Score. So, Pod Topology Spread should give high scores to non-preferred Nodes in Score. + +When the number of domains with matching topology keys is less than `minDomains`, +Pod Topology Spread doubles that score for the constraint in Score (so that normalized score will be a lower score) if this criteria is met: + +``` +('existing matching num' + 'if self-match (1 or 0)' - 'global min matching num') > 'maxSkew' +``` + +- `existing matching num` denotes the number of current existing matching Pods on the domain. +- `if self-match` denotes if the labels of Pod matches with selector of the constraint. +- `global min matching num` denotes the minumun number of matching Pods. + +This `minDomains` in ScheduleAnyway is decided not to support because of the following reasons: + +- To support this, we need to calculate the number of domains with matching topology keys and the minimum number of matching Pods in preScore like preFilter, so that Pod Topology Spread can determine the evaluation way with them. + + This extra calculation may affect the performance of the preScore, because the current preScore only see Nodes which have passed the Filter, but to calculate them, Pod Topology Spread needs to see all Nodes (includes Nodes which haven't passed the Filter). + +- `minDomains` is supported mainly for [the above user story](#user-story), which using the cluster autoscaler. + + The scoring results of scheduler doesn't affect the cluster-autoscaler. So, it is not worth supporting with the performance degradation. + +## Infrastructure Needed (Optional) + + diff --git a/keps/sig-scheduling/3022-min-domains-in-pod-topology-spread/kep.yaml b/keps/sig-scheduling/3022-min-domains-in-pod-topology-spread/kep.yaml new file mode 100644 index 00000000000..38e7d7f4731 --- /dev/null +++ b/keps/sig-scheduling/3022-min-domains-in-pod-topology-spread/kep.yaml @@ -0,0 +1,27 @@ +title: Tuning the number of domains in PodTopologySpread +kep-number: 3022 +authors: + - "@sanposhiho" +owning-sig: sig-scheduling +participating-sigs: + - sig-autoscaling +status: implementable +creation-date: 2021-10-28 +reviewers: + - "@alculquicondor" + - "@Huang-Wei" + - "@x13n" +approvers: + - "@alculquicondor" + - "@Huang-Wei" +stage: alpha +latest-milestone: "v1.24" +milestone: + alpha: "v1.24" + beta: "v1.25" + stable: "v1.26" +disable-supported: true +feature-gates: + - name: MinDomainsInPodTopologySpread + components: + - kube-scheduler