From f7388c5c5a432a9e24a6c2d65315ab25d509183b Mon Sep 17 00:00:00 2001
From: sanposhiho <44139130+sanposhiho@users.noreply.github.com>
Date: Tue, 2 Nov 2021 07:23:47 +0900
Subject: [PATCH] KEP-3022: Tuning the number of domains on Pod Topology Spread

---
 keps/prod-readiness/sig-scheduling/3022.yaml  |   6 +
 .../README.md                                 | 513 ++++++++++++++++++
 .../kep.yaml                                  |  28 +
 3 files changed, 547 insertions(+)
 create mode 100644 keps/prod-readiness/sig-scheduling/3022.yaml
 create mode 100644 keps/sig-scheduling/3022-tuning-the-number-of-domains-on-pod-topology-spread/README.md
 create mode 100644 keps/sig-scheduling/3022-tuning-the-number-of-domains-on-pod-topology-spread/kep.yaml

diff --git a/keps/prod-readiness/sig-scheduling/3022.yaml b/keps/prod-readiness/sig-scheduling/3022.yaml
new file mode 100644
index 000000000000..703595b4ee23
--- /dev/null
+++ b/keps/prod-readiness/sig-scheduling/3022.yaml
@@ -0,0 +1,6 @@
+# The KEP must have an approver from the
+# "prod-readiness-approvers" group
+# of http://git.k8s.io/enhancements/OWNERS_ALIASES
+kep-number: 3022
+alpha:
+  approver: "@wojtek-t"
diff --git a/keps/sig-scheduling/3022-tuning-the-number-of-domains-on-pod-topology-spread/README.md b/keps/sig-scheduling/3022-tuning-the-number-of-domains-on-pod-topology-spread/README.md
new file mode 100644
index 000000000000..a88bb5e22538
--- /dev/null
+++ b/keps/sig-scheduling/3022-tuning-the-number-of-domains-on-pod-topology-spread/README.md
@@ -0,0 +1,513 @@
+# KEP-3022: Tuning the number of domains on Pod Topology Spread
+
+<!-- toc -->
+
+- [Release Signoff Checklist](#release-signoff-checklist)
+- [Summary](#summary)
+- [Motivation](#motivation)
+  - [Goals](#goals)
+- [Proposal](#proposal)
+  - [User Stories](#user-stories)
+    - [Story 1](#story-1)
+    - [Story 2](#story-2)
+- [Design Details](#design-details)
+  - [API](#api)
+  - [How user stories are addressed](#how-user-stories-are-addressed)
+  - [Test Plan](#test-plan)
+  - [Graduation Criteria](#graduation-criteria)
+    - [Alpha (v1.24):](#alpha-v124)
+    - [Beta (v1.25):](#beta-v125)
+- [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire)
+  - [Feature Enablement and Rollback](#feature-enablement-and-rollback)
+  - [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning)
+  - [Monitoring Requirements](#monitoring-requirements)
+  - [Dependencies](#dependencies)
+  - [Scalability](#scalability)
+  - [Troubleshooting](#troubleshooting)
+- [Implementation History](#implementation-history)
+- [Drawbacks](#drawbacks)
+- [Alternatives](#alternatives)
+- [Infrastructure Needed (Optional)](#infrastructure-needed-optional)
+<!-- /toc -->
+
+## Release Signoff Checklist
+
+Items marked with (R) are required _prior to targeting to a milestone / release_.
+
+- [ ] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR)
+- [ ] (R) KEP approvers have approved the KEP status as `implementable`
+- [ ] (R) Design details are appropriately documented
+- [ ] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors)
+  - [ ] e2e Tests for all Beta API Operations (endpoints)
+  - [ ] (R) Ensure GA e2e tests for meet requirements for [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md)
+  - [ ] (R) Minimum Two Week Window for GA e2e tests to prove flake free
+- [ ] (R) Graduation criteria is in place
+  - [ ] (R) [all GA Endpoints](https://github.com/kubernetes/community/pull/1806) must be hit by [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md)
+- [ ] (R) Production readiness review completed
+- [ ] (R) Production readiness review approved
+- [ ] "Implementation History" section is up-to-date for milestone
+- [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io]
+- [ ] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes
+
+[kubernetes.io]: https://kubernetes.io/
+[kubernetes/enhancements]: https://git.k8s.io/enhancements
+[kubernetes/kubernetes]: https://git.k8s.io/kubernetes
+[kubernetes/website]: https://git.k8s.io/website
+
+## Summary
+
+With [Pod Topology Spread](/keps/sig-scheduling/895-pod-topology-spread), users can define the rule to spread pods across your cluster among failure-domains.
+And, we propose to add the parameters `minDomains` and `maxDomains` to control the number of domains on Pod Topology Spread.
+
+## Motivation
+
+Pod Topology Spread has [`maxSkew` parameter](https://github.com/kubernetes/enhancements/tree/11a976c74e1358efccf251d4c7611d05ce27feb3/keps/sig-scheduling/895-pod-topology-spread#maxskew), which control the degree to which Pods may be unevenly distributed.
+But, there isn't a way to control the number of domains over which we should spread.
+
+### Goals
+
+Users can define a maximum number of domains with `maxDomains` parameter
+
+When the number of Pods is more than `maxDomains`,
+
+- when `whenUnsatisfiable` equals `DoNotSchedule`, scheduler schedules each Pods to a different Node.
+  i.e. scheduler filters Nodes that don't have matching Pods in the `Filter` phase.
+- when `whenUnsatisfiable` equals `ScheduleAnyway`, scheduler prefers Nodes that have matching Pods.
+  i.e. scheduler gives a high score to Nodes that have matching Pods in the `Score` phase.
+
+And, Users can define a minimum number of domains with `minDomains` parameter
+
+When the number of Pods is less than `minDomains`,
+
+- when `whenUnsatisfiable` equals `DoNotSchedule`, scheduler doesn't schedule a matching Pod to Nodes that have matching Pods.
+  i.e. scheduler filters Nodes that have matching Pods in the `Filter` phase.
+- when `whenUnsatisfiable` equals `ScheduleAnyway`, scheduler prefers Nodes that don't have matching Pods.
+  i.e. scheduler gives a high score to Nodes that don't have matching Pods in the `Score` phase.
+
+## Proposal
+
+### User Stories
+
+#### Story 1
+
+I am using cluster autoscaler and I want to force spreading a deployment over at least 10 Nodes.
+
+#### Story 2
+
+I am using preferred topology spread, I have a large cluster, but I don't want to spread the service over many Nodes, I prefer to spread it over at most 4 before spreading to other Nodes.
+
+## Design Details
+
+### API
+
+New parameters called `MaxDomains` and `MinDomains` are introduced.
+
+```go
+type TopologySpreadConstraint struct {
+......
+  // MaxDomains describes the maximum number of domains.
+  // When the number of Pods is more than `maxDomains`,
+  // - when `whenUnsatisfiable` equals `DoNotSchedule`, scheduler schedules each Pods to a different Node.
+  //   i.e. scheduler filters Nodes that don't have matching Pods in the `Filter` phase.
+  // - when `whenUnsatisfiable` equals `ScheduleAnyway`, scheduler prefers Nodes that have matching Pods.
+  //   i.e. scheduler gives a high score to Nodes that have matching Pods in the `Score` phase.
+  // +optional
+  MaxDomains int32
+  // MinDomains describes the minimum number of domains.
+  // When the number of Pods is less than `minDomains`,
+  // - when `whenUnsatisfiable` equals `DoNotSchedule`,
+  //   scheduler doesn't schedule a matching Pod to Nodes that have matching Pods.
+  //   i.e. scheduler filters Nodes that have matching Pods in the `Filter` phase.
+  // - when `whenUnsatisfiable` equals `ScheduleAnyway`, scheduler prefers Nodes that don't have matching Pods.
+  //   i.e. scheduler gives a high score to Nodes that don't have matching Pods in the `Score` phase.
+  // +optional
+  MinDomains int32
+}
+```
+
+### How user stories are addressed
+
+In terms of story 1, users can set `MinDomains` and `whenUnsatisfiable: DoNotSchedule` to achieve it.
+
+```yaml
+spec:
+  topologySpreadConstraints:
+    - minDomains: 10
+      topologyKey: node
+      whenUnsatisfiable: DoNotSchedule
+      labelSelector:
+        matchLabels:
+          foo: bar
+```
+
+With this, until 10 Pods have been created, scheduler schedules each Pod to a different Node.
+This means that, for example, if there are only 5 Nodes that can be scheduled, only a maximum of 5 Pods will be created.
+
+---
+
+In terms of story 2, users can set `MaxDomains` and `whenUnsatisfiable: ScheduleAnyway` to achieve it.
+
+```yaml
+spec:
+  topologySpreadConstraints:
+    - maxDomains: 4
+      topologyKey: node
+      whenUnsatisfiable: ScheduleAnyway
+      labelSelector:
+        matchLabels:
+          foo: bar
+```
+
+With this, when it is not possible to schedule to the 4 Nodes which have matching Pods for some reason,
+scheduler prefers to schedule them to be spread to as few Nodes as possible.
+
+### Test Plan
+
+To ensure this feature to be rolled out in high quality. Following tests are mandatory:
+
+- **Unit Tests**: All core changes must be covered by unit tests.
+- **Integration Tests / E2E Tests:** All user cases discussed in this KEP must
+  be covered by either integration tests or e2e tests.
+- **Benchmark Tests:** We can bear with slight performance overhead if users are
+  using this feature, but it shouldn't impose penalty to users who are not using
+  this feature. We will verify it by designing some benchmark tests.
+
+### Graduation Criteria
+
+#### Alpha (v1.24):
+
+- [ ] Add parameters to `TopologySpreadConstraint` and future gating.
+- [ ] Score extension point implementation.
+- [ ] Filter extension point implementation.
+- [ ] Unit test cases mentioned in the [Test Plan](#test-plan).
+
+#### Beta (v1.25):
+
+- [ ] This feature will be enabled by default as a Beta feature in v1.25.
+- [ ] Add necessary integration/end-to-end tests.
+
+## Production Readiness Review Questionnaire
+
+<!--
+
+Production readiness reviews are intended to ensure that features merging into
+Kubernetes are observable, scalable and supportable; can be safely operated in
+production environments, and can be disabled or rolled back in the event they
+cause increased failures in production. See more in the PRR KEP at
+https://git.k8s.io/enhancements/keps/sig-architecture/1194-prod-readiness.
+
+The production readiness review questionnaire must be completed and approved
+for the KEP to move to `implementable` status and be included in the release.
+
+In some cases, the questions below should also have answers in `kep.yaml`. This
+is to enable automation to verify the presence of the review, and to reduce review
+burden and latency.
+
+The KEP must have a approver from the
+[`prod-readiness-approvers`](http://git.k8s.io/enhancements/OWNERS_ALIASES)
+team. Please reach out on the
+[#prod-readiness](https://kubernetes.slack.com/archives/CPNHUMN74) channel if
+you need any help or guidance.
+-->
+
+### Feature Enablement and Rollback
+
+<!--
+This section must be completed when targeting alpha to a release.
+-->
+
+###### How can this feature be enabled / disabled in a live cluster?
+
+- [ ] Feature gate (also fill in values in `kep.yaml`)
+  - Feature gate name:
+  - Components depending on the feature gate:
+
+###### Does enabling the feature change any default behavior?
+
+No.
+
+###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)?
+
+The feature can be disabled in Alpha and Beta versions.
+In terms of Stable versions, users can choose to opt-out by not setting the
+`pod.spec.topologySpreadConstraints.maxDomains` and `pod.spec.topologySpreadConstraints.minDomains` field.
+
+###### What happens if we reenable the feature if it was previously rolled back?
+
+N/A.
+
+###### Are there any tests for feature enablement/disablement?
+
+No.
+
+### Rollout, Upgrade and Rollback Planning
+
+<!--
+This section must be completed when targeting beta to a release.
+-->
+
+###### How can a rollout or rollback fail? Can it impact already running workloads?
+
+<!--
+Try to be as paranoid as possible - e.g., what if some components will restart
+mid-rollout?
+
+Be sure to consider highly-available clusters, where, for example,
+feature flags will be enabled on some API servers and not others during the
+rollout. Similarly, consider large clusters and how enablement/disablement
+will rollout across nodes.
+-->
+
+###### What specific metrics should inform a rollback?
+
+<!--
+What signals should users be paying attention to when the feature is young
+that might indicate a serious problem?
+-->
+
+###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested?
+
+<!--
+Describe manual testing that was done and the outcomes.
+Longer term, we may want to require automated upgrade/rollback tests, but we
+are missing a bunch of machinery and tooling and can't do that now.
+-->
+
+###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.?
+
+<!--
+Even if applying deprecation policies, they may still surprise some users.
+-->
+
+### Monitoring Requirements
+
+<!--
+This section must be completed when targeting beta to a release.
+-->
+
+###### How can an operator determine if the feature is in use by workloads?
+
+<!--
+Ideally, this should be a metric. Operations against the Kubernetes API (e.g.,
+checking if there are objects with field X set) may be a last resort. Avoid
+logs or events for this purpose.
+-->
+
+###### How can someone using this feature know that it is working for their instance?
+
+<!--
+For instance, if this is a pod-related feature, it should be possible to determine if the feature is functioning properly
+for each individual pod.
+Pick one more of these and delete the rest.
+Please describe all items visible to end users below with sufficient detail so that they can verify correct enablement
+and operation of this feature.
+Recall that end users cannot usually observe component logs or access metrics.
+-->
+
+- [ ] Events
+  - Event Reason:
+- [ ] API .status
+  - Condition name:
+  - Other field:
+- [ ] Other (treat as last resort)
+  - Details:
+
+###### What are the reasonable SLOs (Service Level Objectives) for the enhancement?
+
+<!--
+This is your opportunity to define what "normal" quality of service looks like
+for a feature.
+
+It's impossible to provide comprehensive guidance, but at the very
+high level (needs more precise definitions) those may be things like:
+  - per-day percentage of API calls finishing with 5XX errors <= 1%
+  - 99% percentile over day of absolute value from (job creation time minus expected
+    job creation time) for cron job <= 10%
+  - 99.9% of /health requests per day finish with 200 code
+
+These goals will help you determine what you need to measure (SLIs) in the next
+question.
+-->
+
+###### What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service?
+
+<!--
+Pick one more of these and delete the rest.
+-->
+
+- [ ] Metrics
+  - Metric name:
+  - [Optional] Aggregation method:
+  - Components exposing the metric:
+- [ ] Other (treat as last resort)
+  - Details:
+
+###### Are there any missing metrics that would be useful to have to improve observability of this feature?
+
+<!--
+Describe the metrics themselves and the reasons why they weren't added (e.g., cost,
+implementation difficulties, etc.).
+-->
+
+### Dependencies
+
+<!--
+This section must be completed when targeting beta to a release.
+-->
+
+###### Does this feature depend on any specific services running in the cluster?
+
+<!--
+Think about both cluster-level services (e.g. metrics-server) as well
+as node-level agents (e.g. specific version of CRI). Focus on external or
+optional services that are needed. For example, if this feature depends on
+a cloud provider API, or upon an external software-defined storage or network
+control plane.
+
+For each of these, fill in the following—thinking about running existing user workloads
+and creating new ones, as well as about cluster-level services (e.g. DNS):
+  - [Dependency name]
+    - Usage description:
+      - Impact of its outage on the feature:
+      - Impact of its degraded performance or high-error rates on the feature:
+-->
+
+### Scalability
+
+<!--
+For alpha, this section is encouraged: reviewers should consider these questions
+and attempt to answer them.
+
+For beta, this section is required: reviewers must answer these questions.
+
+For GA, this section is required: approvers should be able to confirm the
+previous answers based on experience in the field.
+-->
+
+###### Will enabling / using this feature result in any new API calls?
+
+<!--
+Describe them, providing:
+  - API call type (e.g. PATCH pods)
+  - estimated throughput
+  - originating component(s) (e.g. Kubelet, Feature-X-controller)
+Focusing mostly on:
+  - components listing and/or watching resources they didn't before
+  - API calls that may be triggered by changes of some Kubernetes resources
+    (e.g. update of object X triggers new updates of object Y)
+  - periodic API calls to reconcile state (e.g. periodic fetching state,
+    heartbeats, leader election, etc.)
+-->
+
+###### Will enabling / using this feature result in introducing new API types?
+
+<!--
+Describe them, providing:
+  - API type
+  - Supported number of objects per cluster
+  - Supported number of objects per namespace (for namespace-scoped objects)
+-->
+
+###### Will enabling / using this feature result in any new calls to the cloud provider?
+
+<!--
+Describe them, providing:
+  - Which API(s):
+  - Estimated increase:
+-->
+
+###### Will enabling / using this feature result in increasing size or count of the existing API objects?
+
+<!--
+Describe them, providing:
+  - API type(s):
+  - Estimated increase in size: (e.g., new annotation of size 32B)
+  - Estimated amount of new objects: (e.g., new Object X for every existing Pod)
+-->
+
+###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs?
+
+<!--
+Look at the [existing SLIs/SLOs].
+
+Think about adding additional work or introducing new steps in between
+(e.g. need to do X to start a container), etc. Please describe the details.
+
+[existing SLIs/SLOs]: https://git.k8s.io/community/sig-scalability/slos/slos.md#kubernetes-slisslos
+-->
+
+###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components?
+
+<!--
+Things to keep in mind include: additional in-memory state, additional
+non-trivial computations, excessive access to disks (including increased log
+volume), significant amount of data sent and/or received over network, etc.
+This through this both in small and large cases, again with respect to the
+[supported limits].
+
+[supported limits]: https://git.k8s.io/community//sig-scalability/configs-and-limits/thresholds.md
+-->
+
+### Troubleshooting
+
+<!--
+This section must be completed when targeting beta to a release.
+
+The Troubleshooting section currently serves the `Playbook` role. We may consider
+splitting it into a dedicated `Playbook` document (potentially with some monitoring
+details). For now, we leave it here.
+-->
+
+###### How does this feature react if the API server and/or etcd is unavailable?
+
+###### What are other known failure modes?
+
+<!--
+For each of them, fill in the following information by copying the below template:
+  - [Failure mode brief description]
+    - Detection: How can it be detected via metrics? Stated another way:
+      how can an operator troubleshoot without logging into a master or worker node?
+    - Mitigations: What can be done to stop the bleeding, especially for already
+      running user workloads?
+    - Diagnostics: What are the useful log messages and their required logging
+      levels that could help debug the issue?
+      Not required until feature graduated to beta.
+    - Testing: Are there any tests for failure mode? If not, describe why.
+-->
+
+###### What steps should be taken if SLOs are not being met to determine the problem?
+
+## Implementation History
+
+<!--
+Major milestones in the lifecycle of a KEP should be tracked in this section.
+Major milestones might include:
+- the `Summary` and `Motivation` sections being merged, signaling SIG acceptance
+- the `Proposal` section being merged, signaling agreement on a proposed design
+- the date implementation started
+- the first Kubernetes release where an initial version of the KEP was available
+- the version of Kubernetes where the KEP graduated to general availability
+- when the KEP was retired or superseded
+-->
+
+## Drawbacks
+
+<!--
+Why should this KEP _not_ be implemented?
+-->
+
+## Alternatives
+
+<!--
+What other approaches did you consider, and why did you rule them out? These do
+not need to be as detailed as the proposal, but should include enough
+information to express the idea and why it was not acceptable.
+-->
+
+## Infrastructure Needed (Optional)
+
+<!--
+Use this section if you need things from the project/SIG. Examples include a
+new subproject, repos requested, or GitHub details. Listing these here allows a
+SIG to get the process for these resources started right away.
+-->
diff --git a/keps/sig-scheduling/3022-tuning-the-number-of-domains-on-pod-topology-spread/kep.yaml b/keps/sig-scheduling/3022-tuning-the-number-of-domains-on-pod-topology-spread/kep.yaml
new file mode 100644
index 000000000000..79f3f410ecfa
--- /dev/null
+++ b/keps/sig-scheduling/3022-tuning-the-number-of-domains-on-pod-topology-spread/kep.yaml
@@ -0,0 +1,28 @@
+title: Tuning the number of domains in PodTopologySpread
+kep-number: 3022
+authors:
+  - "@sanposhiho"
+owning-sig: sig-scheduling
+status: provisional
+creation-date: 2021-10-28
+reviewers:
+  - "@alculquicondor"
+  - "@ahg-g"
+  - "@Huang-Wei"
+  - "@damemi"
+approvers:
+  - "@alculquicondor"
+  - "@ahg-g"
+  - "@Huang-Wei"
+  - "@damemi"
+stage: alpha
+latest-milestone: "v1.24"
+milestone:
+  alpha: "v1.24"
+  beta: "v1.25"
+  stable: "v1.26"
+disable-supported: true
+feature-gates:
+  - name: TuneTheNumberOfDomainsOnPodTopologySpread
+    components:
+      - kube-scheduler