Skip to content

Commit

Permalink
chore: multirm unique resource pool config changes [RM-74] (#8974)
Browse files Browse the repository at this point in the history
  • Loading branch information
NicholasBlaskey authored Mar 8, 2024
1 parent ca29879 commit 6e1acf4
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 85 deletions.
35 changes: 21 additions & 14 deletions master/internal/config/resource_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,45 +90,52 @@ func (r *ResourceConfig) ResolveResource() error {
r.RootManagerInternal.setName(DefaultRMName)
}

// Add a default resource pool for all non resource managers.
// Add a default resource pool for nonslurm default resource managers.
// TODO(multirm-slurm) rethink pool discovery.
if r.RootPoolsInternal == nil &&
(r.RootManagerInternal.AgentRM != nil || r.RootManagerInternal.KubernetesRM != nil) {
defaultPool := defaultRPConfig()

defaultPool.PoolName = defaultResourcePoolName
r.RootPoolsInternal = []ResourcePoolConfig{defaultPool}
}
for _, c := range r.AdditionalResourceManagersInternal {
if c.ResourcePools == nil &&
(c.ResourceManager.AgentRM != nil || c.ResourceManager.KubernetesRM != nil) {
defaultPool := defaultRPConfig()
defaultPool.PoolName = defaultResourcePoolName
c.ResourcePools = []ResourcePoolConfig{defaultPool}
}
}

return nil
}

// Validate implements the check.Validatable interface.
func (r ResourceConfig) Validate() []error {
seenResourceManagerNames := make(map[string]bool)
poolNames := make(map[string]bool)
var errs []error
for _, r := range r.ResourceManagers() {
// All non slurm resource managers must have a resource pool.
if len(r.ResourcePools) == 0 &&
(r.ResourceManager.AgentRM != nil || r.ResourceManager.KubernetesRM != nil) {
errs = append(errs, fmt.Errorf(
"for additional_resource_managers, you must specify at least one resource pool"))
}

name := r.ResourceManager.Name()
if _, ok := seenResourceManagerNames[name]; ok {
errs = append(errs, fmt.Errorf("resource manager has a duplicate name: %s", name))
}
seenResourceManagerNames[name] = true

poolNames := make(map[string]bool)
rmPoolNames := make(map[string]bool)
for _, rp := range r.ResourcePools {
if _, ok := poolNames[rp.PoolName]; ok {
errs = append(errs, fmt.Errorf(
"resource pool has a duplicate name: %s", rp.PoolName))
} else {
poolNames[rp.PoolName] = true
if _, ok := rmPoolNames[rp.PoolName]; ok {
errs = append(errs, fmt.Errorf(
"resource pool has a duplicate name: %s", rp.PoolName))
} else {
errs = append(errs, fmt.Errorf("resource pool has a duplicate name: %s "+
"They must be unique across even different resource managers", rp.PoolName))
}
}

rmPoolNames[rp.PoolName] = true
poolNames[rp.PoolName] = true
}
}

Expand Down
116 changes: 45 additions & 71 deletions master/internal/config/resource_config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,25 @@ resource_pools:
"a duplicate name: a",
},

{
"dupe pools in additional rm", `
resource_manager:
type: agent
name: a
resource_pools:
- pool_name: a
additional_resource_managers:
- resource_manager:
type: kubernetes
max_slots_per_pod: 2
name: b
resource_pools:
- pool_name: a`, nil, "Check Failed! 2 errors found:\n\terror found at root.ResourceConfig: " +
"resource pool has a duplicate name: a They must be unique across even different " +
"resource managers\n\terror found at root: resource pool has a duplicate name: a " +
"They must be unique across even different resource managers",
},

{"dupe rm names", `
resource_manager:
type: agent
Expand All @@ -131,7 +150,9 @@ additional_resource_managers:
- resource_manager:
type: kubernetes
name: a
max_slots_per_pod: 2`, nil, "Check Failed! 2 errors found:\n\terror found at " +
max_slots_per_pod: 2
resource_pools:
- pool_name: a`, nil, "Check Failed! 2 errors found:\n\terror found at " +
"root.ResourceConfig: resource manager has a duplicate name: a\n\terror " +
"found at root: resource manager has a duplicate name: a"},

Expand All @@ -142,7 +163,9 @@ resource_manager:
additional_resource_managers:
- resource_manager:
type: agent
name: b`, nil, "Check Failed! 2 errors found:\n\terror found at root.ResourceConfig: " +
name: b
resource_pools:
- pool_name: a`, nil, "Check Failed! 2 errors found:\n\terror found at root.ResourceConfig: " +
"additional_resource_managers only supports resource managers of type: " +
"kubernetes\n\terror found at root: additional_resource_managers only supports " +
"resource managers of type: kubernetes"},
Expand All @@ -154,10 +177,25 @@ resource_manager:
additional_resource_managers:
- resource_manager:
type: kubernetes
max_slots_per_pod: 12`, nil, "Check Failed! 1 errors found:\n\terror found at " +
max_slots_per_pod: 12
resource_pools:
- pool_name: a`, nil, "Check Failed! 1 errors found:\n\terror found at " +
"root.ResourceConfig.AdditionalResourceManagersInternal[0]." +
"ResourceManager.KubernetesRM: name is required: must be non-empty"},

{"additional rm not giving pools", `
resource_manager:
type: agent
name: a
additional_resource_managers:
- resource_manager:
type: kubernetes
name: test
max_slots_per_pod: 12`, nil, "Check Failed! 2 errors found:\n\terror found at " +
"root.ResourceConfig: for additional_resource_managers, you must specify at " +
"least one resource pool\n\terror found at root: for additional_resource_managers, " +
"you must specify at least one resource pool"},

{"k8s rocm config", `
resource_manager:
type: kubernetes
Expand Down Expand Up @@ -198,9 +236,6 @@ resource_manager:
}

func TestResolveConfig(t *testing.T) {
// defaultRPConf := defaultRPConfig()
// defaultRPConf.PoolName = defaultRPName

cases := []struct {
name string
yaml string
Expand Down Expand Up @@ -318,67 +353,6 @@ resource_manager:
type: agent
metadata:
region: "nw"
additional_resource_managers:
- resource_manager:
name: test
type: kubernetes
metadata:
test: "y"
name: k8s
max_slots_per_pod: 65`, Config{
ResourceConfig: ResourceConfig{
RootManagerInternal: &ResourceManagerConfig{
AgentRM: &AgentResourceManagerConfig{
Name: DefaultRMName,
DefaultAuxResourcePool: "default",
DefaultComputeResourcePool: "default",
Scheduler: DefaultSchedulerConfig(),
Metadata: map[string]string{
"region": "nw",
},
},
},
RootPoolsInternal: []ResourcePoolConfig{
{
PoolName: "default",
MaxAuxContainersPerAgent: 100,
AgentReconnectWait: model.Duration(aproto.AgentReconnectWait),
MaxCPUContainersPerAgent: -1,
},
},
AdditionalResourceManagersInternal: []*ResourceManagerWithPoolsConfig{
{
ResourceManager: &ResourceManagerConfig{
KubernetesRM: &KubernetesResourceManagerConfig{
Name: "test",
SlotType: "cuda",
DefaultAuxResourcePool: "default",
MaxSlotsPerPod: ptrs.Ptr(65),
DefaultComputeResourcePool: "default",
Metadata: map[string]string{
"test": "y",
"name": "k8s",
},
},
},
ResourcePools: []ResourcePoolConfig{
{
PoolName: "default",
MaxAuxContainersPerAgent: 100,
MaxCPUContainersPerAgent: -1,
AgentReconnectWait: model.Duration(aproto.AgentReconnectWait),
},
},
},
},
},
}},

{"two resource managers with pools", `
resource_manager:
type: agent
metadata:
region: "nw"
resource_pools:
- pool_name: a
- pool_name: b
Expand All @@ -391,8 +365,8 @@ additional_resource_managers:
name: k8s
max_slots_per_pod: 65
resource_pools:
- pool_name: b
- pool_name: c`, Config{
- pool_name: c
- pool_name: d`, Config{
ResourceConfig: ResourceConfig{
RootManagerInternal: &ResourceManagerConfig{
AgentRM: &AgentResourceManagerConfig{
Expand Down Expand Up @@ -434,12 +408,12 @@ additional_resource_managers:
},
ResourcePools: []ResourcePoolConfig{
{
PoolName: "b",
PoolName: "c",
MaxAuxContainersPerAgent: 100,
AgentReconnectWait: model.Duration(aproto.AgentReconnectWait),
},
{
PoolName: "c",
PoolName: "d",
MaxAuxContainersPerAgent: 100,
AgentReconnectWait: model.Duration(aproto.AgentReconnectWait),
},
Expand Down

0 comments on commit 6e1acf4

Please sign in to comment.