Skip to content

Commit

Permalink
chore: remove panics from rm initialization (#8983)
Browse files Browse the repository at this point in the history
  • Loading branch information
amandavialva01 committed Mar 11, 2024
1 parent 4ae0987 commit a22656d
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 19 deletions.
26 changes: 18 additions & 8 deletions master/internal/core.go
Original file line number Diff line number Diff line change
Expand Up @@ -963,7 +963,7 @@ func buildRM(
tcd *model.TaskContainerDefaultsConfig,
opts *aproto.MasterSetAgentOptions,
cert *tls.Certificate,
) rm.ResourceManager {
) (rm.ResourceManager, error) {
if len(rmConfigs) <= 1 {
config := rmConfigs[0]
switch {
Expand All @@ -972,7 +972,7 @@ func buildRM(
case config.ResourceManager.KubernetesRM != nil:
return kubernetesrm.New(db, config, tcd, opts, cert)
default:
panic("no expected resource manager config is defined")
return nil, fmt.Errorf("no expected resource manager config is defined")
}
}

Expand All @@ -984,15 +984,23 @@ func buildRM(
c := cfg.ResourceManager
switch {
case c.AgentRM != nil:
rms[c.Name()] = agentrm.New(db, echo, cfg, opts, cert)
agentRM, err := agentrm.New(db, echo, cfg, opts, cert)
if err != nil {
return nil, fmt.Errorf("resource manager %s: %w", c.Name(), err)
}
rms[c.Name()] = agentRM
case c.KubernetesRM != nil:
rms[c.Name()] = kubernetesrm.New(db, cfg, tcd, opts, cert)
k8sRM, err := kubernetesrm.New(db, cfg, tcd, opts, cert)
if err != nil {
return nil, fmt.Errorf("resource manager %s: %w", c.Name(), err)
}
rms[c.Name()] = k8sRM
default:
panic("no expected resource manager config is defined")
return nil, fmt.Errorf("no expected resource manager config is defined")
}
}

return multirm.New(defaultRMName, rms)
return multirm.New(defaultRMName, rms), nil
}

// Run causes the Determined master to connect the database and begin listening for HTTP requests.
Expand Down Expand Up @@ -1183,14 +1191,16 @@ func (m *Master) Run(ctx context.Context, gRPCLogInitDone chan struct{}) error {
}

// Resource Manager.
m.rm = buildRM(m.db, m.echo, m.config.ResourceManagers(),
if m.rm, err = buildRM(m.db, m.echo, m.config.ResourceManagers(),
&m.config.TaskContainerDefaults,
&aproto.MasterSetAgentOptions{
MasterInfo: m.Info(),
LoggingOptions: m.config.Logging,
},
cert,
)
); err != nil {
return fmt.Errorf("could not initialize resource manager(s): %w", err)
}

jobservice.SetDefaultService(m.rm)

Expand Down
11 changes: 5 additions & 6 deletions master/internal/rm/agentrm/agent_resource_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ func New(
config *config.ResourceManagerWithPoolsConfig,
opts *aproto.MasterSetAgentOptions,
cert *tls.Certificate,
) *ResourceManager {
) (*ResourceManager, error) {
agentService, agentUpdates := newAgentService(config.ResourcePools, opts)

e.GET("/agents", func(c echo.Context) error {
Expand Down Expand Up @@ -69,7 +69,7 @@ func newAgentResourceManager(
db *db.PgDB, config *config.ResourceManagerWithPoolsConfig,
cert *tls.Certificate, agentService *agents,
agentUpdates *queue.Queue[agentUpdatedEvent],
) *ResourceManager {
) (*ResourceManager, error) {
a := &ResourceManager{
syslog: logrus.WithField("component", "agentrm"),

Expand All @@ -85,9 +85,8 @@ func newAgentResourceManager(
for ix, config := range a.poolsConfig {
rp, err := a.createResourcePool(a.db, a.poolsConfig[ix], a.cert)
if err != nil {
// TODO(DET-9975): Don't panic.
a.syslog.WithError(err).Errorf("failed to create resource pool: %s", a.poolsConfig[ix].PoolName)
panic(err)
return nil, fmt.Errorf("failed to create resource pool: %s: %w",
a.poolsConfig[ix].PoolName, err)
}
a.pools[config.PoolName] = rp
}
Expand All @@ -103,7 +102,7 @@ func newAgentResourceManager(
}
}()

return a
return a, nil
}

// Allocate implements rm.ResourceManager.
Expand Down
3 changes: 2 additions & 1 deletion master/internal/rm/agentrm/resource_managers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ func TestResourceManagerForwardMessage(t *testing.T) {
},
}

rm := New(nil, echo.New(), conf.ResourceManagers()[0], nil, nil)
rm, err := New(nil, echo.New(), conf.ResourceManagers()[0], nil, nil)
assert.NilError(t, err, "error initializing resource manager")

taskSummary, err := rm.GetAllocationSummaries()
assert.NilError(t, err)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,16 +56,16 @@ func New(
taskContainerDefaults *model.TaskContainerDefaultsConfig,
opts *aproto.MasterSetAgentOptions,
cert *tls.Certificate,
) *ResourceManager {
) (*ResourceManager, error) {
tlsConfig, err := model.MakeTLSConfig(cert)
if err != nil {
panic(errors.Wrap(err, "failed to set up TLS config"))
return nil, fmt.Errorf("failed to set up TLS config: %w", err)
}

// TODO(DET-9833) clusterID should just be a `internal/config` package singleton.
clusterID, err := db.GetOrCreateClusterID("")
if err != nil {
panic(fmt.Errorf("getting clusterID: %w", err))
return nil, fmt.Errorf("getting clusterID: %w", err)
}
setClusterID(clusterID)

Expand Down Expand Up @@ -132,7 +132,7 @@ func New(
}()
k.pools[poolConfig.PoolName] = rp
}
return k
return k, nil
}

// Allocate implements rm.ResourceManager.
Expand Down

0 comments on commit a22656d

Please sign in to comment.