Skip to content

Commit

Permalink
feat: add slot stats to /agents endpoints (#9048)
Browse files Browse the repository at this point in the history
  • Loading branch information
hamidzr authored Mar 27, 2024
1 parent 10030a6 commit e8dba6d
Show file tree
Hide file tree
Showing 8 changed files with 700 additions and 150 deletions.
76 changes: 76 additions & 0 deletions harness/determined/common/api/bindings.py

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

63 changes: 63 additions & 0 deletions master/internal/api_agent_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
package internal

import (
"testing"

"github.com/stretchr/testify/assert"

"github.com/determined-ai/determined/proto/pkg/agentv1"
"github.com/determined-ai/determined/proto/pkg/containerv1"
"github.com/determined-ai/determined/proto/pkg/devicev1"
)

func TestSummarizeSlots_EmptySlots(t *testing.T) {
slots := make(map[string]*agentv1.Slot)
stats := SummarizeSlots(slots)

assert.Equal(t, 0, len(stats.TypeStats))
assert.Equal(t, 0, len(stats.BrandStats))
}

func TestSummarizeSlots_VariousStates(t *testing.T) {
slots := map[string]*agentv1.Slot{
"slot1": {
Device: &devicev1.Device{
Type: devicev1.Type_TYPE_CUDA,
Brand: "Nvidia",
},
Enabled: true,
Draining: false,
Container: &containerv1.Container{State: containerv1.State_STATE_RUNNING},
},
"slot2": {
Device: &devicev1.Device{
Type: devicev1.Type_TYPE_CUDA,
Brand: "Nvidia",
},
Enabled: false,
Draining: false,
},
"slot3": {
Device: &devicev1.Device{
Type: devicev1.Type_TYPE_CPU,
Brand: "Intel",
},
Enabled: true,
Draining: true,
},
}

stats := SummarizeSlots(slots)

assert.Equal(t, 2, int(stats.TypeStats[devicev1.Type_TYPE_CUDA.String()].Total))
assert.Equal(t, 1, int(stats.TypeStats[devicev1.Type_TYPE_CPU.String()].Total))
assert.Equal(t, 1, int(stats.TypeStats[devicev1.Type_TYPE_CUDA.String()].Disabled))
assert.Equal(t, 1, int(stats.TypeStats[devicev1.Type_TYPE_CPU.String()].Draining))
assert.Equal(t, 1, int(stats.TypeStats[devicev1.Type_TYPE_CUDA.String()].
States[containerv1.State_STATE_RUNNING.String()]))

assert.Equal(t, 2, int(stats.BrandStats["Nvidia"].Total))
assert.Equal(t, 1, int(stats.BrandStats["Intel"].Total))
assert.Equal(t, 1, int(stats.BrandStats["Nvidia"].Disabled))
assert.Equal(t, 1, int(stats.BrandStats["Intel"].Draining))
}
60 changes: 60 additions & 0 deletions master/internal/api_agents.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,58 @@ import (
"github.com/determined-ai/determined/master/internal/cluster"
"github.com/determined-ai/determined/master/internal/grpcutil"
"github.com/determined-ai/determined/master/internal/rm/rmerrors"
"github.com/determined-ai/determined/proto/pkg/agentv1"
"github.com/determined-ai/determined/proto/pkg/apiv1"
)

type slotStats map[string]*agentv1.DeviceStats

// SummarizeSlots a set of slots.
func SummarizeSlots(slots map[string]*agentv1.Slot) *agentv1.SlotStats {
stats := agentv1.SlotStats{
TypeStats: make(slotStats),
BrandStats: make(slotStats),
}

if len(slots) == 0 {
return &stats
}
for _, slot := range slots {
deviceType := slot.Device.Type.String()
deviceTypeStats, ok := stats.TypeStats[deviceType]
if !ok {
deviceTypeStats = &agentv1.DeviceStats{
States: make(map[string]int32),
}
stats.TypeStats[deviceType] = deviceTypeStats
}
deviceBrand := slot.Device.Brand
deviceBrandStats, ok := stats.BrandStats[deviceBrand]
if !ok {
deviceBrandStats = &agentv1.DeviceStats{
States: make(map[string]int32),
}
stats.BrandStats[deviceBrand] = deviceBrandStats
}
deviceBrandStats.Total++
deviceTypeStats.Total++

if !slot.Enabled {
deviceBrandStats.Disabled++
deviceTypeStats.Disabled++
}
if slot.Draining {
deviceBrandStats.Draining++
deviceTypeStats.Draining++
}
if slot.Container != nil {
deviceBrandStats.States[slot.Container.State.String()]++
deviceTypeStats.States[slot.Container.State.String()]++
}
}
return &stats
}

func (a *apiServer) GetAgents(
ctx context.Context, req *apiv1.GetAgentsRequest,
) (*apiv1.GetAgentsResponse, error) {
Expand All @@ -41,6 +90,17 @@ func (a *apiServer) GetAgents(
}
}

// PERF: can perhaps be done before RBAC.
for _, agent := range resp.Agents {
agent.SlotStats = SummarizeSlots(agent.Slots)
if req.ExcludeSlots {
agent.Slots = nil
}
if req.ExcludeContainers {
agent.Containers = nil
}
}

api.Sort(resp.Agents, req.OrderBy, req.SortBy, apiv1.GetAgentsRequest_SORT_BY_ID)
return resp, api.Paginate(&resp.Pagination, &resp.Agents, req.Offset, req.Limit)
}
Expand Down
Loading

0 comments on commit e8dba6d

Please sign in to comment.